diff -Nru dav1d-0.9.2/debian/changelog dav1d-1.0.0/debian/changelog --- dav1d-0.9.2/debian/changelog 2021-10-27 10:51:02.000000000 +0000 +++ dav1d-1.0.0/debian/changelog 2022-06-24 20:16:02.000000000 +0000 @@ -1,4 +1,11 @@ -dav1d (0.9.2-1sergeyd2~16.04.1) xenial; urgency=medium +dav1d (1.0.0-1sergeyd3.1~16.04.1) xenial; urgency=medium + + * New upstream release + * Bump soname to v6 + + -- Sergey Dryabzhinsky Fri, 24 Jun 2022 23:02:35 +0300 + +dav1d (0.9.2-1sergeyd2~debian11.1) bullseye; urgency=medium * New upstream release diff -Nru dav1d-0.9.2/debian/control dav1d-1.0.0/debian/control --- dav1d-0.9.2/debian/control 2021-02-21 06:46:21.000000000 +0000 +++ dav1d-1.0.0/debian/control 2022-06-24 20:26:42.000000000 +0000 @@ -3,11 +3,11 @@ Priority: optional Maintainer: Debian Multimedia Maintainers Uploaders: Dylan Aïssi -Build-Depends: debhelper (>= 9), dh-autoreconf, +Build-Depends: debhelper (>= 9), meson (>= 0.47), ninja-build, nasm (>= 2.14) [any-amd64 any-i386] -Standards-Version: 4.5.1 +Standards-Version: 4.6.0 Rules-Requires-Root: no Homepage: https://www.videolan.org/projects/dav1d.html Vcs-Browser: https://salsa.debian.org/multimedia-team/dav1d @@ -30,7 +30,7 @@ * full acceleration for ARMv8 chips * partial acceleration for ARMv7 chips -Package: libdav1d5 +Package: libdav1d6 Architecture: any Multi-Arch: same Section: libs @@ -55,7 +55,7 @@ Architecture: any Multi-Arch: same Section: libdevel -Depends: libdav1d5 (= ${binary:Version}), +Depends: libdav1d6 (= ${binary:Version}), ${misc:Depends} Description: fast and small AV1 video stream decoder (development files) dav1d is an AOMedia Video 1 (AV1) cross-platform decoder and focused on speed diff -Nru dav1d-0.9.2/debian/libdav1d5.install dav1d-1.0.0/debian/libdav1d5.install --- dav1d-0.9.2/debian/libdav1d5.install 2021-01-28 20:10:20.000000000 +0000 +++ dav1d-1.0.0/debian/libdav1d5.install 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -usr/lib/*/libdav1d.so.* diff -Nru dav1d-0.9.2/debian/libdav1d5.symbols dav1d-1.0.0/debian/libdav1d5.symbols --- dav1d-0.9.2/debian/libdav1d5.symbols 2021-07-30 09:21:25.000000000 +0000 +++ dav1d-1.0.0/debian/libdav1d5.symbols 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -libdav1d.so.5 #PACKAGE# #MINVER# -* Build-Depends-Package: libdav1d-dev - dav1d_close@Base 0.1.0 - dav1d_data_create@Base 0.1.0 - dav1d_data_unref@Base 0.1.0 - dav1d_data_wrap@Base 0.1.0 - dav1d_data_wrap_user_data@Base 0.2.1 - dav1d_default_settings@Base 0.1.0 - dav1d_flush@Base 0.1.0 - dav1d_get_event_flags@Base 0.9.0 - dav1d_get_picture@Base 0.1.0 - dav1d_open@Base 0.1.0 - dav1d_parse_sequence_header@Base 0.1.0 - dav1d_picture_unref@Base 0.1.0 - dav1d_send_data@Base 0.1.0 - dav1d_set_cpu_flags_mask@Base 0.1.0 - dav1d_version@Base 0.1.0 diff -Nru dav1d-0.9.2/debian/libdav1d6.install dav1d-1.0.0/debian/libdav1d6.install --- dav1d-0.9.2/debian/libdav1d6.install 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/debian/libdav1d6.install 2022-05-22 12:56:08.000000000 +0000 @@ -0,0 +1 @@ +usr/lib/*/libdav1d.so.* diff -Nru dav1d-0.9.2/debian/libdav1d6.symbols dav1d-1.0.0/debian/libdav1d6.symbols --- dav1d-0.9.2/debian/libdav1d6.symbols 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/debian/libdav1d6.symbols 2022-05-22 12:56:08.000000000 +0000 @@ -0,0 +1,20 @@ +libdav1d.so.6 #PACKAGE# #MINVER# +* Build-Depends-Package: libdav1d-dev + dav1d_apply_grain@Base 1.0.0 + dav1d_close@Base 0.1.0 + dav1d_data_create@Base 0.1.0 + dav1d_data_props_unref@Base 1.0.0 + dav1d_data_unref@Base 0.1.0 + dav1d_data_wrap@Base 0.1.0 + dav1d_data_wrap_user_data@Base 0.2.1 + dav1d_default_settings@Base 0.1.0 + dav1d_flush@Base 0.1.0 + dav1d_get_decode_error_data_props@Base 1.0.0 + dav1d_get_event_flags@Base 0.9.0 + dav1d_get_picture@Base 0.1.0 + dav1d_open@Base 0.1.0 + dav1d_parse_sequence_header@Base 0.1.0 + dav1d_picture_unref@Base 0.1.0 + dav1d_send_data@Base 0.1.0 + dav1d_set_cpu_flags_mask@Base 0.1.0 + dav1d_version@Base 0.1.0 diff -Nru dav1d-0.9.2/debian/rules dav1d-1.0.0/debian/rules --- dav1d-0.9.2/debian/rules 2021-02-21 06:51:18.000000000 +0000 +++ dav1d-1.0.0/debian/rules 2022-06-24 20:25:27.000000000 +0000 @@ -18,7 +18,7 @@ meson \ --prefix=/usr \ --buildtype=release \ - -Denable_tests=true \ + -Denable_tests=false \ -Denable_tools=true \ -Dfuzzing_engine=none \ -Dtestdata_tests=false \ @@ -30,9 +30,9 @@ override_dh_auto_install: DESTDIR="$(CURDIR)/debian/tmp" ninja -C build install - LIB=`cd $(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/ && ls -1 libdav1d.so.5.* | sort | tail -1`; echo $$LIB; \ - ln -snf "$$LIB" $(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/libdav1d.so.5 - ln -snf libdav1d.so.5 $(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/libdav1d.so + LIB=`cd $(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/ && ls -1 libdav1d.so.6.* | sort | tail -1`; echo $$LIB; \ + ln -snf "$$LIB" $(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/libdav1d.so.6 + ln -snf libdav1d.so.6 $(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/libdav1d.so cp -v build/tools/libdav1d_*.a $(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/ override_dh_auto_test: diff -Nru dav1d-0.9.2/doc/meson.build dav1d-1.0.0/doc/meson.build --- dav1d-0.9.2/doc/meson.build 2021-09-03 15:51:24.389037100 +0000 +++ dav1d-1.0.0/doc/meson.build 2022-03-18 14:31:55.962355900 +0000 @@ -1,4 +1,4 @@ -# Copyright © 2018, VideoLAN and dav1d authors +# Copyright © 2018-2021, VideoLAN and dav1d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -22,22 +22,23 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -doxygen = find_program('doxygen', required: false) -dot = find_program('dot', required: false) +if not get_option('enable_docs') + subdir_done() +endif -if doxygen.found() and dot.found() - conf_data = configuration_data() - conf_data.set('DOXYGEN_INPUT', join_paths(dav1d_src_root, 'include/dav1d')) - conf_data.set('DOXYGEN_STRIP', join_paths(dav1d_src_root, 'include')) - conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir()) - doxyfile = configure_file(input: 'Doxyfile.in', - output: 'Doxyfile', - configuration: conf_data) +doxygen = find_program('doxygen') +dot = find_program('dot') - custom_target('doc', - build_by_default: false, - command: [doxygen, doxyfile], - output: ['html'] - ) -endif +conf_data = configuration_data() +conf_data.set('DOXYGEN_INPUT', dav1d_src_root / 'include/dav1d') +conf_data.set('DOXYGEN_STRIP', dav1d_src_root / 'include') +conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir()) +doxyfile = configure_file(input: 'Doxyfile.in', + output: 'Doxyfile', + configuration: conf_data) +custom_target('doc', + build_by_default: false, + command: [doxygen, doxyfile], + output: ['html'] +) diff -Nru dav1d-0.9.2/examples/dav1dplay.c dav1d-1.0.0/examples/dav1dplay.c --- dav1d-0.9.2/examples/dav1dplay.c 2021-09-03 15:51:24.389037100 +0000 +++ dav1d-1.0.0/examples/dav1dplay.c 2022-03-18 14:31:55.962355900 +0000 @@ -114,9 +114,9 @@ fprintf(stderr, "Supported options:\n" " --input/-i $file: input file\n" " --untimed/-u: ignore PTS, render as fast as possible\n" - " --framethreads $num: number of frame threads (default: 1)\n" - " --tilethreads $num: number of tile threads (default: 1)\n" - " --pfthreads $num: number of postfilter threads(default: 1)\n" + " --threads $num: number of threads (default: 0)\n" + " --framedelay $num: maximum frame delay, capped at $threads (default: 0);\n" + " set to 1 for low-latency decoding\n" " --highquality: enable high quality rendering\n" " --zerocopy/-z: enable zero copy upload path\n" " --gpugrain/-g: enable GPU grain synthesis\n" @@ -147,9 +147,8 @@ static const char short_opts[] = "i:vuzgr:"; enum { - ARG_FRAME_THREADS = 256, - ARG_TILE_THREADS, - ARG_POSTFILTER_THREADS, + ARG_THREADS = 256, + ARG_FRAME_DELAY, ARG_HIGH_QUALITY, }; @@ -158,9 +157,8 @@ { "input", 1, NULL, 'i' }, { "version", 0, NULL, 'v' }, { "untimed", 0, NULL, 'u' }, - { "framethreads", 1, NULL, ARG_FRAME_THREADS }, - { "tilethreads", 1, NULL, ARG_TILE_THREADS }, - { "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS }, + { "threads", 1, NULL, ARG_THREADS }, + { "framedelay", 1, NULL, ARG_FRAME_DELAY }, { "highquality", 0, NULL, ARG_HIGH_QUALITY }, { "zerocopy", 0, NULL, 'z' }, { "gpugrain", 0, NULL, 'g' }, @@ -191,17 +189,13 @@ case 'r': settings->renderer_name = optarg; break; - case ARG_FRAME_THREADS: - lib_settings->n_frame_threads = - parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]); - break; - case ARG_TILE_THREADS: - lib_settings->n_tile_threads = - parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]); - break; - case ARG_POSTFILTER_THREADS: - lib_settings->n_postfilter_threads = - parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]); + case ARG_THREADS: + lib_settings->n_threads = + parse_unsigned(optarg, ARG_THREADS, argv[0]); + break; + case ARG_FRAME_DELAY: + lib_settings->max_frame_delay = + parse_unsigned(optarg, ARG_FRAME_DELAY, argv[0]); break; default: dp_settings_print_usage(argv[0], NULL); @@ -279,7 +273,7 @@ renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name); if (renderer_info == NULL) { - printf("No suitable rendered matching %s found.\n", + printf("No suitable renderer matching %s found.\n", (rd_ctx->settings.renderer_name) ? rd_ctx->settings.renderer_name : "auto"); } else { printf("Using %s renderer\n", renderer_info->name); diff -Nru dav1d-0.9.2/examples/dp_renderer_placebo.c dav1d-1.0.0/examples/dp_renderer_placebo.c --- dav1d-0.9.2/examples/dp_renderer_placebo.c 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/examples/dp_renderer_placebo.c 2022-03-18 14:31:55.962355900 +0000 @@ -120,7 +120,7 @@ } #ifdef HAVE_PLACEBO_OPENGL -static void *placebo_renderer_create_gl() +static void *placebo_renderer_create_gl(void) { SDL_Window *sdlwin = NULL; SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG); @@ -134,7 +134,7 @@ sdlwin = rd_priv_ctx->win; // Init OpenGL - struct pl_opengl_params params = pl_opengl_default_params; + struct pl_opengl_params params = { 0 }; # ifndef NDEBUG params.debug = true; # endif @@ -177,7 +177,7 @@ #endif #ifdef HAVE_PLACEBO_VULKAN -static void *placebo_renderer_create_vk() +static void *placebo_renderer_create_vk(void) { SDL_Window *sdlwin = NULL; @@ -211,7 +211,7 @@ printf(" %s\n", extensions[i]); } - struct pl_vk_inst_params iparams = pl_vk_inst_default_params; + struct pl_vk_inst_params iparams = { 0 }; iparams.extensions = extensions; iparams.num_extensions = num; @@ -374,7 +374,7 @@ assert(rd_priv_ctx != NULL); SDL_LockMutex(rd_priv_ctx->lock); - int ret = pl_allocate_dav1dpicture(pic, rd_priv_ctx->gpu); + int ret = pl_allocate_dav1dpicture(pic, (void *) rd_priv_ctx->gpu); SDL_UnlockMutex(rd_priv_ctx->lock); return ret; } @@ -385,7 +385,7 @@ assert(rd_priv_ctx != NULL); SDL_LockMutex(rd_priv_ctx->lock); - pl_release_dav1dpicture(pic, rd_priv_ctx->gpu); + pl_release_dav1dpicture(pic, (void *) rd_priv_ctx->gpu); SDL_UnlockMutex(rd_priv_ctx->lock); } diff -Nru dav1d-0.9.2/include/compat/gcc/stdatomic.h dav1d-1.0.0/include/compat/gcc/stdatomic.h --- dav1d-0.9.2/include/compat/gcc/stdatomic.h 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/include/compat/gcc/stdatomic.h 2022-03-18 14:31:55.962355900 +0000 @@ -41,6 +41,8 @@ #define atomic_load_explicit(p_a, mo) __atomic_load_n(p_a, mo) #define atomic_fetch_add(p_a, inc) __atomic_fetch_add(p_a, inc, __ATOMIC_SEQ_CST) #define atomic_fetch_sub(p_a, dec) __atomic_fetch_sub(p_a, dec, __ATOMIC_SEQ_CST) +#define atomic_exchange(p_a, v) __atomic_exchange_n(p_a, v, __ATOMIC_SEQ_CST) +#define atomic_fetch_or(p_a, v) __atomic_fetch_or(p_a, v, __ATOMIC_SEQ_CST) #endif /* !defined(__cplusplus) */ diff -Nru dav1d-0.9.2/include/compat/msvc/stdatomic.h dav1d-1.0.0/include/compat/msvc/stdatomic.h --- dav1d-0.9.2/include/compat/msvc/stdatomic.h 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/include/compat/msvc/stdatomic.h 2022-03-18 14:31:55.962355900 +0000 @@ -41,8 +41,8 @@ #include "common/attributes.h" -typedef volatile LONG __declspec(align(32)) atomic_int; -typedef volatile ULONG __declspec(align(32)) atomic_uint; +typedef volatile LONG atomic_int; +typedef volatile ULONG atomic_uint; typedef enum { memory_order_relaxed, @@ -52,6 +52,7 @@ #define atomic_init(p_a, v) do { *(p_a) = (v); } while(0) #define atomic_store(p_a, v) InterlockedExchange((LONG*)p_a, v) #define atomic_load(p_a) InterlockedCompareExchange((LONG*)p_a, 0, 0) +#define atomic_exchange(p_a, v) InterlockedExchange(p_a, v) #define atomic_load_explicit(p_a, mo) atomic_load(p_a) /* @@ -60,6 +61,7 @@ */ #define atomic_fetch_add(p_a, inc) InterlockedExchangeAdd(p_a, inc) #define atomic_fetch_sub(p_a, dec) InterlockedExchangeAdd(p_a, -(dec)) +#define atomic_fetch_or(p_a, v) InterlockedOr(p_a, v) #endif /* ! stdatomic.h */ diff -Nru dav1d-0.9.2/include/dav1d/common.h dav1d-1.0.0/include/dav1d/common.h --- dav1d-0.9.2/include/dav1d/common.h 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/include/dav1d/common.h 2022-03-18 14:31:55.962355900 +0000 @@ -78,4 +78,9 @@ struct Dav1dUserData user_data; ///< user-configurable data, default NULL members } Dav1dDataProps; +/** + * Release reference to a Dav1dDataProps. + */ +DAV1D_API void dav1d_data_props_unref(Dav1dDataProps *props); + #endif /* DAV1D_COMMON_H */ diff -Nru dav1d-0.9.2/include/dav1d/dav1d.h dav1d-1.0.0/include/dav1d/dav1d.h --- dav1d-0.9.2/include/dav1d/dav1d.h 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/include/dav1d/dav1d.h 2022-03-18 14:31:55.962355900 +0000 @@ -43,9 +43,8 @@ typedef struct Dav1dContext Dav1dContext; typedef struct Dav1dRef Dav1dRef; -#define DAV1D_MAX_FRAME_THREADS 256 -#define DAV1D_MAX_TILE_THREADS 64 -#define DAV1D_MAX_POSTFILTER_THREADS 256 +#define DAV1D_MAX_THREADS 256 +#define DAV1D_MAX_FRAME_DELAY 256 typedef struct Dav1dLogger { void *cookie; ///< Custom data to pass to the callback. @@ -59,17 +58,35 @@ void (*callback)(void *cookie, const char *format, va_list ap); } Dav1dLogger; +enum Dav1dInloopFilterType { + DAV1D_INLOOPFILTER_NONE = 0, + DAV1D_INLOOPFILTER_DEBLOCK = 1 << 0, + DAV1D_INLOOPFILTER_CDEF = 1 << 1, + DAV1D_INLOOPFILTER_RESTORATION = 1 << 2, + DAV1D_INLOOPFILTER_ALL = DAV1D_INLOOPFILTER_DEBLOCK | + DAV1D_INLOOPFILTER_CDEF | + DAV1D_INLOOPFILTER_RESTORATION, +}; + typedef struct Dav1dSettings { - int n_frame_threads; - int n_tile_threads; - int apply_grain; - int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31) - int all_layers; ///< output all spatial layers of a scalable AV1 biststream - unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited) + int n_threads; ///< number of threads (0 = number of logical cores in host system, default 0) + int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = ceil(sqrt(n_threads)), default 0) + int apply_grain; ///< whether to apply film grain on output frames (default 1) + int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31, default 0) + int all_layers; ///< output all spatial layers of a scalable AV1 biststream (default 1) + unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited, default 0) Dav1dPicAllocator allocator; ///< Picture allocator callback. Dav1dLogger logger; ///< Logger callback. - int n_postfilter_threads; - uint8_t reserved[28]; ///< reserved for future use + int strict_std_compliance; ///< whether to abort decoding on standard compliance violations + ///< that don't affect actual bitstream decoding (e.g. inconsistent + ///< or invalid metadata, default 0) + int output_invisible_frames; ///< output invisibly coded frames (in coding order) in addition + ///< to all visible frames. Because of show-existing-frame, this + ///< means some frames may appear twice (once when coded, + ///< once when shown, default 0) + enum Dav1dInloopFilterType inloop_filters; ///< postfilters to enable during decoding (default + ///< DAV1D_INLOOPFILTER_ALL) + uint8_t reserved[20]; ///< reserved for future use } Dav1dSettings; /** @@ -189,6 +206,27 @@ DAV1D_API int dav1d_get_picture(Dav1dContext *c, Dav1dPicture *out); /** + * Apply film grain to a previously decoded picture. If the picture contains no + * film grain metadata, then this function merely returns a new reference. + * + * @param c Input decoder instance. + * @param out Output frame. The caller assumes ownership of the returned + * reference. + * @param in Input frame. No ownership is transferred. + * + * @return + * 0: Success, and a frame is returned. + * other negative DAV1D_ERR codes: Error due to lack of memory or because of + * invalid passed-in arguments. + * + * @note If `Dav1dSettings.apply_grain` is true, film grain was already applied + * by `dav1d_get_picture`, and so calling this function leads to double + * application of film grain. Users should only call this when needed. + */ +DAV1D_API int dav1d_apply_grain(Dav1dContext *c, Dav1dPicture *out, + const Dav1dPicture *in); + +/** * Close a decoder instance and free all associated memory. * * @param c_out The decoder instance to close. *c_out will be set to NULL. @@ -236,6 +274,19 @@ */ DAV1D_API int dav1d_get_event_flags(Dav1dContext *c, enum Dav1dEventFlags *flags); +/** + * Retrieve the user-provided metadata associated with the input data packet + * for the last decoding error reported to the user, i.e. a negative return + * value (not EAGAIN) from dav1d_send_data() or dav1d_get_picture(). + * + * @param c Input decoder instance. + * @param out Output Dav1dDataProps. On success, the caller assumes ownership of + * the returned reference. + * + * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error. + */ +DAV1D_API int dav1d_get_decode_error_data_props(Dav1dContext *c, Dav1dDataProps *out); + # ifdef __cplusplus } # endif diff -Nru dav1d-0.9.2/meson.build dav1d-1.0.0/meson.build --- dav1d-0.9.2/meson.build 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/meson.build 2022-03-18 14:31:55.966356000 +0000 @@ -1,4 +1,4 @@ -# Copyright © 2018-2021, VideoLAN and dav1d authors +# Copyright © 2018-2022, VideoLAN and dav1d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -23,14 +23,14 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '0.9.2', + version: '1.0.0', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], meson_version: '>= 0.49.0') -dav1d_soname_version = '5.1.1' +dav1d_soname_version = '6.6.0' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] @@ -109,10 +109,6 @@ cdata.set('ftello', '_ftelli64') endif - if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc') - optional_arguments += '-mcmodel=small' - endif - # On Windows, we use a compatibility layer to emulate pthread thread_dependency = [] thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c')) @@ -128,7 +124,7 @@ rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major) rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor) rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision) - rc_data.set('COPYRIGHT_YEARS', '2021') + rc_data.set('COPYRIGHT_YEARS', '2022') else thread_dependency = dependency('threads') thread_compat_dep = [] @@ -203,6 +199,11 @@ cdata.set('HAVE_IO_H', 1) endif +if cc.check_header('pthread_np.h') + cdata.set('HAVE_PTHREAD_NP_H', 1) + test_args += '-DHAVE_PTHREAD_NP_H' +endif + # Function checks @@ -234,6 +235,16 @@ endif endif +pthread_np_prefix = ''' +#include +#ifdef HAVE_PTHREAD_NP_H +#include +#endif +''' +if cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency) + cdata.set('HAVE_PTHREAD_GETAFFINITY_NP', 1) +endif + # Compiler flag tests if cc.has_argument('-fvisibility=hidden') @@ -382,7 +393,11 @@ cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le') -if cc.symbols_have_underscore_prefix() +# meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably +# when additional flags like '-fprofile-instr-generate' are passed via CFLAGS +# see following meson issue https://github.com/mesonbuild/meson/issues/5482 +if (host_machine.system() == 'darwin' or + (host_machine.system() == 'windows' and host_machine.cpu_family() == 'x86')) cdata.set10('PREFIX', true) cdata_asm.set10('PREFIX', true) endif @@ -398,22 +413,13 @@ # check NASM version if nasm.found() - nasm_r = run_command(nasm, '-v') - - if nasm_r.returncode() != 0 - error('failed running nasm to obtain its version') - endif + nasm_r = run_command(nasm, '-v', check: true) out = nasm_r.stdout().strip().split() if out[1].to_lower() == 'version' - if out[2].version_compare('<2.13.02') - error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2])) - elif out[2].version_compare('<2.14') and get_option('enable_avx512') - error('nasm 2.14 or later is required for AVX-512 asm.\n' + - 'AVX-512 asm can be disabled with \'-Denable_avx512=false\'') + if out[2].version_compare('<2.14') + error('nasm 2.14 or later is required, found nasm @0@'.format(out[2])) endif - cdata.set10('HAVE_AVX512ICL', get_option('enable_avx512')) - cdata_asm.set10('HAVE_AVX512ICL', get_option('enable_avx512')) else error('unexpected nasm version string: @0@'.format(nasm_r.stdout())) endif diff -Nru dav1d-0.9.2/meson_options.txt dav1d-1.0.0/meson_options.txt --- dav1d-0.9.2/meson_options.txt 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/meson_options.txt 2022-03-18 14:31:55.966356000 +0000 @@ -10,11 +10,6 @@ value: true, description: 'Build asm files, if available') -option('enable_avx512', - type: 'boolean', - value: true, - description: 'Build AVX-512 asm files, requires nasm 2.14') - option('enable_tools', type: 'boolean', value: true, @@ -30,6 +25,11 @@ value: true, description: 'Build dav1d tests') +option('enable_docs', + type: 'boolean', + value: false, + description: 'Build dav1d documentation') + option('logging', type: 'boolean', value: true, diff -Nru dav1d-0.9.2/NEWS dav1d-1.0.0/NEWS --- dav1d-0.9.2/NEWS 2021-09-03 15:51:24.389037100 +0000 +++ dav1d-1.0.0/NEWS 2022-03-18 14:31:55.958356000 +0000 @@ -1,3 +1,33 @@ +Changes for 1.0.0 'Peregrine falcon': +------------------------------------- + +1.0.0 is a major release of dav1d, adding important features and bug fixes. + +It notably changes, in an important way, the way threading works, by adding +an automatic thread management. + +It also adds support for AVX-512 acceleration, and adds speedups to existing x86 +code (from SSE2 to AVX2). + +1.0.0 adds new grain API to ease acceleration on the GPU, and adds an API call +to get information of which frame failed to decode, in error cases. + +Finally, 1.0.0 fixes numerous small bugs that were reported since the beginning +of the project to have a proper release. + + .''. + .''. . *''* :_\/_: . + :_\/_: _\(/_ .:.*_\/_* : /\ : .'.:.'. + .''.: /\ : ./)\ ':'* /\ * : '..'. -=:o:=- + :_\/_:'.:::. ' *''* * '.\'/.' _\(/_'.':'.' + : /\ : ::::: *_\/_* -= o =- /)\ ' * + '..' ':::' * /\ * .'/.\'. ' + * *..* : + * : + * 1.0.0 + + + Changes for 0.9.2 'Golden Eagle': --------------------------------- diff -Nru dav1d-0.9.2/README.md dav1d-1.0.0/README.md --- dav1d-0.9.2/README.md 2021-09-03 15:51:24.389037100 +0000 +++ dav1d-1.0.0/README.md 2022-03-18 14:31:55.958356000 +0000 @@ -2,11 +2,13 @@ # dav1d -**dav1d** is a new **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness. +**dav1d** is an **AV1** cross-platform **d**ecoder, open-source, and focused on speed and correctness. + +It is now battle-tested and production-ready and can be used everywhere. The canonical repository URL for this repo is https://code.videolan.org/videolan/dav1d -This project is partially funded by the *Alliance for Open Media*/**AOM**. +This project was partially funded by the *Alliance for Open Media*/**AOM**. ## Goal and Features @@ -38,11 +40,11 @@ 9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips, 10. Make high bit-depth fast on desktop, by writing asm for AVX2 chips, 11. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips, +12. Improve threading. ### On-going -12. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list), -13. Accelerate for less common architectures, like PPC, SSE2 or AVX-512. -14. Improve threading. +13. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list), +14. Accelerate for less common architectures, like PPC, SSE2, RISC-V or AVX-512. ### After 15. Use more GPU decoding, when possible. @@ -78,7 +80,7 @@ # Compile -1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher) +1. Install [Meson](https://mesonbuild.com/) (0.49 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.14 or higher) 2. Run `mkdir build && cd build` to create a build directory and enter it 3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired 4. Run `ninja` to compile @@ -105,6 +107,14 @@ meson build --cross-file=package/crossfiles/i686-linux32.meson ``` +## Build documentation + +1. Install [doxygen](https://www.doxygen.nl/) and [graphviz](https://www.graphviz.org/) +2. Run `meson build -Denable_docs=true` to create the build directory +3. Run `ninja -C build doc/html` to build the docs + +The result can be found in `build/doc/html/`. An online version built from master can be found [here](https://videolan.videolan.me/dav1d/). + # Run tests 1. In the root directory, run `git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data` to fetch the test data repository @@ -146,6 +156,3 @@ ## Will you care about ? ? - We do, but we don't have either the time or the knowledge. Therefore, patches and contributions welcome. - -## Where can I find documentation? -- The current library documentation, built from master, can be found [here](https://videolan.videolan.me/dav1d/). diff -Nru dav1d-0.9.2/src/arm/32/cdef16.S dav1d-1.0.0/src/arm/32/cdef16.S --- dav1d-0.9.2/src/arm/32/cdef16.S 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/src/arm/32/cdef16.S 2022-03-18 14:31:55.966356000 +0000 @@ -32,10 +32,10 @@ // r1 = d0/q0 // r2 = d2/q1 .macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret - tst r6, #1 // CDEF_HAVE_LEFT + tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT - tst r6, #2 // CDEF_HAVE_RIGHT + tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT vldr s8, [\s1, #-4] @@ -52,7 +52,7 @@ vst1.16 {\r2}, [r0, :\align] vstr s11, [r0, #2*\w] .if \ret - pop {r4-r7,pc} + pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f @@ -72,7 +72,7 @@ vst1.16 {\r2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret - pop {r4-r7,pc} + pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f @@ -80,7 +80,7 @@ 2: // !CDEF_HAVE_LEFT - tst r6, #2 // CDEF_HAVE_RIGHT + tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT vld1.16 {\r1}, [\s1, :\align] @@ -95,7 +95,7 @@ vst1.16 {\r2}, [r0, :\align] vstr s9, [r0, #2*\w] .if \ret - pop {r4-r7,pc} + pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f @@ -113,7 +113,7 @@ vst1.16 {\r2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret - pop {r4-r7,pc} + pop {r4-r8,pc} .else add r0, r0, #2*\stride .endif @@ -122,18 +122,19 @@ // void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], -// const pixel *const top, int h, +// const pixel *const top, +// const pixel *const bottom, int h, // enum CdefEdgeFlags edges); // r1 = d0/q0 // r2 = d2/q1 .macro padding_func_16 w, stride, r1, r2, align function cdef_padding\w\()_16bpc_neon, export=1 - push {r4-r7,lr} - ldrd r4, r5, [sp, #20] - ldr r6, [sp, #28] + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] vmov.i16 q3, #0x8000 - tst r6, #4 // CDEF_HAVE_TOP + tst r7, #4 // CDEF_HAVE_TOP bne 1f // !CDEF_HAVE_TOP sub r12, r0, #2*(2*\stride+2) @@ -145,23 +146,23 @@ b 3f 1: // CDEF_HAVE_TOP - add r7, r4, r2 + add r8, r4, r2 sub r0, r0, #2*(2*\stride) - pad_top_bot_16 r4, r7, \w, \stride, \r1, \r2, \align, 0 + pad_top_bot_16 r4, r8, \w, \stride, \r1, \r2, \align, 0 // Middle section 3: - tst r6, #1 // CDEF_HAVE_LEFT + tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT - tst r6, #2 // CDEF_HAVE_RIGHT + tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: vld1.32 {d2[]}, [r3, :32]! vldr s5, [r1, #2*\w] vld1.16 {\r1}, [r1, :\align], r2 - subs r5, r5, #1 + subs r6, r6, #1 vstr s4, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s5, [r0, #2*\w] @@ -172,7 +173,7 @@ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.32 {d2[]}, [r3, :32]! vld1.16 {\r1}, [r1, :\align], r2 - subs r5, r5, #1 + subs r6, r6, #1 vstr s4, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] @@ -180,13 +181,13 @@ bgt 1b b 3f 2: - tst r6, #2 // CDEF_HAVE_RIGHT + tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: vldr s4, [r1, #2*\w] vld1.16 {\r1}, [r1, :\align], r2 - subs r5, r5, #1 + subs r6, r6, #1 vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s4, [r0, #2*\w] @@ -196,7 +197,7 @@ 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.16 {\r1}, [r1, :\align], r2 - subs r5, r5, #1 + subs r6, r6, #1 vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] @@ -204,7 +205,7 @@ bgt 1b 3: - tst r6, #8 // CDEF_HAVE_BOTTOM + tst r7, #8 // CDEF_HAVE_BOTTOM bne 1f // !CDEF_HAVE_BOTTOM sub r12, r0, #4 @@ -213,11 +214,11 @@ .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif - pop {r4-r7,pc} + pop {r4-r8,pc} 1: // CDEF_HAVE_BOTTOM - add r7, r1, r2 - pad_top_bot_16 r1, r7, \w, \stride, \r1, \r2, \align, 1 + add r8, r5, r2 + pad_top_bot_16 r5, r8, \w, \stride, \r1, \r2, \align, 1 endfunc .endm diff -Nru dav1d-0.9.2/src/arm/32/cdef.S dav1d-1.0.0/src/arm/32/cdef.S --- dav1d-0.9.2/src/arm/32/cdef.S 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/src/arm/32/cdef.S 2022-03-18 14:31:55.966356000 +0000 @@ -34,10 +34,10 @@ // n2 = s4/d2 // w2 = d2/q1 .macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret - tst r6, #1 // CDEF_HAVE_LEFT + tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT - tst r6, #2 // CDEF_HAVE_RIGHT + tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldrh r12, [\s1, #-2] @@ -61,7 +61,7 @@ vst1.16 {\w2}, [r0, :\align] vstr s11, [r0, #2*\w] .if \ret - pop {r4-r7,pc} + pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f @@ -86,7 +86,7 @@ vst1.16 {\w2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret - pop {r4-r7,pc} + pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f @@ -94,7 +94,7 @@ 2: // !CDEF_HAVE_LEFT - tst r6, #2 // CDEF_HAVE_RIGHT + tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT vldr \n1, [\s1] @@ -114,7 +114,7 @@ vst1.16 {\w2}, [r0, :\align] vstr s9, [r0, #2*\w] .if \ret - pop {r4-r7,pc} + pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f @@ -134,7 +134,7 @@ vst1.16 {\w2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret - pop {r4-r7,pc} + pop {r4-r8,pc} .else add r0, r0, #2*\stride .endif @@ -151,7 +151,8 @@ // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], -// const pixel *const top, int h, +// const pixel *const top, +// const pixel *const bottom, int h, // enum CdefEdgeFlags edges); // n1 = s0/d0 @@ -160,13 +161,13 @@ // w2 = d2/q1 .macro padding_func w, stride, n1, w1, n2, w2, align function cdef_padding\w\()_8bpc_neon, export=1 - push {r4-r7,lr} - ldrd r4, r5, [sp, #20] - ldr r6, [sp, #28] - cmp r6, #0xf // fully edged + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + cmp r7, #0xf // fully edged beq cdef_padding\w\()_edged_8bpc_neon vmov.i16 q3, #0x8000 - tst r6, #4 // CDEF_HAVE_TOP + tst r7, #4 // CDEF_HAVE_TOP bne 1f // !CDEF_HAVE_TOP sub r12, r0, #2*(2*\stride+2) @@ -178,23 +179,23 @@ b 3f 1: // CDEF_HAVE_TOP - add r7, r4, r2 + add r8, r4, r2 sub r0, r0, #2*(2*\stride) - pad_top_bottom r4, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 0 + pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0 // Middle section 3: - tst r6, #1 // CDEF_HAVE_LEFT + tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT - tst r6, #2 // CDEF_HAVE_RIGHT + tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: vld1.16 {d2[]}, [r3, :16]! ldrh r12, [r1, #\w] load_n_incr d0, r1, r2, \w - subs r5, r5, #1 + subs r6, r6, #1 vmov.16 d2[1], r12 vmovl.u8 q0, d0 vmovl.u8 q1, d2 @@ -208,7 +209,7 @@ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.16 {d2[]}, [r3, :16]! load_n_incr d0, r1, r2, \w - subs r5, r5, #1 + subs r6, r6, #1 vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s4, [r0, #-4] @@ -218,14 +219,14 @@ bgt 1b b 3f 2: - tst r6, #2 // CDEF_HAVE_RIGHT + tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldrh r12, [r1, #\w] load_n_incr d0, r1, r2, \w vdup.16 d2, r12 - subs r5, r5, #1 + subs r6, r6, #1 vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s12, [r0, #-4] @@ -237,7 +238,7 @@ 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr d0, r1, r2, \w - subs r5, r5, #1 + subs r6, r6, #1 vmovl.u8 q0, d0 vstr s12, [r0, #-4] vst1.16 {\w1}, [r0, :\align] @@ -246,7 +247,7 @@ bgt 1b 3: - tst r6, #8 // CDEF_HAVE_BOTTOM + tst r7, #8 // CDEF_HAVE_BOTTOM bne 1f // !CDEF_HAVE_BOTTOM sub r12, r0, #4 @@ -255,11 +256,11 @@ .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif - pop {r4-r7,pc} + pop {r4-r8,pc} 1: // CDEF_HAVE_BOTTOM - add r7, r1, r2 - pad_top_bottom r1, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 1 + add r8, r5, r2 + pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1 endfunc .endm @@ -268,7 +269,8 @@ // void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], -// const pixel *const top, int h, +// const pixel *const top, +// const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_edged w, stride, reg, align @@ -277,16 +279,16 @@ ldrh r12, [r4, #-2] vldr \reg, [r4] - add r7, r4, r2 + add r8, r4, r2 strh r12, [r0, #-2] ldrh r12, [r4, #\w] vstr \reg, [r0] strh r12, [r0, #\w] - ldrh r12, [r7, #-2] - vldr \reg, [r7] + ldrh r12, [r8, #-2] + vldr \reg, [r8] strh r12, [r0, #\stride-2] - ldrh r12, [r7, #\w] + ldrh r12, [r8, #\w] vstr \reg, [r0, #\stride] strh r12, [r0, #\stride+\w] add r0, r0, #2*\stride @@ -297,28 +299,28 @@ str r12, [r0, #-2] ldrh r12, [r1, #\w] add r1, r1, r2 - subs r5, r5, #1 + subs r6, r6, #1 vstr \reg, [r0] str r12, [r0, #\w] add r0, r0, #\stride bgt 0b - ldrh r12, [r1, #-2] - vldr \reg, [r1] - add r7, r1, r2 + ldrh r12, [r5, #-2] + vldr \reg, [r5] + add r8, r5, r2 strh r12, [r0, #-2] - ldrh r12, [r1, #\w] + ldrh r12, [r5, #\w] vstr \reg, [r0] strh r12, [r0, #\w] - ldrh r12, [r7, #-2] - vldr \reg, [r7] + ldrh r12, [r8, #-2] + vldr \reg, [r8] strh r12, [r0, #\stride-2] - ldrh r12, [r7, #\w] + ldrh r12, [r8, #\w] vstr \reg, [r0, #\stride] strh r12, [r0, #\stride+\w] - pop {r4-r7,pc} + pop {r4-r8,pc} endfunc .endm diff -Nru dav1d-0.9.2/src/arm/32/film_grain16.S dav1d-1.0.0/src/arm/32/film_grain16.S --- dav1d-0.9.2/src/arm/32/film_grain16.S 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/src/arm/32/film_grain16.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,2137 +0,0 @@ -/* - * Copyright © 2021, VideoLAN and dav1d authors - * Copyright © 2021, Martin Storsjo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "src/arm/asm.S" -#include "util.S" -#include "src/arm/asm-offsets.h" - -#define GRAIN_WIDTH 82 -#define GRAIN_HEIGHT 73 - -#define SUB_GRAIN_WIDTH 44 -#define SUB_GRAIN_HEIGHT 38 - -.macro increment_seed steps, shift=1 - lsr r11, r2, #3 - lsr r12, r2, #12 - lsr lr, r2, #1 - eor r11, r2, r11 // (r >> 0) ^ (r >> 3) - eor r12, r12, lr // (r >> 12) ^ (r >> 1) - eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) -.if \shift - lsr r2, r2, #\steps -.endif - and r11, r11, #((1 << \steps) - 1) // bit -.if \shift - orr r2, r2, r11, lsl #(16 - \steps) // *state -.else - orr r2, r2, r11, lsl #16 // *state -.endif -.endm - -.macro read_rand dest, bits, age - ubfx \dest, r2, #16 - \bits - \age, #\bits -.endm - -.macro read_shift_rand dest, bits - ubfx \dest, r2, #17 - \bits, #\bits - lsr r2, r2, #1 -.endm - -// special calling convention: -// r2 holds seed -// r3 holds dav1d_gaussian_sequence -// clobbers r11-r12 -// returns in d0-d1 -function get_gaussian_neon - push {r5-r6,lr} - increment_seed 4 - read_rand r5, 11, 3 - read_rand r6, 11, 2 - add r5, r3, r5, lsl #1 - add r6, r3, r6, lsl #1 - vld1.16 {d0[0]}, [r5] - read_rand r5, 11, 1 - vld1.16 {d0[1]}, [r6] - add r5, r3, r5, lsl #1 - read_rand r6, 11, 0 - increment_seed 4 - add r6, r3, r6, lsl #1 - vld1.16 {d0[2]}, [r5] - read_rand r5, 11, 3 - vld1.16 {d0[3]}, [r6] - add r5, r3, r5, lsl #1 - read_rand r6, 11, 2 - vld1.16 {d1[0]}, [r5] - add r6, r3, r6, lsl #1 - read_rand r5, 11, 1 - vld1.16 {d1[1]}, [r6] - read_rand r6, 11, 0 - add r5, r3, r5, lsl #1 - add r6, r3, r6, lsl #1 - vld1.16 {d1[2]}, [r5] - vld1.16 {d1[3]}, [r6] - pop {r5-r6,pc} -endfunc - -function get_grain_2_neon - push {r11,lr} - increment_seed 2 - read_rand r11, 11, 1 - read_rand r12, 11, 0 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d0[0]}, [r11] - vld1.16 {d0[1]}, [r12] - vrshl.s16 d0, d0, d30 - pop {r11,pc} -endfunc - -.macro get_grain_2 dst - bl get_grain_2_neon -.ifnc \dst, d0 - vmov \dst, d0 -.endif -.endm - -function get_grain_4_neon - push {r11,lr} - increment_seed 4 - read_rand r11, 11, 3 - read_rand r12, 11, 2 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d0[0]}, [r11] - read_rand r11, 11, 1 - vld1.16 {d0[1]}, [r12] - read_rand r12, 11, 0 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d0[2]}, [r11] - vld1.16 {d0[3]}, [r12] - vrshl.s16 d0, d0, d30 - pop {r11,pc} -endfunc - -.macro get_grain_4 dst - bl get_grain_4_neon -.ifnc \dst, d0 - vmov \dst, d0 -.endif -.endm - -// r1 holds the number of entries to produce -// r6, r8 and r10 hold the previous output entries -// q0 holds the vector of produced entries -// q1 holds the input vector of sums from above -.macro output_lag n -function output_lag\n\()_neon - push {r0, lr} -.if \n == 1 - mvn lr, r5 // grain_min = ~grain_max -.else - mov r0, #1 - mov lr, #1 - sub r7, r7, #1 - sub r9, r9, #1 - lsl r0, r0, r7 - lsl lr, lr, r9 - add r7, r7, #1 - add r9, r9, #1 -.endif -1: - read_shift_rand r12, 11 - vmov.32 r11, d2[0] - lsl r12, r12, #1 - vext.8 q0, q0, q0, #2 - ldrsh r12, [r3, r12] -.if \n == 1 - mla r11, r6, r4, r11 // sum (above) + *coeff * prev output - add r6, r11, r8 // 1 << (ar_coeff_shift - 1) - add r12, r12, r10 - asr r6, r6, r7 // >> ar_coeff_shift - asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) - add r6, r6, r12 - cmp r6, r5 -.elseif \n == 2 - mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 - mla r11, r6, r10, r11 // += *coeff * prev output 2 - mov r8, r6 - add r6, r11, r0 // 1 << (ar_coeff_shift - 1) - add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) - asr r6, r6, r7 // >> ar_coeff_shift - asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) - add r6, r6, r12 - push {lr} - cmp r6, r5 - mvn lr, r5 // grain_min = ~grain_max -.else - push {r1-r3} - sbfx r1, r4, #0, #8 - sbfx r2, r4, #8, #8 - sbfx r3, r4, #16, #8 - mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 - mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 - mla r11, r6, r3, r11 // += *coeff * prev output 3 - pop {r1-r3} - mov r10, r8 - mov r8, r6 - - add r6, r11, r0 // 1 << (ar_coeff_shift - 1) - add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) - asr r6, r6, r7 // >> ar_coeff_shift - asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) - add r6, r6, r12 - push {lr} - cmp r6, r5 - mvn lr, r5 // grain_min = ~grain_max -.endif - it gt - movgt r6, r5 - cmp r6, lr - it lt - movlt r6, lr -.if \n >= 2 - pop {lr} -.endif - subs r1, r1, #1 - vext.8 q1, q1, q1, #4 - vmov.16 d1[3], r6 - bgt 1b - pop {r0, pc} -endfunc -.endm - -output_lag 1 -output_lag 2 -output_lag 3 - - -function sum_lag1_above_neon - sub r12, r0, #1*GRAIN_WIDTH*2 - 16 - vld1.16 {q10}, [r12] // load top right - - vext.8 q0, q8, q9, #14 // top left, top mid - vext.8 q1, q9, q10, #2 // top left, top mid - - vmull.s16 q2, d18, d28 - vmlal.s16 q2, d0, d27 - vmlal.s16 q2, d2, d29 - vmull.s16 q3, d19, d28 - vmlal.s16 q3, d1, d27 - vmlal.s16 q3, d3, d29 - - vmov q8, q9 - vmov q9, q10 - - bx lr -endfunc - -.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff -.ifc \lag\()_\edge, lag3_left - bl sum_lag3_left_above_neon -.else - bl sum_\lag\()_above_neon -.endif -.ifc \type, uv_420 - vpush {q6-q7} - add r12, r11, #GRAIN_WIDTH*2 - vld1.16 {q0, q1}, [r11]! - vld1.16 {q6, q7}, [r12]! - vpadd.i16 d0, d0, d1 - vpadd.i16 d1, d2, d3 - vpadd.i16 d12, d12, d13 - vpadd.i16 d13, d14, d15 - vadd.i16 q0, q0, q6 - vpop {q6-q7} - vrshr.s16 q0, q0, #2 -.endif -.ifc \type, uv_422 - vld1.16 {q0, q1}, [r11]! - vpadd.i16 d0, d0, d1 - vpadd.i16 d1, d2, d3 - vrshr.s16 q0, q0, #1 -.endif -.ifc \type, uv_444 - vld1.16 {q0}, [r11]! -.endif -.if \uv_layout -.ifnb \uv_coeff - vdup.8 d13, \uv_coeff - vmovl.s8 q6, d13 -.endif - vmlal.s16 q2, d0, d13 - vmlal.s16 q3, d1, d13 -.endif -.if \uv_layout && \elems == 8 - b sum_\lag\()_y_\edge\()_start -.elseif \uv_layout == 444 && \elems == 7 - b sum_\lag\()_y_\edge\()_start -.elseif \uv_layout == 422 && \elems == 1 - b sum_\lag\()_uv_420_\edge\()_start -.else -sum_\lag\()_\type\()_\edge\()_start: - push {r11} -.if \elems > 4 -.ifc \edge, left - increment_seed 4 - read_rand r11, 11, 3 - read_rand r12, 11, 2 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d1[1]}, [r11] - read_rand r11, 11, 1 - vld1.16 {d1[2]}, [r12] - add r11, r3, r11, lsl #1 - vld1.16 {d1[3]}, [r11] - lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 - vrshl.s16 d1, d1, d30 - vext.8 q2, q2, q2, #12 -.ifc \lag, lag3 - vmov.s16 r10, d1[1] -.endif -.ifnc \lag, lag1 - vmov.s16 r8, d1[2] -.endif - vmov.s16 r6, d1[3] - - vmov q1, q2 - mov r1, #1 - bl output_\lag\()_neon -.else - increment_seed 4, shift=0 - vmov q1, q2 - mov r1, #4 - bl output_\lag\()_neon -.endif - - increment_seed 4, shift=0 - vmov q1, q3 -.ifc \edge, right - mov r1, #3 - bl output_\lag\()_neon - read_shift_rand r12, 11 - add r12, r3, r12, lsl #1 - vld1.16 {d2[0]}, [r12] - vrshl.s16 d2, d2, d30 - vext.8 q0, q0, q1, #2 -.else - mov r1, #4 - bl output_\lag\()_neon -.endif -.else - // elems == 1 - increment_seed 4, shift=0 - vmov q1, q2 - mov r1, #1 - bl output_\lag\()_neon - lsr r2, r2, #3 - - read_rand r11, 11, 2 - read_rand r12, 11, 1 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d2[0]}, [r11] - read_rand r11, 11, 0 - vld1.16 {d2[1]}, [r12] - add r11, r3, r11, lsl #1 - vld1.16 {d2[2]}, [r11] - vrshl.s16 d2, d2, d30 - vext.8 q0, q0, q1, #14 -.endif - vst1.16 {q0}, [r0]! - pop {r11} - pop {r1, pc} -.endif -.endm - -.macro sum_lag1_func type, uv_layout, edge, elems=8 -function sum_\type\()_lag1_\edge\()_neon - push {r1, lr} -.ifc \edge, left - sub r12, r0, #1*GRAIN_WIDTH*2 - vld1.8 {q9}, [r12] // load the previous block right above -.endif - sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems -endfunc -.endm - -sum_lag1_func y, 0, left -sum_lag1_func y, 0, mid -sum_lag1_func y, 0, right, 7 -sum_lag1_func uv_444, 444, left -sum_lag1_func uv_444, 444, mid -sum_lag1_func uv_444, 444, right, 7 -sum_lag1_func uv_422, 422, left -sum_lag1_func uv_422, 422, mid -sum_lag1_func uv_422, 422, right, 1 -sum_lag1_func uv_420, 420, left -sum_lag1_func uv_420, 420, mid -sum_lag1_func uv_420, 420, right, 1 - - -function sum_lag2_above_neon - push {lr} - sub r12, r0, #2*GRAIN_WIDTH*2 - 16 - sub lr, r0, #1*GRAIN_WIDTH*2 - 16 - vld1.16 {q10}, [r12] // load top right - vld1.16 {q13}, [lr] - - vdup.8 d10, d28[0] - vext.8 q0, q8, q9, #12 // top left, top mid - vdup.8 d12, d28[1] - vext.8 q1, q8, q9, #14 - vdup.8 d14, d28[3] - vext.8 q4, q9, q10, #2 // top mid, top right - vmovl.s8 q5, d10 - vmovl.s8 q6, d12 - vmovl.s8 q7, d14 - - vmull.s16 q2, d0, d10 - vmlal.s16 q2, d2, d12 - vmlal.s16 q2, d8, d14 - vmull.s16 q3, d1, d10 - vmlal.s16 q3, d3, d12 - vmlal.s16 q3, d9, d14 - - vdup.8 d10, d28[4] - vext.8 q0, q9, q10, #4 // top mid, top right - vdup.8 d12, d28[5] - vext.8 q1, q11, q12, #12 // top left, top mid - vdup.8 d14, d28[6] - vext.8 q4, q11, q12, #14 - vmovl.s8 q5, d10 - vmovl.s8 q6, d12 - vmovl.s8 q7, d14 - - vmlal.s16 q2, d0, d10 - vmlal.s16 q2, d2, d12 - vmlal.s16 q2, d8, d14 - vmlal.s16 q3, d1, d10 - vmlal.s16 q3, d3, d12 - vmlal.s16 q3, d9, d14 - - vdup.8 d10, d29[0] - vext.8 q0, q12, q13, #2 // top mid, top right - vdup.8 d12, d29[1] - vext.8 q1, q12, q13, #4 - - vdup.8 d14, d28[2] - vdup.8 d8, d28[7] - - vmovl.s8 q5, d10 - vmovl.s8 q6, d12 - vmovl.s8 q7, d14 - vmovl.s8 q4, d8 - - vmlal.s16 q2, d0, d10 - vmlal.s16 q2, d2, d12 - vmlal.s16 q2, d18, d14 - vmlal.s16 q2, d24, d8 - vmlal.s16 q3, d1, d10 - vmlal.s16 q3, d3, d12 - vmlal.s16 q3, d19, d14 - vmlal.s16 q3, d25, d8 - - vmov q8, q9 - vmov q9, q10 - - vmov q11, q12 - vmov q12, q13 - - pop {pc} -endfunc - -.macro sum_lag2_func type, uv_layout, edge, elems=8 -function sum_\type\()_lag2_\edge\()_neon - push {r1, lr} -.ifc \edge, left - sub r12, r0, #2*GRAIN_WIDTH*2 - sub lr, r0, #1*GRAIN_WIDTH*2 - vld1.16 {q9}, [r12] // load the previous block right above - vld1.16 {q12}, [lr] -.endif - sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4] -endfunc -.endm - -sum_lag2_func y, 0, left -sum_lag2_func y, 0, mid -sum_lag2_func y, 0, right, 7 -sum_lag2_func uv_444, 444, left -sum_lag2_func uv_444, 444, mid -sum_lag2_func uv_444, 444, right, 7 -sum_lag2_func uv_422, 422, left -sum_lag2_func uv_422, 422, mid -sum_lag2_func uv_422, 422, right, 1 -sum_lag2_func uv_420, 420, left -sum_lag2_func uv_420, 420, mid -sum_lag2_func uv_420, 420, right, 1 - - -function sum_lag3_left_above_neon - // A separate codepath for the left edge, to avoid reading outside - // of the edge of the buffer. - sub r12, r0, #3*GRAIN_WIDTH*2 - vld1.8 {q11, q12}, [r12] - vext.8 q12, q11, q12, #10 - vext.8 q11, q11, q11, #10 - b sum_lag3_above_start -endfunc - -function sum_lag3_above_neon - movw r12, #(3*GRAIN_WIDTH + 3)*2 - sub r12, r0, r12 - vld1.8 {q11, q12}, [r12] - -sum_lag3_above_start: - vdup.8 d12, d26[0] - vext.8 q1, q11, q12, #2 - vdup.8 d14, d26[1] - vext.8 q4, q11, q12, #4 - vdup.8 d16, d26[2] - vext.8 q5, q11, q12, #6 - vdup.8 d18, d26[3] - vmovl.s8 q6, d12 - vmovl.s8 q7, d14 - vmovl.s8 q8, d16 - vmovl.s8 q9, d18 - - movw r12, #(2*GRAIN_WIDTH + 3)*2 - sub r12, r0, r12 - - vmull.s16 q2, d22, d12 - vmlal.s16 q2, d2, d14 - vmlal.s16 q2, d8, d16 - vmlal.s16 q2, d10, d18 - vmull.s16 q3, d23, d12 - vmlal.s16 q3, d3, d14 - vmlal.s16 q3, d9, d16 - vmlal.s16 q3, d11, d18 - - vdup.8 d12, d26[4] - vext.8 q0, q11, q12, #8 - vdup.8 d14, d26[5] - vext.8 q1, q11, q12, #10 - vdup.8 d16, d26[6] - vext.8 q4, q11, q12, #12 - vld1.8 {q11, q12}, [r12] - vdup.8 d18, d26[7] - vmovl.s8 q6, d12 - vmovl.s8 q7, d14 - vmovl.s8 q8, d16 - vmovl.s8 q9, d18 - - vmlal.s16 q2, d0, d12 - vmlal.s16 q2, d2, d14 - vmlal.s16 q2, d8, d16 - vmlal.s16 q2, d22, d18 - vmlal.s16 q3, d1, d12 - vmlal.s16 q3, d3, d14 - vmlal.s16 q3, d9, d16 - vmlal.s16 q3, d23, d18 - - vdup.8 d12, d27[0] - vext.8 q0, q11, q12, #2 - vdup.8 d14, d27[1] - vext.8 q1, q11, q12, #4 - vdup.8 d16, d27[2] - vext.8 q4, q11, q12, #6 - vdup.8 d18, d27[3] - vext.8 q5, q11, q12, #8 - vmovl.s8 q6, d12 - vmovl.s8 q7, d14 - vmovl.s8 q8, d16 - vmovl.s8 q9, d18 - - sub r12, r0, #(1*GRAIN_WIDTH + 3)*2 - - vmlal.s16 q2, d0, d12 - vmlal.s16 q2, d2, d14 - vmlal.s16 q2, d8, d16 - vmlal.s16 q2, d10, d18 - vmlal.s16 q3, d1, d12 - vmlal.s16 q3, d3, d14 - vmlal.s16 q3, d9, d16 - vmlal.s16 q3, d11, d18 - - vdup.8 d12, d27[4] - vext.8 q0, q11, q12, #10 - vdup.8 d14, d27[5] - vext.8 q1, q11, q12, #12 - vld1.8 {q11, q12}, [r12] - vdup.8 d16, d27[6] - vdup.8 d18, d27[7] - vmovl.s8 q6, d12 - vmovl.s8 q7, d14 - vext.8 q5, q11, q12, #2 - vmovl.s8 q8, d16 - vmovl.s8 q9, d18 - - vmlal.s16 q2, d0, d12 - vmlal.s16 q2, d2, d14 - vmlal.s16 q2, d22, d16 - vmlal.s16 q2, d10, d18 - vmlal.s16 q3, d1, d12 - vmlal.s16 q3, d3, d14 - vmlal.s16 q3, d23, d16 - vmlal.s16 q3, d11, d18 - - vdup.8 d12, d28[0] - vext.8 q0, q11, q12, #4 - vdup.8 d14, d28[1] - vext.8 q1, q11, q12, #6 - vdup.8 d16, d28[2] - vext.8 q4, q11, q12, #8 - vdup.8 d18, d28[3] - vext.8 q5, q11, q12, #10 - vmovl.s8 q6, d12 - vmovl.s8 q7, d14 - vmovl.s8 q8, d16 - vmovl.s8 q9, d18 - - vmlal.s16 q2, d0, d12 - vmlal.s16 q2, d2, d14 - vmlal.s16 q2, d8, d16 - vmlal.s16 q2, d10, d18 - vmlal.s16 q3, d1, d12 - vmlal.s16 q3, d3, d14 - vmlal.s16 q3, d9, d16 - vmlal.s16 q3, d11, d18 - - vdup.8 d12, d28[4] - vext.8 q0, q11, q12, #12 - vmovl.s8 q6, d12 - - vmlal.s16 q2, d0, d12 - vmlal.s16 q3, d1, d12 - - bx lr -endfunc - -.macro sum_lag3_func type, uv_layout, edge, elems=8 -function sum_\type\()_lag3_\edge\()_neon - push {r1, lr} - sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0] -endfunc -.endm - -sum_lag3_func y, 0, left -sum_lag3_func y, 0, mid -sum_lag3_func y, 0, right, 7 -sum_lag3_func uv_444, 444, left -sum_lag3_func uv_444, 444, mid -sum_lag3_func uv_444, 444, right, 7 -sum_lag3_func uv_422, 422, left -sum_lag3_func uv_422, 422, mid -sum_lag3_func uv_422, 422, right, 1 -sum_lag3_func uv_420, 420, left -sum_lag3_func uv_420, 420, mid -sum_lag3_func uv_420, 420, right, 1 - -function generate_grain_rows_neon - push {r10-r11,lr} -1: - mov r10, #80 -2: - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - subs r10, r10, #8 - vst1.16 {q0}, [r0]! - bgt 2b - get_grain_2 d0 - subs r1, r1, #1 - vst1.32 {d0[0]}, [r0]! - bgt 1b - pop {r10-r11,pc} -endfunc - -function generate_grain_rows_44_neon - push {r10-r11,lr} -1: - mov r10, #40 -2: - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - subs r10, r10, #8 - vst1.16 {q0}, [r0]! - bgt 2b - get_grain_4 d0 - subs r1, r1, #1 - vst1.16 {d0}, [r0] - add r0, r0, #GRAIN_WIDTH*2-80 - bgt 1b - pop {r10-r11,pc} -endfunc - -function gen_grain_uv_444_lag0_neon - vld1.16 {q3}, [r11]! -gen_grain_uv_lag0_8_start: - push {r11,lr} - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 -gen_grain_uv_lag0_8_add: - vand q3, q3, q1 - vmull.s16 q2, d6, d22 - vmull.s16 q3, d7, d22 - vrshl.s32 q2, q2, q12 - vrshl.s32 q3, q3, q12 - vqmovn.s32 d4, q2 - vqmovn.s32 d5, q3 - vqadd.s16 q2, q2, q0 - vmin.s16 q2, q2, q9 - vmax.s16 q2, q2, q10 - vst1.16 {q2}, [r0]! - pop {r11,pc} -endfunc - -function gen_grain_uv_420_lag0_8_neon - add r12, r11, #GRAIN_WIDTH*2 - vld1.16 {q2,q3}, [r11]! - vld1.16 {q4,q5}, [r12] - vpadd.i16 d4, d4, d5 - vpadd.i16 d5, d6, d7 - vpadd.i16 d8, d8, d9 - vpadd.i16 d9, d10, d11 - vadd.i16 q2, q2, q4 - vrshr.s16 q3, q2, #2 - b gen_grain_uv_lag0_8_start -endfunc - -function gen_grain_uv_422_lag0_8_neon - vld1.16 {q2,q3}, [r11]! - vpadd.i16 d4, d4, d5 - vpadd.i16 d5, d6, d7 - vrshr.s16 q3, q2, #1 - b gen_grain_uv_lag0_8_start -endfunc - -function gen_grain_uv_420_lag0_4_neon - add r12, r11, #GRAIN_WIDTH*2 - vld1.16 {q2}, [r11] - vld1.16 {q0}, [r12] - add r11, r11, #32 - vpadd.i16 d4, d4, d5 - vpadd.i16 d0, d0, d1 - vadd.i16 d4, d4, d0 - vrshr.s16 d6, d4, #2 - push {r11,lr} - get_grain_4 d0 - b gen_grain_uv_lag0_8_add -endfunc - -function gen_grain_uv_422_lag0_4_neon - vld1.16 {q2}, [r11] - add r11, r11, #32 - vpadd.i16 d4, d4, d5 - vrshr.s16 d6, d4, #1 - push {r11,lr} - get_grain_4 d0 - b gen_grain_uv_lag0_8_add -endfunc - -.macro gen_grain_82 type -function generate_grain_\type\()_16bpc_neon, export=1 - push {r4-r11,lr} - -.ifc \type, uv_444 - ldr r4, [sp, #36] - mov r12, r3 - mov lr, #28 - add r11, r1, #3*GRAIN_WIDTH*2 - mov r1, r2 - mul r12, r12, lr - clz lr, r4 -.else - clz lr, r2 -.endif - movrel r3, X(gaussian_sequence) - sub lr, lr, #24 // -bitdepth_min_8 - ldr r2, [r1, #FGD_SEED] - ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] -.ifc \type, y - add r4, r1, #FGD_AR_COEFFS_Y -.else - add r4, r1, #FGD_AR_COEFFS_UV -.endif - add r9, r9, lr // grain_scale_shift - bitdepth_min_8 - adr r5, L(gen_grain_\type\()_tbl) - ldr r6, [r1, #FGD_AR_COEFF_LAG] - add r9, r9, #4 - ldr r6, [r5, r6, lsl #2] - vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift - add r5, r5, r6 - vneg.s16 q15, q15 - -.ifc \type, uv_444 - push {lr} - cmp r12, #0 - movw r10, #0x49d8 - movw lr, #0xb524 - // Intentionally using a separate register instead of moveq with an - // immediate constant, to avoid armv8 deprecated it instruction forms. - it eq - moveq r10, lr - add r4, r4, r12 // Add offset to ar_coeffs_uv[1] - eor r2, r2, r10 - pop {lr} -.endif - - ldr r7, [r1, #FGD_AR_COEFF_SHIFT] - neg lr, lr // bitdepth_min_8 - mov r8, #1 - mov r10, #1 - lsl r8, r8, r7 // 1 << ar_coeff_shift - lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) - lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) - lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) - - bx r5 - - .align 2 -L(gen_grain_\type\()_tbl): - .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - -L(generate_grain_\type\()_lag0): -.ifc \type, y - mov r1, #GRAIN_HEIGHT - bl generate_grain_rows_neon -.else - mov r5, #128 - lsl r5, r5, lr // 128 << bitdepth_min_8 - sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 - mvn r6, r5 // grain_min = ~grain_max - - mov r1, #3 - bl generate_grain_rows_neon - mov r1, #GRAIN_HEIGHT-3 - - vdup.32 q12, r7 - vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] - vmov.i8 q0, #0 - vmov.i8 q1, #255 - vdup.16 q9, r5 - vdup.16 q10, r6 - vext.8 q13, q0, q1, #10 - vext.8 q14, q1, q0, #2 - vneg.s32 q12, q12 - vmovl.s8 q11, d22 - -1: - vmov q1, q13 - bl gen_grain_uv_444_lag0_neon // 8 - vmov.i8 q1, #255 - bl gen_grain_uv_444_lag0_neon // 16 - bl gen_grain_uv_444_lag0_neon // 24 - bl gen_grain_uv_444_lag0_neon // 32 - bl gen_grain_uv_444_lag0_neon // 40 - bl gen_grain_uv_444_lag0_neon // 48 - bl gen_grain_uv_444_lag0_neon // 56 - bl gen_grain_uv_444_lag0_neon // 64 - bl gen_grain_uv_444_lag0_neon // 72 - vmov q1, q14 - bl gen_grain_uv_444_lag0_neon // 80 - get_grain_2 d16 - subs r1, r1, #1 - add r11, r11, #4 - vst1.32 {d16[0]}, [r0]! - bgt 1b -.endif - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag1): - vpush {q4-q7} - mov r5, #128 - lsl r5, r5, lr // 128 << bitdepth_min_8 - sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 - vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] - vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] - vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] -.ifc \type, y - ldrsb r4, [r4, #1] // ar_coeffs_y[3] -.else - add r4, r4, #2 -.endif - - mov r1, #3 -.ifc \type, uv_444 - vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] - ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] -.endif - bl generate_grain_rows_neon - vmovl.s8 q13, d27 - vmovl.s8 q12, d29 - vmovl.s8 q14, d28 - vmov d29, d24 -.ifc \type, uv_444 - vmovl.s8 q6, d13 -.endif - - mov r1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag1_left_neon // 8 - bl sum_\type\()_lag1_mid_neon // 16 - bl sum_\type\()_lag1_mid_neon // 24 - bl sum_\type\()_lag1_mid_neon // 32 - bl sum_\type\()_lag1_mid_neon // 40 - bl sum_\type\()_lag1_mid_neon // 48 - bl sum_\type\()_lag1_mid_neon // 56 - bl sum_\type\()_lag1_mid_neon // 64 - bl sum_\type\()_lag1_mid_neon // 72 - bl sum_\type\()_lag1_right_neon // 80 - get_grain_2 d16 - subs r1, r1, #1 -.ifc \type, uv_444 - add r11, r11, #4 -.endif - vst1.32 {d16[0]}, [r0]! - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag2): - vpush {q4-q7} - mov r5, #128 - lsl r5, r5, lr // 128 << bitdepth_min_8 - sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 - vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] - - vmov.s8 r4, d29[2] - vmov.s8 r10, d29[3] - - mov r1, #3 - bl generate_grain_rows_neon - - mov r1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag2_left_neon // 8 - bl sum_\type\()_lag2_mid_neon // 16 - bl sum_\type\()_lag2_mid_neon // 24 - bl sum_\type\()_lag2_mid_neon // 32 - bl sum_\type\()_lag2_mid_neon // 40 - bl sum_\type\()_lag2_mid_neon // 48 - bl sum_\type\()_lag2_mid_neon // 56 - bl sum_\type\()_lag2_mid_neon // 64 - bl sum_\type\()_lag2_mid_neon // 72 - bl sum_\type\()_lag2_right_neon // 80 - get_grain_2 d16 - subs r1, r1, #1 -.ifc \type, uv_444 - add r11, r11, #4 -.endif - vst1.32 {d16[0]}, [r0]! - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag3): - vpush {q4-q7} - mov r5, #128 - lsl r5, r5, lr // 128 << bitdepth_min_8 - sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 - vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] - - vmov.u8 r4, d28[5] - vmov.u8 r10, d28[6] - vmov.u8 r12, d28[7] - - orr r4, r4, r10, lsl #8 - orr r4, r4, r12, lsl #16 - - mov r1, #3 - vpush {d26} - bl generate_grain_rows_neon - vpop {d26} - - mov r1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag3_left_neon // 8 - bl sum_\type\()_lag3_mid_neon // 16 - bl sum_\type\()_lag3_mid_neon // 24 - bl sum_\type\()_lag3_mid_neon // 32 - bl sum_\type\()_lag3_mid_neon // 40 - bl sum_\type\()_lag3_mid_neon // 48 - bl sum_\type\()_lag3_mid_neon // 56 - bl sum_\type\()_lag3_mid_neon // 64 - bl sum_\type\()_lag3_mid_neon // 72 - bl sum_\type\()_lag3_right_neon // 80 - get_grain_2 d16 - subs r1, r1, #1 -.ifc \type, uv_444 - add r11, r11, #4 -.endif - vst1.32 {d16[0]}, [r0]! - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} -endfunc -.endm - -gen_grain_82 y -gen_grain_82 uv_444 - -.macro set_height dst, type -.ifc \type, uv_420 - mov \dst, #SUB_GRAIN_HEIGHT-3 -.else - mov \dst, #GRAIN_HEIGHT-3 -.endif -.endm - -.macro increment_y_ptr reg, type -.ifc \type, uv_420 - add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) -.else - sub \reg, \reg, #6*32-GRAIN_WIDTH*2 -.endif -.endm - -.macro gen_grain_44 type -function generate_grain_\type\()_16bpc_neon, export=1 - push {r4-r11,lr} - - ldr r4, [sp, #36] - mov r12, r3 - movw r11, #(3*GRAIN_WIDTH-3)*2 - mov lr, #28 - add r11, r1, r11 - mov r1, r2 - mul r12, r12, lr - clz lr, r4 - - movrel r3, X(gaussian_sequence) - sub lr, lr, #24 // -bitdepth_min_8 - ldr r2, [r1, #FGD_SEED] - ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] - add r4, r1, #FGD_AR_COEFFS_UV - add r9, r9, lr // grain_scale_shift - bitdepth_min_8 - adr r5, L(gen_grain_\type\()_tbl) - ldr r6, [r1, #FGD_AR_COEFF_LAG] - add r9, r9, #4 - ldr r6, [r5, r6, lsl #2] - vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift - add r5, r5, r6 - vneg.s16 q15, q15 - - push {lr} - cmp r12, #0 - movw r10, #0x49d8 - movw lr, #0xb524 - // Intentionally using a separate register instead of moveq with an - // immediate constant, to avoid armv8 deprecated it instruction forms. - it eq - moveq r10, lr - add r4, r4, r12 // Add offset to ar_coeffs_uv[1] - eor r2, r2, r10 - pop {lr} - - ldr r7, [r1, #FGD_AR_COEFF_SHIFT] - neg lr, lr - mov r8, #1 - mov r10, #1 - lsl r8, r8, r7 // 1 << ar_coeff_shift - lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) - lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) - lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) - bx r5 - - .align 2 -L(gen_grain_\type\()_tbl): - .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - -L(generate_grain_\type\()_lag0): -.ifc \type, uv_420 - vpush {q4-q5} -.endif - mov r5, #128 - lsl r5, r5, lr // 128 << bitdepth_min_8 - sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 - mvn r6, r5 // grain_min = ~grain_max - - mov r1, #3 - bl generate_grain_rows_44_neon - set_height r1, \type - - vdup.32 q12, r7 - vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] - vmov.i8 q0, #0 - vmov.i8 q1, #255 - vdup.16 q9, r5 - vdup.16 q10, r6 - vext.8 q13, q0, q1, #10 - vext.8 q14, q1, q0, #14 - vneg.s32 q12, q12 - vmovl.s8 q11, d22 - -1: - vmov q1, q13 - bl gen_grain_\type\()_lag0_8_neon // 8 - vmov.i8 q1, #255 - bl gen_grain_\type\()_lag0_8_neon // 16 - bl gen_grain_\type\()_lag0_8_neon // 24 - bl gen_grain_\type\()_lag0_8_neon // 32 - bl gen_grain_\type\()_lag0_8_neon // 40 - vmov q1, q14 - bl gen_grain_\type\()_lag0_4_neon // 44 - subs r1, r1, #1 - increment_y_ptr r11, \type - add r0, r0, #GRAIN_WIDTH*2-6*16 - bgt 1b - -.ifc \type, uv_420 - vpop {q4-q5} -.endif - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag1): - vpush {q4-q7} - mov r5, #128 - lsl r5, r5, lr // 128 << bitdepth_min_8 - sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 - vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] - vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] - vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] - add r4, r4, #2 - - mov r1, #3 - vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] - ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] - bl generate_grain_rows_44_neon - vmovl.s8 q13, d27 - vmovl.s8 q12, d29 - vmovl.s8 q14, d28 - vmov d29, d24 - vmovl.s8 q6, d13 - - set_height r1, \type -1: - bl sum_\type\()_lag1_left_neon // 8 - bl sum_\type\()_lag1_mid_neon // 16 - bl sum_\type\()_lag1_mid_neon // 24 - bl sum_\type\()_lag1_mid_neon // 32 - bl sum_\type\()_lag1_mid_neon // 40 - bl sum_\type\()_lag1_right_neon // 44 - subs r1, r1, #1 - increment_y_ptr r11, \type - add r0, r0, #GRAIN_WIDTH*2-6*16 - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag2): - vpush {q4-q7} - mov r5, #128 - lsl r5, r5, lr // 128 << bitdepth_min_8 - sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 - vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] - - vmov.s8 r4, d29[2] - vmov.s8 r10, d29[3] - - mov r1, #3 - bl generate_grain_rows_44_neon - - set_height r1, \type -1: - bl sum_\type\()_lag2_left_neon // 8 - bl sum_\type\()_lag2_mid_neon // 16 - bl sum_\type\()_lag2_mid_neon // 24 - bl sum_\type\()_lag2_mid_neon // 32 - bl sum_\type\()_lag2_mid_neon // 40 - bl sum_\type\()_lag2_right_neon // 44 - subs r1, r1, #1 - increment_y_ptr r11, \type - add r0, r0, #GRAIN_WIDTH*2-6*16 - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag3): - vpush {q4-q7} - mov r5, #128 - lsl r5, r5, lr // 128 << bitdepth_min_8 - sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 - vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] - - vmov.u8 r4, d28[5] - vmov.u8 r10, d28[6] - vmov.u8 r12, d28[7] - - orr r4, r4, r10, lsl #8 - orr r4, r4, r12, lsl #16 - - mov r1, #3 - bl generate_grain_rows_44_neon - - set_height r1, \type -1: - bl sum_\type\()_lag3_left_neon // 8 - bl sum_\type\()_lag3_mid_neon // 16 - bl sum_\type\()_lag3_mid_neon // 24 - bl sum_\type\()_lag3_mid_neon // 32 - bl sum_\type\()_lag3_mid_neon // 40 - bl sum_\type\()_lag3_right_neon // 44 - subs r1, r1, #1 - increment_y_ptr r11, \type - add r0, r0, #GRAIN_WIDTH*2-6*16 - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} -endfunc -.endm - -gen_grain_44 uv_420 -gen_grain_44 uv_422 - -.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off - vmov.u16 r11, \src1[0+\off] - vmov.u16 r12, \src3[0+\off] - add r11, r11, r3 - vmov.u16 lr, \src1[2+\off] - add r12, r12, r3 - vld1.8 {\dst1[0+\off]}, [r11] - vmov.u16 r11, \src3[2+\off] - add lr, lr, r3 - vld1.8 {\dst2[0+\off]}, [r12] - vmov.u16 r12, \src2[0+\off] - add r11, r11, r3 - vld1.8 {\dst1[2+\off]}, [lr] - vmov.u16 lr, \src4[0+\off] - add r12, r12, r3 - vld1.8 {\dst2[2+\off]}, [r11] - vmov.u16 r11, \src2[2+\off] - add lr, lr, r3 - vld1.8 {\dst1[4+\off]}, [r12] - vmov.u16 r12, \src4[2+\off] - add r11, r11, r3 - vld1.8 {\dst2[4+\off]}, [lr] - add r12, r12, r3 - vld1.8 {\dst1[6+\off]}, [r11] - vld1.8 {\dst2[6+\off]}, [r12] -.endm - -.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8 - gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0 - gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1 - gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0 - gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1 -.endm - -function gather32_neon - push {r11-r12,lr} - gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7 - pop {r11-r12,pc} -endfunc - -function gather16_neon - push {r11-r12,lr} - gather_interleaved d8, d9, d0, d1, d2, d3, 0 - gather_interleaved d8, d9, d0, d1, d2, d3, 1 - pop {r11-r12,pc} -endfunc - -const overlap_coeffs_0, align=4 - .short 27, 17, 0, 0 - .short 17, 27, 32, 32 -endconst - -const overlap_coeffs_1, align=4 - .short 23, 0, 0, 0 - .short 22, 32, 32, 32 -endconst - -.macro calc_offset offx, offy, src, sx, sy - and \offy, \src, #0xF // randval & 0xF - lsr \offx, \src, #4 // randval >> 4 -.if \sy == 0 - add \offy, \offy, \offy // 2 * (randval & 0xF) -.endif -.if \sx == 0 - add \offx, \offx, \offx // 2 * (randval >> 4) -.endif -.endm - -.macro add_offset dst, offx, offy, src, stride - mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy - add \dst, \dst, \offx, lsl #1 // grain_lut += offx -.endm - -// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, -// const ptrdiff_t stride, -// const uint8_t scaling[SCALING_SIZE], -// const int scaling_shift, -// const entry grain_lut[][GRAIN_WIDTH], -// const int offsets[][2], -// const int h, const ptrdiff_t clip, -// const ptrdiff_t type, -// const int bitdepth_max); -function fgy_32x32_16bpc_neon, export=1 - push {r4-r11,lr} - vpush {q4-q7} - ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut - ldrd r6, r7, [sp, #108] // offsets, h - ldr r8, [sp, #116] // clip - mov r9, #GRAIN_WIDTH*2 // grain_lut stride - ldr r10, [sp, #124] // bitdepth_max - - eor r4, r4, #15 // 15 - scaling_shift - vdup.16 q6, r10 // bitdepth_max - clz r10, r10 - vdup.16 q13, r4 // 15 - scaling_shift - rsb r10, r10, #24 // bitdepth_min_8 - cmp r8, #0 - vdup.16 q12, r10 // bitdepth_min_8 - - movrel_local r12, overlap_coeffs_0 - - beq 1f - // clip - vmov.i16 q14, #16 - vmov.i16 q15, #235 - vshl.s16 q14, q14, q12 - vshl.s16 q15, q15, q12 - b 2f -1: - // no clip - vmov.i16 q14, #0 - vmov q15, q6 -2: - vshr.u16 q6, q6, #1 // grain_max - - vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs - - add r5, r5, #18 // grain_lut += 9 - add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride - add r5, r5, r9 // grain_lut += grain_stride - - ldr r10, [r6, #8] // offsets[1][0] - calc_offset r10, r4, r10, 0, 0 - add_offset r4, r10, r4, r5, r9 - ldr r10, [r6, #4] // offsets[0][1] - calc_offset r10, r11, r10, 0, 0 - add_offset r11, r10, r11, r5, r9 - ldr r10, [r6, #12] // offsets[1][1] - calc_offset r10, r8, r10, 0, 0 - add_offset r8, r10, r8, r5, r9 - ldr r6, [r6] // offsets[0][0] - calc_offset r6, lr, r6, 0, 0 - add_offset r5, r6, lr, r5, r9 - - add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx - add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - - ldr r10, [sp, #120] // type - adr r11, L(fgy_loop_tbl) - - tst r10, #1 - ldr r10, [r11, r10, lsl #2] - - add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx - - add r11, r11, r10 - - beq 1f - // y overlap - vdup.16 d14, d24[0] - vdup.16 d15, d24[1] - mov r10, r7 // backup actual h - mov r7, #2 -1: - sub r2, r2, #32 // src_stride -= 32 - sub r9, r9, #32 // grain_stride -= 32 - bx r11 -endfunc - -function fgy_loop_neon -L(fgy_loop_tbl): - .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB - .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB - .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB - .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB - -.macro fgy ox, oy -L(loop_\ox\oy): -1: -.if \ox - vld1.16 {d0}, [r4], r9 // grain_lut old -.endif -.if \oy - vld1.16 {q2, q3}, [r6]! // grain_lut top -.endif -.if \ox && \oy - vld1.16 {d2}, [r8], r9 // grain_lut top old -.endif -.if \oy - vld1.16 {q4, q5}, [r6], r9 // grain_lut top -.endif -.if !\ox && !\oy - vld1.16 {q0, q1}, [r1, :128]! // src -.endif - vld1.16 {q8, q9}, [r5]! // grain_lut -.if !\ox && !\oy - vld1.16 {q2, q3}, [r1, :128], r2 // src -.endif -.if !\oy - vmvn.i16 q5, #0xf000 // 0x0fff -.endif - vld1.16 {q10, q11}, [r5], r9 // grain_lut - -.if \ox - add r4, r4, #32 - vmull.s16 q0, d0, d24 - vmlal.s16 q0, d16, d25 -.endif - -.if \oy -.if \ox - add r8, r8, #32 - vmull.s16 q1, d2, d24 - vmlal.s16 q1, d4, d25 - vqrshrn.s32 d16, q0, #5 - vmvn d0, d12 // grain_min - vqrshrn.s32 d4, q1, #5 - vmin.s16 d16, d16, d12 - vmin.s16 d4, d4, d12 - vmax.s16 d16, d16, d0 - vmax.s16 d4, d4, d0 -.endif - - vmull.s16 q0, d4, d14 - vmull.s16 q1, d5, d14 - vmull.s16 q2, d6, d14 - vmull.s16 q3, d7, d14 - vmlal.s16 q0, d16, d15 - vmlal.s16 q1, d17, d15 - vmlal.s16 q2, d18, d15 - vmlal.s16 q3, d19, d15 - vmull.s16 q8, d20, d15 - vmull.s16 q9, d21, d15 - vmull.s16 q10, d22, d15 - vmull.s16 q11, d23, d15 - vmlal.s16 q8, d8, d14 - vmlal.s16 q9, d9, d14 - vmlal.s16 q10, d10, d14 - vmlal.s16 q11, d11, d14 - vmvn q4, q6 // grain_min - vqrshrn.s32 d0, q0, #5 - vqrshrn.s32 d1, q1, #5 - vqrshrn.s32 d2, q2, #5 - vqrshrn.s32 d3, q3, #5 - vqrshrn.s32 d4, q8, #5 - vqrshrn.s32 d5, q9, #5 - vqrshrn.s32 d6, q10, #5 - vqrshrn.s32 d7, q11, #5 - vmin.s16 q8, q0, q6 - vmin.s16 q9, q1, q6 - vld1.16 {q0, q1}, [r1, :128]! // src - vmin.s16 q10, q2, q6 - vmin.s16 q11, q3, q6 - vmax.s16 q8, q8, q4 - vmax.s16 q9, q9, q4 - vld1.16 {q2, q3}, [r1, :128], r2 // src - vmvn.i16 q5, #0xf000 // 0x0fff - vmax.s16 q10, q10, q4 - vmax.s16 q11, q11, q4 -.elseif \ox - vmvn d4, d12 // grain_min - vqrshrn.s32 d16, q0, #5 - vld1.16 {q0, q1}, [r1, :128]! // src - vmin.s16 d16, d16, d12 - vmax.s16 d16, d16, d4 - vld1.16 {q2, q3}, [r1, :128], r2 // src -.endif - - // Make sure that uninitialized pixels out of range past the right - // edge are in range; their actual values shouldn't matter. - vand q0, q0, q5 - vand q1, q1, q5 - vand q2, q2, q5 - vand q3, q3, q5 - - bl gather32_neon - -.if \ox || \oy - vpush {q6-q7} -.endif - - vmovl.u8 q6, d8 // scaling - vmovl.u8 q7, d9 - vmovl.u8 q4, d10 - vmovl.u8 q5, d11 - - vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) - vshl.u16 q7, q7, q13 - vshl.u16 q4, q4, q13 - vshl.u16 q5, q5, q13 - - vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) - vqrdmulh.s16 q9, q9, q7 - vqrdmulh.s16 q10, q10, q4 - vqrdmulh.s16 q11, q11, q5 - -.if \ox || \oy - vpop {q6-q7} -.endif - - vqadd.s16 q0, q0, q8 // *src + noise - vqadd.s16 q1, q1, q9 - vqadd.s16 q2, q2, q10 - vqadd.s16 q3, q3, q11 - - vmax.s16 q0, q0, q14 - vmax.s16 q1, q1, q14 - vmax.s16 q2, q2, q14 - vmax.s16 q3, q3, q14 - vmin.s16 q0, q0, q15 - vmin.s16 q1, q1, q15 - vmin.s16 q2, q2, q15 - vmin.s16 q3, q3, q15 - - vst1.16 {q0, q1}, [r0, :128]! // dst - subs r7, r7, #1 -.if \oy - vdup.16 d14, d25[0] - vdup.16 d15, d25[1] -.endif - vst1.16 {q2, q3}, [r0, :128], r2 // dst - bgt 1b - -.if \oy - cmp r10, #2 - sub r7, r10, #2 // restore actual remaining h - bgt L(loop_\ox\()0) -.endif - vpop {q4-q7} - pop {r4-r11,pc} -.endm - - fgy 0, 0 - fgy 0, 1 - fgy 1, 0 - fgy 1, 1 -endfunc - -// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, -// const pixel *const src, -// const ptrdiff_t stride, -// const uint8_t scaling[SCALING_SIZE], -// const Dav1dFilmGrainData *const data, -// const entry grain_lut[][GRAIN_WIDTH], -// const pixel *const luma_row, -// const ptrdiff_t luma_stride, -// const int offsets[][2], -// const ptrdiff_t h, const ptrdiff_t uv, -// const ptrdiff_t is_id, -// const ptrdiff_t type, -// const int bitdepth_max); -.macro fguv layout, sx, sy -function fguv_32x32_\layout\()_16bpc_neon, export=1 - push {r4-r11,lr} - vpush {q4-q7} - ldrd r4, r5, [sp, #100] // data, grain_lut - ldrd r10, r11, [sp, #124] // uv, is_id - ldr r6, [sp, #136] // bitdepth_max - - clz r7, r6 - rsb r7, r7, #24 // bitdepth_min_8 - - // !csfl - add r10, r4, r10, lsl #2 // + 4*uv - add r12, r10, #FGD_UV_LUMA_MULT - add lr, r10, #FGD_UV_MULT - ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset - vld1.16 {d30[]}, [r12] // uv_luma_mult - lsl r10, r10, r7 // uv_offset << bitdepth_min_8 - vld1.16 {d30[1]}, [lr] // uv_mult - - ldr lr, [r4, #FGD_SCALING_SHIFT] - ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] - eor lr, lr, #15 // 15 - scaling_shift - - vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8 - - cmp r12, #0 - vdup.16 q13, lr // 15 - scaling_shift - - beq 1f - // clip - cmp r11, #0 - mov r8, #16 - mov r9, #240 - lsl r8, r8, r7 - lsl r9, r9, r7 - beq 2f - // is_id - mov r9, #235 - lsl r9, r9, r7 - b 2f -1: - // no clip - mov r8, #0 - mov r9, r6 // bitdepth_max -2: - vmov.16 d30[3], r6 // bitdepth_max - vdup.16 d31, r8 // clip_min - - mov r10, #GRAIN_WIDTH*2 // grain_lut stride - -.if \sy - mov r6, #23 - mov r7, #22 -.else - mov r6, #27 - mov r7, #17 -.endif - vmov.16 d31[1], r9 // clip_max - - ldrd r8, r9, [sp, #116] // offsets, h - - add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 -.if \sy - add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride - add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride -.else - add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride - add r5, r5, r10 // grain_lut += grain_stride -.endif - vmov.16 d31[2], r6 // overlap y [0] - - ldr r12, [r8, #8] // offsets[1][0] - calc_offset r12, r4, r12, \sx, \sy - add_offset r4, r12, r4, r5, r10 - - ldr r12, [r8, #4] // offsets[0][1] - calc_offset r12, lr, r12, \sx, \sy - add_offset lr, r12, lr, r5, r10 - - ldr r12, [r8, #12] // offsets[1][1] - calc_offset r12, r11, r12, \sx, \sy - add_offset r11, r12, r11, r5, r10 - - ldr r8, [r8] // offsets[0][0] - calc_offset r8, r12, r8, \sx, \sy - add_offset r5, r8, r12, r5, r10 - - vmov.16 d31[3], r7 // overlap y [1] - - add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - - movrel_local r12, overlap_coeffs_\sx - ldr lr, [sp, #132] // type - ldrd r6, r7, [sp, #108] // luma_row, luma_stride - - vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs - - movrel_local r12, L(fguv_loop_sx\sx\()_tbl) -#if CONFIG_THUMB - // This uses movrel_local instead of adr above, because the target - // can be out of range for adr. But movrel_local leaves the thumb bit - // set on COFF (but probably wouldn't if building for thumb on ELF), - // thus try to clear the bit for robustness. - bic r12, r12, #1 -#endif - - tst lr, #1 - ldr lr, [r12, lr, lsl #2] - - add r12, r12, lr - - beq 1f - // y overlap - sub lr, r9, #(2 >> \sy) // backup remaining h - mov r9, #(2 >> \sy) - -1: -.if \sy - add r7, r7, r7 // luma_stride *= 2 -.endif - sub r7, r7, #32 // luma_stride -= 32 - - bx r12 -endfunc -.endm - -fguv 420, 1, 1 -fguv 422, 1, 0 -fguv 444, 0, 0 - -function fguv_loop_sx0_neon -L(fguv_loop_sx0_tbl): - .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - -.macro fguv_loop_sx0 csfl, ox, oy -L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): - sub r2, r2, #32 // src_stride -= 32 - sub r10, r10, #32 // grain_stride -= 32 -.if \oy - mov r12, lr -.endif -L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart): -1: -.if \ox - vld1.16 {d0}, [r4], r10 // grain_lut old -.endif -.if \oy - vld1.16 {q2, q3}, [r8]! // grain_lut top -.endif -.if \ox && \oy - vld1.16 {d2}, [r11], r10 // grain_lut top old -.endif -.if !\ox && !\oy - vld1.16 {q0, q1}, [r6, :128]! // luma -.endif - vld1.16 {q8, q9}, [r5]! // grain_lut -.if \oy - vld1.16 {q4, q5}, [r8], r10 // grain_lut top -.endif -.if !\ox && !\oy - vld1.16 {q2, q3}, [r6, :128], r7 // luma -.endif -.if \oy - vdup.16 d28, d31[2] // overlap y coeff - vdup.16 d29, d31[3] // overlap y coeff -.endif - vld1.16 {q10, q11}, [r5], r10 // grain_lut - -.if \ox - vdup.16 q7, d30[3] // bitdepth_max - add r4, r4, #32 - vmull.s16 q0, d0, d24 - vshr.u16 q7, q7, #1 // grain_max - vmlal.s16 q0, d16, d25 - vmvn q6, q7 // grain_min -.endif - -.if \oy -.if \ox - add r11, r11, #32 - vmull.s16 q1, d2, d24 - vmlal.s16 q1, d4, d25 - vqrshrn.s32 d16, q0, #5 - vqrshrn.s32 d4, q1, #5 - vmin.s16 d4, d4, d14 - vmin.s16 d16, d16, d14 - vmax.s16 d4, d4, d12 - vmax.s16 d16, d16, d12 -.endif - - vmull.s16 q0, d4, d28 - vmull.s16 q1, d5, d28 - vmull.s16 q2, d6, d28 - vmull.s16 q3, d7, d28 -.if !\ox - vdup.16 q7, d30[3] // bitdepth_max -.endif - vmlal.s16 q0, d16, d29 - vmlal.s16 q1, d17, d29 - vmlal.s16 q2, d18, d29 - vmlal.s16 q3, d19, d29 -.if !\ox - vshr.u16 q7, q7, #1 // grain_max -.endif - vmull.s16 q8, d20, d29 - vmull.s16 q9, d21, d29 - vmull.s16 q10, d22, d29 - vmull.s16 q11, d23, d29 -.if !\ox - vmvn q6, q7 // grain_min -.endif - vmlal.s16 q8, d8, d28 - vmlal.s16 q9, d9, d28 - vmlal.s16 q10, d10, d28 - vmlal.s16 q11, d11, d28 - vqrshrn.s32 d0, q0, #5 - vqrshrn.s32 d1, q1, #5 - vqrshrn.s32 d2, q2, #5 - vqrshrn.s32 d3, q3, #5 - vqrshrn.s32 d4, q8, #5 - vqrshrn.s32 d5, q9, #5 - vqrshrn.s32 d6, q10, #5 - vqrshrn.s32 d7, q11, #5 - vmin.s16 q8, q0, q7 - vmin.s16 q9, q1, q7 - vld1.16 {q0, q1}, [r6, :128]! // luma - vmin.s16 q10, q2, q7 - vmin.s16 q11, q3, q7 - vmax.s16 q8, q8, q6 - vmax.s16 q9, q9, q6 - vld1.16 {q2, q3}, [r6, :128], r7 // luma - vmax.s16 q10, q10, q6 - vmax.s16 q11, q11, q6 -.elseif \ox - vqrshrn.s32 d16, q0, #5 - vld1.16 {q0, q1}, [r6, :128]! // luma - vmin.s16 d16, d16, d14 - vld1.16 {q2, q3}, [r6, :128], r7 // luma - vmax.s16 d16, d16, d12 -.endif - -.if !\csfl - vdup.16 d28, d30[0] // uv_luma_mult - vld1.16 {q4, q5}, [r1, :128]! // src - vdup.16 d29, d30[1] // uv_mult - vmull.s16 q6, d0, d28 - vmull.s16 q7, d1, d28 - vmull.s16 q0, d2, d28 - vmull.s16 q1, d3, d28 - vmlal.s16 q6, d8, d29 - vmlal.s16 q7, d9, d29 - vmlal.s16 q0, d10, d29 - vmlal.s16 q1, d11, d29 - vld1.16 {q4, q5}, [r1, :128] // src - sub r1, r1, #32 - vshrn.s32 d12, q6, #6 - vshrn.s32 d13, q7, #6 - vshrn.s32 d14, q0, #6 - vshrn.s32 d15, q1, #6 - vmull.s16 q0, d4, d28 - vmull.s16 q1, d5, d28 - vmull.s16 q2, d6, d28 - vmull.s16 q3, d7, d28 - vmlal.s16 q0, d8, d29 - vmlal.s16 q1, d9, d29 - vmlal.s16 q2, d10, d29 - vmlal.s16 q3, d11, d29 - vdup.16 q14, d30[2] // uv_offset - vshrn.s32 d0, q0, #6 - vshrn.s32 d1, q1, #6 - vshrn.s32 d2, q2, #6 - vshrn.s32 d3, q3, #6 - vdup.16 q4, d30[3] // bitdepth_max - vmov.i16 q5, #0 - vadd.i16 q6, q6, q14 - vadd.i16 q7, q7, q14 - vadd.i16 q2, q0, q14 - vadd.i16 q3, q1, q14 - vmin.s16 q0, q6, q4 - vmin.s16 q1, q7, q4 - vmin.s16 q2, q2, q4 - vmin.s16 q3, q3, q4 - vmax.s16 q0, q0, q5 - vmax.s16 q1, q1, q5 - vmax.s16 q2, q2, q5 - vmax.s16 q3, q3, q5 -.else - vdup.16 q14, d30[3] // bitdepth_max - // Make sure that uninitialized pixels out of range past the right - // edge are in range; their actual values shouldn't matter. - vand q0, q0, q14 - vand q1, q1, q14 - vand q2, q2, q14 - vand q3, q3, q14 -.endif - - bl gather32_neon - - vld1.16 {q0, q1}, [r1, :128]! // src - - vmovl.u8 q6, d8 // scaling - vmovl.u8 q7, d9 - vmovl.u8 q4, d10 - vmovl.u8 q5, d11 - - vld1.16 {q2, q3}, [r1, :128], r2 // src - - vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) - vshl.u16 q7, q7, q13 - vshl.u16 q4, q4, q13 - vshl.u16 q5, q5, q13 - - vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) - vqrdmulh.s16 q9, q9, q7 - vqrdmulh.s16 q10, q10, q4 - vqrdmulh.s16 q11, q11, q5 - - - vdup.16 q4, d31[0] // clip_min - vdup.16 q5, d31[1] // clip_max - - vqadd.s16 q0, q0, q8 // *src + noise - vqadd.s16 q1, q1, q9 - vqadd.s16 q2, q2, q10 - vqadd.s16 q3, q3, q11 - -.if \oy - vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x -.endif - - vmax.s16 q0, q0, q4 - vmax.s16 q1, q1, q4 - vmax.s16 q2, q2, q4 - vmax.s16 q3, q3, q4 - vmin.s16 q0, q0, q5 - vmin.s16 q1, q1, q5 - vmin.s16 q2, q2, q5 - vmin.s16 q3, q3, q5 - - vst1.16 {q0, q1}, [r0, :128]! // dst - - subs r9, r9, #1 -.if \oy - vmov.32 d31[1], lr // new coeffs for overlap y -.endif - - vst1.16 {q2, q3}, [r0, :128], r2 // dst - bgt 1b - -.if \oy - cmp r12, #0 - mov r9, r12 // restore actual remaining h - bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart) -.endif - b 9f -.endm - fguv_loop_sx0 0, 0, 0 - fguv_loop_sx0 0, 0, 1 - fguv_loop_sx0 0, 1, 0 - fguv_loop_sx0 0, 1, 1 - fguv_loop_sx0 1, 0, 0 - fguv_loop_sx0 1, 0, 1 - fguv_loop_sx0 1, 1, 0 - fguv_loop_sx0 1, 1, 1 - -9: - vpop {q4-q7} - pop {r4-r11,pc} -endfunc - -function fguv_loop_sx1_neon -L(fguv_loop_sx1_tbl): - .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - -.macro fguv_loop_sx1 csfl, ox, oy -L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): -.if \oy - mov r12, lr -.endif -1: -.if \ox - vld1.16 {d0}, [r4], r10 // grain_lut old -.endif -.if \ox && \oy - vld1.16 {d2}, [r11], r10 // grain_lut top old -.endif -.if \oy - vld1.16 {q2, q3}, [r8], r10 // grain_lut top -.endif -.if !\ox && !\oy - vld1.16 {q0, q1}, [r6, :128]! // luma -.endif - vld1.16 {q8, q9}, [r5], r10 // grain_lut -.if \oy - vdup.16 d28, d31[2] // overlap y coeff - vdup.16 d29, d31[3] // overlap y coeff -.endif -.if !\ox && !\oy - vld1.16 {q2, q3}, [r6, :128], r7 // luma -.endif - -.if \ox - vdup.16 q7, d30[3] // bitdepth_max - vmull.s16 q0, d0, d24 - vshr.u16 q7, q7, #1 // grain_max - vmlal.s16 q0, d16, d25 - vmvn q6, q7 // grain_min -.endif - -.if \oy -.if \ox - vmull.s16 q1, d2, d24 - vmlal.s16 q1, d4, d25 - vqrshrn.s32 d16, q0, #5 - vqrshrn.s32 d4, q1, #5 - vmin.s16 d4, d4, d14 - vmin.s16 d16, d16, d14 - vmax.s16 d4, d4, d12 - vmax.s16 d16, d16, d12 -.endif - - vmull.s16 q0, d4, d28 - vmull.s16 q1, d5, d28 - vmull.s16 q2, d6, d28 - vmull.s16 q3, d7, d28 -.if !\ox - vdup.16 q7, d30[3] // bitdepth_max -.endif - vmlal.s16 q0, d16, d29 - vmlal.s16 q1, d17, d29 - vmlal.s16 q2, d18, d29 - vmlal.s16 q3, d19, d29 -.if !\ox - vshr.u16 q7, q7, #1 // grain_max -.endif - vqrshrn.s32 d16, q0, #5 - vqrshrn.s32 d17, q1, #5 - vqrshrn.s32 d18, q2, #5 - vqrshrn.s32 d19, q3, #5 -.if !\ox - vmvn q6, q7 // grain_min -.endif - vld1.16 {q0, q1}, [r6, :128]! // luma - vmin.s16 q8, q8, q7 - vmin.s16 q9, q9, q7 - vmax.s16 q8, q8, q6 - vmax.s16 q9, q9, q6 - vld1.16 {q2, q3}, [r6, :128], r7 // luma -.elseif \ox - vqrshrn.s32 d16, q0, #5 - vld1.16 {q0, q1}, [r6, :128]! // luma - vmin.s16 d16, d16, d14 - vld1.16 {q2, q3}, [r6, :128], r7 // luma - vmax.s16 d16, d16, d12 -.endif - - vpadd.i16 d0, d0, d1 - vpadd.i16 d1, d2, d3 - vpadd.i16 d2, d4, d5 - vpadd.i16 d3, d6, d7 - vrshr.u16 q0, q0, #1 - vrshr.u16 q1, q1, #1 -.if !\csfl - vdup.16 d28, d30[0] // uv_luma_mult - vld1.16 {q2, q3}, [r1, :128], r2 // src - vdup.16 d29, d30[1] // uv_mult - vmull.s16 q6, d0, d28 - vmull.s16 q7, d1, d28 - vmull.s16 q0, d2, d28 - vmull.s16 q1, d3, d28 - vmlal.s16 q6, d4, d29 - vmlal.s16 q7, d5, d29 - vmlal.s16 q0, d6, d29 - vmlal.s16 q1, d7, d29 - vshrn.s32 d12, q6, #6 - vshrn.s32 d13, q7, #6 - vshrn.s32 d14, q0, #6 - vshrn.s32 d15, q1, #6 - vdup.16 q14, d30[2] // uv_offset - vdup.16 q4, d30[3] // bitdepth_max - vmov.i16 q5, #0 - vadd.i16 q6, q6, q14 - vadd.i16 q7, q7, q14 - vmin.s16 q0, q6, q4 - vmin.s16 q1, q7, q4 - vmax.s16 q0, q0, q5 - vmax.s16 q1, q1, q5 -.else - vdup.16 q14, d30[3] // bitdepth_max - vld1.16 {q2, q3}, [r1, :128], r2 // src - - // Make sure that uninitialized pixels out of range past the right - // edge are in range; their actual values shouldn't matter. - vand q0, q0, q14 - vand q1, q1, q14 -.endif - - bl gather16_neon - - vmovl.u8 q6, d8 // scaling - vmovl.u8 q7, d9 - - vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) - vshl.u16 q7, q7, q13 - - vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) - vqrdmulh.s16 q9, q9, q7 - - - vdup.16 q4, d31[0] // clip_min - vdup.16 q5, d31[1] // clip_max - - vqadd.s16 q0, q2, q8 // *src + noise - vqadd.s16 q1, q3, q9 - -.if \oy - // Swap the two last coefficients of d31, place them first in d28 - vrev64.16 d28, d31 -.endif - - vmax.s16 q0, q0, q4 - vmax.s16 q1, q1, q4 - vmin.s16 q0, q0, q5 - vmin.s16 q1, q1, q5 - - subs r9, r9, #1 -.if \oy - // Take the first two 16 bit coefficients of d28 and place them at the - // end of d31 - vtrn.32 d31, d28 -.endif - - vst1.16 {q0, q1}, [r0, :128], r2 // dst - bgt 1b - -.if \oy - cmp r12, #0 - mov r9, r12 // restore actual remaining h - bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) -.endif - - b 9f -.endm - fguv_loop_sx1 0, 0, 0 - fguv_loop_sx1 0, 0, 1 - fguv_loop_sx1 0, 1, 0 - fguv_loop_sx1 0, 1, 1 - fguv_loop_sx1 1, 0, 0 - fguv_loop_sx1 1, 0, 1 - fguv_loop_sx1 1, 1, 0 - fguv_loop_sx1 1, 1, 1 - -9: - vpop {q4-q7} - pop {r4-r11,pc} -endfunc diff -Nru dav1d-0.9.2/src/arm/32/filmgrain16.S dav1d-1.0.0/src/arm/32/filmgrain16.S --- dav1d-0.9.2/src/arm/32/filmgrain16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/arm/32/filmgrain16.S 2022-03-18 14:31:55.966356000 +0000 @@ -0,0 +1,2137 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr r11, r2, #3 + lsr r12, r2, #12 + lsr lr, r2, #1 + eor r11, r2, r11 // (r >> 0) ^ (r >> 3) + eor r12, r12, lr // (r >> 12) ^ (r >> 1) + eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr r2, r2, #\steps +.endif + and r11, r11, #((1 << \steps) - 1) // bit +.if \shift + orr r2, r2, r11, lsl #(16 - \steps) // *state +.else + orr r2, r2, r11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, r2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, r2, #17 - \bits, #\bits + lsr r2, r2, #1 +.endm + +// special calling convention: +// r2 holds seed +// r3 holds dav1d_gaussian_sequence +// clobbers r11-r12 +// returns in d0-d1 +function get_gaussian_neon + push {r5-r6,lr} + increment_seed 4 + read_rand r5, 11, 3 + read_rand r6, 11, 2 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d0[0]}, [r5] + read_rand r5, 11, 1 + vld1.16 {d0[1]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 0 + increment_seed 4 + add r6, r3, r6, lsl #1 + vld1.16 {d0[2]}, [r5] + read_rand r5, 11, 3 + vld1.16 {d0[3]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 2 + vld1.16 {d1[0]}, [r5] + add r6, r3, r6, lsl #1 + read_rand r5, 11, 1 + vld1.16 {d1[1]}, [r6] + read_rand r6, 11, 0 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d1[2]}, [r5] + vld1.16 {d1[3]}, [r6] + pop {r5-r6,pc} +endfunc + +function get_grain_2_neon + push {r11,lr} + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + pop {r11,pc} +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +function get_grain_4_neon + push {r11,lr} + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d0[1]}, [r12] + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[2]}, [r11] + vld1.16 {d0[3]}, [r12] + vrshl.s16 d0, d0, d30 + pop {r11,pc} +endfunc + +.macro get_grain_4 dst + bl get_grain_4_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +// r1 holds the number of entries to produce +// r6, r8 and r10 hold the previous output entries +// q0 holds the vector of produced entries +// q1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon + push {r0, lr} +.if \n == 1 + mvn lr, r5 // grain_min = ~grain_max +.else + mov r0, #1 + mov lr, #1 + sub r7, r7, #1 + sub r9, r9, #1 + lsl r0, r0, r7 + lsl lr, lr, r9 + add r7, r7, #1 + add r9, r9, #1 +.endif +1: + read_shift_rand r12, 11 + vmov.32 r11, d2[0] + lsl r12, r12, #1 + vext.8 q0, q0, q0, #2 + ldrsh r12, [r3, r12] +.if \n == 1 + mla r11, r6, r4, r11 // sum (above) + *coeff * prev output + add r6, r11, r8 // 1 << (ar_coeff_shift - 1) + add r12, r12, r10 + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + cmp r6, r5 +.elseif \n == 2 + mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 + mla r11, r6, r10, r11 // += *coeff * prev output 2 + mov r8, r6 + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mvn lr, r5 // grain_min = ~grain_max +.else + push {r1-r3} + sbfx r1, r4, #0, #8 + sbfx r2, r4, #8, #8 + sbfx r3, r4, #16, #8 + mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 + mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 + mla r11, r6, r3, r11 // += *coeff * prev output 3 + pop {r1-r3} + mov r10, r8 + mov r8, r6 + + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mvn lr, r5 // grain_min = ~grain_max +.endif + it gt + movgt r6, r5 + cmp r6, lr + it lt + movlt r6, lr +.if \n >= 2 + pop {lr} +.endif + subs r1, r1, #1 + vext.8 q1, q1, q1, #4 + vmov.16 d1[3], r6 + bgt 1b + pop {r0, pc} +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + sub r12, r0, #1*GRAIN_WIDTH*2 - 16 + vld1.16 {q10}, [r12] // load top right + + vext.8 q0, q8, q9, #14 // top left, top mid + vext.8 q1, q9, q10, #2 // top left, top mid + + vmull.s16 q2, d18, d28 + vmlal.s16 q2, d0, d27 + vmlal.s16 q2, d2, d29 + vmull.s16 q3, d19, d28 + vmlal.s16 q3, d1, d27 + vmlal.s16 q3, d3, d29 + + vmov q8, q9 + vmov q9, q10 + + bx lr +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff +.ifc \lag\()_\edge, lag3_left + bl sum_lag3_left_above_neon +.else + bl sum_\lag\()_above_neon +.endif +.ifc \type, uv_420 + vpush {q6-q7} + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q0, q1}, [r11]! + vld1.16 {q6, q7}, [r12]! + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d12, d12, d13 + vpadd.i16 d13, d14, d15 + vadd.i16 q0, q0, q6 + vpop {q6-q7} + vrshr.s16 q0, q0, #2 +.endif +.ifc \type, uv_422 + vld1.16 {q0, q1}, [r11]! + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vrshr.s16 q0, q0, #1 +.endif +.ifc \type, uv_444 + vld1.16 {q0}, [r11]! +.endif +.if \uv_layout +.ifnb \uv_coeff + vdup.8 d13, \uv_coeff + vmovl.s8 q6, d13 +.endif + vmlal.s16 q2, d0, d13 + vmlal.s16 q3, d1, d13 +.endif +.if \uv_layout && \elems == 8 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 7 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 1 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: + push {r11} +.if \elems > 4 +.ifc \edge, left + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d1[1]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d1[2]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d1[3]}, [r11] + lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 + vrshl.s16 d1, d1, d30 + vext.8 q2, q2, q2, #12 +.ifc \lag, lag3 + vmov.s16 r10, d1[1] +.endif +.ifnc \lag, lag1 + vmov.s16 r8, d1[2] +.endif + vmov.s16 r6, d1[3] + + vmov q1, q2 + mov r1, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + vmov q1, q3 +.ifc \edge, right + mov r1, #3 + bl output_\lag\()_neon + read_shift_rand r12, 11 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r12] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #2 +.else + mov r1, #4 + bl output_\lag\()_neon +.endif +.else + // elems == 1 + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #1 + bl output_\lag\()_neon + lsr r2, r2, #3 + + read_rand r11, 11, 2 + read_rand r12, 11, 1 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r11] + read_rand r11, 11, 0 + vld1.16 {d2[1]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d2[2]}, [r11] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #14 +.endif + vst1.16 {q0}, [r0]! + pop {r11} + pop {r1, pc} +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag1_\edge\()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #1*GRAIN_WIDTH*2 + vld1.8 {q9}, [r12] // load the previous block right above +.endif + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 7 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 7 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 1 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 1 + + +function sum_lag2_above_neon + push {lr} + sub r12, r0, #2*GRAIN_WIDTH*2 - 16 + sub lr, r0, #1*GRAIN_WIDTH*2 - 16 + vld1.16 {q10}, [r12] // load top right + vld1.16 {q13}, [lr] + + vdup.8 d10, d28[0] + vext.8 q0, q8, q9, #12 // top left, top mid + vdup.8 d12, d28[1] + vext.8 q1, q8, q9, #14 + vdup.8 d14, d28[3] + vext.8 q4, q9, q10, #2 // top mid, top right + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + + vmull.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d8, d14 + vmull.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d9, d14 + + vdup.8 d10, d28[4] + vext.8 q0, q9, q10, #4 // top mid, top right + vdup.8 d12, d28[5] + vext.8 q1, q11, q12, #12 // top left, top mid + vdup.8 d14, d28[6] + vext.8 q4, q11, q12, #14 + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + + vmlal.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d8, d14 + vmlal.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d9, d14 + + vdup.8 d10, d29[0] + vext.8 q0, q12, q13, #2 // top mid, top right + vdup.8 d12, d29[1] + vext.8 q1, q12, q13, #4 + + vdup.8 d14, d28[2] + vdup.8 d8, d28[7] + + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q4, d8 + + vmlal.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d18, d14 + vmlal.s16 q2, d24, d8 + vmlal.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d19, d14 + vmlal.s16 q3, d25, d8 + + vmov q8, q9 + vmov q9, q10 + + vmov q11, q12 + vmov q12, q13 + + pop {pc} +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag2_\edge\()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #2*GRAIN_WIDTH*2 + sub lr, r0, #1*GRAIN_WIDTH*2 + vld1.16 {q9}, [r12] // load the previous block right above + vld1.16 {q12}, [lr] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 7 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 7 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 1 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 1 + + +function sum_lag3_left_above_neon + // A separate codepath for the left edge, to avoid reading outside + // of the edge of the buffer. + sub r12, r0, #3*GRAIN_WIDTH*2 + vld1.8 {q11, q12}, [r12] + vext.8 q12, q11, q12, #10 + vext.8 q11, q11, q11, #10 + b sum_lag3_above_start +endfunc + +function sum_lag3_above_neon + movw r12, #(3*GRAIN_WIDTH + 3)*2 + sub r12, r0, r12 + vld1.8 {q11, q12}, [r12] + +sum_lag3_above_start: + vdup.8 d12, d26[0] + vext.8 q1, q11, q12, #2 + vdup.8 d14, d26[1] + vext.8 q4, q11, q12, #4 + vdup.8 d16, d26[2] + vext.8 q5, q11, q12, #6 + vdup.8 d18, d26[3] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + movw r12, #(2*GRAIN_WIDTH + 3)*2 + sub r12, r0, r12 + + vmull.s16 q2, d22, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmull.s16 q3, d23, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d26[4] + vext.8 q0, q11, q12, #8 + vdup.8 d14, d26[5] + vext.8 q1, q11, q12, #10 + vdup.8 d16, d26[6] + vext.8 q4, q11, q12, #12 + vld1.8 {q11, q12}, [r12] + vdup.8 d18, d26[7] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d22, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d23, d18 + + vdup.8 d12, d27[0] + vext.8 q0, q11, q12, #2 + vdup.8 d14, d27[1] + vext.8 q1, q11, q12, #4 + vdup.8 d16, d27[2] + vext.8 q4, q11, q12, #6 + vdup.8 d18, d27[3] + vext.8 q5, q11, q12, #8 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + sub r12, r0, #(1*GRAIN_WIDTH + 3)*2 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d27[4] + vext.8 q0, q11, q12, #10 + vdup.8 d14, d27[5] + vext.8 q1, q11, q12, #12 + vld1.8 {q11, q12}, [r12] + vdup.8 d16, d27[6] + vdup.8 d18, d27[7] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vext.8 q5, q11, q12, #2 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d22, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d23, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d28[0] + vext.8 q0, q11, q12, #4 + vdup.8 d14, d28[1] + vext.8 q1, q11, q12, #6 + vdup.8 d16, d28[2] + vext.8 q4, q11, q12, #8 + vdup.8 d18, d28[3] + vext.8 q5, q11, q12, #10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d28[4] + vext.8 q0, q11, q12, #12 + vmovl.s8 q6, d12 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q3, d1, d12 + + bx lr +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag3_\edge\()_neon + push {r1, lr} + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 7 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 7 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 1 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 1 + +function generate_grain_rows_neon + push {r10-r11,lr} +1: + mov r10, #80 +2: + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + subs r10, r10, #8 + vst1.16 {q0}, [r0]! + bgt 2b + get_grain_2 d0 + subs r1, r1, #1 + vst1.32 {d0[0]}, [r0]! + bgt 1b + pop {r10-r11,pc} +endfunc + +function generate_grain_rows_44_neon + push {r10-r11,lr} +1: + mov r10, #40 +2: + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + subs r10, r10, #8 + vst1.16 {q0}, [r0]! + bgt 2b + get_grain_4 d0 + subs r1, r1, #1 + vst1.16 {d0}, [r0] + add r0, r0, #GRAIN_WIDTH*2-80 + bgt 1b + pop {r10-r11,pc} +endfunc + +function gen_grain_uv_444_lag0_neon + vld1.16 {q3}, [r11]! +gen_grain_uv_lag0_8_start: + push {r11,lr} + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 +gen_grain_uv_lag0_8_add: + vand q3, q3, q1 + vmull.s16 q2, d6, d22 + vmull.s16 q3, d7, d22 + vrshl.s32 q2, q2, q12 + vrshl.s32 q3, q3, q12 + vqmovn.s32 d4, q2 + vqmovn.s32 d5, q3 + vqadd.s16 q2, q2, q0 + vmin.s16 q2, q2, q9 + vmax.s16 q2, q2, q10 + vst1.16 {q2}, [r0]! + pop {r11,pc} +endfunc + +function gen_grain_uv_420_lag0_8_neon + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q2,q3}, [r11]! + vld1.16 {q4,q5}, [r12] + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d8, d8, d9 + vpadd.i16 d9, d10, d11 + vadd.i16 q2, q2, q4 + vrshr.s16 q3, q2, #2 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_422_lag0_8_neon + vld1.16 {q2,q3}, [r11]! + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vrshr.s16 q3, q2, #1 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_420_lag0_4_neon + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q2}, [r11] + vld1.16 {q0}, [r12] + add r11, r11, #32 + vpadd.i16 d4, d4, d5 + vpadd.i16 d0, d0, d1 + vadd.i16 d4, d4, d0 + vrshr.s16 d6, d4, #2 + push {r11,lr} + get_grain_4 d0 + b gen_grain_uv_lag0_8_add +endfunc + +function gen_grain_uv_422_lag0_4_neon + vld1.16 {q2}, [r11] + add r11, r11, #32 + vpadd.i16 d4, d4, d5 + vrshr.s16 d6, d4, #1 + push {r11,lr} + get_grain_4 d0 + b gen_grain_uv_lag0_8_add +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + +.ifc \type, uv_444 + ldr r4, [sp, #36] + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH*2 + mov r1, r2 + mul r12, r12, lr + clz lr, r4 +.else + clz lr, r2 +.endif + movrel r3, X(gaussian_sequence) + sub lr, lr, #24 // -bitdepth_min_8 + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add r4, r1, #FGD_AR_COEFFS_Y +.else + add r4, r1, #FGD_AR_COEFFS_UV +.endif + add r9, r9, lr // grain_scale_shift - bitdepth_min_8 + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + +.ifc \type, uv_444 + push {lr} + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + pop {lr} +.endif + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + neg lr, lr // bitdepth_min_8 + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, y + mov r1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + mvn r6, r5 // grain_min = ~grain_max + + mov r1, #3 + bl generate_grain_rows_neon + mov r1, #GRAIN_HEIGHT-3 + + vdup.32 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vdup.16 q9, r5 + vdup.16 q10, r6 + vext.8 q13, q0, q1, #10 + vext.8 q14, q1, q0, #2 + vneg.s32 q12, q12 + vmovl.s8 q11, d22 + +1: + vmov q1, q13 + bl gen_grain_uv_444_lag0_neon // 8 + vmov.i8 q1, #255 + bl gen_grain_uv_444_lag0_neon // 16 + bl gen_grain_uv_444_lag0_neon // 24 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 40 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 56 + bl gen_grain_uv_444_lag0_neon // 64 + bl gen_grain_uv_444_lag0_neon // 72 + vmov q1, q14 + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 + add r11, r11, #4 + vst1.32 {d16[0]}, [r0]! + bgt 1b +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb r4, [r4, #1] // ar_coeffs_y[3] +.else + add r4, r4, #2 +.endif + + mov r1, #3 +.ifc \type, uv_444 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + vmovl.s8 q13, d27 + vmovl.s8 q12, d29 + vmovl.s8 q14, d28 + vmov d29, d24 +.ifc \type, uv_444 + vmovl.s8 q6, d13 +.endif + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_mid_neon // 48 + bl sum_\type\()_lag1_mid_neon // 56 + bl sum_\type\()_lag1_mid_neon // 64 + bl sum_\type\()_lag1_mid_neon // 72 + bl sum_\type\()_lag1_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_mid_neon // 48 + bl sum_\type\()_lag2_mid_neon // 56 + bl sum_\type\()_lag2_mid_neon // 64 + bl sum_\type\()_lag2_mid_neon // 72 + bl sum_\type\()_lag2_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + vpush {d26} + bl generate_grain_rows_neon + vpop {d26} + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_mid_neon // 48 + bl sum_\type\()_lag3_mid_neon // 56 + bl sum_\type\()_lag3_mid_neon // 64 + bl sum_\type\()_lag3_mid_neon // 72 + bl sum_\type\()_lag3_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) +.else + sub \reg, \reg, #6*32-GRAIN_WIDTH*2 +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + + ldr r4, [sp, #36] + mov r12, r3 + movw r11, #(3*GRAIN_WIDTH-3)*2 + mov lr, #28 + add r11, r1, r11 + mov r1, r2 + mul r12, r12, lr + clz lr, r4 + + movrel r3, X(gaussian_sequence) + sub lr, lr, #24 // -bitdepth_min_8 + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] + add r4, r1, #FGD_AR_COEFFS_UV + add r9, r9, lr // grain_scale_shift - bitdepth_min_8 + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + + push {lr} + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + pop {lr} + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + neg lr, lr + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, uv_420 + vpush {q4-q5} +.endif + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + mvn r6, r5 // grain_min = ~grain_max + + mov r1, #3 + bl generate_grain_rows_44_neon + set_height r1, \type + + vdup.32 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vdup.16 q9, r5 + vdup.16 q10, r6 + vext.8 q13, q0, q1, #10 + vext.8 q14, q1, q0, #14 + vneg.s32 q12, q12 + vmovl.s8 q11, d22 + +1: + vmov q1, q13 + bl gen_grain_\type\()_lag0_8_neon // 8 + vmov.i8 q1, #255 + bl gen_grain_\type\()_lag0_8_neon // 16 + bl gen_grain_\type\()_lag0_8_neon // 24 + bl gen_grain_\type\()_lag0_8_neon // 32 + bl gen_grain_\type\()_lag0_8_neon // 40 + vmov q1, q14 + bl gen_grain_\type\()_lag0_4_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + +.ifc \type, uv_420 + vpop {q4-q5} +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] + add r4, r4, #2 + + mov r1, #3 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + vmovl.s8 q13, d27 + vmovl.s8 q12, d29 + vmovl.s8 q14, d28 + vmov d29, d24 + vmovl.s8 q6, d13 + + set_height r1, \type +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off + vmov.u16 r11, \src1[0+\off] + vmov.u16 r12, \src3[0+\off] + add r11, r11, r3 + vmov.u16 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u16 r11, \src3[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u16 r12, \src2[0+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u16 lr, \src4[0+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u16 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u16 r12, \src4[2+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, d2, d3, 0 + gather_interleaved d8, d9, d0, d1, d2, d3, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, lsl #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH*2 // grain_lut stride + ldr r10, [sp, #124] // bitdepth_max + + eor r4, r4, #15 // 15 - scaling_shift + vdup.16 q6, r10 // bitdepth_max + clz r10, r10 + vdup.16 q13, r4 // 15 - scaling_shift + rsb r10, r10, #24 // bitdepth_min_8 + cmp r8, #0 + vdup.16 q12, r10 // bitdepth_min_8 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i16 q14, #16 + vmov.i16 q15, #235 + vshl.s16 q14, q14, q12 + vshl.s16 q15, q15, q12 + b 2f +1: + // no clip + vmov.i16 q14, #0 + vmov q15, q6 +2: + vshr.u16 q6, q6, #1 // grain_max + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #18 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.16 d14, d24[0] + vdup.16 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + sub r2, r2, #32 // src_stride -= 32 + sub r9, r9, #32 // grain_stride -= 32 + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.16 {d0}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r6]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r8], r9 // grain_lut top old +.endif +.if \oy + vld1.16 {q4, q5}, [r6], r9 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r1, :128]! // src +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if !\ox && !\oy + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif +.if !\oy + vmvn.i16 q5, #0xf000 // 0x0fff +.endif + vld1.16 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vmlal.s16 q0, d16, d25 +.endif + +.if \oy +.if \ox + add r8, r8, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vmvn d0, d12 // grain_min + vqrshrn.s32 d4, q1, #5 + vmin.s16 d16, d16, d12 + vmin.s16 d4, d4, d12 + vmax.s16 d16, d16, d0 + vmax.s16 d4, d4, d0 +.endif + + vmull.s16 q0, d4, d14 + vmull.s16 q1, d5, d14 + vmull.s16 q2, d6, d14 + vmull.s16 q3, d7, d14 + vmlal.s16 q0, d16, d15 + vmlal.s16 q1, d17, d15 + vmlal.s16 q2, d18, d15 + vmlal.s16 q3, d19, d15 + vmull.s16 q8, d20, d15 + vmull.s16 q9, d21, d15 + vmull.s16 q10, d22, d15 + vmull.s16 q11, d23, d15 + vmlal.s16 q8, d8, d14 + vmlal.s16 q9, d9, d14 + vmlal.s16 q10, d10, d14 + vmlal.s16 q11, d11, d14 + vmvn q4, q6 // grain_min + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q6 + vmin.s16 q9, q1, q6 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 q10, q2, q6 + vmin.s16 q11, q3, q6 + vmax.s16 q8, q8, q4 + vmax.s16 q9, q9, q4 + vld1.16 {q2, q3}, [r1, :128], r2 // src + vmvn.i16 q5, #0xf000 // 0x0fff + vmax.s16 q10, q10, q4 + vmax.s16 q11, q11, q4 +.elseif \ox + vmvn d4, d12 // grain_min + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 d16, d16, d12 + vmax.s16 d16, d16, d4 + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q5 + vand q1, q1, q5 + vand q2, q2, q5 + vand q3, q3, q5 + + bl gather32_neon + +.if \ox || \oy + vpush {q6-q7} +.endif + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + +.if \ox || \oy + vpop {q6-q7} +.endif + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + + vst1.16 {q0, q1}, [r0, :128]! // dst + subs r7, r7, #1 +.if \oy + vdup.16 d14, d25[0] + vdup.16 d15, d25[1] +.endif + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox\()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r10, r11, [sp, #124] // uv, is_id + ldr r6, [sp, #136] // bitdepth_max + + clz r7, r6 + rsb r7, r7, #24 // bitdepth_min_8 + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset + vld1.16 {d30[]}, [r12] // uv_luma_mult + lsl r10, r10, r7 // uv_offset << bitdepth_min_8 + vld1.16 {d30[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + eor lr, lr, #15 // 15 - scaling_shift + + vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8 + + cmp r12, #0 + vdup.16 q13, lr // 15 - scaling_shift + + beq 1f + // clip + cmp r11, #0 + mov r8, #16 + mov r9, #240 + lsl r8, r8, r7 + lsl r9, r9, r7 + beq 2f + // is_id + mov r9, #235 + lsl r9, r9, r7 + b 2f +1: + // no clip + mov r8, #0 + mov r9, r6 // bitdepth_max +2: + vmov.16 d30[3], r6 // bitdepth_max + vdup.16 d31, r8 // clip_min + + mov r10, #GRAIN_WIDTH*2 // grain_lut stride + +.if \sy + mov r6, #23 + mov r7, #22 +.else + mov r6, #27 + mov r7, #17 +.endif + vmov.16 d31[1], r9 // clip_max + + ldrd r8, r9, [sp, #116] // offsets, h + + add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + vmov.16 d31[2], r6 // overlap y [0] + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + vmov.16 d31[3], r7 // overlap y [1] + + add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx\()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + sub r7, r7, #32 // luma_stride -= 32 + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): + sub r2, r2, #32 // src_stride -= 32 + sub r10, r10, #32 // grain_stride -= 32 +.if \oy + mov r12, lr +.endif +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart): +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r8]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if \oy + vld1.16 {q4, q5}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif + vld1.16 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + add r11, r11, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vmull.s16 q8, d20, d29 + vmull.s16 q9, d21, d29 + vmull.s16 q10, d22, d29 + vmull.s16 q11, d23, d29 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vmlal.s16 q8, d8, d28 + vmlal.s16 q9, d9, d28 + vmlal.s16 q10, d10, d28 + vmlal.s16 q11, d11, d28 + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q7 + vmin.s16 q9, q1, q7 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q10, q2, q7 + vmin.s16 q11, q3, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 q10, q10, q6 + vmax.s16 q11, q11, q6 +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q4, q5}, [r1, :128]! // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d8, d29 + vmlal.s16 q7, d9, d29 + vmlal.s16 q0, d10, d29 + vmlal.s16 q1, d11, d29 + vld1.16 {q4, q5}, [r1, :128] // src + sub r1, r1, #32 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 + vmlal.s16 q0, d8, d29 + vmlal.s16 q1, d9, d29 + vmlal.s16 q2, d10, d29 + vmlal.s16 q3, d11, d29 + vdup.16 q14, d30[2] // uv_offset + vshrn.s32 d0, q0, #6 + vshrn.s32 d1, q1, #6 + vshrn.s32 d2, q2, #6 + vshrn.s32 d3, q3, #6 + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vadd.i16 q2, q0, q14 + vadd.i16 q3, q1, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmin.s16 q2, q2, q4 + vmin.s16 q3, q3, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 + vmax.s16 q2, q2, q5 + vmax.s16 q3, q3, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 + vand q2, q2, q14 + vand q3, q3, q14 +.endif + + bl gather32_neon + + vld1.16 {q0, q1}, [r1, :128]! // src + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vld1.16 {q2, q3}, [r1, :128], r2 // src + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + +.if \oy + vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmax.s16 q2, q2, q4 + vmax.s16 q3, q3, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + vmin.s16 q2, q2, q5 + vmin.s16 q3, q3, q5 + + vst1.16 {q0, q1}, [r0, :128]! // dst + + subs r9, r9, #1 +.if \oy + vmov.32 d31[1], lr // new coeffs for overlap y +.endif + + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if \oy + vld1.16 {q2, q3}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5], r10 // grain_lut +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d17, q1, #5 + vqrshrn.s32 d18, q2, #5 + vqrshrn.s32 d19, q3, #5 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q8, q8, q7 + vmin.s16 q9, q9, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vrshr.u16 q0, q0, #1 + vrshr.u16 q1, q1, #1 +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q2, q3}, [r1, :128], r2 // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d4, d29 + vmlal.s16 q7, d5, d29 + vmlal.s16 q0, d6, d29 + vmlal.s16 q1, d7, d29 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vdup.16 q14, d30[2] // uv_offset + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + vld1.16 {q2, q3}, [r1, :128], r2 // src + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 +.endif + + bl gather16_neon + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q2, q8 // *src + noise + vqadd.s16 q1, q3, q9 + +.if \oy + // Swap the two last coefficients of d31, place them first in d28 + vrev64.16 d28, d31 +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + + subs r9, r9, #1 +.if \oy + // Take the first two 16 bit coefficients of d28 and place them at the + // end of d31 + vtrn.32 d31, d28 +.endif + + vst1.16 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff -Nru dav1d-0.9.2/src/arm/32/film_grain.S dav1d-1.0.0/src/arm/32/film_grain.S --- dav1d-0.9.2/src/arm/32/film_grain.S 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/src/arm/32/film_grain.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,2039 +0,0 @@ -/* - * Copyright © 2021, VideoLAN and dav1d authors - * Copyright © 2021, Martin Storsjo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "src/arm/asm.S" -#include "util.S" -#include "src/arm/asm-offsets.h" - -#define GRAIN_WIDTH 82 -#define GRAIN_HEIGHT 73 - -#define SUB_GRAIN_WIDTH 44 -#define SUB_GRAIN_HEIGHT 38 - -.macro increment_seed steps, shift=1 - lsr r11, r2, #3 - lsr r12, r2, #12 - lsr lr, r2, #1 - eor r11, r2, r11 // (r >> 0) ^ (r >> 3) - eor r12, r12, lr // (r >> 12) ^ (r >> 1) - eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) -.if \shift - lsr r2, r2, #\steps -.endif - and r11, r11, #((1 << \steps) - 1) // bit -.if \shift - orr r2, r2, r11, lsl #(16 - \steps) // *state -.else - orr r2, r2, r11, lsl #16 // *state -.endif -.endm - -.macro read_rand dest, bits, age - ubfx \dest, r2, #16 - \bits - \age, #\bits -.endm - -.macro read_shift_rand dest, bits - ubfx \dest, r2, #17 - \bits, #\bits - lsr r2, r2, #1 -.endm - -// special calling convention: -// r2 holds seed -// r3 holds dav1d_gaussian_sequence -// clobbers r11-r12 -// returns in d0-d1 -function get_gaussian_neon - push {r5-r6,lr} - increment_seed 4 - read_rand r5, 11, 3 - read_rand r6, 11, 2 - add r5, r3, r5, lsl #1 - add r6, r3, r6, lsl #1 - vld1.16 {d0[0]}, [r5] - read_rand r5, 11, 1 - vld1.16 {d0[1]}, [r6] - add r5, r3, r5, lsl #1 - read_rand r6, 11, 0 - increment_seed 4 - add r6, r3, r6, lsl #1 - vld1.16 {d0[2]}, [r5] - read_rand r5, 11, 3 - vld1.16 {d0[3]}, [r6] - add r5, r3, r5, lsl #1 - read_rand r6, 11, 2 - vld1.16 {d1[0]}, [r5] - add r6, r3, r6, lsl #1 - read_rand r5, 11, 1 - vld1.16 {d1[1]}, [r6] - read_rand r6, 11, 0 - add r5, r3, r5, lsl #1 - add r6, r3, r6, lsl #1 - vld1.16 {d1[2]}, [r5] - vld1.16 {d1[3]}, [r6] - pop {r5-r6,pc} -endfunc - -.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r0, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r1, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r2, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r3, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r4, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r5, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r6, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r7, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r8, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r9, q0 - increment_seed 2 - read_rand r11, 11, 1 - read_rand r12, 11, 0 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d0[0]}, [r11] - vld1.16 {d0[1]}, [r12] - vrshl.s16 d0, d0, d30 - vmovn.i16 \r10, q0 -.endm - -.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 - vst1.16 {\r0, \r1, \r2, \r3}, [r0]! - vst1.16 {\r4, \r5, \r6, \r7}, [r0]! - vst1.16 {\r8, \r9}, [r0]! - vst1.16 {\r10[0]}, [r0]! -.endm - -.macro get_grain_row_44 r0, r1, r2, r3, r4, r5 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r0, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r1, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r2, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r3, q0 - bl get_gaussian_neon - vrshl.s16 q0, q0, q15 - vmovn.i16 \r4, q0 - increment_seed 4 - read_rand r11, 11, 3 - read_rand r12, 11, 2 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d0[]}, [r11] - read_rand r11, 11, 1 - vld1.16 {d0[1]}, [r12] - add r11, r3, r11, lsl #1 - read_rand r12, 11, 0 - vld1.16 {d0[2]}, [r11] - add r12, r3, r12, lsl #1 - vld1.16 {d0[3]}, [r12] - vrshl.s16 d0, d0, d30 - vmovn.i16 \r5, q0 -.endm - -.macro store_grain_row_44 r0, r1, r2, r3, r4, r5 - vst1.16 {\r0, \r1, \r2, \r3}, [r0]! - vst1.16 {\r4, \r5}, [r0] - add r0, r0, #GRAIN_WIDTH-32 -.endm - -function get_grain_2_neon - push {r11,lr} - increment_seed 2 - read_rand r11, 11, 1 - read_rand r12, 11, 0 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d0[0]}, [r11] - vld1.16 {d0[1]}, [r12] - vrshl.s16 d0, d0, d30 - vmovn.i16 d0, q0 - pop {r11,pc} -endfunc - -.macro get_grain_2 dst - bl get_grain_2_neon -.ifnc \dst, d0 - vmov \dst, d0 -.endif -.endm - -// r1 holds the number of entries to produce -// r6, r8 and r10 hold the previous output entries -// q0 holds the vector of produced entries -// q1 holds the input vector of sums from above -.macro output_lag n -function output_lag\n\()_neon - push {r0, lr} -.if \n == 1 - mov lr, #-128 -.else - mov r0, #1 - mov lr, #1 - sub r7, r7, #1 - sub r9, r9, #1 - lsl r0, r0, r7 - lsl lr, lr, r9 - add r7, r7, #1 - add r9, r9, #1 -.endif -1: - read_shift_rand r12, 11 - vmov.32 r11, d2[0] - lsl r12, r12, #1 - vext.8 q0, q0, q0, #1 - ldrsh r12, [r3, r12] -.if \n == 1 - mla r11, r6, r4, r11 // sum (above) + *coeff * prev output - add r6, r11, r8 // 1 << (ar_coeff_shift - 1) - add r12, r12, r10 - asr r6, r6, r7 // >> ar_coeff_shift - asr r12, r12, r9 // >> (4 + grain_scale_shift) - add r6, r6, r12 - cmp r6, r5 -.elseif \n == 2 - mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 - mla r11, r6, r10, r11 // += *coeff * prev output 2 - mov r8, r6 - add r6, r11, r0 // 1 << (ar_coeff_shift - 1) - add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) - asr r6, r6, r7 // >> ar_coeff_shift - asr r12, r12, r9 // >> (4 + grain_scale_shift) - add r6, r6, r12 - push {lr} - cmp r6, r5 - mov lr, #-128 -.else - push {r1-r3} - sbfx r1, r4, #0, #8 - sbfx r2, r4, #8, #8 - sbfx r3, r4, #16, #8 - mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 - mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 - mla r11, r6, r3, r11 // += *coeff * prev output 3 - pop {r1-r3} - mov r10, r8 - mov r8, r6 - - add r6, r11, r0 // 1 << (ar_coeff_shift - 1) - add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) - asr r6, r6, r7 // >> ar_coeff_shift - asr r12, r12, r9 // >> (4 + grain_scale_shift) - add r6, r6, r12 - push {lr} - cmp r6, r5 - mov lr, #-128 -.endif - it gt - movgt r6, r5 - cmp r6, lr - it lt - movlt r6, lr -.if \n >= 2 - pop {lr} -.endif - subs r1, r1, #1 - vext.8 q1, q1, q1, #4 - vmov.8 d1[7], r6 - bgt 1b - pop {r0, pc} -endfunc -.endm - -output_lag 1 -output_lag 2 -output_lag 3 - - -function sum_lag1_above_neon - vmull.s8 q2, d6, d28 - vmull.s8 q3, d7, d28 - vmull.s8 q4, d0, d27 - vmull.s8 q5, d1, d27 - - vaddl.s16 q0, d4, d8 - vaddl.s16 q2, d5, d9 - vaddl.s16 q4, d6, d10 - vaddl.s16 q5, d7, d11 - - vmull.s8 q3, d3, d29 - vmull.s8 q1, d2, d29 - - vaddw.s16 q4, q4, d6 - vaddw.s16 q5, q5, d7 - vaddw.s16 q3, q2, d3 - vaddw.s16 q2, q0, d2 - bx lr -endfunc - -.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff -.ifc \lag\()_\edge, lag3_left - bl sum_lag3_left_above_neon -.else - bl sum_\lag\()_above_neon -.endif -.ifc \type, uv_420 - vpush {q6-q7} - add r12, r11, #GRAIN_WIDTH - vld1.16 {q0, q1}, [r11]! - vld1.16 {q6, q7}, [r12]! - vpaddl.s8 q0, q0 - vpaddl.s8 q1, q1 - vpaddl.s8 q6, q6 - vpaddl.s8 q7, q7 - vadd.i16 q0, q0, q6 - vadd.i16 q1, q1, q7 - vpop {q6-q7} - vrshrn.s16 d0, q0, #2 - vrshrn.s16 d1, q1, #2 -.endif -.ifc \type, uv_422 - vld1.8 {q0, q1}, [r11]! - vpaddl.s8 q0, q0 - vpaddl.s8 q1, q1 - vrshrn.s16 d0, q0, #1 - vrshrn.s16 d1, q1, #1 -.endif -.ifc \type, uv_444 - vld1.8 {q0}, [r11]! -.endif -.if \uv_layout -.ifnb \uv_coeff - vdup.8 d13, \uv_coeff -.endif - vmull.s8 q1, d0, d13 - vmull.s8 q0, d1, d13 - vaddw.s16 q2, q2, d2 - vaddw.s16 q3, q3, d3 - vaddw.s16 q4, q4, d0 - vaddw.s16 q5, q5, d1 -.endif -.if \uv_layout && \elems == 16 - b sum_\lag\()_y_\edge\()_start -.elseif \uv_layout == 444 && \elems == 15 - b sum_\lag\()_y_\edge\()_start -.elseif \uv_layout == 422 && \elems == 9 - b sum_\lag\()_uv_420_\edge\()_start -.else -sum_\lag\()_\type\()_\edge\()_start: - push {r11} -.ifc \edge, left - increment_seed 4 - read_rand r11, 11, 3 - read_rand r12, 11, 2 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d1[1]}, [r11] - read_rand r11, 11, 1 - vld1.16 {d1[2]}, [r12] - add r11, r3, r11, lsl #1 - vld1.16 {d1[3]}, [r11] - lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 - vrshl.s16 d1, d1, d30 - vmovn.i16 d1, q0 - vext.8 q2, q2, q2, #12 -.ifc \lag, lag3 - vmov.s8 r10, d1[5] -.endif -.ifnc \lag, lag1 - vmov.s8 r8, d1[6] -.endif - vmov.s8 r6, d1[7] - - vmov q1, q2 - mov r1, #1 - bl output_\lag\()_neon -.else - increment_seed 4, shift=0 - vmov q1, q2 - mov r1, #4 - bl output_\lag\()_neon -.endif - - increment_seed 4, shift=0 - vmov q1, q3 - mov r1, #4 - bl output_\lag\()_neon - - increment_seed 4, shift=0 - vmov q1, q4 -.if \elems == 9 - mov r1, #1 - bl output_\lag\()_neon - lsr r2, r2, #3 - - read_rand r11, 11, 2 - read_rand r12, 11, 1 - add r11, r3, r11, lsl #1 - add r12, r3, r12, lsl #1 - vld1.16 {d2[0]}, [r11] - read_rand r11, 11, 0 - vld1.16 {d2[1]}, [r12] - add r11, r3, r11, lsl #1 - vld1.16 {d2[2]}, [r11] - vrshl.s16 d2, d2, d30 - vmovn.i16 d2, q1 - vext.8 q0, q0, q1, #7 -.else - mov r1, #4 - bl output_\lag\()_neon - - increment_seed 4, shift=0 - vmov q1, q5 - -.ifc \edge, right - mov r1, #3 - bl output_\lag\()_neon - read_shift_rand r11, 11 - add r11, r3, r11, lsl #1 - vld1.16 {d2[0]}, [r11] - vrshl.s16 d2, d2, d30 - vext.8 q0, q0, q1, #1 -.else - mov r1, #4 - bl output_\lag\()_neon -.endif -.endif -.if \store - vst1.8 {q0}, [r0]! -.endif - pop {r11} - pop {r1, pc} -.endif -.endm - -.macro sum_lag1_func type, uv_layout, edge, elems=16 -function sum_\type\()_lag1_\edge\()_neon - push {r1, lr} - sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 -endfunc -.endm - -sum_lag1_func y, 0, left -sum_lag1_func y, 0, mid -sum_lag1_func y, 0, right, 15 -sum_lag1_func uv_444, 444, left -sum_lag1_func uv_444, 444, mid -sum_lag1_func uv_444, 444, right, 15 -sum_lag1_func uv_422, 422, left -sum_lag1_func uv_422, 422, mid -sum_lag1_func uv_422, 422, right, 9 -sum_lag1_func uv_420, 420, left -sum_lag1_func uv_420, 420, mid -sum_lag1_func uv_420, 420, right, 9 - -.macro sum_lag1 type, dst, left, mid, right, edge=mid - vmov q3, \mid - vext.8 q0, \left, \mid, #15 - vext.8 q1, \mid, \right, #1 - bl sum_\type\()_lag1_\edge\()_neon - vmov \dst, q0 -.endm - -.macro sum_y_lag1 dst, left, mid, right, edge=mid - sum_lag1 y, \dst, \left, \mid, \right, \edge -.endm - -.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid - sum_lag1 uv_444, \dst, \left, \mid, \right, \edge -.endm - -.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid - sum_lag1 uv_422, \dst, \left, \mid, \right, \edge -.endm - -.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid - sum_lag1 uv_420, \dst, \left, \mid, \right, \edge -.endm - - -function sum_lag2_above_neon - push {lr} - sub r12, r0, #2*GRAIN_WIDTH - 16 - sub lr, r0, #1*GRAIN_WIDTH - 16 - vld1.8 {q10}, [r12] // load top right - vld1.8 {q13}, [lr] - - vext.8 q6, q8, q9, #14 // top left, top mid - vdup.8 d14, d28[0] - vext.8 q8, q8, q9, #15 - vdup.8 d15, d28[1] - - vmull.s8 q0, d12, d14 - vmull.s8 q1, d13, d14 - vmull.s8 q6, d16, d15 - vmull.s8 q8, d17, d15 - - vaddl.s16 q2, d0, d12 - vaddl.s16 q3, d1, d13 - vaddl.s16 q4, d2, d16 - vaddl.s16 q5, d3, d17 - - vext.8 q6, q9, q10, #1 // top mid, top right - vdup.8 d14, d28[3] - vext.8 q8, q9, q10, #2 - vdup.8 d15, d28[4] - - vmull.s8 q0, d12, d14 - vmull.s8 q1, d13, d14 - vmull.s8 q6, d16, d15 - vmull.s8 q8, d17, d15 - - vaddl.s16 q7, d0, d12 - vaddl.s16 q0, d1, d13 - vaddl.s16 q6, d2, d16 - vaddl.s16 q1, d3, d17 - - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q0 - vadd.i32 q4, q4, q6 - vadd.i32 q5, q5, q1 - - vext.8 q6, q11, q12, #14 // top left, top mid - vdup.8 d14, d28[5] - vext.8 q8, q11, q12, #15 - vdup.8 d15, d28[6] - - vmull.s8 q0, d12, d14 - vmull.s8 q1, d13, d14 - vmull.s8 q6, d16, d15 - vmull.s8 q8, d17, d15 - - vaddl.s16 q7, d0, d12 - vaddl.s16 q0, d1, d13 - vaddl.s16 q6, d2, d16 - vaddl.s16 q1, d3, d17 - - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q0 - vadd.i32 q4, q4, q6 - vadd.i32 q5, q5, q1 - - vext.8 q6, q12, q13, #1 // top mid, top right - vdup.8 d14, d29[0] - vext.8 q8, q12, q13, #2 - vdup.8 d15, d29[1] - - vmull.s8 q0, d12, d14 - vmull.s8 q1, d13, d14 - vmull.s8 q6, d16, d15 - vmull.s8 q8, d17, d15 - - vaddl.s16 q7, d0, d12 - vaddl.s16 q0, d1, d13 - vaddl.s16 q6, d2, d16 - vaddl.s16 q1, d3, d17 - - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q0 - vadd.i32 q4, q4, q6 - vadd.i32 q5, q5, q1 - - vdup.8 d14, d28[2] - vdup.8 d15, d28[7] - - vmull.s8 q0, d18, d14 - vmull.s8 q1, d19, d14 - vmull.s8 q6, d24, d15 - vmull.s8 q8, d25, d15 - - vaddl.s16 q7, d0, d12 - vaddl.s16 q0, d1, d13 - vaddl.s16 q6, d2, d16 - vaddl.s16 q1, d3, d17 - - vmov q8, q9 - vmov q9, q10 - - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q0 - vadd.i32 q4, q4, q6 - vadd.i32 q5, q5, q1 - - vmov q11, q12 - vmov q12, q13 - - pop {pc} -endfunc - -.macro sum_lag2_func type, uv_layout, edge, elems=16 -function sum_\type\()_lag2_\edge\()_neon - push {r1, lr} -.ifc \edge, left - sub r12, r0, #2*GRAIN_WIDTH - sub lr, r0, #1*GRAIN_WIDTH - vld1.8 {q9}, [r12] // load the previous block right above - vld1.8 {q12}, [lr] -.endif - sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4] -endfunc -.endm - -sum_lag2_func y, 0, left -sum_lag2_func y, 0, mid -sum_lag2_func y, 0, right, 15 -sum_lag2_func uv_444, 444, left -sum_lag2_func uv_444, 444, mid -sum_lag2_func uv_444, 444, right, 15 -sum_lag2_func uv_422, 422, left -sum_lag2_func uv_422, 422, mid -sum_lag2_func uv_422, 422, right, 9 -sum_lag2_func uv_420, 420, left -sum_lag2_func uv_420, 420, mid -sum_lag2_func uv_420, 420, right, 9 - - -function sum_lag3_left_above_neon - // A separate codepath for the left edge, to avoid reading outside - // of the edge of the buffer. - sub r12, r0, #3*GRAIN_WIDTH - vld1.8 {q11, q12}, [r12] - vext.8 q12, q11, q12, #13 - vext.8 q11, q11, q11, #13 - b sum_lag3_above_start -endfunc - -function sum_lag3_above_neon - sub r12, r0, #3*GRAIN_WIDTH + 3 - vld1.8 {q11, q12}, [r12] - -sum_lag3_above_start: - vdup.8 d20, d26[0] - vext.8 q9, q11, q12, #1 - vdup.8 d21, d26[1] - - vmull.s8 q0, d22, d20 - vmull.s8 q1, d23, d20 - vmull.s8 q6, d18, d21 - vmull.s8 q7, d19, d21 - - vext.8 q8, q11, q12, #2 - vdup.8 d20, d26[2] - vext.8 q9, q11, q12, #3 - vdup.8 d21, d26[3] - - vaddl.s16 q2, d0, d12 - vaddl.s16 q3, d1, d13 - vaddl.s16 q4, d2, d14 - vaddl.s16 q5, d3, d15 - - vmull.s8 q0, d16, d20 - vmull.s8 q1, d17, d20 - vmull.s8 q6, d18, d21 - vmull.s8 q7, d19, d21 - - vaddl.s16 q8, d0, d12 - vaddl.s16 q9, d1, d13 - vaddl.s16 q0, d2, d14 - vaddl.s16 q1, d3, d15 - - vext.8 q6, q11, q12, #4 - vdup.8 d20, d26[4] - vext.8 q7, q11, q12, #5 - vdup.8 d21, d26[5] - - vadd.i32 q2, q2, q8 - vadd.i32 q3, q3, q9 - vadd.i32 q4, q4, q0 - vadd.i32 q5, q5, q1 - - vmull.s8 q0, d12, d20 - vmull.s8 q1, d13, d20 - vmull.s8 q8, d14, d21 - vmull.s8 q9, d15, d21 - - sub r12, r0, #2*GRAIN_WIDTH + 3 - - vaddl.s16 q6, d0, d16 - vaddl.s16 q7, d1, d17 - vaddl.s16 q0, d2, d18 - vaddl.s16 q1, d3, d19 - - vext.8 q8, q11, q12, #6 - vld1.8 {q11, q12}, [r12] - vdup.8 d20, d26[6] - vdup.8 d21, d26[7] - - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - vadd.i32 q4, q4, q0 - vadd.i32 q5, q5, q1 - - vmull.s8 q0, d16, d20 - vmull.s8 q1, d17, d20 - vmull.s8 q6, d22, d21 - vmull.s8 q7, d23, d21 - - vaddl.s16 q8, d0, d12 - vaddl.s16 q9, d1, d13 - vaddl.s16 q0, d2, d14 - vaddl.s16 q1, d3, d15 - - vext.8 q6, q11, q12, #1 - vdup.8 d20, d27[0] - vext.8 q7, q11, q12, #2 - vdup.8 d21, d27[1] - - vadd.i32 q2, q2, q8 - vadd.i32 q3, q3, q9 - vadd.i32 q4, q4, q0 - vadd.i32 q5, q5, q1 - - vmull.s8 q0, d12, d20 - vmull.s8 q1, d13, d20 - vmull.s8 q8, d14, d21 - vmull.s8 q9, d15, d21 - - vaddl.s16 q6, d0, d16 - vaddl.s16 q7, d1, d17 - vaddl.s16 q0, d2, d18 - vaddl.s16 q1, d3, d19 - - vext.8 q8, q11, q12, #3 - vdup.8 d20, d27[2] - vext.8 q9, q11, q12, #4 - vdup.8 d21, d27[3] - - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - vadd.i32 q4, q4, q0 - vadd.i32 q5, q5, q1 - - vmull.s8 q0, d16, d20 - vmull.s8 q1, d17, d20 - vmull.s8 q6, d18, d21 - vmull.s8 q7, d19, d21 - - sub r12, r0, #1*GRAIN_WIDTH + 3 - - vaddl.s16 q8, d0, d12 - vaddl.s16 q9, d1, d13 - vaddl.s16 q0, d2, d14 - vaddl.s16 q1, d3, d15 - - vext.8 q6, q11, q12, #5 - vdup.8 d20, d27[4] - vext.8 q7, q11, q12, #6 - vdup.8 d21, d27[5] - - vld1.8 {q11, q12}, [r12] - - vadd.i32 q2, q2, q8 - vadd.i32 q3, q3, q9 - vadd.i32 q4, q4, q0 - vadd.i32 q5, q5, q1 - - vmull.s8 q0, d12, d20 - vmull.s8 q1, d13, d20 - vmull.s8 q8, d14, d21 - vmull.s8 q9, d15, d21 - - vaddl.s16 q6, d0, d16 - vaddl.s16 q7, d1, d17 - vaddl.s16 q0, d2, d18 - vaddl.s16 q1, d3, d19 - - vdup.8 d20, d27[6] - vext.8 q9, q11, q12, #1 - vdup.8 d21, d27[7] - - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - vadd.i32 q4, q4, q0 - vadd.i32 q5, q5, q1 - - vmull.s8 q0, d22, d20 - vmull.s8 q1, d23, d20 - vmull.s8 q6, d18, d21 - vmull.s8 q7, d19, d21 - - vaddl.s16 q8, d0, d12 - vaddl.s16 q9, d1, d13 - vaddl.s16 q0, d2, d14 - vaddl.s16 q1, d3, d15 - - vext.8 q6, q11, q12, #2 - vdup.8 d20, d28[0] - vext.8 q7, q11, q12, #3 - vdup.8 d21, d28[1] - - vadd.i32 q2, q2, q8 - vadd.i32 q3, q3, q9 - vadd.i32 q4, q4, q0 - vadd.i32 q5, q5, q1 - - vmull.s8 q0, d12, d20 - vmull.s8 q1, d13, d20 - vmull.s8 q8, d14, d21 - vmull.s8 q9, d15, d21 - - vaddl.s16 q6, d0, d16 - vaddl.s16 q7, d1, d17 - vaddl.s16 q0, d2, d18 - vaddl.s16 q1, d3, d19 - - vext.8 q8, q11, q12, #4 - vdup.8 d20, d28[2] - vext.8 q9, q11, q12, #5 - vdup.8 d21, d28[3] - - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - vadd.i32 q4, q4, q0 - vadd.i32 q5, q5, q1 - - vmull.s8 q0, d16, d20 - vmull.s8 q1, d17, d20 - vmull.s8 q6, d18, d21 - vmull.s8 q7, d19, d21 - - vaddl.s16 q8, d0, d12 - vaddl.s16 q9, d1, d13 - vaddl.s16 q0, d2, d14 - vaddl.s16 q1, d3, d15 - - vext.8 q6, q11, q12, #6 - vdup.8 d20, d28[4] - - vadd.i32 q2, q2, q8 - vadd.i32 q3, q3, q9 - vadd.i32 q4, q4, q0 - vadd.i32 q5, q5, q1 - - vmull.s8 q0, d12, d20 - vmull.s8 q1, d13, d20 - - vaddw.s16 q2, q2, d0 - vaddw.s16 q3, q3, d1 - vaddw.s16 q4, q4, d2 - vaddw.s16 q5, q5, d3 - - bx lr -endfunc - -.macro sum_lag3_func type, uv_layout, edge, elems=16 -function sum_\type\()_lag3_\edge\()_neon - push {r1, lr} - sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0] -endfunc -.endm - -sum_lag3_func y, 0, left -sum_lag3_func y, 0, mid -sum_lag3_func y, 0, right, 15 -sum_lag3_func uv_444, 444, left -sum_lag3_func uv_444, 444, mid -sum_lag3_func uv_444, 444, right, 15 -sum_lag3_func uv_422, 422, left -sum_lag3_func uv_422, 422, mid -sum_lag3_func uv_422, 422, right, 9 -sum_lag3_func uv_420, 420, left -sum_lag3_func uv_420, 420, mid -sum_lag3_func uv_420, 420, right, 9 - -function generate_grain_rows_neon - push {r11,lr} -1: - get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 - subs r1, r1, #1 - store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 - bgt 1b - pop {r11,pc} -endfunc - -function generate_grain_rows_44_neon - push {r11,lr} -1: - get_grain_row_44 d16, d17, d18, d19, d20, d21 - subs r1, r1, #1 - store_grain_row_44 d16, d17, d18, d19, d20, d21 - bgt 1b - pop {r11,pc} -endfunc - -function gen_grain_uv_444_lag0_neon - vld1.8 {q3}, [r11]! - push {r11,lr} - bl get_gaussian_neon - vrshl.s16 q8, q0, q15 - bl get_gaussian_neon - vrshl.s16 q9, q0, q15 - vqmovn.s16 d0, q8 - vqmovn.s16 d1, q9 - - vand q3, q3, q1 - vmull.s8 q2, d6, d22 - vmull.s8 q3, d7, d22 - vrshl.s16 q2, q2, q12 - vrshl.s16 q3, q3, q12 - vaddw.s8 q2, q2, d0 - vaddw.s8 q3, q3, d1 - vqmovn.s16 d4, q2 - vqmovn.s16 d5, q3 - vst1.8 {q2}, [r0]! - pop {r11,pc} -endfunc - -function get_grain_row_44_neon - push {r11,lr} - get_grain_row_44 d16, d17, d18, d19, d20, d21 - pop {r11,pc} -endfunc - -function add_uv_420_coeff_lag0_neon - vld1.16 {q2, q3}, [r11]! - vld1.16 {q4, q5}, [r12]! - vpaddl.s8 q2, q2 - vpaddl.s8 q3, q3 - vpaddl.s8 q4, q4 - vpaddl.s8 q5, q5 - vadd.i16 q2, q2, q4 - vadd.i16 q3, q3, q5 - vrshrn.s16 d4, q2, #2 - vrshrn.s16 d5, q3, #2 - b add_coeff_lag0_start -endfunc - -function add_uv_422_coeff_lag0_neon - vld1.16 {q2, q3}, [r11]! - vpaddl.s8 q2, q2 - vpaddl.s8 q3, q3 - vrshrn.s16 d4, q2, #1 - vrshrn.s16 d5, q3, #1 - -add_coeff_lag0_start: - vand q3, q2, q1 - vmull.s8 q2, d6, d22 - vmull.s8 q3, d7, d22 - vrshl.s16 q2, q2, q12 - vrshl.s16 q3, q3, q12 - vaddw.s8 q2, q2, d0 - vaddw.s8 q3, q3, d1 - vqmovn.s16 d4, q2 - vqmovn.s16 d5, q3 - bx lr -endfunc - -.macro gen_grain_82 type -function generate_grain_\type\()_8bpc_neon, export=1 - push {r4-r11,lr} - -.ifc \type, uv_444 - mov r12, r3 - mov lr, #28 - add r11, r1, #3*GRAIN_WIDTH - mov r1, r2 - mul r12, r12, lr -.endif - movrel r3, X(gaussian_sequence) - ldr r2, [r1, #FGD_SEED] - ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] -.ifc \type, y - add r4, r1, #FGD_AR_COEFFS_Y -.else - add r4, r1, #FGD_AR_COEFFS_UV -.endif - adr r5, L(gen_grain_\type\()_tbl) - ldr r6, [r1, #FGD_AR_COEFF_LAG] - add r9, r9, #4 - ldr r6, [r5, r6, lsl #2] - vdup.16 q15, r9 // 4 + data->grain_scale_shift - add r5, r5, r6 - vneg.s16 q15, q15 - -.ifc \type, uv_444 - cmp r12, #0 - movw r10, #0x49d8 - movw lr, #0xb524 - // Intentionally using a separate register instead of moveq with an - // immediate constant, to avoid armv8 deprecated it instruction forms. - it eq - moveq r10, lr - add r4, r4, r12 // Add offset to ar_coeffs_uv[1] - eor r2, r2, r10 -.endif - - ldr r7, [r1, #FGD_AR_COEFF_SHIFT] - mov r8, #1 - mov r10, #1 - lsl r8, r8, r7 // 1 << ar_coeff_shift - lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) - lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) - lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) - - bx r5 - - .align 2 -L(gen_grain_\type\()_tbl): - .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - -L(generate_grain_\type\()_lag0): -.ifc \type, y - mov r1, #GRAIN_HEIGHT - bl generate_grain_rows_neon -.else - - mov r1, #3 - bl generate_grain_rows_neon - mov r1, #GRAIN_HEIGHT-3 - - vdup.16 q12, r7 - vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] - vmov.i8 q0, #0 - vmov.i8 q1, #255 - vext.8 q13, q0, q1, #13 - vext.8 q14, q1, q0, #1 - vneg.s16 q12, q12 - -1: - vmov q1, q13 - bl gen_grain_uv_444_lag0_neon // 16 - vmov.i8 q1, #255 - bl gen_grain_uv_444_lag0_neon // 32 - bl gen_grain_uv_444_lag0_neon // 48 - bl gen_grain_uv_444_lag0_neon // 64 - vmov q1, q14 - bl gen_grain_uv_444_lag0_neon // 80 - get_grain_2 d16 - subs r1, r1, #1 - add r11, r11, #2 - vst1.16 {d16[0]}, [r0]! - bgt 1b -.endif - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag1): - vpush {q4-q7} - mov r5, #127 - vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] - vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] - vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] -.ifc \type, y - ldrsb r4, [r4, #1] // ar_coeffs_y[3] -.else - add r4, r4, #2 -.endif - - mov r1, #3 -.ifc \type, uv_444 - vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] - ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] -.endif - bl generate_grain_rows_neon - - mov r1, #GRAIN_HEIGHT - 3 -1: - sum_\type\()_lag1 q7, q8, q8, q9, left - sum_\type\()_lag1 q8, q8, q9, q10 - sum_\type\()_lag1 q9, q9, q10, q11 - sum_\type\()_lag1 q10, q10, q11, q12 - sum_\type\()_lag1 q12, q11, q12, q13, right - get_grain_2 d26 - subs r1, r1, #1 -.ifc \type, uv_444 - add r11, r11, #2 -.endif - store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26 - vmov q11, q10 - vmov q10, q9 - vmov q9, q8 - vmov q8, q7 - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag2): - vpush {q4-q7} - mov r5, #127 - vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] - - vmov.s8 r4, d29[2] - vmov.s8 r10, d29[3] - - mov r1, #3 - bl generate_grain_rows_neon - - mov r1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag2_left_neon - bl sum_\type\()_lag2_mid_neon - bl sum_\type\()_lag2_mid_neon - bl sum_\type\()_lag2_mid_neon - bl sum_\type\()_lag2_right_neon - get_grain_2 d16 - subs r1, r1, #1 -.ifc \type, uv_444 - add r11, r11, #2 -.endif - vst1.16 {d16[0]}, [r0]! - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag3): - vpush {q4-q7} - mov r5, #127 - vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] - - vmov.u8 r4, d28[5] - vmov.u8 r10, d28[6] - vmov.u8 r12, d28[7] - - orr r4, r4, r10, lsl #8 - orr r4, r4, r12, lsl #16 - - mov r1, #3 - vpush {d26} - bl generate_grain_rows_neon - vpop {d26} - - mov r1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag3_left_neon - bl sum_\type\()_lag3_mid_neon - bl sum_\type\()_lag3_mid_neon - bl sum_\type\()_lag3_mid_neon - bl sum_\type\()_lag3_right_neon - get_grain_2 d16 - subs r1, r1, #1 -.ifc \type, uv_444 - add r11, r11, #2 -.endif - vst1.16 {d16[0]}, [r0]! - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} -endfunc -.endm - -gen_grain_82 y -gen_grain_82 uv_444 - -.macro set_height dst, type -.ifc \type, uv_420 - mov \dst, #SUB_GRAIN_HEIGHT-3 -.else - mov \dst, #GRAIN_HEIGHT-3 -.endif -.endm - -.macro increment_y_ptr reg, type -.ifc \type, uv_420 - add \reg, \reg, #2*GRAIN_WIDTH-(3*32) -.else - sub \reg, \reg, #3*32-GRAIN_WIDTH -.endif -.endm - -.macro gen_grain_44 type -function generate_grain_\type\()_8bpc_neon, export=1 - push {r4-r11,lr} - - mov r12, r3 - mov lr, #28 - add r11, r1, #3*GRAIN_WIDTH-3 - mov r1, r2 - mul r12, r12, lr - - movrel r3, X(gaussian_sequence) - ldr r2, [r1, #FGD_SEED] - ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] - add r4, r1, #FGD_AR_COEFFS_UV - adr r5, L(gen_grain_\type\()_tbl) - ldr r6, [r1, #FGD_AR_COEFF_LAG] - add r9, r9, #4 - ldr r6, [r5, r6, lsl #2] - vdup.16 q15, r9 // 4 + data->grain_scale_shift - add r5, r5, r6 - vneg.s16 q15, q15 - - cmp r12, #0 - movw r10, #0x49d8 - movw lr, #0xb524 - // Intentionally using a separate register instead of moveq with an - // immediate constant, to avoid armv8 deprecated it instruction forms. - it eq - moveq r10, lr - add r4, r4, r12 // Add offset to ar_coeffs_uv[1] - eor r2, r2, r10 - - ldr r7, [r1, #FGD_AR_COEFF_SHIFT] - mov r8, #1 - mov r10, #1 - lsl r8, r8, r7 // 1 << ar_coeff_shift - lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) - lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) - lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) - bx r5 - - .align 2 -L(gen_grain_\type\()_tbl): - .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB - -L(generate_grain_\type\()_lag0): -.ifc \type, uv_420 - vpush {q4-q5} -.endif - mov r1, #3 - bl generate_grain_rows_44_neon - set_height r1, \type - - vdup.16 q12, r7 - vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] - vmov.i8 q0, #0 - vmov.i8 q1, #255 - vext.8 q13, q0, q1, #13 - vext.8 q14, q1, q0, #7 - vneg.s16 q12, q12 - -1: - bl get_grain_row_44_neon -.ifc \type, uv_420 - add r12, r11, #GRAIN_WIDTH -.endif - vmov q1, q13 - vmov q0, q8 - bl add_\type\()_coeff_lag0_neon - vmov.i8 q1, #255 - vmov q0, q9 - vmov q8, q2 - bl add_\type\()_coeff_lag0_neon - vmov.i8 q1, q14 - vmov q0, q10 - vmov q9, q2 - bl add_\type\()_coeff_lag0_neon - vmov q10, q2 - subs r1, r1, #1 - increment_y_ptr r11, \type - store_grain_row_44 d16, d17, d18, d19, d20, d21 - bgt 1b - -.ifc \type, uv_420 - vpop {q4-q5} -.endif - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag1): - vpush {q4-q7} - mov r5, #127 - vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] - vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] - vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] - add r4, r4, #2 - - mov r1, #3 - vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] - ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] - bl generate_grain_rows_44_neon - - set_height r1, \type -1: - sum_\type\()_lag1 q7, q8, q8, q9, left - sum_\type\()_lag1 q8, q8, q9, q10 - sum_\type\()_lag1 q10, q9, q10, q11, right - subs r1, r1, #1 - increment_y_ptr r11, \type - store_grain_row_44 d14, d15, d16, d17, d20, d21 - vmov q9, q8 - vmov q8, q7 - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag2): - vpush {q4-q7} - mov r5, #127 - vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] - - vmov.s8 r4, d29[2] - vmov.s8 r10, d29[3] - - mov r1, #3 - bl generate_grain_rows_44_neon - - set_height r1, \type -1: - bl sum_\type\()_lag2_left_neon - bl sum_\type\()_lag2_mid_neon - bl sum_\type\()_lag2_right_neon - subs r1, r1, #1 - increment_y_ptr r11, \type - add r0, r0, #GRAIN_WIDTH-48 - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} - -L(generate_grain_\type\()_lag3): - vpush {q4-q7} - mov r5, #127 - vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] - - vmov.u8 r4, d28[5] - vmov.u8 r10, d28[6] - vmov.u8 r12, d28[7] - - orr r4, r4, r10, lsl #8 - orr r4, r4, r12, lsl #16 - - mov r1, #3 - bl generate_grain_rows_44_neon - - set_height r1, \type -1: - bl sum_\type\()_lag3_left_neon - bl sum_\type\()_lag3_mid_neon - bl sum_\type\()_lag3_right_neon - subs r1, r1, #1 - increment_y_ptr r11, \type - add r0, r0, #GRAIN_WIDTH-48 - bgt 1b - - vpop {q4-q7} - pop {r4-r11,pc} -endfunc -.endm - -gen_grain_44 uv_420 -gen_grain_44 uv_422 - -.macro gather_interleaved dst1, dst2, src1, src2, off - vmov.u8 r11, \src1[0+\off] - vmov.u8 r12, \src2[0+\off] - add r11, r11, r3 - vmov.u8 lr, \src1[2+\off] - add r12, r12, r3 - vld1.8 {\dst1[0+\off]}, [r11] - vmov.u8 r11, \src2[2+\off] - add lr, lr, r3 - vld1.8 {\dst2[0+\off]}, [r12] - vmov.u8 r12, \src1[4+\off] - add r11, r11, r3 - vld1.8 {\dst1[2+\off]}, [lr] - vmov.u8 lr, \src2[4+\off] - add r12, r12, r3 - vld1.8 {\dst2[2+\off]}, [r11] - vmov.u8 r11, \src1[6+\off] - add lr, lr, r3 - vld1.8 {\dst1[4+\off]}, [r12] - vmov.u8 r12, \src2[6+\off] - add r11, r11, r3 - vld1.8 {\dst2[4+\off]}, [lr] - add r12, r12, r3 - vld1.8 {\dst1[6+\off]}, [r11] - vld1.8 {\dst2[6+\off]}, [r12] -.endm - -.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4 - gather_interleaved \dst1, \dst3, \src1, \src3, 0 - gather_interleaved \dst1, \dst3, \src1, \src3, 1 - gather_interleaved \dst2, \dst4, \src2, \src4, 0 - gather_interleaved \dst2, \dst4, \src2, \src4, 1 -.endm - -function gather32_neon - push {r11-r12,lr} - gather d8, d9, d10, d11, d0, d1, d2, d3 - pop {r11-r12,pc} -endfunc - -function gather16_neon - push {r11-r12,lr} - gather_interleaved d8, d9, d0, d1, 0 - gather_interleaved d8, d9, d0, d1, 1 - pop {r11-r12,pc} -endfunc - -const overlap_coeffs_0, align=4 - .byte 27, 17, 0, 0, 0, 0, 0, 0 - .byte 17, 27, 32, 32, 32, 32, 32, 32 -endconst - -const overlap_coeffs_1, align=4 - .byte 23, 0, 0, 0, 0, 0, 0, 0 - .byte 22, 32, 32, 32, 32, 32, 32, 32 -endconst - -.macro calc_offset offx, offy, src, sx, sy - and \offy, \src, #0xF // randval & 0xF - lsr \offx, \src, #4 // randval >> 4 -.if \sy == 0 - add \offy, \offy, \offy // 2 * (randval & 0xF) -.endif -.if \sx == 0 - add \offx, \offx, \offx // 2 * (randval >> 4) -.endif -.endm - -.macro add_offset dst, offx, offy, src, stride - mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy - add \dst, \dst, \offx // grain_lut += offx -.endm - -// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, -// const ptrdiff_t stride, -// const uint8_t scaling[SCALING_SIZE], -// const int scaling_shift, -// const entry grain_lut[][GRAIN_WIDTH], -// const int offsets[][2], -// const int h, const ptrdiff_t clip, -// const ptrdiff_t type); -function fgy_32x32_8bpc_neon, export=1 - push {r4-r11,lr} - vpush {q4-q7} - ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut - ldrd r6, r7, [sp, #108] // offsets, h - ldr r8, [sp, #116] // clip - mov r9, #GRAIN_WIDTH // grain_lut stride - - neg r4, r4 - vdup.16 q13, r4 // -scaling_shift - cmp r8, #0 - - movrel_local r12, overlap_coeffs_0 - - beq 1f - // clip - vmov.i8 q14, #16 - vmov.i8 q15, #235 - b 2f -1: - // no clip - vmov.i8 q14, #0 - vmov.i8 q15, #255 -2: - - vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs - - add r5, r5, #9 // grain_lut += 9 - add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride - add r5, r5, r9 // grain_lut += grain_stride - - ldr r10, [r6, #8] // offsets[1][0] - calc_offset r10, r4, r10, 0, 0 - add_offset r4, r10, r4, r5, r9 - ldr r10, [r6, #4] // offsets[0][1] - calc_offset r10, r11, r10, 0, 0 - add_offset r11, r10, r11, r5, r9 - ldr r10, [r6, #12] // offsets[1][1] - calc_offset r10, r8, r10, 0, 0 - add_offset r8, r10, r8, r5, r9 - ldr r6, [r6] // offsets[0][0] - calc_offset r6, lr, r6, 0, 0 - add_offset r5, r6, lr, r5, r9 - - add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx - add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - - ldr r10, [sp, #120] // type - adr r11, L(fgy_loop_tbl) - - tst r10, #1 - ldr r10, [r11, r10, lsl #2] - - add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx - - add r11, r11, r10 - - beq 1f - // y overlap - vdup.8 d14, d24[0] - vdup.8 d15, d24[1] - mov r10, r7 // backup actual h - mov r7, #2 -1: - bx r11 -endfunc - -function fgy_loop_neon -L(fgy_loop_tbl): - .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB - .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB - .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB - .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB - -.macro fgy ox, oy -L(loop_\ox\oy): -1: -.if \ox - vld1.8 {d8}, [r4], r9 // grain_lut old -.endif -.if \oy - vld1.8 {q2, q3}, [r6], r9 // grain_lut top -.endif -.if \ox && \oy - vld1.8 {d10}, [r8], r9 // grain_lut top old -.endif - vld1.8 {q0, q1}, [r1, :128], r2 // src - vld1.8 {q10, q11}, [r5], r9 // grain_lut - -.if \ox - vmull.s8 q4, d8, d24 - vmlal.s8 q4, d20, d25 -.endif - -.if \oy -.if \ox - vmull.s8 q5, d10, d24 - vmlal.s8 q5, d4, d25 - vqrshrn.s16 d20, q4, #5 - vqrshrn.s16 d4, q5, #5 -.endif - - vmull.s8 q4, d20, d15 - vmull.s8 q5, d21, d15 - vmull.s8 q8, d22, d15 - vmull.s8 q9, d23, d15 - vmlal.s8 q4, d4, d14 - vmlal.s8 q5, d5, d14 - vmlal.s8 q8, d6, d14 - vmlal.s8 q9, d7, d14 - vqrshrn.s16 d20, q4, #5 - vqrshrn.s16 d21, q5, #5 - vqrshrn.s16 d22, q8, #5 - vqrshrn.s16 d23, q9, #5 -.elseif \ox - vqrshrn.s16 d20, q4, #5 -.endif - - bl gather32_neon - - vmovl.s8 q8, d20 // grain - vmovl.s8 q9, d21 - vmovl.s8 q10, d22 - vmovl.s8 q11, d23 - - vmovl.u8 q2, d8 // scaling - vmovl.u8 q3, d9 - vmovl.u8 q4, d10 - vmovl.u8 q5, d11 - - vmul.i16 q8, q8, q2 // scaling * grain - vmul.i16 q9, q9, q3 - vmul.i16 q10, q10, q4 - vmul.i16 q11, q11, q5 - - vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) - vrshl.s16 q9, q9, q13 - vrshl.s16 q10, q10, q13 - vrshl.s16 q11, q11, q13 - - vaddw.u8 q8, q8, d0 // *src + noise - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - - vmax.u8 q0, q0, q14 - vmax.u8 q1, q1, q14 - vmin.u8 q0, q0, q15 - vmin.u8 q1, q1, q15 - - subs r7, r7, #1 -.if \oy - vdup.8 d14, d25[0] - vdup.8 d15, d25[1] -.endif - vst1.8 {q0, q1}, [r0, :128], r2 // dst - bgt 1b - -.if \oy - cmp r10, #2 - sub r7, r10, #2 // restore actual remaining h - bgt L(loop_\ox\()0) -.endif - vpop {q4-q7} - pop {r4-r11,pc} -.endm - - fgy 0, 0 - fgy 0, 1 - fgy 1, 0 - fgy 1, 1 -endfunc - -// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, -// const pixel *const src, -// const ptrdiff_t stride, -// const uint8_t scaling[SCALING_SIZE], -// const Dav1dFilmGrainData *const data, -// const entry grain_lut[][GRAIN_WIDTH], -// const pixel *const luma_row, -// const ptrdiff_t luma_stride, -// const int offsets[][2], -// const ptrdiff_t h, const ptrdiff_t uv, -// const ptrdiff_t is_id, -// const ptrdiff_t type); -.macro fguv layout, sx, sy -function fguv_32x32_\layout\()_8bpc_neon, export=1 - push {r4-r11,lr} - vpush {q4-q7} - ldrd r4, r5, [sp, #100] // data, grain_lut - ldrd r6, r7, [sp, #108] // luma_row, luma_stride - ldrd r8, r9, [sp, #116] // offsets, h - ldrd r10, r11, [sp, #124] // uv, is_id - - // !csfl - add r10, r4, r10, lsl #2 // + 4*uv - add r12, r10, #FGD_UV_LUMA_MULT - add lr, r10, #FGD_UV_MULT - add r10, r10, #FGD_UV_OFFSET - vld1.16 {d4[]}, [r12] // uv_luma_mult - vld1.16 {d4[2]}, [r10] // uv_offset - vld1.16 {d4[1]}, [lr] // uv_mult - - ldr lr, [r4, #FGD_SCALING_SHIFT] - ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] - neg lr, lr // -scaling_shift - - cmp r12, #0 - vdup.16 q13, lr // -scaling_shift - - beq 1f - // clip - cmp r11, #0 - vmov.i8 q14, #16 - vmov.i8 q15, #240 - beq 2f - // is_id - vmov.i8 q15, #235 - b 2f -1: - // no clip - vmov.i8 q14, #0 - vmov.i8 q15, #255 -2: - - mov r10, #GRAIN_WIDTH // grain_lut stride - - add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 -.if \sy - add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride - add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride -.else - add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride - add r5, r5, r10 // grain_lut += grain_stride -.endif - - ldr r12, [r8, #8] // offsets[1][0] - calc_offset r12, r4, r12, \sx, \sy - add_offset r4, r12, r4, r5, r10 - - ldr r12, [r8, #4] // offsets[0][1] - calc_offset r12, lr, r12, \sx, \sy - add_offset lr, r12, lr, r5, r10 - - ldr r12, [r8, #12] // offsets[1][1] - calc_offset r12, r11, r12, \sx, \sy - add_offset r11, r12, r11, r5, r10 - - ldr r8, [r8] // offsets[0][0] - calc_offset r8, r12, r8, \sx, \sy - add_offset r5, r8, r12, r5, r10 - - add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - - movrel_local r12, overlap_coeffs_\sx - ldr lr, [sp, #132] // type - - vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs - - movrel_local r12, L(fguv_loop_sx\sx\()_tbl) -#if CONFIG_THUMB - // This uses movrel_local instead of adr above, because the target - // can be out of range for adr. But movrel_local leaves the thumb bit - // set on COFF (but probably wouldn't if building for thumb on ELF), - // thus try to clear the bit for robustness. - bic r12, r12, #1 -#endif - - tst lr, #1 - ldr lr, [r12, lr, lsl #2] - - add r12, r12, lr - - beq 1f - // y overlap - sub lr, r9, #(2 >> \sy) // backup remaining h - mov r9, #(2 >> \sy) - -1: - -.if \sy - vmov.i8 d6, #23 - vmov.i8 d7, #22 -.else - vmov.i8 d6, #27 - vmov.i8 d7, #17 -.endif - -.if \sy - add r7, r7, r7 // luma_stride *= 2 -.endif - - bx r12 -endfunc -.endm - -fguv 420, 1, 1 -fguv 422, 1, 0 -fguv 444, 0, 0 - -function fguv_loop_sx0_neon -L(fguv_loop_sx0_tbl): - .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB - -.macro fguv_loop_sx0 csfl, ox, oy -L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): -.if \oy - mov r12, lr -.endif -1: -.if \ox - vld1.8 {d8}, [r4], r10 // grain_lut old -.endif -.if \oy - vld1.8 {q8, q9}, [r8], r10 // grain_lut top -.endif -.if \ox && \oy - vld1.8 {d10}, [r11], r10 // grain_lut top old -.endif - vld1.8 {q0, q1}, [r6, :128], r7 // luma - vld1.8 {q10, q11}, [r5], r10 // grain_lut - -.if \ox - vmull.s8 q4, d8, d24 - vmlal.s8 q4, d20, d25 -.endif - -.if \oy -.if \ox - vmull.s8 q5, d10, d24 - vmlal.s8 q5, d16, d25 - vqrshrn.s16 d20, q4, #5 - vqrshrn.s16 d16, q5, #5 -.endif - - vmull.s8 q4, d20, d7 - vmull.s8 q5, d21, d7 - vmull.s8 q6, d22, d7 - vmull.s8 q7, d23, d7 - vmlal.s8 q4, d16, d6 - vmlal.s8 q5, d17, d6 - vmlal.s8 q6, d18, d6 - vmlal.s8 q7, d19, d6 - vqrshrn.s16 d20, q4, #5 - vqrshrn.s16 d21, q5, #5 - vqrshrn.s16 d22, q6, #5 - vqrshrn.s16 d23, q7, #5 -.elseif \ox - vqrshrn.s16 d20, q4, #5 -.endif -.if !\csfl - vld1.8 {q8, q9}, [r1, :128] // src - vmovl.u8 q4, d0 - vmovl.u8 q5, d1 - vmovl.u8 q6, d2 - vmovl.u8 q7, d3 - vmovl.u8 q0, d16 - vmovl.u8 q1, d17 - vmovl.u8 q8, d18 - vmovl.u8 q9, d19 - vmul.i16 q4, q4, d4[0] - vmul.i16 q5, q5, d4[0] - vmul.i16 q6, q6, d4[0] - vmul.i16 q7, q7, d4[0] - vmul.i16 q0, q0, d4[1] - vmul.i16 q1, q1, d4[1] - vmul.i16 q8, q8, d4[1] - vmul.i16 q9, q9, d4[1] - vqadd.s16 q4, q4, q0 - vqadd.s16 q5, q5, q1 - vqadd.s16 q6, q6, q8 - vqadd.s16 q7, q7, q9 - vdup.16 q0, d4[2] - vshr.s16 q4, q4, #6 - vshr.s16 q5, q5, #6 - vshr.s16 q6, q6, #6 - vshr.s16 q7, q7, #6 - vadd.i16 q4, q4, q0 - vadd.i16 q5, q5, q0 - vadd.i16 q6, q6, q0 - vadd.i16 q7, q7, q0 - vqmovun.s16 d0, q4 - vqmovun.s16 d1, q5 - vqmovun.s16 d2, q6 - vqmovun.s16 d3, q7 -.endif - - bl gather32_neon - - vld1.8 {q0, q1}, [r1, :128], r2 // src - - vmovl.s8 q8, d20 // grain - vmovl.s8 q9, d21 - vmovl.s8 q10, d22 - vmovl.s8 q11, d23 - - vmovl.u8 q6, d8 // scaling - vmovl.u8 q7, d9 - vmovl.u8 q4, d10 - vmovl.u8 q5, d11 - - vmul.i16 q8, q8, q6 // scaling * grain - vmul.i16 q9, q9, q7 - vmul.i16 q10, q10, q4 - vmul.i16 q11, q11, q5 - - vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) - vrshl.s16 q9, q9, q13 - vrshl.s16 q10, q10, q13 - vrshl.s16 q11, q11, q13 - - vaddw.u8 q8, q8, d0 // *src + noise - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - - vmax.u8 q0, q0, q14 - vmax.u8 q1, q1, q14 - vmin.u8 q0, q0, q15 - vmin.u8 q1, q1, q15 - - subs r9, r9, #1 -.if \oy - vdup.8 d6, d25[0] - vdup.8 d7, d25[1] -.endif - - vst1.8 {q0, q1}, [r0, :128], r2 // dst - bgt 1b - -.if \oy - cmp r12, #0 - mov r9, r12 // restore actual remaining h - bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) -.endif - b 9f -.endm - fguv_loop_sx0 0, 0, 0 - fguv_loop_sx0 0, 0, 1 - fguv_loop_sx0 0, 1, 0 - fguv_loop_sx0 0, 1, 1 - fguv_loop_sx0 1, 0, 0 - fguv_loop_sx0 1, 0, 1 - fguv_loop_sx0 1, 1, 0 - fguv_loop_sx0 1, 1, 1 - -9: - vpop {q4-q7} - pop {r4-r11,pc} -endfunc - -function fguv_loop_sx1_neon -L(fguv_loop_sx1_tbl): - .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB - -.macro fguv_loop_sx1 csfl, ox, oy -L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): -.if \oy - mov r12, lr -.endif -1: -.if \ox - vld1.8 {d8}, [r4], r10 // grain_lut old -.endif -.if \oy - vld1.8 {q8}, [r8], r10 // grain_lut top -.endif -.if \ox && \oy - vld1.8 {d10}, [r11], r10 // grain_lut top old -.endif - vld1.8 {q0, q1}, [r6, :128], r7 // luma - vld1.8 {q10}, [r5], r10 // grain_lut - vld1.8 {q11}, [r1, :128], r2 // src - -.if \ox - vmull.s8 q4, d8, d24 - vmlal.s8 q4, d20, d25 -.endif - - vpaddl.u8 q0, q0 - vpaddl.u8 q1, q1 -.if \oy -.if \ox - vmull.s8 q5, d10, d24 - vmlal.s8 q5, d16, d25 - vqrshrn.s16 d20, q4, #5 - vqrshrn.s16 d16, q5, #5 -.endif - - vmull.s8 q4, d20, d7 - vmull.s8 q5, d21, d7 - vmlal.s8 q4, d16, d6 - vmlal.s8 q5, d17, d6 - vqrshrn.s16 d20, q4, #5 - vqrshrn.s16 d21, q5, #5 -.elseif \ox - vqrshrn.s16 d20, q4, #5 -.endif -.if \csfl - vrshrn.u16 d0, q0, #1 - vrshrn.u16 d1, q1, #1 -.else - vrshr.u16 q4, q0, #1 - vrshr.u16 q5, q1, #1 - vmovl.u8 q0, d22 - vmovl.u8 q1, d23 - vmul.i16 q4, q4, d4[0] - vmul.i16 q5, q5, d4[0] - vmul.i16 q0, q0, d4[1] - vmul.i16 q1, q1, d4[1] - vqadd.s16 q4, q4, q0 - vqadd.s16 q5, q5, q1 - vdup.16 q0, d4[2] - vshr.s16 q4, q4, #6 - vshr.s16 q5, q5, #6 - vadd.i16 q4, q4, q0 - vadd.i16 q5, q5, q0 - vqmovun.s16 d0, q4 - vqmovun.s16 d1, q5 -.endif - - bl gather16_neon - - vmovl.s8 q8, d20 // grain - vmovl.s8 q9, d21 - - vmovl.u8 q6, d8 // scaling - vmovl.u8 q7, d9 - - vmul.i16 q8, q8, q6 // scaling * grain - vmul.i16 q9, q9, q7 - - vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) - vrshl.s16 q9, q9, q13 - - vaddw.u8 q8, q8, d22 // *src + noise - vaddw.u8 q9, q9, d23 - - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - - vmax.u8 q0, q0, q14 - vmin.u8 q0, q0, q15 - - subs r9, r9, #1 -.if \oy - vswp d6, d7 -.endif - vst1.8 {q0}, [r0, :128], r2 // dst - bgt 1b - -.if \oy - cmp r12, #0 - mov r9, r12 // restore actual remaining h - bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) -.endif - - b 9f -.endm - fguv_loop_sx1 0, 0, 0 - fguv_loop_sx1 0, 0, 1 - fguv_loop_sx1 0, 1, 0 - fguv_loop_sx1 0, 1, 1 - fguv_loop_sx1 1, 0, 0 - fguv_loop_sx1 1, 0, 1 - fguv_loop_sx1 1, 1, 0 - fguv_loop_sx1 1, 1, 1 - -9: - vpop {q4-q7} - pop {r4-r11,pc} -endfunc diff -Nru dav1d-0.9.2/src/arm/32/filmgrain.S dav1d-1.0.0/src/arm/32/filmgrain.S --- dav1d-0.9.2/src/arm/32/filmgrain.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/arm/32/filmgrain.S 2022-03-18 14:31:55.966356000 +0000 @@ -0,0 +1,2039 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr r11, r2, #3 + lsr r12, r2, #12 + lsr lr, r2, #1 + eor r11, r2, r11 // (r >> 0) ^ (r >> 3) + eor r12, r12, lr // (r >> 12) ^ (r >> 1) + eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr r2, r2, #\steps +.endif + and r11, r11, #((1 << \steps) - 1) // bit +.if \shift + orr r2, r2, r11, lsl #(16 - \steps) // *state +.else + orr r2, r2, r11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, r2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, r2, #17 - \bits, #\bits + lsr r2, r2, #1 +.endm + +// special calling convention: +// r2 holds seed +// r3 holds dav1d_gaussian_sequence +// clobbers r11-r12 +// returns in d0-d1 +function get_gaussian_neon + push {r5-r6,lr} + increment_seed 4 + read_rand r5, 11, 3 + read_rand r6, 11, 2 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d0[0]}, [r5] + read_rand r5, 11, 1 + vld1.16 {d0[1]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 0 + increment_seed 4 + add r6, r3, r6, lsl #1 + vld1.16 {d0[2]}, [r5] + read_rand r5, 11, 3 + vld1.16 {d0[3]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 2 + vld1.16 {d1[0]}, [r5] + add r6, r3, r6, lsl #1 + read_rand r5, 11, 1 + vld1.16 {d1[1]}, [r6] + read_rand r6, 11, 0 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d1[2]}, [r5] + vld1.16 {d1[3]}, [r6] + pop {r5-r6,pc} +endfunc + +.macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r0, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r1, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r2, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r3, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r4, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r5, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r6, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r7, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r8, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r9, q0 + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + vmovn.i16 \r10, q0 +.endm + +.macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 + vst1.16 {\r0, \r1, \r2, \r3}, [r0]! + vst1.16 {\r4, \r5, \r6, \r7}, [r0]! + vst1.16 {\r8, \r9}, [r0]! + vst1.16 {\r10[0]}, [r0]! +.endm + +.macro get_grain_row_44 r0, r1, r2, r3, r4, r5 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r0, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r1, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r2, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r3, q0 + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + vmovn.i16 \r4, q0 + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d0[1]}, [r12] + add r11, r3, r11, lsl #1 + read_rand r12, 11, 0 + vld1.16 {d0[2]}, [r11] + add r12, r3, r12, lsl #1 + vld1.16 {d0[3]}, [r12] + vrshl.s16 d0, d0, d30 + vmovn.i16 \r5, q0 +.endm + +.macro store_grain_row_44 r0, r1, r2, r3, r4, r5 + vst1.16 {\r0, \r1, \r2, \r3}, [r0]! + vst1.16 {\r4, \r5}, [r0] + add r0, r0, #GRAIN_WIDTH-32 +.endm + +function get_grain_2_neon + push {r11,lr} + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + vmovn.i16 d0, q0 + pop {r11,pc} +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +// r1 holds the number of entries to produce +// r6, r8 and r10 hold the previous output entries +// q0 holds the vector of produced entries +// q1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon + push {r0, lr} +.if \n == 1 + mov lr, #-128 +.else + mov r0, #1 + mov lr, #1 + sub r7, r7, #1 + sub r9, r9, #1 + lsl r0, r0, r7 + lsl lr, lr, r9 + add r7, r7, #1 + add r9, r9, #1 +.endif +1: + read_shift_rand r12, 11 + vmov.32 r11, d2[0] + lsl r12, r12, #1 + vext.8 q0, q0, q0, #1 + ldrsh r12, [r3, r12] +.if \n == 1 + mla r11, r6, r4, r11 // sum (above) + *coeff * prev output + add r6, r11, r8 // 1 << (ar_coeff_shift - 1) + add r12, r12, r10 + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 + grain_scale_shift) + add r6, r6, r12 + cmp r6, r5 +.elseif \n == 2 + mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 + mla r11, r6, r10, r11 // += *coeff * prev output 2 + mov r8, r6 + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mov lr, #-128 +.else + push {r1-r3} + sbfx r1, r4, #0, #8 + sbfx r2, r4, #8, #8 + sbfx r3, r4, #16, #8 + mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 + mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 + mla r11, r6, r3, r11 // += *coeff * prev output 3 + pop {r1-r3} + mov r10, r8 + mov r8, r6 + + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mov lr, #-128 +.endif + it gt + movgt r6, r5 + cmp r6, lr + it lt + movlt r6, lr +.if \n >= 2 + pop {lr} +.endif + subs r1, r1, #1 + vext.8 q1, q1, q1, #4 + vmov.8 d1[7], r6 + bgt 1b + pop {r0, pc} +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + vmull.s8 q2, d6, d28 + vmull.s8 q3, d7, d28 + vmull.s8 q4, d0, d27 + vmull.s8 q5, d1, d27 + + vaddl.s16 q0, d4, d8 + vaddl.s16 q2, d5, d9 + vaddl.s16 q4, d6, d10 + vaddl.s16 q5, d7, d11 + + vmull.s8 q3, d3, d29 + vmull.s8 q1, d2, d29 + + vaddw.s16 q4, q4, d6 + vaddw.s16 q5, q5, d7 + vaddw.s16 q3, q2, d3 + vaddw.s16 q2, q0, d2 + bx lr +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff +.ifc \lag\()_\edge, lag3_left + bl sum_lag3_left_above_neon +.else + bl sum_\lag\()_above_neon +.endif +.ifc \type, uv_420 + vpush {q6-q7} + add r12, r11, #GRAIN_WIDTH + vld1.16 {q0, q1}, [r11]! + vld1.16 {q6, q7}, [r12]! + vpaddl.s8 q0, q0 + vpaddl.s8 q1, q1 + vpaddl.s8 q6, q6 + vpaddl.s8 q7, q7 + vadd.i16 q0, q0, q6 + vadd.i16 q1, q1, q7 + vpop {q6-q7} + vrshrn.s16 d0, q0, #2 + vrshrn.s16 d1, q1, #2 +.endif +.ifc \type, uv_422 + vld1.8 {q0, q1}, [r11]! + vpaddl.s8 q0, q0 + vpaddl.s8 q1, q1 + vrshrn.s16 d0, q0, #1 + vrshrn.s16 d1, q1, #1 +.endif +.ifc \type, uv_444 + vld1.8 {q0}, [r11]! +.endif +.if \uv_layout +.ifnb \uv_coeff + vdup.8 d13, \uv_coeff +.endif + vmull.s8 q1, d0, d13 + vmull.s8 q0, d1, d13 + vaddw.s16 q2, q2, d2 + vaddw.s16 q3, q3, d3 + vaddw.s16 q4, q4, d0 + vaddw.s16 q5, q5, d1 +.endif +.if \uv_layout && \elems == 16 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 15 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 9 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: + push {r11} +.ifc \edge, left + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d1[1]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d1[2]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d1[3]}, [r11] + lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 + vrshl.s16 d1, d1, d30 + vmovn.i16 d1, q0 + vext.8 q2, q2, q2, #12 +.ifc \lag, lag3 + vmov.s8 r10, d1[5] +.endif +.ifnc \lag, lag1 + vmov.s8 r8, d1[6] +.endif + vmov.s8 r6, d1[7] + + vmov q1, q2 + mov r1, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + vmov q1, q3 + mov r1, #4 + bl output_\lag\()_neon + + increment_seed 4, shift=0 + vmov q1, q4 +.if \elems == 9 + mov r1, #1 + bl output_\lag\()_neon + lsr r2, r2, #3 + + read_rand r11, 11, 2 + read_rand r12, 11, 1 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r11] + read_rand r11, 11, 0 + vld1.16 {d2[1]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d2[2]}, [r11] + vrshl.s16 d2, d2, d30 + vmovn.i16 d2, q1 + vext.8 q0, q0, q1, #7 +.else + mov r1, #4 + bl output_\lag\()_neon + + increment_seed 4, shift=0 + vmov q1, q5 + +.ifc \edge, right + mov r1, #3 + bl output_\lag\()_neon + read_shift_rand r11, 11 + add r11, r3, r11, lsl #1 + vld1.16 {d2[0]}, [r11] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #1 +.else + mov r1, #4 + bl output_\lag\()_neon +.endif +.endif +.if \store + vst1.8 {q0}, [r0]! +.endif + pop {r11} + pop {r1, pc} +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag1_\edge\()_neon + push {r1, lr} + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 15 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 15 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 9 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 9 + +.macro sum_lag1 type, dst, left, mid, right, edge=mid + vmov q3, \mid + vext.8 q0, \left, \mid, #15 + vext.8 q1, \mid, \right, #1 + bl sum_\type\()_lag1_\edge\()_neon + vmov \dst, q0 +.endm + +.macro sum_y_lag1 dst, left, mid, right, edge=mid + sum_lag1 y, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_444, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_422, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_420, \dst, \left, \mid, \right, \edge +.endm + + +function sum_lag2_above_neon + push {lr} + sub r12, r0, #2*GRAIN_WIDTH - 16 + sub lr, r0, #1*GRAIN_WIDTH - 16 + vld1.8 {q10}, [r12] // load top right + vld1.8 {q13}, [lr] + + vext.8 q6, q8, q9, #14 // top left, top mid + vdup.8 d14, d28[0] + vext.8 q8, q8, q9, #15 + vdup.8 d15, d28[1] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q2, d0, d12 + vaddl.s16 q3, d1, d13 + vaddl.s16 q4, d2, d16 + vaddl.s16 q5, d3, d17 + + vext.8 q6, q9, q10, #1 // top mid, top right + vdup.8 d14, d28[3] + vext.8 q8, q9, q10, #2 + vdup.8 d15, d28[4] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vext.8 q6, q11, q12, #14 // top left, top mid + vdup.8 d14, d28[5] + vext.8 q8, q11, q12, #15 + vdup.8 d15, d28[6] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vext.8 q6, q12, q13, #1 // top mid, top right + vdup.8 d14, d29[0] + vext.8 q8, q12, q13, #2 + vdup.8 d15, d29[1] + + vmull.s8 q0, d12, d14 + vmull.s8 q1, d13, d14 + vmull.s8 q6, d16, d15 + vmull.s8 q8, d17, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vdup.8 d14, d28[2] + vdup.8 d15, d28[7] + + vmull.s8 q0, d18, d14 + vmull.s8 q1, d19, d14 + vmull.s8 q6, d24, d15 + vmull.s8 q8, d25, d15 + + vaddl.s16 q7, d0, d12 + vaddl.s16 q0, d1, d13 + vaddl.s16 q6, d2, d16 + vaddl.s16 q1, d3, d17 + + vmov q8, q9 + vmov q9, q10 + + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q0 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q1 + + vmov q11, q12 + vmov q12, q13 + + pop {pc} +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag2_\edge\()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #2*GRAIN_WIDTH + sub lr, r0, #1*GRAIN_WIDTH + vld1.8 {q9}, [r12] // load the previous block right above + vld1.8 {q12}, [lr] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 15 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 15 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 9 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 9 + + +function sum_lag3_left_above_neon + // A separate codepath for the left edge, to avoid reading outside + // of the edge of the buffer. + sub r12, r0, #3*GRAIN_WIDTH + vld1.8 {q11, q12}, [r12] + vext.8 q12, q11, q12, #13 + vext.8 q11, q11, q11, #13 + b sum_lag3_above_start +endfunc + +function sum_lag3_above_neon + sub r12, r0, #3*GRAIN_WIDTH + 3 + vld1.8 {q11, q12}, [r12] + +sum_lag3_above_start: + vdup.8 d20, d26[0] + vext.8 q9, q11, q12, #1 + vdup.8 d21, d26[1] + + vmull.s8 q0, d22, d20 + vmull.s8 q1, d23, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vext.8 q8, q11, q12, #2 + vdup.8 d20, d26[2] + vext.8 q9, q11, q12, #3 + vdup.8 d21, d26[3] + + vaddl.s16 q2, d0, d12 + vaddl.s16 q3, d1, d13 + vaddl.s16 q4, d2, d14 + vaddl.s16 q5, d3, d15 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #4 + vdup.8 d20, d26[4] + vext.8 q7, q11, q12, #5 + vdup.8 d21, d26[5] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + sub r12, r0, #2*GRAIN_WIDTH + 3 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vext.8 q8, q11, q12, #6 + vld1.8 {q11, q12}, [r12] + vdup.8 d20, d26[6] + vdup.8 d21, d26[7] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d22, d21 + vmull.s8 q7, d23, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #1 + vdup.8 d20, d27[0] + vext.8 q7, q11, q12, #2 + vdup.8 d21, d27[1] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vext.8 q8, q11, q12, #3 + vdup.8 d20, d27[2] + vext.8 q9, q11, q12, #4 + vdup.8 d21, d27[3] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + sub r12, r0, #1*GRAIN_WIDTH + 3 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #5 + vdup.8 d20, d27[4] + vext.8 q7, q11, q12, #6 + vdup.8 d21, d27[5] + + vld1.8 {q11, q12}, [r12] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vdup.8 d20, d27[6] + vext.8 q9, q11, q12, #1 + vdup.8 d21, d27[7] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d22, d20 + vmull.s8 q1, d23, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #2 + vdup.8 d20, d28[0] + vext.8 q7, q11, q12, #3 + vdup.8 d21, d28[1] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + vmull.s8 q8, d14, d21 + vmull.s8 q9, d15, d21 + + vaddl.s16 q6, d0, d16 + vaddl.s16 q7, d1, d17 + vaddl.s16 q0, d2, d18 + vaddl.s16 q1, d3, d19 + + vext.8 q8, q11, q12, #4 + vdup.8 d20, d28[2] + vext.8 q9, q11, q12, #5 + vdup.8 d21, d28[3] + + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d16, d20 + vmull.s8 q1, d17, d20 + vmull.s8 q6, d18, d21 + vmull.s8 q7, d19, d21 + + vaddl.s16 q8, d0, d12 + vaddl.s16 q9, d1, d13 + vaddl.s16 q0, d2, d14 + vaddl.s16 q1, d3, d15 + + vext.8 q6, q11, q12, #6 + vdup.8 d20, d28[4] + + vadd.i32 q2, q2, q8 + vadd.i32 q3, q3, q9 + vadd.i32 q4, q4, q0 + vadd.i32 q5, q5, q1 + + vmull.s8 q0, d12, d20 + vmull.s8 q1, d13, d20 + + vaddw.s16 q2, q2, d0 + vaddw.s16 q3, q3, d1 + vaddw.s16 q4, q4, d2 + vaddw.s16 q5, q5, d3 + + bx lr +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag3_\edge\()_neon + push {r1, lr} + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 15 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 15 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 9 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 9 + +function generate_grain_rows_neon + push {r11,lr} +1: + get_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 + subs r1, r1, #1 + store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26 + bgt 1b + pop {r11,pc} +endfunc + +function generate_grain_rows_44_neon + push {r11,lr} +1: + get_grain_row_44 d16, d17, d18, d19, d20, d21 + subs r1, r1, #1 + store_grain_row_44 d16, d17, d18, d19, d20, d21 + bgt 1b + pop {r11,pc} +endfunc + +function gen_grain_uv_444_lag0_neon + vld1.8 {q3}, [r11]! + push {r11,lr} + bl get_gaussian_neon + vrshl.s16 q8, q0, q15 + bl get_gaussian_neon + vrshl.s16 q9, q0, q15 + vqmovn.s16 d0, q8 + vqmovn.s16 d1, q9 + + vand q3, q3, q1 + vmull.s8 q2, d6, d22 + vmull.s8 q3, d7, d22 + vrshl.s16 q2, q2, q12 + vrshl.s16 q3, q3, q12 + vaddw.s8 q2, q2, d0 + vaddw.s8 q3, q3, d1 + vqmovn.s16 d4, q2 + vqmovn.s16 d5, q3 + vst1.8 {q2}, [r0]! + pop {r11,pc} +endfunc + +function get_grain_row_44_neon + push {r11,lr} + get_grain_row_44 d16, d17, d18, d19, d20, d21 + pop {r11,pc} +endfunc + +function add_uv_420_coeff_lag0_neon + vld1.16 {q2, q3}, [r11]! + vld1.16 {q4, q5}, [r12]! + vpaddl.s8 q2, q2 + vpaddl.s8 q3, q3 + vpaddl.s8 q4, q4 + vpaddl.s8 q5, q5 + vadd.i16 q2, q2, q4 + vadd.i16 q3, q3, q5 + vrshrn.s16 d4, q2, #2 + vrshrn.s16 d5, q3, #2 + b add_coeff_lag0_start +endfunc + +function add_uv_422_coeff_lag0_neon + vld1.16 {q2, q3}, [r11]! + vpaddl.s8 q2, q2 + vpaddl.s8 q3, q3 + vrshrn.s16 d4, q2, #1 + vrshrn.s16 d5, q3, #1 + +add_coeff_lag0_start: + vand q3, q2, q1 + vmull.s8 q2, d6, d22 + vmull.s8 q3, d7, d22 + vrshl.s16 q2, q2, q12 + vrshl.s16 q3, q3, q12 + vaddw.s8 q2, q2, d0 + vaddw.s8 q3, q3, d1 + vqmovn.s16 d4, q2 + vqmovn.s16 d5, q3 + bx lr +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_8bpc_neon, export=1 + push {r4-r11,lr} + +.ifc \type, uv_444 + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH + mov r1, r2 + mul r12, r12, lr +.endif + movrel r3, X(gaussian_sequence) + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add r4, r1, #FGD_AR_COEFFS_Y +.else + add r4, r1, #FGD_AR_COEFFS_UV +.endif + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + +.ifc \type, uv_444 + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 +.endif + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, y + mov r1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + + mov r1, #3 + bl generate_grain_rows_neon + mov r1, #GRAIN_HEIGHT-3 + + vdup.16 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vext.8 q13, q0, q1, #13 + vext.8 q14, q1, q0, #1 + vneg.s16 q12, q12 + +1: + vmov q1, q13 + bl gen_grain_uv_444_lag0_neon // 16 + vmov.i8 q1, #255 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 64 + vmov q1, q14 + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 + add r11, r11, #2 + vst1.16 {d16[0]}, [r0]! + bgt 1b +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb r4, [r4, #1] // ar_coeffs_y[3] +.else + add r4, r4, #2 +.endif + + mov r1, #3 +.ifc \type, uv_444 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + sum_\type\()_lag1 q7, q8, q8, q9, left + sum_\type\()_lag1 q8, q8, q9, q10 + sum_\type\()_lag1 q9, q9, q10, q11 + sum_\type\()_lag1 q10, q10, q11, q12 + sum_\type\()_lag1 q12, q11, q12, q13, right + get_grain_2 d26 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #2 +.endif + store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26 + vmov q11, q10 + vmov q10, q9 + vmov q9, q8 + vmov q8, q7 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #2 +.endif + vst1.16 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #127 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + vpush {d26} + bl generate_grain_rows_neon + vpop {d26} + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #2 +.endif + vst1.16 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH-(3*32) +.else + sub \reg, \reg, #3*32-GRAIN_WIDTH +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_8bpc_neon, export=1 + push {r4-r11,lr} + + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH-3 + mov r1, r2 + mul r12, r12, lr + + movrel r3, X(gaussian_sequence) + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] + add r4, r1, #FGD_AR_COEFFS_UV + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, uv_420 + vpush {q4-q5} +.endif + mov r1, #3 + bl generate_grain_rows_44_neon + set_height r1, \type + + vdup.16 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vext.8 q13, q0, q1, #13 + vext.8 q14, q1, q0, #7 + vneg.s16 q12, q12 + +1: + bl get_grain_row_44_neon +.ifc \type, uv_420 + add r12, r11, #GRAIN_WIDTH +.endif + vmov q1, q13 + vmov q0, q8 + bl add_\type\()_coeff_lag0_neon + vmov.i8 q1, #255 + vmov q0, q9 + vmov q8, q2 + bl add_\type\()_coeff_lag0_neon + vmov.i8 q1, q14 + vmov q0, q10 + vmov q9, q2 + bl add_\type\()_coeff_lag0_neon + vmov q10, q2 + subs r1, r1, #1 + increment_y_ptr r11, \type + store_grain_row_44 d16, d17, d18, d19, d20, d21 + bgt 1b + +.ifc \type, uv_420 + vpop {q4-q5} +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] + add r4, r4, #2 + + mov r1, #3 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + sum_\type\()_lag1 q7, q8, q8, q9, left + sum_\type\()_lag1 q8, q8, q9, q10 + sum_\type\()_lag1 q10, q9, q10, q11, right + subs r1, r1, #1 + increment_y_ptr r11, \type + store_grain_row_44 d14, d15, d16, d17, d20, d21 + vmov q9, q8 + vmov q8, q7 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #127 + vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH-48 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #127 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH-48 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + vmov.u8 r11, \src1[0+\off] + vmov.u8 r12, \src2[0+\off] + add r11, r11, r3 + vmov.u8 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u8 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u8 r12, \src1[4+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u8 lr, \src2[4+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u8 r11, \src1[6+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u8 r12, \src2[6+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4 + gather_interleaved \dst1, \dst3, \src1, \src3, 0 + gather_interleaved \dst1, \dst3, \src1, \src3, 1 + gather_interleaved \dst2, \dst4, \src2, \src4, 0 + gather_interleaved \dst2, \dst4, \src2, \src4, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, 0 + gather_interleaved d8, d9, d0, d1, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .byte 27, 17, 0, 0, 0, 0, 0, 0 + .byte 17, 27, 32, 32, 32, 32, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .byte 23, 0, 0, 0, 0, 0, 0, 0 + .byte 22, 32, 32, 32, 32, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type); +function fgy_32x32_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH // grain_lut stride + + neg r4, r4 + vdup.16 q13, r4 // -scaling_shift + cmp r8, #0 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i8 q14, #16 + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #9 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.8 d14, d24[0] + vdup.8 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.8 {d8}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.8 {q2, q3}, [r6], r9 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r8], r9 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r1, :128], r2 // src + vld1.8 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d4, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d4, q5, #5 +.endif + + vmull.s8 q4, d20, d15 + vmull.s8 q5, d21, d15 + vmull.s8 q8, d22, d15 + vmull.s8 q9, d23, d15 + vmlal.s8 q4, d4, d14 + vmlal.s8 q5, d5, d14 + vmlal.s8 q8, d6, d14 + vmlal.s8 q9, d7, d14 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q8, #5 + vqrshrn.s16 d23, q9, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif + + bl gather32_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q2, d8 // scaling + vmovl.u8 q3, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q2 // scaling * grain + vmul.i16 q9, q9, q3 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r7, r7, #1 +.if \oy + vdup.8 d14, d25[0] + vdup.8 d15, d25[1] +.endif + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox\()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + ldrd r8, r9, [sp, #116] // offsets, h + ldrd r10, r11, [sp, #124] // uv, is_id + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + add r10, r10, #FGD_UV_OFFSET + vld1.16 {d4[]}, [r12] // uv_luma_mult + vld1.16 {d4[2]}, [r10] // uv_offset + vld1.16 {d4[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + neg lr, lr // -scaling_shift + + cmp r12, #0 + vdup.16 q13, lr // -scaling_shift + + beq 1f + // clip + cmp r11, #0 + vmov.i8 q14, #16 + vmov.i8 q15, #240 + beq 2f + // is_id + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + mov r10, #GRAIN_WIDTH // grain_lut stride + + add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx\()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: + +.if \sy + vmov.i8 d6, #23 + vmov.i8 d7, #22 +.else + vmov.i8 d6, #27 + vmov.i8 d7, #17 +.endif + +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8, q9}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmull.s8 q6, d22, d7 + vmull.s8 q7, d23, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vmlal.s8 q6, d18, d6 + vmlal.s8 q7, d19, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q6, #5 + vqrshrn.s16 d23, q7, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if !\csfl + vld1.8 {q8, q9}, [r1, :128] // src + vmovl.u8 q4, d0 + vmovl.u8 q5, d1 + vmovl.u8 q6, d2 + vmovl.u8 q7, d3 + vmovl.u8 q0, d16 + vmovl.u8 q1, d17 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q6, q6, d4[0] + vmul.i16 q7, q7, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vmul.i16 q8, q8, d4[1] + vmul.i16 q9, q9, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vqadd.s16 q6, q6, q8 + vqadd.s16 q7, q7, q9 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vshr.s16 q6, q6, #6 + vshr.s16 q7, q7, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vadd.i16 q6, q6, q0 + vadd.i16 q7, q7, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 + vqmovun.s16 d2, q6 + vqmovun.s16 d3, q7 +.endif + + bl gather32_neon + + vld1.8 {q0, q1}, [r1, :128], r2 // src + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r9, r9, #1 +.if \oy + vdup.8 d6, d25[0] + vdup.8 d7, d25[1] +.endif + + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10}, [r5], r10 // grain_lut + vld1.8 {q11}, [r1, :128], r2 // src + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if \csfl + vrshrn.u16 d0, q0, #1 + vrshrn.u16 d1, q1, #1 +.else + vrshr.u16 q4, q0, #1 + vrshr.u16 q5, q1, #1 + vmovl.u8 q0, d22 + vmovl.u8 q1, d23 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 +.endif + + bl gather16_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + + vaddw.u8 q8, q8, d22 // *src + noise + vaddw.u8 q9, q9, d23 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + + vmax.u8 q0, q0, q14 + vmin.u8 q0, q0, q15 + + subs r9, r9, #1 +.if \oy + vswp d6, d7 +.endif + vst1.8 {q0}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff -Nru dav1d-0.9.2/src/arm/32/itx.S dav1d-1.0.0/src/arm/32/itx.S --- dav1d-0.9.2/src/arm/32/itx.S 2021-09-03 15:51:24.393037000 +0000 +++ dav1d-1.0.0/src/arm/32/itx.S 2022-03-18 14:31:55.970356000 +0000 @@ -134,9 +134,9 @@ vmlsl.s16 \d1, \s3, \c1 .endm -.macro vrshrn_8h d0, d1, s0, s1, shift - vrshrn.i32 \d0, \s0, \shift - vrshrn.i32 \d1, \s1, \shift +.macro vqrshrn_8h d0, d1, s0, s1, shift + vqrshrn.s32 \d0, \s0, \shift + vqrshrn.s32 \d1, \s1, \shift .endm .macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 @@ -418,11 +418,11 @@ vmull_vmlal q3, \r1, \r3, d0[3], d0[2] vmull_vmlsl q2, \r1, \r3, d0[2], d0[3] vmull_vmlal q1, \r0, \r2, d0[0], d0[0] - vrshrn.i32 d6, q3, #12 - vrshrn.i32 d7, q2, #12 + vqrshrn.s32 d6, q3, #12 + vqrshrn.s32 d7, q2, #12 vmull_vmlsl q2, \r0, \r2, d0[0], d0[0] - vrshrn.i32 d2, q1, #12 - vrshrn.i32 d3, q2, #12 + vqrshrn.s32 d2, q1, #12 + vqrshrn.s32 d3, q2, #12 vqadd.s16 \r0, d2, d6 vqsub.s16 \r3, d2, d6 vqadd.s16 \r1, d3, d7 @@ -433,11 +433,11 @@ vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2] vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3] vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0] - vrshrn_8h d12, d13, q6, q7, #12 - vrshrn_8h d14, d15, q4, q5, #12 + vqrshrn_8h d12, d13, q6, q7, #12 + vqrshrn_8h d14, d15, q4, q5, #12 vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0] - vrshrn_8h d4, d5, q2, q3, #12 - vrshrn_8h d6, d7, q4, q5, #12 + vqrshrn_8h d4, d5, q2, q3, #12 + vqrshrn_8h d6, d7, q4, q5, #12 vqadd.s16 \q0, q2, q6 vqsub.s16 \q3, q2, q6 vqadd.s16 \q1, q3, q7 @@ -478,10 +478,10 @@ vadd.s32 q3, q3, q10 vsub.s32 q11, q11, q10 - vrshrn.i32 \o0, q2, #12 - vrshrn.i32 \o2, q1, #12 - vrshrn.i32 \o1, q3, #12 - vrshrn.i32 \o3, q11, #12 + vqrshrn.s32 \o0, q2, #12 + vqrshrn.s32 \o2, q1, #12 + vqrshrn.s32 \o1, q3, #12 + vqrshrn.s32 \o3, q11, #12 .endm function inv_adst_4h_x4_neon, export=1 @@ -533,21 +533,21 @@ vsub.s32 q4, q4, q2 // out3 vsub.s32 q5, q5, q3 - vrshrn.i32 d20, q10, #12 - vrshrn.i32 d21, q11, #12 + vqrshrn.s32 d20, q10, #12 + vqrshrn.s32 d21, q11, #12 - vrshrn.i32 \o0, q8, #12 - vrshrn.i32 \o1, q9, #12 + vqrshrn.s32 \o0, q8, #12 + vqrshrn.s32 \o1, q9, #12 .ifc \o4, d18 vmov q9, q10 .endif - vrshrn.i32 \o2, q6, #12 - vrshrn.i32 \o3, q7, #12 + vqrshrn.s32 \o2, q6, #12 + vqrshrn.s32 \o3, q7, #12 - vrshrn.i32 \o6, q4, #12 - vrshrn.i32 \o7, q5, #12 + vqrshrn.s32 \o6, q4, #12 + vqrshrn.s32 \o7, q5, #12 .endm function inv_adst_8h_x4_neon, export=1 @@ -702,11 +702,11 @@ vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a - vrshrn_8h \r2, \r3, q2, q3, #12 // t4a - vrshrn_8h \r14, \r15, q4, q5, #12 // t7a + vqrshrn_8h \r2, \r3, q2, q3, #12 // t4a + vqrshrn_8h \r14, \r15, q4, q5, #12 // t7a vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a - vrshrn_8h \r6, \r7, q6, q7, #12 // t5a - vrshrn_8h \r10, \r11, q2, q3, #12 // t6a + vqrshrn_8h \r6, \r7, q6, q7, #12 // t5a + vqrshrn_8h \r10, \r11, q2, q3, #12 // t6a vqadd.s16 q2, \q1, \q3 // t4 vqsub.s16 \q1, \q1, \q3 // t5a @@ -715,8 +715,8 @@ vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5 vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6 - vrshrn_8h d8, d9, q4, q5, #12 // t5 - vrshrn_8h d10, d11, q6, q7, #12 // t6 + vqrshrn_8h d8, d9, q4, q5, #12 // t5 + vqrshrn_8h d10, d11, q6, q7, #12 // t6 vqsub.s16 \q7, \q0, q3 // out7 vqadd.s16 \q0, \q0, q3 // out0 @@ -735,11 +735,11 @@ vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a - vrshrn.i32 \r1, q1, #12 // t4a + vqrshrn.s32 \r1, q1, #12 // t4a vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a - vrshrn.i32 \r7, q2, #12 // t7a - vrshrn.i32 \r3, q3, #12 // t5a - vrshrn.i32 \r5, q1, #12 // taa + vqrshrn.s32 \r7, q2, #12 // t7a + vqrshrn.s32 \r3, q3, #12 // t5a + vqrshrn.s32 \r5, q1, #12 // taa vqadd.s16 d2, \r1, \r3 // t4 vqsub.s16 \r1, \r1, \r3 // t5a @@ -748,8 +748,8 @@ vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5 vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6 - vrshrn.i32 d4, q2, #12 // t5 - vrshrn.i32 d5, q3, #12 // t6 + vqrshrn.s32 d4, q2, #12 // t5 + vqrshrn.s32 d5, q3, #12 // t6 vqsub.s16 \r7, \r0, d3 // out7 vqadd.s16 \r0, \r0, d3 // out0 @@ -783,19 +783,19 @@ vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1] vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0] vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3] - vrshrn_8h d16, d17, q2, q3, #12 // t0a - vrshrn_8h d30, d31, q4, q5, #12 // t1a + vqrshrn_8h d16, d17, q2, q3, #12 // t0a + vqrshrn_8h d30, d31, q4, q5, #12 // t1a vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2] vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1] - vrshrn_8h d20, d21, q6, q7, #12 // t2a - vrshrn_8h d26, d27, q2, q3, #12 // t3a + vqrshrn_8h d20, d21, q6, q7, #12 // t2a + vqrshrn_8h d26, d27, q2, q3, #12 // t3a vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0] vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3] - vrshrn_8h d24, d25, q4, q5, #12 // t4a - vrshrn_8h d22, d23, q6, q7, #12 // t5a + vqrshrn_8h d24, d25, q4, q5, #12 // t4a + vqrshrn_8h d22, d23, q6, q7, #12 // t5a vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2] - vrshrn_8h d28, d29, q2, q3, #12 // t6a - vrshrn_8h d18, d19, q4, q5, #12 // t7a + vqrshrn_8h d28, d29, q2, q3, #12 // t6a + vqrshrn_8h d18, d19, q4, q5, #12 // t7a vqadd.s16 q2, q8, q12 // t0 vqsub.s16 q3, q8, q12 // t4 @@ -810,13 +810,13 @@ vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3] vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2] - vrshrn_8h d6, d7, q8, q9, #12 // t4a - vrshrn_8h d10, d11, q12, q13, #12 // t5a + vqrshrn_8h d6, d7, q8, q9, #12 // t4a + vqrshrn_8h d10, d11, q12, q13, #12 // t5a vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3] - vrshrn_8h d14, d15, q14, q15, #12 // t6a - vrshrn_8h d22, d23, q8, q9, #12 // t7a + vqrshrn_8h d14, d15, q14, q15, #12 // t6a + vqrshrn_8h d22, d23, q8, q9, #12 // t7a vqadd.s16 \q0, q2, q6 // out0 vqsub.s16 q2, q2, q6 // t2 @@ -833,11 +833,11 @@ vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12) vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11) vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10) - vrshrn_8h d4, d5, q10, q11, #12 // out3 + vqrshrn_8h d4, d5, q10, q11, #12 // out3 vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13) - vrshrn_8h d6, d7, q12, q13, #12 // out5 - vrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) - vrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11) + vqrshrn_8h d6, d7, q12, q13, #12 // out5 + vqrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) + vqrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11) vqneg.s16 \q3, q2 // out3 vqneg.s16 \q5, q3 // out5 @@ -850,19 +850,19 @@ vmull_vmlal q2, d23, d16, d0[0], d0[1] vmull_vmlsl q3, d23, d16, d0[1], d0[0] vmull_vmlal q4, d21, d18, d0[2], d0[3] - vrshrn.i32 d16, q2, #12 // t0a - vrshrn.i32 d23, q3, #12 // t1a + vqrshrn.s32 d16, q2, #12 // t0a + vqrshrn.s32 d23, q3, #12 // t1a vmull_vmlsl q5, d21, d18, d0[3], d0[2] vmull_vmlal q6, d19, d20, d1[0], d1[1] - vrshrn.i32 d18, q4, #12 // t2a - vrshrn.i32 d21, q5, #12 // t3a + vqrshrn.s32 d18, q4, #12 // t2a + vqrshrn.s32 d21, q5, #12 // t3a vmull_vmlsl q7, d19, d20, d1[1], d1[0] vmull_vmlal q2, d17, d22, d1[2], d1[3] - vrshrn.i32 d20, q6, #12 // t4a - vrshrn.i32 d19, q7, #12 // t5a + vqrshrn.s32 d20, q6, #12 // t4a + vqrshrn.s32 d19, q7, #12 // t5a vmull_vmlsl q3, d17, d22, d1[3], d1[2] - vrshrn.i32 d22, q2, #12 // t6a - vrshrn.i32 d17, q3, #12 // t7a + vqrshrn.s32 d22, q2, #12 // t6a + vqrshrn.s32 d17, q3, #12 // t7a vqadd.s16 d4, d16, d20 // t0 vqsub.s16 d5, d16, d20 // t4 @@ -877,13 +877,13 @@ vmull_vmlsl q10, d5, d7, d2[2], d2[3] vmull_vmlsl q11, d19, d9, d2[3], d2[2] - vrshrn.i32 d5, q8, #12 // t4a - vrshrn.i32 d7, q10, #12 // t5a + vqrshrn.s32 d5, q8, #12 // t4a + vqrshrn.s32 d7, q10, #12 // t5a vmull_vmlal q8, d19, d9, d2[2], d2[3] - vrshrn.i32 d9, q11, #12 // t6a - vrshrn.i32 d19, q8, #12 // t7a + vqrshrn.s32 d9, q11, #12 // t6a + vqrshrn.s32 d19, q8, #12 // t7a vqadd.s16 \r0, d4, d8 // out0 vqsub.s16 d4, d4, d8 // t2 @@ -900,11 +900,11 @@ vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20) vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19) vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18) - vrshrn.i32 d4, q9, #12 // out3 + vqrshrn.s32 d4, q9, #12 // out3 vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21) - vrshrn.i32 d5, q10, #12 // out5 - vrshrn.i32 \r2, q9, #12 // out2 (d18 or d21) - vrshrn.i32 \r4, q4, #12 // out4 (d20 or d19) + vqrshrn.s32 d5, q10, #12 // out5 + vqrshrn.s32 \r2, q9, #12 // out2 (d18 or d21) + vqrshrn.s32 \r4, q4, #12 // out4 (d20 or d19) vqneg.s16 \r3, d4 // out3 vqneg.s16 \r5, d5 // out5 @@ -1122,19 +1122,19 @@ vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a - vrshrn.i32 d17, q2, #12 // t8a - vrshrn.i32 d31, q3, #12 // t15a + vqrshrn.s32 d17, q2, #12 // t8a + vqrshrn.s32 d31, q3, #12 // t15a vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a - vrshrn.i32 d23, q4, #12 // t9a - vrshrn.i32 d25, q2, #12 // t14a + vqrshrn.s32 d23, q4, #12 // t9a + vqrshrn.s32 d25, q2, #12 // t14a vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a - vrshrn.i32 d21, q3, #12 // t10a - vrshrn.i32 d27, q4, #12 // t13a + vqrshrn.s32 d21, q3, #12 // t10a + vqrshrn.s32 d27, q4, #12 // t13a vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a - vrshrn.i32 d19, q2, #12 // t11a - vrshrn.i32 d29, q3, #12 // t12a + vqrshrn.s32 d19, q2, #12 // t11a + vqrshrn.s32 d29, q3, #12 // t12a idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30 @@ -1149,14 +1149,14 @@ vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a - vrshrn.i32 d21, q3, #12 // t9a - vrshrn.i32 d27, q4, #12 // t14a + vqrshrn.s32 d21, q3, #12 // t9a + vqrshrn.s32 d27, q4, #12 // t14a vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a - vrshrn.i32 d29, q3, #12 // t13a + vqrshrn.s32 d29, q3, #12 // t13a vneg.s32 q4, q4 - vrshrn.i32 d23, q4, #12 // t10a + vqrshrn.s32 d23, q4, #12 // t10a vqsub.s16 d4, d17, d19 // t11a vqadd.s16 d17, d17, d19 // t8a @@ -1171,11 +1171,11 @@ vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12 vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a - vrshrn.i32 d6, q3, #12 // t11 - vrshrn.i32 d7, q4, #12 // t12 + vqrshrn.s32 d6, q3, #12 // t11 + vqrshrn.s32 d7, q4, #12 // t12 vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a - vrshrn.i32 d4, q2, #12 // t10a - vrshrn.i32 d5, q4, #12 // t13a + vqrshrn.s32 d4, q2, #12 // t10a + vqrshrn.s32 d5, q4, #12 // t13a vqadd.s16 d8, d16, d31 // out0 vqsub.s16 d31, d16, d31 // out15 @@ -1208,35 +1208,35 @@ vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0 vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1 vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2 - vrshrn.i32 d16, q2, #12 // t0 - vrshrn.i32 d31, q3, #12 // t1 + vqrshrn.s32 d16, q2, #12 // t0 + vqrshrn.s32 d31, q3, #12 // t1 vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3 vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4 - vrshrn.i32 d18, q4, #12 // t2 - vrshrn.i32 d29, q2, #12 // t3 + vqrshrn.s32 d18, q4, #12 // t2 + vqrshrn.s32 d29, q2, #12 // t3 vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5 vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6 - vrshrn.i32 d20, q3, #12 // t4 - vrshrn.i32 d27, q4, #12 // t5 + vqrshrn.s32 d20, q3, #12 // t4 + vqrshrn.s32 d27, q4, #12 // t5 vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7 vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8 - vrshrn.i32 d22, q2, #12 // t6 - vrshrn.i32 d25, q3, #12 // t7 + vqrshrn.s32 d22, q2, #12 // t6 + vqrshrn.s32 d25, q3, #12 // t7 vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9 vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10 - vrshrn.i32 d23, q4, #12 // t8 - vrshrn.i32 d24, q2, #12 // t9 + vqrshrn.s32 d23, q4, #12 // t8 + vqrshrn.s32 d24, q2, #12 // t9 vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11 vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12 - vrshrn.i32 d21, q3, #12 // t10 - vrshrn.i32 d26, q4, #12 // t11 + vqrshrn.s32 d21, q3, #12 // t10 + vqrshrn.s32 d26, q4, #12 // t11 vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13 vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14 - vrshrn.i32 d19, q2, #12 // t12 - vrshrn.i32 d28, q3, #12 // t13 + vqrshrn.s32 d19, q2, #12 // t12 + vqrshrn.s32 d28, q3, #12 // t13 vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15 - vrshrn.i32 d17, q4, #12 // t14 - vrshrn.i32 d30, q2, #12 // t15 + vqrshrn.s32 d17, q4, #12 // t14 + vqrshrn.s32 d30, q2, #12 // t15 vld1.16 {q0}, [r12, :128] @@ -1260,19 +1260,19 @@ vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8 vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9 vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10 - vrshrn.i32 d17, q2, #12 // t8 - vrshrn.i32 d30, q3, #12 // t9 + vqrshrn.s32 d17, q2, #12 // t8 + vqrshrn.s32 d30, q3, #12 // t9 vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11 vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12 - vrshrn.i32 d18, q4, #12 // t10 - vrshrn.i32 d29, q2, #12 // t11 + vqrshrn.s32 d18, q4, #12 // t10 + vqrshrn.s32 d29, q2, #12 // t11 vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13 vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14 - vrshrn.i32 d27, q3, #12 // t12 - vrshrn.i32 d20, q4, #12 // t13 + vqrshrn.s32 d27, q3, #12 // t12 + vqrshrn.s32 d20, q4, #12 // t13 vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15 - vrshrn.i32 d25, q2, #12 // t14 - vrshrn.i32 d22, q3, #12 // t15 + vqrshrn.s32 d25, q2, #12 // t14 + vqrshrn.s32 d22, q3, #12 // t15 vqsub.s16 d2, d16, d21 // t4 vqadd.s16 d16, d16, d21 // t0 @@ -1294,19 +1294,19 @@ vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a - vrshrn.i32 d22, q2, #12 // t4a - vrshrn.i32 d25, q3, #12 // t5a + vqrshrn.s32 d22, q2, #12 // t4a + vqrshrn.s32 d25, q3, #12 // t5a vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12 - vrshrn.i32 d24, q4, #12 // t6a - vrshrn.i32 d23, q2, #12 // t7a + vqrshrn.s32 d24, q4, #12 // t6a + vqrshrn.s32 d23, q2, #12 // t7a vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13 vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14 - vrshrn.i32 d17, q3, #12 // t12 + vqrshrn.s32 d17, q3, #12 // t12 vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15 - vrshrn.i32 d29, q4, #12 // t13 - vrshrn.i32 d30, q2, #12 // t14 - vrshrn.i32 d18, q3, #12 // t15 + vqrshrn.s32 d29, q4, #12 // t13 + vqrshrn.s32 d30, q2, #12 // t14 + vqrshrn.s32 d18, q3, #12 // t15 vqsub.s16 d2, d16, d21 // t2a .ifc \o0, d16 @@ -1343,21 +1343,21 @@ vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) - vrshrn.i32 d24, q12, #12 // out8 - vrshrn.i32 d4, q2, #12 // out7 - vrshrn.i32 d5, q3, #12 // out5 + vqrshrn.s32 d24, q12, #12 // out8 + vqrshrn.s32 d4, q2, #12 // out7 + vqrshrn.s32 d5, q3, #12 // out5 vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) - vrshrn.i32 d26, q4, #12 // out10 + vqrshrn.s32 d26, q4, #12 // out10 vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) - vrshrn.i32 \o4, q1, #12 // out4 - vrshrn.i32 d7, q3, #12 // out9 - vrshrn.i32 d6, q4, #12 // out11 - vrshrn.i32 \o6, q11, #12 // out6 + vqrshrn.s32 \o4, q1, #12 // out4 + vqrshrn.s32 d7, q3, #12 // out9 + vqrshrn.s32 d6, q4, #12 // out11 + vqrshrn.s32 \o6, q11, #12 // out6 .ifc \o8, d23 vmov \o8, d24 @@ -1927,35 +1927,35 @@ vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a - vrshrn.i32 d16, q2, #12 // t16a - vrshrn.i32 d31, q3, #12 // t31a + vqrshrn.s32 d16, q2, #12 // t16a + vqrshrn.s32 d31, q3, #12 // t31a vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a - vrshrn.i32 d24, q4, #12 // t17a - vrshrn.i32 d23, q2, #12 // t30a + vqrshrn.s32 d24, q4, #12 // t17a + vqrshrn.s32 d23, q2, #12 // t30a vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a - vrshrn.i32 d20, q3, #12 // t18a - vrshrn.i32 d27, q4, #12 // t29a + vqrshrn.s32 d20, q3, #12 // t18a + vqrshrn.s32 d27, q4, #12 // t29a vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a - vrshrn.i32 d28, q2, #12 // t19a - vrshrn.i32 d19, q3, #12 // t28a + vqrshrn.s32 d28, q2, #12 // t19a + vqrshrn.s32 d19, q3, #12 // t28a vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a - vrshrn.i32 d18, q4, #12 // t20a - vrshrn.i32 d29, q2, #12 // t27a + vqrshrn.s32 d18, q4, #12 // t20a + vqrshrn.s32 d29, q2, #12 // t27a vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a - vrshrn.i32 d26, q3, #12 // t21a - vrshrn.i32 d21, q4, #12 // t26a + vqrshrn.s32 d26, q3, #12 // t21a + vqrshrn.s32 d21, q4, #12 // t26a vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a - vrshrn.i32 d22, q2, #12 // t22a - vrshrn.i32 d25, q3, #12 // t25a + vqrshrn.s32 d22, q2, #12 // t22a + vqrshrn.s32 d25, q3, #12 // t25a vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a - vrshrn.i32 d30, q4, #12 // t23a - vrshrn.i32 d17, q2, #12 // t24a + vqrshrn.s32 d30, q4, #12 // t23a + vqrshrn.s32 d17, q2, #12 // t24a vld1.16 {q0}, [r12, :128] @@ -1979,21 +1979,21 @@ vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a - vrshrn.i32 d21, q2, #12 // t17a - vrshrn.i32 d27, q3, #12 // t30a + vqrshrn.s32 d21, q2, #12 // t17a + vqrshrn.s32 d27, q3, #12 // t30a vneg.s32 q4, q4 // -> t18a vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a - vrshrn.i32 d19, q4, #12 // t18a - vrshrn.i32 d24, q1, #12 // t29a + vqrshrn.s32 d19, q4, #12 // t18a + vqrshrn.s32 d24, q1, #12 // t29a vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a - vrshrn.i32 d22, q2, #12 // t21a - vrshrn.i32 d18, q3, #12 // t26a + vqrshrn.s32 d22, q2, #12 // t21a + vqrshrn.s32 d18, q3, #12 // t26a vneg.s32 q4, q4 // -> t22a vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a - vrshrn.i32 d17, q4, #12 // t22a - vrshrn.i32 d20, q1, #12 // t25a + vqrshrn.s32 d17, q4, #12 // t22a + vqrshrn.s32 d20, q1, #12 // t25a vqsub.s16 d2, d27, d24 // t29 vqadd.s16 d27, d27, d24 // t30 @@ -2015,21 +2015,21 @@ vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19 - vrshrn.i32 d18, q2, #12 // t18a - vrshrn.i32 d25, q3, #12 // t29a + vqrshrn.s32 d18, q2, #12 // t18a + vqrshrn.s32 d25, q3, #12 // t29a vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28 vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20 - vrshrn.i32 d29, q4, #12 // t19 - vrshrn.i32 d24, q1, #12 // t28 + vqrshrn.s32 d29, q4, #12 // t19 + vqrshrn.s32 d24, q1, #12 // t28 vneg.s32 q2, q2 // -> t20 vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27 vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a - vrshrn.i32 d26, q2, #12 // t20 - vrshrn.i32 d19, q3, #12 // t27 + vqrshrn.s32 d26, q2, #12 // t20 + vqrshrn.s32 d19, q3, #12 // t27 vneg.s32 q4, q4 // -> t21a vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a - vrshrn.i32 d20, q4, #12 // t21a - vrshrn.i32 d28, q1, #12 // t26a + vqrshrn.s32 d20, q4, #12 // t21a + vqrshrn.s32 d28, q1, #12 // t26a vqsub.s16 d2, d16, d30 // t23 vqadd.s16 d16, d16, d30 // t16 = out16 @@ -2051,24 +2051,24 @@ vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20 vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27 - vrshrn.i32 d20, q2, #12 // t20 - vrshrn.i32 d22, q3, #12 // t27 + vqrshrn.s32 d20, q2, #12 // t20 + vqrshrn.s32 d22, q3, #12 // t27 vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a vmov d27, d22 // t27 - vrshrn.i32 d26, q2, #12 // t26a + vqrshrn.s32 d26, q2, #12 // t26a vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22 vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25 - vrshrn.i32 d21, q3, #12 // t21a - vrshrn.i32 d22, q12, #12 // t22 - vrshrn.i32 d25, q2, #12 // t25 + vqrshrn.s32 d21, q3, #12 // t21a + vqrshrn.s32 d22, q12, #12 // t22 + vqrshrn.s32 d25, q2, #12 // t25 vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a - vrshrn.i32 d23, q2, #12 // t23a - vrshrn.i32 d24, q3, #12 // t24a + vqrshrn.s32 d23, q2, #12 // t23a + vqrshrn.s32 d24, q3, #12 // t24a bx lr endfunc @@ -2679,11 +2679,11 @@ vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a vneg.s32 q2, q2 // t34a vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a - vrshrn.i32 d26, q2, #12 // t34a + vqrshrn.s32 d26, q2, #12 // t34a vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a - vrshrn.i32 d29, q3, #12 // t61a - vrshrn.i32 d25, q4, #12 // t33a - vrshrn.i32 d30, q2, #12 // t62a + vqrshrn.s32 d29, q3, #12 // t61a + vqrshrn.s32 d25, q4, #12 // t33a + vqrshrn.s32 d30, q2, #12 // t62a vqadd.s16 d16, d24, d27 // t32a vqsub.s16 d19, d24, d27 // t35a @@ -2697,11 +2697,11 @@ vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60 - vrshrn.i32 d21, q2, #12 // t61a - vrshrn.i32 d18, q3, #12 // t34a + vqrshrn.s32 d21, q2, #12 // t61a + vqrshrn.s32 d18, q3, #12 // t34a vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35 - vrshrn.i32 d20, q4, #12 // t60 - vrshrn.i32 d19, q2, #12 // t35 + vqrshrn.s32 d20, q4, #12 // t60 + vqrshrn.s32 d19, q2, #12 // t35 vst1.16 {d16, d17, d18, d19}, [r6, :128]! vst1.16 {d20, d21, d22, d23}, [r6, :128]! @@ -2738,12 +2738,12 @@ vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a - vrshrn.i32 d25, q2, #12 // t56a - vrshrn.i32 d27, q3, #12 // t39a + vqrshrn.s32 d25, q2, #12 // t56a + vqrshrn.s32 d27, q3, #12 // t39a vneg.s32 q4, q4 // t40a vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a - vrshrn.i32 d31, q4, #12 // t40a - vrshrn.i32 d28, q2, #12 // t55a + vqrshrn.s32 d31, q4, #12 // t40a + vqrshrn.s32 d28, q2, #12 // t55a vqadd.s16 d16, d24, d29 // t32a vqsub.s16 d19, d24, d29 // t47a @@ -2757,11 +2757,11 @@ vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47 - vrshrn.i32 d18, q2, #12 // t40a - vrshrn.i32 d21, q3, #12 // t55a + vqrshrn.s32 d18, q2, #12 // t40a + vqrshrn.s32 d21, q3, #12 // t55a vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48 - vrshrn.i32 d19, q4, #12 // t47 - vrshrn.i32 d20, q2, #12 // t48 + vqrshrn.s32 d19, q4, #12 // t47 + vqrshrn.s32 d20, q2, #12 // t48 vstr d16, [r6, #2*4*0] // t32a vstr d17, [r9, #2*4*0] // t39 diff -Nru dav1d-0.9.2/src/arm/32/mc16.S dav1d-1.0.0/src/arm/32/mc16.S --- dav1d-0.9.2/src/arm/32/mc16.S 2021-09-03 15:51:24.397037000 +0000 +++ dav1d-1.0.0/src/arm/32/mc16.S 2022-03-18 14:31:55.974356000 +0000 @@ -1748,7 +1748,7 @@ vst1_32 \d_strd, d16, d17 pop {r4-r11,pc} -28: // 2x8, 2x16 v +28: // 2x6, 2x8, 2x12, 2x16 v vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd @@ -1761,25 +1761,29 @@ interleave_1_32 d2, d3, d4, d5, d6 interleave_1_32 d6, d7, d16 216: - subs \h, \h, #8 + subs \h, \h, #4 load_32 \sr2, \src, \s_strd, d17, d18, d19, d20 - load_32 \sr2, \src, \s_strd, d21, d22, d23, d24 interleave_1_32 d16, d17, d18, d19, d20 - interleave_1_32 d20, d21, d22, d23, d24 vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19 - vmull_vmlal_8 q2, d6, d7, d16, d17, d18, d19, d20, d21 - vmull_vmlal_8 q3, d16, d17, d18, d19, d20, d21, d22, d23 - vqrshrun_s32 6, q13, d26, q1, d27, q2, d2, q3, d3 - vmin_u16 q15, q13, q1 + vqrshrun_s32 6, q13, d26, q1, d27 + vmin_u16 q15, q13 vst1_32 \d_strd, d26, d27 - vst1_32 \d_strd, d2, d3 ble 0f - vmov q1, q9 - vmov q2, q10 - vmov q3, q11 - vmov d16, d24 + cmp \h, #2 + vmov q1, q3 + vmov q2, q8 + vmov q3, q9 + vmov d16, d20 + beq 26f b 216b +26: + load_32 \sr2, \src, \s_strd, d17, d18 + interleave_1_32 d16, d17, d18 + vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 + vqrshrun_s32 6, q13, d26 + vmin_u16 d30, d26 + vst1_32 \d_strd, d26 0: pop {r4-r11,pc} .endif @@ -1810,7 +1814,7 @@ 0: pop {r4-r11,pc} -480: // 4x8, 4x16 v +480: // 4x6, 4x8, 4x12, 4x16 v vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd @@ -1830,11 +1834,18 @@ vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26 shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5 ble 0f + cmp \h, #2 vmov q8, q10 vmov q9, q11 vmov q10, q12 vmov d22, d26 + beq 46f b 48b +46: + load_reg \sr2, \src, \s_strd, d23, d24 + vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 + vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 + shift_store_4 \type, \d_strd, q1, q2, d2, d3 0: pop {r4-r11,pc} @@ -2801,6 +2812,7 @@ // 2x2 v vld1.32 {d16[]}, [\src], \s_strd bgt 24f +22: vld1.32 {d17[]}, [\sr2], \s_strd vld1.32 {d18[]}, [\src], \s_strd vext.8 d16, d16, d17, #4 @@ -2811,11 +2823,12 @@ vst1.32 {d16[0]}, [\dst, :32] vst1.32 {d16[1]}, [\ds2, :32] pop {r4-r11,pc} -24: // 2x4, 2x8, ... v +24: // 2x4, 2x6, 2x8, ... v vld1.32 {d17[]}, [\sr2], \s_strd vld1.32 {d18[]}, [\src], \s_strd vld1.32 {d19[]}, [\sr2], \s_strd vld1.32 {d20[]}, [\src], \s_strd + subs \h, \h, #4 vext.8 d16, d16, d17, #4 vext.8 d17, d17, d18, #4 vext.8 d18, d18, d19, #4 @@ -2823,14 +2836,15 @@ vswp d17, d18 vmul.i16 q8, q8, q2 vmla.i16 q8, q9, q3 - subs \h, \h, #4 + cmp \h, #2 vrshr.u16 q8, q8, #4 vst1.32 {d16[0]}, [\dst, :32], \d_strd vst1.32 {d16[1]}, [\ds2, :32], \d_strd vst1.32 {d17[0]}, [\dst, :32], \d_strd vst1.32 {d17[1]}, [\ds2, :32], \d_strd - ble 0f + blt 0f vmov d16, d20 + beq 22b b 24b 0: pop {r4-r11,pc} diff -Nru dav1d-0.9.2/src/arm/32/mc.S dav1d-1.0.0/src/arm/32/mc.S --- dav1d-0.9.2/src/arm/32/mc.S 2021-09-03 15:51:24.397037000 +0000 +++ dav1d-1.0.0/src/arm/32/mc.S 2022-03-18 14:31:55.974356000 +0000 @@ -1146,6 +1146,16 @@ vmla.s16 \d, \s2, d0[2] vmla.s16 \d, \s3, d0[3] .endm +.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 + vmul.s16 \d0, \s0, d0[0] + vmla.s16 \d0, \s1, d0[1] + vmla.s16 \d0, \s2, d0[2] + vmla.s16 \d0, \s3, d0[3] + vmla.s16 \d0, \s4, d1[0] + vmla.s16 \d0, \s5, d1[1] + vmla.s16 \d0, \s6, d1[2] + vmla.s16 \d0, \s7, d1[3] +.endm .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 vmul.s16 \d0, \s0, d0[0] vmla.s16 \d0, \s1, d0[1] @@ -1182,24 +1192,6 @@ vmla.s16 \d1, \s8, d1[2] vmla.s16 \d1, \s9, d1[3] .endm -.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 - vmul.s16 \d0, \s0, d0[0] - vmla.s16 \d0, \s1, d0[1] - vmla.s16 \d0, \s2, d0[2] - vmla.s16 \d0, \s3, d0[3] - vmla.s16 \d0, \s4, d1[0] - vmla.s16 \d0, \s5, d1[1] - vmla.s16 \d0, \s6, d1[2] - vmla.s16 \d0, \s7, d1[3] - vmul.s16 \d1, \s4, d0[0] - vmla.s16 \d1, \s5, d0[1] - vmla.s16 \d1, \s6, d0[2] - vmla.s16 \d1, \s7, d0[3] - vmla.s16 \d1, \s8, d1[0] - vmla.s16 \d1, \s9, d1[1] - vmla.s16 \d1, \s10, d1[2] - vmla.s16 \d1, \s11, d1[3] -.endm .macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3 vqrshrun.s16 \d0, \q0, #\shift .ifnb \q1 @@ -1623,7 +1615,7 @@ st_16 \d_strd, d6, 4 pop {r4-r11,pc} -28: // 2x8, 2x16 v +28: // 2x6, 2x8, 2x12, 2x16 v vpush {q4-q7} vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 @@ -1642,34 +1634,37 @@ vmov d7, d10 vmov d9, d12 216: - subs \h, \h, #8 + subs \h, \h, #4 load_16 \sr2, \src, \s_strd, d16, d18, d20, d22 - load_16 \sr2, \src, \s_strd, d24, d26, d28, d30 interleave_1_16 d14, d16, d18, d20, d22 - interleave_1_16 d22, d24, d26, d28, d30 vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20 - vmovl_u8 q11, d22, q12, d24, q13, d26, q14, d28 vmov d11, d14 vmov d13, d16 vmov d15, d18 vmov d17, d20 - vmov d19, d22 - vmov d21, d24 - vmov d23, d26 - vmov d25, d28 - mul_mla_8_4 q1, q2, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12 - vqrshrun_s16 6, q1, d2, q2, d4 + mul_mla_8_0 q1, q1, q2, q3, q4, q5, q6, q7, q8 + vqrshrun_s16 6, q1, d2 st_16 \d_strd, d2, 4 - st_16 \d_strd, d4, 4 ble 0f - vmov q1, q9 - vmov q2, q10 - vmov q3, q11 - vmov q4, q12 - vmov q5, q13 - vmov q6, q14 - vmov d14, d30 + cmp \h, #2 + vmov q1, q5 + vmov q2, q6 + vmov q3, q7 + vmov q4, q8 + vmov q5, q9 + vmov q6, q10 + vmov d14, d22 + beq 26f b 216b +26: + load_16 \sr2, \src, \s_strd, d16, d18 + interleave_1_16 d14, d16, d18 + vmovl_u8 q7, d14, q8, d16 + vmov d11, d14 + vmov d13, d16 + mul_mla_8_0 d2, d2, d4, d6, d8, d10, d12, d14, d16 + vqrshrun_s16 6, q1, d2 + st_16 \d_strd, d2, 2 0: vpop {q4-q7} pop {r4-r11,pc} @@ -1703,7 +1698,7 @@ 0: pop {r4-r11,pc} -480: // 4x8, 4x16 v +480: // 4x6, 4x8, 4x12, 4x16 v vpush {q4} vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 @@ -1726,12 +1721,19 @@ mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13 shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5 ble 0f - subs \h, \h, #4 - load_32 \sr2, \src, \s_strd, d30, d2, d4, d6 - interleave_1_32 d28, d30, d2, d4, d6 - vmovl_u8 q14, d28, q15, d30, q1, d2, q2, d4 - mul_mla_8_2 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1, q2 - shift_store_4 \type, \d_strd, q8, d16, d17, q9, d18, d19 + load_32 \sr2, \src, \s_strd, d30, d2 + subs \h, \h, #2 + interleave_1_32 d28, d30, d2 + vmovl_u8 q14, d28, q15, d30 + mul_mla_8_0 q8, q8, q9, q10, q11, q12, q13, q14, q15 + shift_store_4 \type, \d_strd, q8, d16, d17 + ble 0f + load_32 \sr2, \src, \s_strd, d4, d6 + subs \h, \h, #2 + interleave_1_32 d2, d4, d6 + vmovl_u8 q1, d2, q2, d4 + mul_mla_8_0 q9, q10, q11, q12, q13, q14, q15, q1, q2 + shift_store_4 \type, \d_strd, q9, d18, d19 ble 0f subs \h, \h, #4 load_32 \sr2, \src, \s_strd, d8, d16, d18, d20 @@ -2643,6 +2645,7 @@ // 2x2 v vld1.16 {d16[]}, [\src], \s_strd bgt 24f +22: vld1.16 {d17[]}, [\sr2], \s_strd vld1.16 {d18[]}, [\src], \s_strd vext.8 d16, d16, d17, #6 @@ -2653,11 +2656,12 @@ vst1.16 {d4[0]}, [\dst, :16] vst1.16 {d4[1]}, [\ds2, :16] pop {r4-r11,pc} -24: // 2x4, 2x8, ... v +24: // 2x4, 2x6, 2x8, ... v vld1.16 {d17[]}, [\sr2], \s_strd vld1.16 {d18[]}, [\src], \s_strd vld1.16 {d19[]}, [\sr2], \s_strd vld1.16 {d20[]}, [\src], \s_strd + sub \h, \h, #4 vext.8 d16, d16, d17, #6 vext.8 d17, d17, d18, #6 vext.8 d18, d18, d19, #6 @@ -2666,14 +2670,15 @@ vtrn.32 d17, d19 vmull.u8 q2, d16, d2 vmlal.u8 q2, d17, d3 - subs \h, \h, #4 + cmp \h, #2 vqrshrn.u16 d4, q2, #4 vst1.16 {d4[0]}, [\dst, :16], \d_strd vst1.16 {d4[1]}, [\ds2, :16], \d_strd vst1.16 {d4[2]}, [\dst, :16], \d_strd vst1.16 {d4[3]}, [\ds2, :16], \d_strd - ble 0f + blt 0f vmov d16, d20 + beq 22b b 24b 0: pop {r4-r11,pc} diff -Nru dav1d-0.9.2/src/arm/64/cdef16.S dav1d-1.0.0/src/arm/64/cdef16.S --- dav1d-0.9.2/src/arm/64/cdef16.S 2021-09-03 15:51:24.397037000 +0000 +++ dav1d-1.0.0/src/arm/64/cdef16.S 2022-03-18 14:31:55.974356000 +0000 @@ -30,12 +30,12 @@ #include "cdef_tmpl.S" .macro pad_top_bot_16 s1, s2, w, stride, reg, ret - tst w6, #1 // CDEF_HAVE_LEFT + tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT sub \s1, \s1, #4 sub \s2, \s2, #4 - tst w6, #2 // CDEF_HAVE_RIGHT + tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] @@ -76,7 +76,7 @@ 2: // !CDEF_HAVE_LEFT - tst w6, #2 // CDEF_HAVE_RIGHT + tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] @@ -126,7 +126,8 @@ // void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], -// const pixel *const top, int h, +// const pixel *const top, +// const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_16 w, stride, reg @@ -134,7 +135,7 @@ movi v30.8h, #0x80, lsl #8 mov v31.16b, v30.16b sub x0, x0, #2*(2*\stride+2) - tst w6, #4 // CDEF_HAVE_TOP + tst w7, #4 // CDEF_HAVE_TOP b.ne 1f // !CDEF_HAVE_TOP st1 {v30.8h, v31.8h}, [x0], #32 @@ -149,17 +150,17 @@ // Middle section 3: - tst w6, #1 // CDEF_HAVE_LEFT + tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT - tst w6, #2 // CDEF_HAVE_RIGHT + tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ld1 {v0.s}[0], [x3], #4 ldr s2, [x1, #2*\w] load_n_incr_16 v1, x1, x2, \w - subs w5, w5, #1 + subs w6, w6, #1 str s0, [x0] stur \reg\()1, [x0, #4] str s2, [x0, #4+2*\w] @@ -170,7 +171,7 @@ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ld1 {v0.s}[0], [x3], #4 load_n_incr_16 v1, x1, x2, \w - subs w5, w5, #1 + subs w6, w6, #1 str s0, [x0] stur \reg\()1, [x0, #4] str s31, [x0, #4+2*\w] @@ -178,13 +179,13 @@ b.gt 1b b 3f 2: - tst w6, #2 // CDEF_HAVE_RIGHT + tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldr s1, [x1, #2*\w] load_n_incr_16 v0, x1, x2, \w - subs w5, w5, #1 + subs w6, w6, #1 str s31, [x0] stur \reg\()0, [x0, #4] str s1, [x0, #4+2*\w] @@ -194,7 +195,7 @@ 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr_16 v0, x1, x2, \w - subs w5, w5, #1 + subs w6, w6, #1 str s31, [x0] stur \reg\()0, [x0, #4] str s31, [x0, #4+2*\w] @@ -202,7 +203,7 @@ b.gt 1b 3: - tst w6, #8 // CDEF_HAVE_BOTTOM + tst w7, #8 // CDEF_HAVE_BOTTOM b.ne 1f // !CDEF_HAVE_BOTTOM st1 {v30.8h, v31.8h}, [x0], #32 @@ -212,8 +213,8 @@ ret 1: // CDEF_HAVE_BOTTOM - add x9, x1, x2 - pad_top_bot_16 x1, x9, \w, \stride, \reg, 1 + add x9, x5, x2 + pad_top_bot_16 x5, x9, \w, \stride, \reg, 1 endfunc .endm diff -Nru dav1d-0.9.2/src/arm/64/cdef.S dav1d-1.0.0/src/arm/64/cdef.S --- dav1d-0.9.2/src/arm/64/cdef.S 2021-09-03 15:51:24.397037000 +0000 +++ dav1d-1.0.0/src/arm/64/cdef.S 2022-03-18 14:31:55.974356000 +0000 @@ -30,12 +30,12 @@ #include "cdef_tmpl.S" .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret - tst w6, #1 // CDEF_HAVE_LEFT + tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT sub \s1, \s1, #2 sub \s2, \s2, #2 - tst w6, #2 // CDEF_HAVE_RIGHT + tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] @@ -84,7 +84,7 @@ 2: // !CDEF_HAVE_LEFT - tst w6, #2 // CDEF_HAVE_RIGHT + tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] @@ -140,17 +140,18 @@ // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], -// const pixel *const top, int h, +// const pixel *const top, +// const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func w, stride, rn, rw function cdef_padding\w\()_8bpc_neon, export=1 - cmp w6, #0xf // fully edged + cmp w7, #0xf // fully edged b.eq cdef_padding\w\()_edged_8bpc_neon movi v30.8h, #0x80, lsl #8 mov v31.16b, v30.16b sub x0, x0, #2*(2*\stride+2) - tst w6, #4 // CDEF_HAVE_TOP + tst w7, #4 // CDEF_HAVE_TOP b.ne 1f // !CDEF_HAVE_TOP st1 {v30.8h, v31.8h}, [x0], #32 @@ -165,17 +166,17 @@ // Middle section 3: - tst w6, #1 // CDEF_HAVE_LEFT + tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT - tst w6, #2 // CDEF_HAVE_RIGHT + tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ld1 {v0.h}[0], [x3], #2 ldr h2, [x1, #\w] load_n_incr v1, x1, x2, \w - subs w5, w5, #1 + subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b @@ -189,7 +190,7 @@ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ld1 {v0.h}[0], [x3], #2 load_n_incr v1, x1, x2, \w - subs w5, w5, #1 + subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s0, [x0] @@ -199,13 +200,13 @@ b.gt 1b b 3f 2: - tst w6, #2 // CDEF_HAVE_RIGHT + tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldr h1, [x1, #\w] load_n_incr v0, x1, x2, \w - subs w5, w5, #1 + subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s31, [x0] @@ -217,7 +218,7 @@ 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr v0, x1, x2, \w - subs w5, w5, #1 + subs w6, w6, #1 uxtl v0.8h, v0.8b str s31, [x0] stur \rw\()0, [x0, #4] @@ -226,7 +227,7 @@ b.gt 1b 3: - tst w6, #8 // CDEF_HAVE_BOTTOM + tst w7, #8 // CDEF_HAVE_BOTTOM b.ne 1f // !CDEF_HAVE_BOTTOM st1 {v30.8h, v31.8h}, [x0], #32 @@ -236,8 +237,8 @@ ret 1: // CDEF_HAVE_BOTTOM - add x9, x1, x2 - pad_top_bottom x1, x9, \w, \stride, \rn, \rw, 1 + add x9, x5, x2 + pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1 endfunc .endm @@ -246,12 +247,14 @@ // void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], -// const pixel *const top, int h, +// const pixel *const top, +// const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_edged w, stride, reg function cdef_padding\w\()_edged_8bpc_neon, export=1 sub x4, x4, #2 + sub x5, x5, #2 sub x0, x0, #(2*\stride+2) .if \w == 4 @@ -275,22 +278,21 @@ ld1 {v0.h}[0], [x3], #2 ldr h2, [x1, #\w] load_n_incr v1, x1, x2, \w - subs w5, w5, #1 + subs w6, w6, #1 str h0, [x0] stur \reg\()1, [x0, #2] str h2, [x0, #2+\w] add x0, x0, #\stride b.gt 0b - sub x1, x1, #2 .if \w == 4 - ldr d0, [x1] - ldr d1, [x1, x2] + ldr d0, [x5] + ldr d1, [x5, x2] st1 {v0.8b, v1.8b}, [x0], #16 .else - add x9, x1, x2 - ldr d0, [x1] - ldr s1, [x1, #8] + add x9, x5, x2 + ldr d0, [x5] + ldr s1, [x5, #8] ldr d2, [x9] ldr s3, [x9, #8] str d0, [x0] @@ -473,7 +475,7 @@ // To handle the offset for negative values, use both halving w/ and w/o rounding. srhadd v5.16b, v1.16b, v2.16b // sum >> 1 shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 - sshr v1.16b, v5.16b, #7 // sum < 0 + cmlt v1.16b, v5.16b, #0 // sum < 0 bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 diff -Nru dav1d-0.9.2/src/arm/64/cdef_tmpl.S dav1d-1.0.0/src/arm/64/cdef_tmpl.S --- dav1d-0.9.2/src/arm/64/cdef_tmpl.S 2021-09-03 15:51:24.397037000 +0000 +++ dav1d-1.0.0/src/arm/64/cdef_tmpl.S 2022-03-18 14:31:55.974356000 +0000 @@ -208,7 +208,7 @@ .endif b.ne 2b - sshr v4.8h, v1.8h, #15 // -(sum < 0) + cmlt v4.8h, v1.8h, #0 // -(sum < 0) add v1.8h, v1.8h, v4.8h // sum - (sum < 0) srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4 diff -Nru dav1d-0.9.2/src/arm/64/film_grain16.S dav1d-1.0.0/src/arm/64/film_grain16.S --- dav1d-0.9.2/src/arm/64/film_grain16.S 2021-09-03 15:51:24.397037000 +0000 +++ dav1d-1.0.0/src/arm/64/film_grain16.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,1957 +0,0 @@ -/* - * Copyright © 2021, VideoLAN and dav1d authors - * Copyright © 2021, Martin Storsjo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "src/arm/asm.S" -#include "util.S" -#include "src/arm/asm-offsets.h" - -#define GRAIN_WIDTH 82 -#define GRAIN_HEIGHT 73 - -#define SUB_GRAIN_WIDTH 44 -#define SUB_GRAIN_HEIGHT 38 - -.macro increment_seed steps, shift=1 - lsr w11, w2, #3 - lsr w12, w2, #12 - lsr w13, w2, #1 - eor w11, w2, w11 // (r >> 0) ^ (r >> 3) - eor w12, w12, w13 // (r >> 12) ^ (r >> 1) - eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) -.if \shift - lsr w2, w2, #\steps -.endif - and w11, w11, #((1 << \steps) - 1) // bit -.if \shift - orr w2, w2, w11, lsl #(16 - \steps) // *state -.else - orr w2, w2, w11, lsl #16 // *state -.endif -.endm - -.macro read_rand dest, bits, age - ubfx \dest, x2, #16 - \bits - \age, #\bits -.endm - -.macro read_shift_rand dest, bits - ubfx \dest, x2, #17 - \bits, #\bits - lsr w2, w2, #1 -.endm - -// special calling convention: -// w2 holds seed -// x3 holds dav1d_gaussian_sequence -// clobbers x11-x15 -// returns in v0.8h -function get_gaussian_neon - increment_seed 4 - read_rand x14, 11, 3 - read_rand x15, 11, 2 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[0], [x14] - read_rand x14, 11, 1 - ld1 {v0.h}[1], [x15] - add x14, x3, x14, lsl #1 - read_rand x15, 11, 0 - increment_seed 4 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[2], [x14] - read_rand x14, 11, 3 - ld1 {v0.h}[3], [x15] - add x14, x3, x14, lsl #1 - read_rand x15, 11, 2 - ld1 {v0.h}[4], [x14] - add x15, x3, x15, lsl #1 - read_rand x14, 11, 1 - ld1 {v0.h}[5], [x15] - read_rand x15, 11, 0 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[6], [x14] - ld1 {v0.h}[7], [x15] - ret -endfunc - -.macro store_grain_row r0, r1, r2, r3, r4, r5 - st1 {\r0\().16b,\r1\().16b}, [x0], #32 - st1 {\r2\().16b,\r3\().16b}, [x0], #32 - st1 {\r4\().16b}, [x0], #16 - st1 {\r5\().h}[0], [x0], #2 -.endm - -function get_grain_2_neon - increment_seed 2 - read_rand x14, 11, 1 - read_rand x15, 11, 0 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[0], [x14] - ld1 {v0.h}[1], [x15] - srshl v0.4h, v0.4h, v31.4h - ret -endfunc - -.macro get_grain_2 dst - bl get_grain_2_neon -.ifnc \dst, v0 - mov \dst\().8b, v0.8b -.endif -.endm - -function get_grain_4_neon - increment_seed 4 - read_rand x14, 11, 3 - read_rand x15, 11, 2 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[0], [x14] - read_rand x14, 11, 1 - ld1 {v0.h}[1], [x15] - add x14, x3, x14, lsl #1 - read_rand x15, 11, 0 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[2], [x14] - ld1 {v0.h}[3], [x15] - srshl v0.4h, v0.4h, v31.4h - ret -endfunc - -.macro get_grain_4 dst - bl get_grain_4_neon -.ifnc \dst, v0 - mov \dst\().8b, v0.8b -.endif -.endm - -// w15 holds the number of entries to produce -// w14, w16 and w17 hold the previous output entries -// v0 holds the vector of produced entries -// v1 holds the input vector of sums from above -.macro output_lag n -function output_lag\n\()_neon -1: - read_shift_rand x13, 11 - mov w11, v1.s[0] - ldrsh w12, [x3, x13, lsl #1] - ext v0.16b, v0.16b, v0.16b, #2 -.if \n == 1 - madd w11, w14, w4, w11 // sum (above) + *coeff * prev output -.elseif \n == 2 - madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 - madd w11, w14, w17, w11 // += *coeff * prev output 2 - mov w16, w14 -.else - madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 - madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 - madd w11, w14, w21, w11 // += *coeff * prev output 3 - mov w17, w16 - mov w16, w14 -.endif - add w14, w11, w8 // 1 << (ar_coeff_shift - 1) - add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) - asr w14, w14, w7 // >> ar_coeff_shift - asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) - add w14, w14, w12 - cmp w14, w5 - csel w14, w14, w5, le - cmp w14, w6 - csel w14, w14, w6, ge - subs w15, w15, #1 - ext v1.16b, v1.16b, v1.16b, #4 - ins v0.h[7], w14 - b.gt 1b - ret -endfunc -.endm - -output_lag 1 -output_lag 2 -output_lag 3 - - -function sum_lag1_above_neon - sub x12, x0, #1*GRAIN_WIDTH*2 - 16 - ld1 {v18.8h}, [x12] // load top right - - ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid - ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right - - smull v4.4s, v17.4h, v28.4h - smlal v4.4s, v0.4h, v27.4h - smlal v4.4s, v1.4h, v29.4h - smull2 v5.4s, v17.8h, v28.8h - smlal2 v5.4s, v0.8h, v27.8h - smlal2 v5.4s, v1.8h, v29.8h - - mov v16.16b, v17.16b - mov v17.16b, v18.16b - - ret -endfunc - -.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff - bl sum_\lag\()_above_neon -.ifc \type, uv_420 - add x12, x19, #GRAIN_WIDTH*2 - ld1 {v22.8h, v23.8h}, [x19], #32 - ld1 {v24.8h, v25.8h}, [x12] - addp v22.8h, v22.8h, v23.8h - addp v23.8h, v24.8h, v25.8h - add v22.8h, v22.8h, v23.8h - srshr v0.8h, v22.8h, #2 -.endif -.ifc \type, uv_422 - ld1 {v22.8h, v23.8h}, [x19], #32 - addp v22.8h, v22.8h, v23.8h - srshr v0.8h, v22.8h, #1 -.endif -.ifc \type, uv_444 - ld1 {v0.8h}, [x19], #16 -.endif -.if \uv_layout -.ifnb \uv_coeff - dup v1.8b, \uv_coeff - sxtl v1.8h, v1.8b - smlal v4.4s, v0.4h, v1.4h - smlal2 v5.4s, v0.8h, v1.8h -.else - smlal v4.4s, v0.4h, v30.4h - smlal2 v5.4s, v0.8h, v30.8h -.endif -.endif -.if \uv_layout && \elems == 8 - b sum_\lag\()_y_\edge\()_start -.elseif \uv_layout == 444 && \elems == 7 - b sum_\lag\()_y_\edge\()_start -.elseif \uv_layout == 422 && \elems == 1 - b sum_\lag\()_uv_420_\edge\()_start -.else -sum_\lag\()_\type\()_\edge\()_start: -.if \elems > 4 -.ifc \edge, left - increment_seed 4 - read_rand x12, 11, 3 - read_rand x13, 11, 2 - read_rand x14, 11, 1 - add x12, x3, x12, lsl #1 - add x13, x3, x13, lsl #1 - add x14, x3, x14, lsl #1 - ld1 {v0.h}[5], [x12] - ld1 {v0.h}[6], [x13] - ld1 {v0.h}[7], [x14] - lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 - srshl v0.8h, v0.8h, v31.8h - ext v4.16b, v4.16b, v4.16b, #12 -.ifc \lag, lag3 - smov w17, v0.h[5] -.endif -.ifnc \lag, lag1 - smov w16, v0.h[6] -.endif - smov w14, v0.h[7] - - mov v1.16b, v4.16b - mov w15, #1 - bl output_\lag\()_neon -.else - increment_seed 4, shift=0 - mov v1.16b, v4.16b - mov w15, #4 - bl output_\lag\()_neon -.endif - - increment_seed 4, shift=0 - mov v1.16b, v5.16b -.ifc \edge, right - mov w15, #3 - bl output_\lag\()_neon - read_shift_rand x15, 11 - add x15, x3, x15, lsl #1 - ld1 {v1.h}[0], [x15] - srshl v1.4h, v1.4h, v31.4h - ext v0.16b, v0.16b, v1.16b, #2 -.else - mov w15, #4 - bl output_\lag\()_neon -.endif -.else - // elems == 1 - increment_seed 4, shift=0 - mov v1.16b, v4.16b - mov w15, #1 - bl output_\lag\()_neon - lsr w2, w2, #3 - - read_rand x12, 11, 2 - read_rand x13, 11, 1 - read_rand x14, 11, 0 - add x12, x3, x12, lsl #1 - add x13, x3, x13, lsl #1 - add x14, x3, x14, lsl #1 - ld1 {v1.h}[0], [x12] - ld1 {v1.h}[1], [x13] - ld1 {v1.h}[2], [x14] - srshl v1.4h, v1.4h, v31.4h - ext v0.16b, v0.16b, v1.16b, #14 -.endif - st1 {v0.8h}, [x0], #16 - ldr x30, [sp], #16 - ret -.endif -.endm - -.macro sum_lag1_func type, uv_layout, edge, elems=8 -function sum_\type\()_lag1_\edge\()_neon - str x30, [sp, #-16]! -.ifc \edge, left - sub x12, x0, #1*GRAIN_WIDTH*2 - ld1 {v17.8h}, [x12] // load the previous block right above -.endif - sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems -endfunc -.endm - -sum_lag1_func y, 0, left -sum_lag1_func y, 0, mid -sum_lag1_func y, 0, right, 7 -sum_lag1_func uv_444, 444, left -sum_lag1_func uv_444, 444, mid -sum_lag1_func uv_444, 444, right, 7 -sum_lag1_func uv_422, 422, left -sum_lag1_func uv_422, 422, mid -sum_lag1_func uv_422, 422, right, 1 -sum_lag1_func uv_420, 420, left -sum_lag1_func uv_420, 420, mid -sum_lag1_func uv_420, 420, right, 1 - - -function sum_lag2_above_neon - sub x12, x0, #2*GRAIN_WIDTH*2 - 16 - sub x13, x0, #1*GRAIN_WIDTH*2 - 16 - ld1 {v18.8h}, [x12] // load top right - ld1 {v21.8h}, [x13] - - dup v26.8b, v30.b[0] - ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid - dup v27.8b, v30.b[1] - ext v23.16b, v16.16b, v17.16b, #14 - sxtl v26.8h, v26.8b - dup v28.8b, v30.b[3] - ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right - sxtl v27.8h, v27.8b - dup v29.8b, v30.b[4] - ext v1.16b, v17.16b, v18.16b, #4 - sxtl v28.8h, v28.8b - sxtl v29.8h, v29.8b - - smull v4.4s, v22.4h, v26.4h - smlal v4.4s, v23.4h, v27.4h - smlal v4.4s, v0.4h, v28.4h - smlal v4.4s, v1.4h, v29.4h - smull2 v5.4s, v22.8h, v26.8h - smlal2 v5.4s, v23.8h, v27.8h - smlal2 v5.4s, v0.8h, v28.8h - smlal2 v5.4s, v1.8h, v29.8h - - dup v26.16b, v30.b[5] - ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid - dup v27.16b, v30.b[6] - ext v23.16b, v19.16b, v20.16b, #14 - sxtl v26.8h, v26.8b - dup v28.16b, v30.b[8] - ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right - sxtl v27.8h, v27.8b - dup v29.16b, v30.b[9] - ext v1.16b, v20.16b, v21.16b, #4 - sxtl v28.8h, v28.8b - sxtl v29.8h, v29.8b - - smlal v4.4s, v22.4h, v26.4h - smlal v4.4s, v23.4h, v27.4h - smlal v4.4s, v0.4h, v28.4h - smlal v4.4s, v1.4h, v29.4h - smlal2 v5.4s, v22.8h, v26.8h - smlal2 v5.4s, v23.8h, v27.8h - smlal2 v5.4s, v0.8h, v28.8h - smlal2 v5.4s, v1.8h, v29.8h - - dup v26.16b, v30.b[2] - dup v27.16b, v30.b[7] - sxtl v26.8h, v26.8b - sxtl v27.8h, v27.8b - - smlal v4.4s, v17.4h, v26.4h - smlal v4.4s, v20.4h, v27.4h - smlal2 v5.4s, v17.8h, v26.8h - smlal2 v5.4s, v20.8h, v27.8h - mov v16.16b, v17.16b - mov v17.16b, v18.16b - - mov v19.16b, v20.16b - mov v20.16b, v21.16b - ret -endfunc - -.macro sum_lag2_func type, uv_layout, edge, elems=8 -function sum_\type\()_lag2_\edge\()_neon - str x30, [sp, #-16]! -.ifc \edge, left - sub x12, x0, #2*GRAIN_WIDTH*2 - sub x13, x0, #1*GRAIN_WIDTH*2 - ld1 {v17.8h}, [x12] // load the previous block right above - ld1 {v20.8h}, [x13] -.endif - sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] -endfunc -.endm - -sum_lag2_func y, 0, left -sum_lag2_func y, 0, mid -sum_lag2_func y, 0, right, 7 -sum_lag2_func uv_444, 444, left -sum_lag2_func uv_444, 444, mid -sum_lag2_func uv_444, 444, right, 7 -sum_lag2_func uv_422, 422, left -sum_lag2_func uv_422, 422, mid -sum_lag2_func uv_422, 422, right, 1 -sum_lag2_func uv_420, 420, left -sum_lag2_func uv_420, 420, mid -sum_lag2_func uv_420, 420, right, 1 - - -function sum_lag3_above_neon - sub x11, x0, #3*GRAIN_WIDTH*2 - 16 - sub x12, x0, #2*GRAIN_WIDTH*2 - 16 - sub x13, x0, #1*GRAIN_WIDTH*2 - 16 - ld1 {v15.8h}, [x11] // load top right - ld1 {v18.8h}, [x12] - ld1 {v21.8h}, [x13] - - dup v22.8b, v29.b[0] - ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid - dup v23.8b, v29.b[1] - ext v9.16b, v13.16b, v14.16b, #12 - sxtl v22.8h, v22.8b - dup v24.8b, v29.b[2] - sxtl v23.8h, v23.8b - dup v25.8b, v29.b[3] - ext v10.16b, v13.16b, v14.16b, #14 - sxtl v24.8h, v24.8b - dup v26.8b, v29.b[4] - ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right - sxtl v25.8h, v25.8b - dup v27.8b, v29.b[5] - ext v12.16b, v14.16b, v15.16b, #4 - sxtl v26.8h, v26.8b - dup v28.8b, v29.b[6] - ext v13.16b, v14.16b, v15.16b, #6 - sxtl v27.8h, v27.8b - sxtl v28.8h, v28.8b - - smull v4.4s, v8.4h, v22.4h - smlal v4.4s, v9.4h, v23.4h - smlal v4.4s, v10.4h, v24.4h - smlal v4.4s, v11.4h, v26.4h - smlal v4.4s, v12.4h, v27.4h - smlal v4.4s, v13.4h, v28.4h - smlal v4.4s, v14.4h, v25.4h - smull2 v5.4s, v8.8h, v22.8h - smlal2 v5.4s, v9.8h, v23.8h - smlal2 v5.4s, v10.8h, v24.8h - smlal2 v5.4s, v11.8h, v26.8h - smlal2 v5.4s, v12.8h, v27.8h - smlal2 v5.4s, v13.8h, v28.8h - smlal2 v5.4s, v14.8h, v25.8h - - dup v22.8b, v29.b[7] - ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid - dup v23.8b, v29.b[8] - ext v9.16b, v16.16b, v17.16b, #12 - sxtl v22.8h, v22.8b - dup v24.8b, v29.b[9] - sxtl v23.8h, v23.8b - dup v25.8b, v29.b[10] - ext v10.16b, v16.16b, v17.16b, #14 - sxtl v24.8h, v24.8b - dup v26.8b, v29.b[11] - ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right - sxtl v25.8h, v25.8b - dup v27.8b, v29.b[12] - ext v12.16b, v17.16b, v18.16b, #4 - sxtl v26.8h, v26.8b - dup v28.8b, v29.b[13] - ext v13.16b, v17.16b, v18.16b, #6 - sxtl v27.8h, v27.8b - sxtl v28.8h, v28.8b - - smlal v4.4s, v8.4h, v22.4h - smlal v4.4s, v9.4h, v23.4h - smlal v4.4s, v10.4h, v24.4h - smlal v4.4s, v11.4h, v26.4h - smlal v4.4s, v12.4h, v27.4h - smlal v4.4s, v13.4h, v28.4h - smlal v4.4s, v17.4h, v25.4h - smlal2 v5.4s, v8.8h, v22.8h - smlal2 v5.4s, v9.8h, v23.8h - smlal2 v5.4s, v10.8h, v24.8h - smlal2 v5.4s, v11.8h, v26.8h - smlal2 v5.4s, v12.8h, v27.8h - smlal2 v5.4s, v13.8h, v28.8h - smlal2 v5.4s, v17.8h, v25.8h - - dup v22.8b, v29.b[14] - ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid - dup v23.8b, v29.b[15] - ext v9.16b, v19.16b, v20.16b, #12 - sxtl v22.8h, v22.8b - dup v24.8b, v30.b[0] - sxtl v23.8h, v23.8b - dup v25.8b, v30.b[1] - ext v10.16b, v19.16b, v20.16b, #14 - sxtl v24.8h, v24.8b - dup v26.8b, v30.b[2] - ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right - sxtl v25.8h, v25.8b - dup v27.8b, v30.b[3] - ext v12.16b, v20.16b, v21.16b, #4 - sxtl v26.8h, v26.8b - dup v28.8b, v30.b[4] - ext v13.16b, v20.16b, v21.16b, #6 - sxtl v27.8h, v27.8b - sxtl v28.8h, v28.8b - - smlal v4.4s, v8.4h, v22.4h - smlal v4.4s, v9.4h, v23.4h - smlal v4.4s, v10.4h, v24.4h - smlal v4.4s, v11.4h, v26.4h - smlal v4.4s, v12.4h, v27.4h - smlal v4.4s, v13.4h, v28.4h - smlal v4.4s, v20.4h, v25.4h - mov v16.16b, v17.16b - mov v17.16b, v18.16b - smlal2 v5.4s, v8.8h, v22.8h - smlal2 v5.4s, v9.8h, v23.8h - smlal2 v5.4s, v10.8h, v24.8h - smlal2 v5.4s, v11.8h, v26.8h - smlal2 v5.4s, v12.8h, v27.8h - smlal2 v5.4s, v13.8h, v28.8h - smlal2 v5.4s, v20.8h, v25.8h - - mov v13.16b, v14.16b - mov v14.16b, v15.16b - - mov v19.16b, v20.16b - mov v20.16b, v21.16b - ret -endfunc - -.macro sum_lag3_func type, uv_layout, edge, elems=8 -function sum_\type\()_lag3_\edge\()_neon - str x30, [sp, #-16]! -.ifc \edge, left - sub x11, x0, #3*GRAIN_WIDTH*2 - sub x12, x0, #2*GRAIN_WIDTH*2 - sub x13, x0, #1*GRAIN_WIDTH*2 - ld1 {v14.8h}, [x11] // load the previous block right above - ld1 {v17.8h}, [x12] - ld1 {v20.8h}, [x13] -.endif - sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] -endfunc -.endm - -sum_lag3_func y, 0, left -sum_lag3_func y, 0, mid -sum_lag3_func y, 0, right, 7 -sum_lag3_func uv_444, 444, left -sum_lag3_func uv_444, 444, mid -sum_lag3_func uv_444, 444, right, 7 -sum_lag3_func uv_422, 422, left -sum_lag3_func uv_422, 422, mid -sum_lag3_func uv_422, 422, right, 1 -sum_lag3_func uv_420, 420, left -sum_lag3_func uv_420, 420, mid -sum_lag3_func uv_420, 420, right, 1 - -function generate_grain_rows_neon - str x30, [sp, #-16]! -1: - mov w16, #80 -2: - bl get_gaussian_neon - srshl v0.8h, v0.8h, v31.8h - subs w16, w16, #8 - st1 {v0.8h}, [x0], #16 - b.gt 2b - get_grain_2 v0 - subs w1, w1, #1 - st1 {v0.s}[0], [x0], #4 - b.gt 1b - ldr x30, [sp], #16 - ret -endfunc - -function generate_grain_rows_44_neon - str x30, [sp, #-16]! -1: - mov w16, #40 -2: - bl get_gaussian_neon - srshl v0.8h, v0.8h, v31.8h - subs w16, w16, #8 - st1 {v0.8h}, [x0], #16 - b.gt 2b - get_grain_4 v0 - subs w1, w1, #1 - st1 {v0.4h}, [x0] - add x0, x0, #GRAIN_WIDTH*2-80 - b.gt 1b - ldr x30, [sp], #16 - ret -endfunc - -function gen_grain_uv_444_lag0_neon - str x30, [sp, #-16]! - ld1 {v4.8h}, [x19], #16 -gen_grain_uv_lag0_8_start: - bl get_gaussian_neon - srshl v0.8h, v0.8h, v31.8h -gen_grain_uv_lag0_8_add: - and v4.16b, v4.16b, v1.16b - smull v2.4s, v4.4h, v27.4h - smull2 v3.4s, v4.8h, v27.8h - srshl v2.4s, v2.4s, v28.4s - srshl v3.4s, v3.4s, v28.4s - sqxtn v2.4h, v2.4s - sqxtn2 v2.8h, v3.4s - sqadd v2.8h, v2.8h, v0.8h - smin v2.8h, v2.8h, v25.8h - smax v2.8h, v2.8h, v26.8h - st1 {v2.8h}, [x0], #16 - ldr x30, [sp], #16 - ret -endfunc - -function gen_grain_uv_420_lag0_8_neon - add x12, x19, #GRAIN_WIDTH*2 - str x30, [sp, #-16]! - ld1 {v16.8h, v17.8h}, [x19], #32 - ld1 {v18.8h, v19.8h}, [x12] - addp v16.8h, v16.8h, v17.8h - addp v17.8h, v18.8h, v19.8h - add v16.8h, v16.8h, v17.8h - srshr v4.8h, v16.8h, #2 - b gen_grain_uv_lag0_8_start -endfunc - -function gen_grain_uv_422_lag0_8_neon - str x30, [sp, #-16]! - ld1 {v16.8h, v17.8h}, [x19], #32 - addp v16.8h, v16.8h, v17.8h - srshr v4.8h, v16.8h, #1 - b gen_grain_uv_lag0_8_start -endfunc - -function gen_grain_uv_420_lag0_4_neon - add x12, x19, #GRAIN_WIDTH*2 - str x30, [sp, #-16]! - ld1 {v16.4h, v17.4h}, [x19] - ld1 {v18.4h, v19.4h}, [x12] - add x19, x19, #32 - addp v16.4h, v16.4h, v17.4h - addp v17.4h, v18.4h, v19.4h - add v16.4h, v16.4h, v17.4h - srshr v4.4h, v16.4h, #2 - get_grain_4 v0 - b gen_grain_uv_lag0_8_add -endfunc - -function gen_grain_uv_422_lag0_4_neon - str x30, [sp, #-16]! - ld1 {v16.4h, v17.4h}, [x19] - add x19, x19, #32 - addp v16.4h, v16.4h, v17.4h - srshr v4.4h, v16.4h, #1 - get_grain_4 v0 - b gen_grain_uv_lag0_8_add -endfunc - -.macro gen_grain_82 type -function generate_grain_\type\()_16bpc_neon, export=1 - stp x30, x19, [sp, #-96]! - -.ifc \type, uv_444 - mov w13, w3 - mov w14, #28 - add x19, x1, #3*GRAIN_WIDTH*2 - mov x1, x2 - mul w13, w13, w14 - clz w15, w4 -.else - clz w15, w2 -.endif - movrel x3, X(gaussian_sequence) - sub w15, w15, #24 // -bitdepth_min_8 - ldr w2, [x1, #FGD_SEED] - ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] -.ifc \type, y - add x4, x1, #FGD_AR_COEFFS_Y -.else - add x4, x1, #FGD_AR_COEFFS_UV -.endif - add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 - adr x16, L(gen_grain_\type\()_tbl) - ldr w17, [x1, #FGD_AR_COEFF_LAG] - add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] - dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift - sub x16, x16, w17, uxtw - neg v31.8h, v31.8h - -.ifc \type, uv_444 - cmp w13, #0 - mov w11, #0x49d8 - mov w14, #0xb524 - add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] - csel w11, w11, w14, ne -.endif - - ldr w7, [x1, #FGD_AR_COEFF_SHIFT] - neg w15, w15 // bitdepth_min_8 - mov w8, #1 - mov w10, #1 - lsl w8, w8, w7 // 1 << ar_coeff_shift - lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) - lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) - lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) - mov w5, #128 - lsl w5, w5, w15 // 128 << bitdepth_min_8 - neg w6, w5 // -(128 << bitpdeth_min_8) - sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 - -.ifc \type, uv_444 - eor w2, w2, w11 -.endif - - br x16 - -L(generate_grain_\type\()_lag0): -.ifc \type, y - mov w1, #GRAIN_HEIGHT - bl generate_grain_rows_neon -.else - dup v28.4s, w7 - ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] - movi v0.16b, #0 - movi v1.16b, #255 - dup v25.8h, w5 - dup v26.8h, w6 - ext v29.16b, v0.16b, v1.16b, #10 - ext v30.16b, v1.16b, v0.16b, #2 - neg v28.4s, v28.4s - sxtl v27.8h, v27.8b - - mov w1, #3 - bl generate_grain_rows_neon - mov w1, #GRAIN_HEIGHT-3 -1: - mov v1.16b, v29.16b - bl gen_grain_uv_444_lag0_neon // 8 - movi v1.16b, #255 - bl gen_grain_uv_444_lag0_neon // 16 - bl gen_grain_uv_444_lag0_neon // 24 - bl gen_grain_uv_444_lag0_neon // 32 - bl gen_grain_uv_444_lag0_neon // 40 - bl gen_grain_uv_444_lag0_neon // 48 - bl gen_grain_uv_444_lag0_neon // 56 - bl gen_grain_uv_444_lag0_neon // 64 - bl gen_grain_uv_444_lag0_neon // 72 - mov v1.16b, v30.16b - bl gen_grain_uv_444_lag0_neon // 80 - get_grain_2 v16 - subs w1, w1, #1 - add x19, x19, #4 - st1 {v16.s}[0], [x0], #4 - b.gt 1b -.endif - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag1): - ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] - ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] - ld1r {v29.8b}, [x4] // ar_coeffs_y[2] -.ifc \type, y - ldrsb w4, [x4, #1] // ar_coeffs_y[3] -.else - add x4, x4, #2 -.endif - - mov w1, #3 -.ifc \type, uv_444 - ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] - ldursb w4, [x4, #-1] // ar_coeffs_uv[3] -.endif - bl generate_grain_rows_neon - sxtl v27.8h, v27.8b - sxtl v28.8h, v28.8b - sxtl v29.8h, v29.8b -.ifc \type, uv_444 - sxtl v30.8h, v30.8b -.endif - - mov w1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag1_left_neon // 8 - bl sum_\type\()_lag1_mid_neon // 16 - bl sum_\type\()_lag1_mid_neon // 24 - bl sum_\type\()_lag1_mid_neon // 32 - bl sum_\type\()_lag1_mid_neon // 40 - bl sum_\type\()_lag1_mid_neon // 48 - bl sum_\type\()_lag1_mid_neon // 56 - bl sum_\type\()_lag1_mid_neon // 64 - bl sum_\type\()_lag1_mid_neon // 72 - bl sum_\type\()_lag1_right_neon // 80 - get_grain_2 v16 - subs w1, w1, #1 -.ifc \type, uv_444 - add x19, x19, #4 -.endif - st1 {v16.s}[0], [x0], #4 - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag2): - ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] - - smov w4, v30.b[10] - smov w17, v30.b[11] - - mov w1, #3 - bl generate_grain_rows_neon - - mov w1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag2_left_neon // 8 - bl sum_\type\()_lag2_mid_neon // 16 - bl sum_\type\()_lag2_mid_neon // 24 - bl sum_\type\()_lag2_mid_neon // 32 - bl sum_\type\()_lag2_mid_neon // 40 - bl sum_\type\()_lag2_mid_neon // 48 - bl sum_\type\()_lag2_mid_neon // 56 - bl sum_\type\()_lag2_mid_neon // 64 - bl sum_\type\()_lag2_mid_neon // 72 - bl sum_\type\()_lag2_right_neon // 80 - get_grain_2 v16 - subs w1, w1, #1 -.ifc \type, uv_444 - add x19, x19, #4 -.endif - st1 {v16.s}[0], [x0], #4 - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag3): - ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] - stp x20, x21, [sp, #80] - - smov w4, v30.b[5] - smov w20, v30.b[6] - smov w21, v30.b[7] - - mov w1, #3 - bl generate_grain_rows_neon - - mov w1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag3_left_neon // 8 - bl sum_\type\()_lag3_mid_neon // 16 - bl sum_\type\()_lag3_mid_neon // 24 - bl sum_\type\()_lag3_mid_neon // 32 - bl sum_\type\()_lag3_mid_neon // 40 - bl sum_\type\()_lag3_mid_neon // 48 - bl sum_\type\()_lag3_mid_neon // 56 - bl sum_\type\()_lag3_mid_neon // 64 - bl sum_\type\()_lag3_mid_neon // 72 - bl sum_\type\()_lag3_right_neon // 80 - get_grain_2 v16 - subs w1, w1, #1 -.ifc \type, uv_444 - add x19, x19, #4 -.endif - st1 {v16.s}[0], [x0], #4 - b.gt 1b - - ldp x20, x21, [sp, #80] - ldp d14, d15, [sp, #64] - ldp d12, d13, [sp, #48] - ldp d10, d11, [sp, #32] - ldp d8, d9, [sp, #16] - ldp x30, x19, [sp], #96 - ret - -L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) -endfunc -.endm - -gen_grain_82 y -gen_grain_82 uv_444 - -.macro set_height dst, type -.ifc \type, uv_420 - mov \dst, #SUB_GRAIN_HEIGHT-3 -.else - mov \dst, #GRAIN_HEIGHT-3 -.endif -.endm - -.macro increment_y_ptr reg, type -.ifc \type, uv_420 - add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) -.else - sub \reg, \reg, #6*32-GRAIN_WIDTH*2 -.endif -.endm - -.macro gen_grain_44 type -function generate_grain_\type\()_16bpc_neon, export=1 - stp x30, x19, [sp, #-96]! - - mov w13, w3 - mov w14, #28 - add x19, x1, #(3*GRAIN_WIDTH-3)*2 - mov x1, x2 - mul w13, w13, w14 - clz w15, w4 - - movrel x3, X(gaussian_sequence) - sub w15, w15, #24 // -bitdepth_min_8 - ldr w2, [x1, #FGD_SEED] - ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] - add x4, x1, #FGD_AR_COEFFS_UV - add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 - adr x16, L(gen_grain_\type\()_tbl) - ldr w17, [x1, #FGD_AR_COEFF_LAG] - add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] - dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift - sub x16, x16, w17, uxtw - neg v31.8h, v31.8h - - cmp w13, #0 - mov w11, #0x49d8 - mov w14, #0xb524 - add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] - csel w11, w11, w14, ne - - ldr w7, [x1, #FGD_AR_COEFF_SHIFT] - neg w15, w15 // bitdepth_min_8 - mov w8, #1 - mov w10, #1 - lsl w8, w8, w7 // 1 << ar_coeff_shift - lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) - lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) - lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) - mov w5, #128 - lsl w5, w5, w15 // 128 << bitdepth_min_8 - neg w6, w5 // -(128 << bitpdeth_min_8) - sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 - - eor w2, w2, w11 - - br x16 - -L(generate_grain_\type\()_lag0): - dup v28.4s, w7 - ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] - movi v0.16b, #0 - movi v1.16b, #255 - dup v25.8h, w5 - dup v26.8h, w6 - ext v29.16b, v0.16b, v1.16b, #10 - ext v30.16b, v1.16b, v0.16b, #14 - neg v28.4s, v28.4s - sxtl v27.8h, v27.8b - - mov w1, #3 - bl generate_grain_rows_44_neon - set_height w1, \type -1: - mov v1.16b, v29.16b - bl gen_grain_\type\()_lag0_8_neon // 8 - movi v1.16b, #255 - bl gen_grain_\type\()_lag0_8_neon // 16 - bl gen_grain_\type\()_lag0_8_neon // 24 - bl gen_grain_\type\()_lag0_8_neon // 32 - bl gen_grain_\type\()_lag0_8_neon // 40 - mov v1.16b, v30.16b - bl gen_grain_\type\()_lag0_4_neon // 44 - subs w1, w1, #1 - increment_y_ptr x19, \type - add x0, x0, #GRAIN_WIDTH*2-6*16 - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag1): - ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] - ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] - ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] - add x4, x4, #2 - - mov w1, #3 - ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] - ldursb w4, [x4, #-1] // ar_coeffs_uv[3] - bl generate_grain_rows_44_neon - - sxtl v27.8h, v27.8b - sxtl v28.8h, v28.8b - sxtl v29.8h, v29.8b - sxtl v30.8h, v30.8b - set_height w1, \type -1: - bl sum_\type\()_lag1_left_neon // 8 - bl sum_\type\()_lag1_mid_neon // 16 - bl sum_\type\()_lag1_mid_neon // 24 - bl sum_\type\()_lag1_mid_neon // 32 - bl sum_\type\()_lag1_mid_neon // 40 - bl sum_\type\()_lag1_right_neon // 44 - subs w1, w1, #1 - increment_y_ptr x19, \type - add x0, x0, #GRAIN_WIDTH*2-6*16 - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag2): - ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] - - smov w4, v30.b[10] - smov w17, v30.b[11] - - mov w1, #3 - bl generate_grain_rows_44_neon - - set_height w1, \type -1: - bl sum_\type\()_lag2_left_neon // 8 - bl sum_\type\()_lag2_mid_neon // 16 - bl sum_\type\()_lag2_mid_neon // 24 - bl sum_\type\()_lag2_mid_neon // 32 - bl sum_\type\()_lag2_mid_neon // 40 - bl sum_\type\()_lag2_right_neon // 44 - subs w1, w1, #1 - increment_y_ptr x19, \type - add x0, x0, #GRAIN_WIDTH*2-6*16 - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag3): - ldr q29, [x4] // ar_coeffs_uv[0-15] - ldr q30, [x4, #16] // ar_coeffs_uv[16-24] - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] - stp x20, x21, [sp, #80] - - smov w4, v30.b[5] - smov w20, v30.b[6] - smov w21, v30.b[7] - - mov w1, #3 - bl generate_grain_rows_44_neon - - set_height w1, \type -1: - bl sum_\type\()_lag3_left_neon // 8 - bl sum_\type\()_lag3_mid_neon // 16 - bl sum_\type\()_lag3_mid_neon // 24 - bl sum_\type\()_lag3_mid_neon // 32 - bl sum_\type\()_lag3_mid_neon // 40 - bl sum_\type\()_lag3_right_neon // 44 - subs w1, w1, #1 - increment_y_ptr x19, \type - add x0, x0, #GRAIN_WIDTH*2-6*16 - b.gt 1b - - ldp x20, x21, [sp, #80] - ldp d14, d15, [sp, #64] - ldp d12, d13, [sp, #48] - ldp d10, d11, [sp, #32] - ldp d8, d9, [sp, #16] - ldp x30, x19, [sp], #96 - ret - -L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) -endfunc -.endm - -gen_grain_44 uv_420 -gen_grain_44 uv_422 - -.macro gather_interleaved dst1, dst2, src1, src2, off - umov w14, \src1[0] - umov w15, \src2[1] - umov w16, \src1[2] - add x14, x14, x3 - umov w17, \src2[3] - add x15, x15, x3 - ld1 {\dst1}[0+\off], [x14] - umov w14, \src1[4] - add x16, x16, x3 - ld1 {\dst2}[1+\off], [x15] - umov w15, \src2[5] - add x17, x17, x3 - ld1 {\dst1}[2+\off], [x16] - umov w16, \src1[6] - add x14, x14, x3 - ld1 {\dst2}[3+\off], [x17] - umov w17, \src2[7] - add x15, x15, x3 - ld1 {\dst1}[4+\off], [x14] - add x16, x16, x3 - ld1 {\dst2}[5+\off], [x15] - add x17, x17, x3 - ld1 {\dst1}[6+\off], [x16] - ld1 {\dst2}[7+\off], [x17] -.endm - -.macro gather dst1, dst2, src1, src2, src3, src4 - gather_interleaved \dst1, \dst2, \src1, \src3, 0 - gather_interleaved \dst2, \dst1, \src3, \src1, 0 - gather_interleaved \dst1, \dst2, \src2, \src4, 8 - gather_interleaved \dst2, \dst1, \src4, \src2, 8 -.endm - -function gather32_neon - gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h - ret -endfunc - -function gather16_neon - gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 - gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 - ins v6.d[1], v7.d[0] - ret -endfunc - -const overlap_coeffs_0, align=4 - .short 27, 17, 0, 0 - .short 17, 27, 32, 32 -endconst - -const overlap_coeffs_1, align=4 - .short 23, 0, 0, 0 - .short 22, 32, 32, 32 -endconst - -.macro calc_offset offx, offy, src, sx, sy - and \offy, \src, #0xF // randval & 0xF - lsr \offx, \src, #4 // randval >> 4 -.if \sy == 0 - add \offy, \offy, \offy // 2 * (randval & 0xF) -.endif -.if \sx == 0 - add \offx, \offx, \offx // 2 * (randval >> 4) -.endif -.endm - -.macro add_offset dst, offx, offy, src, stride - madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy - add \dst, \dst, \offx, uxtw #1 // grain_lut += offx -.endm - -// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, -// const ptrdiff_t stride, -// const uint8_t scaling[SCALING_SIZE], -// const int scaling_shift, -// const entry grain_lut[][GRAIN_WIDTH], -// const int offsets[][2], -// const int h, const ptrdiff_t clip, -// const ptrdiff_t type, -// const int bitdepth_max); -function fgy_32x32_16bpc_neon, export=1 - str x30, [sp, #-80]! - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - str d14, [sp, #64] - eor w4, w4, #15 // 15 - scaling_shift - ldr w11, [x6, #8] // offsets[1][0] - ldr w13, [x6, #4] // offsets[0][1] - ldr w15, [x6, #12] // offsets[1][1] - ldr w10, [sp, #96] // bitdepth_max - ldr w6, [x6] // offsets[0][0] - dup v26.8h, w10 // bitdepth_max - clz w10, w10 - ldr w8, [sp, #80] // clip - sub w10, w10, #24 // -bitdepth_min_8 - mov x9, #GRAIN_WIDTH*2 // grain_lut stride - neg w10, w10 // bitdepth_min_8 - - dup v29.8h, w4 // 15 - scaling_shift - dup v27.8h, w10 // bitdepth_min_8 - - movrel x16, overlap_coeffs_0 - - cbz w8, 1f - // clip - movi v30.8h, #16 - movi v31.8h, #235 - sshl v30.8h, v30.8h, v27.8h - sshl v31.8h, v31.8h, v27.8h - b 2f -1: - // no clip - movi v30.8h, #0 - mov v31.16b, v26.16b // bitdepth_max -2: - - ushr v26.8h, v26.8h, #1 // grain_max - not v25.16b, v26.16b // grain_min - - ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs - - add x5, x5, #18 // grain_lut += 9 - add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride - add x5, x5, x9 // grain_lut += grain_stride - - calc_offset w11, w12, w11, 0, 0 - calc_offset w13, w14, w13, 0, 0 - calc_offset w15, w16, w15, 0, 0 - calc_offset w6, w10, w6, 0, 0 - - add_offset x12, w11, x12, x5, x9 - add_offset x14, w13, x14, x5, x9 - add_offset x16, w15, x16, x5, x9 - add_offset x5, w6, x10, x5, x9 - - ldr w11, [sp, #88] // type - adr x13, L(fgy_loop_tbl) - - add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx - add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - - tst w11, #1 - ldrh w11, [x13, w11, uxtw #1] - - add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx - - sub x11, x13, w11, uxtw - - b.eq 1f - // y overlap - dup v8.8h, v27.h[0] - dup v9.8h, v27.h[1] - mov w10, w7 // backup actual h - mov w7, #2 -1: - br x11 -endfunc - -function fgy_loop_neon -.macro fgy ox, oy -L(loop_\ox\oy): -1: - ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src -.if \ox - ld1 {v20.4h}, [x4], x9 // grain_lut old -.endif -.if \oy - ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top -.endif -.if \ox && \oy - ld1 {v14.4h}, [x8], x9 // grain_lut top old -.endif - mvni v4.8h, #0xf0, lsl #8 // 0x0fff - ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut - - // Make sure that uninitialized pixels out of range past the right - // edge are in range; their actual values shouldn't matter. - and v0.16b, v0.16b, v4.16b - and v1.16b, v1.16b, v4.16b - and v2.16b, v2.16b, v4.16b - and v3.16b, v3.16b, v4.16b - bl gather32_neon - -.if \ox - smull v20.4s, v20.4h, v27.4h - smlal v20.4s, v16.4h, v28.4h -.endif - -.if \oy -.if \ox - smull v14.4s, v14.4h, v27.4h - smlal v14.4s, v21.4h, v28.4h - sqrshrn v20.4h, v20.4s, #5 - sqrshrn v14.4h, v14.4s, #5 - smin v20.4h, v20.4h, v26.4h - smin v14.4h, v14.4h, v26.4h - smax v20.4h, v20.4h, v25.4h - smax v14.4h, v14.4h, v25.4h -.endif - -.if \ox - smull v10.4s, v20.4h, v9.4h -.else - smull v10.4s, v16.4h, v9.4h -.endif - smull2 v11.4s, v16.8h, v9.8h - smull v12.4s, v17.4h, v9.4h - smull2 v13.4s, v17.8h, v9.8h - smull v16.4s, v18.4h, v9.4h - smull2 v17.4s, v18.8h, v9.8h - smull v18.4s, v19.4h, v9.4h - smull2 v19.4s, v19.8h, v9.8h -.if \ox - smlal v10.4s, v14.4h, v8.4h -.else - smlal v10.4s, v21.4h, v8.4h -.endif - smlal2 v11.4s, v21.8h, v8.8h - smlal v12.4s, v22.4h, v8.4h - smlal2 v13.4s, v22.8h, v8.8h - smlal v16.4s, v23.4h, v8.4h - smlal2 v17.4s, v23.8h, v8.8h - smlal v18.4s, v24.4h, v8.4h - smlal2 v19.4s, v24.8h, v8.8h - sqrshrn v10.4h, v10.4s, #5 - sqrshrn2 v10.8h, v11.4s, #5 - sqrshrn v11.4h, v12.4s, #5 - sqrshrn2 v11.8h, v13.4s, #5 - sqrshrn v12.4h, v16.4s, #5 - sqrshrn2 v12.8h, v17.4s, #5 - sqrshrn v13.4h, v18.4s, #5 - sqrshrn2 v13.8h, v19.4s, #5 - smin v16.8h, v10.8h, v26.8h - smin v17.8h, v11.8h, v26.8h - smin v18.8h, v12.8h, v26.8h - smin v19.8h, v13.8h, v26.8h - smax v16.8h, v16.8h, v25.8h - smax v17.8h, v17.8h, v25.8h - smax v18.8h, v18.8h, v25.8h - smax v19.8h, v19.8h, v25.8h -.endif - - uxtl v4.8h, v6.8b // scaling -.if \ox && !\oy - sqrshrn v20.4h, v20.4s, #5 -.endif - uxtl2 v5.8h, v6.16b -.if \ox && !\oy - smin v20.4h, v20.4h, v26.4h -.endif - uxtl v6.8h, v7.8b -.if \ox && !\oy - smax v20.4h, v20.4h, v25.4h -.endif - uxtl2 v7.8h, v7.16b -.if \ox && !\oy - ins v16.d[0], v20.d[0] -.endif - ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) - ushl v5.8h, v5.8h, v29.8h - ushl v6.8h, v6.8h, v29.8h - ushl v7.8h, v7.8h, v29.8h - - sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) - sqrdmulh v21.8h, v17.8h, v5.8h - sqrdmulh v22.8h, v18.8h, v6.8h - sqrdmulh v23.8h, v19.8h, v7.8h - - usqadd v0.8h, v20.8h // *src + noise - usqadd v1.8h, v21.8h - usqadd v2.8h, v22.8h - usqadd v3.8h, v23.8h - - umax v0.8h, v0.8h, v30.8h - umax v1.8h, v1.8h, v30.8h - umax v2.8h, v2.8h, v30.8h - umax v3.8h, v3.8h, v30.8h - umin v0.8h, v0.8h, v31.8h - umin v1.8h, v1.8h, v31.8h - umin v2.8h, v2.8h, v31.8h - umin v3.8h, v3.8h, v31.8h - - subs w7, w7, #1 -.if \oy - dup v8.8h, v28.h[0] - dup v9.8h, v28.h[1] -.endif - st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst - b.gt 1b - -.if \oy - cmp w10, #2 - sub w7, w10, #2 // restore actual remaining h - b.gt L(loop_\ox\()0) -.endif - ldr d14, [sp, #64] - ldp d12, d13, [sp, #48] - ldp d10, d11, [sp, #32] - ldp d8, d9, [sp, #16] - ldr x30, [sp], #80 - ret -.endm - - fgy 0, 0 - fgy 0, 1 - fgy 1, 0 - fgy 1, 1 - -L(fgy_loop_tbl): - .hword L(fgy_loop_tbl) - L(loop_00) - .hword L(fgy_loop_tbl) - L(loop_01) - .hword L(fgy_loop_tbl) - L(loop_10) - .hword L(fgy_loop_tbl) - L(loop_11) -endfunc - -// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, -// const pixel *const src, -// const ptrdiff_t stride, -// const uint8_t scaling[SCALING_SIZE], -// const Dav1dFilmGrainData *const data, -// const entry grain_lut[][GRAIN_WIDTH], -// const pixel *const luma_row, -// const ptrdiff_t luma_stride, -// const int offsets[][2], -// const ptrdiff_t h, const ptrdiff_t uv, -// const ptrdiff_t is_id, -// const ptrdiff_t type, -// const int bitdepth_max); -.macro fguv layout, sx, sy -function fguv_32x32_\layout\()_16bpc_neon, export=1 - str x30, [sp, #-80]! - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] - - ldp x8, x9, [sp, #80] // offsets, h - ldp x10, x11, [sp, #96] // uv, is_id - ldr w16, [sp, #120] // bitdepth_max - - ldr w13, [x4, #FGD_SCALING_SHIFT] - ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] - dup v23.8h, w16 // bitdepth_max - clz w16, w16 - eor w13, w13, #15 // 15 - scaling_shift - sub w16, w16, #24 // -bitdepth_min_8 - - // !csfl - add x10, x4, x10, lsl #2 // + 4*uv - add x14, x10, #FGD_UV_LUMA_MULT - add x15, x10, #FGD_UV_MULT - add x10, x10, #FGD_UV_OFFSET - neg w16, w16 // bitdepth_min_8 - ld1r {v8.8h}, [x14] // uv_luma_mult - ld1r {v24.8h}, [x10] // uv_offset - ld1r {v9.8h}, [x15] // uv_mult - - dup v29.8h, w13 // 15 - scaling_shift - dup v27.8h, w16 // bitdepth_min_8 - - cbz w12, 1f - // clip - movi v30.8h, #16 - movi v31.8h, #240 - sshl v30.8h, v30.8h, v27.8h - sshl v31.8h, v31.8h, v27.8h - cbz w11, 2f - // is_id - movi v31.8h, #235 - sshl v31.8h, v31.8h, v27.8h - b 2f -1: - // no clip - movi v30.8h, #0 - mov v31.16b, v23.16b // bitdepth_max -2: - - ushr v15.8h, v23.8h, #1 // grain_max - sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 - not v14.16b, v15.16b // grain_min - - ldr w12, [x8, #8] // offsets[1][0] - ldr w14, [x8, #4] // offsets[0][1] - ldr w16, [x8, #12] // offsets[1][1] - ldr w8, [x8] // offsets[0][0] - - mov x10, #GRAIN_WIDTH*2 // grain_lut stride - - add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 -.if \sy - add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride - add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride -.else - add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride - add x5, x5, x10 // grain_lut += grain_stride -.endif - - calc_offset w12, w13, w12, \sx, \sy - calc_offset w14, w15, w14, \sx, \sy - calc_offset w16, w17, w16, \sx, \sy - calc_offset w8, w11, w8, \sx, \sy - - add_offset x13, w12, x13, x5, x10 - add_offset x15, w14, x15, x5, x10 - add_offset x17, w16, x17, x5, x10 - add_offset x5, w8, x11, x5, x10 - - add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - - ldr w13, [sp, #112] // type - - movrel x16, overlap_coeffs_\sx - adr x14, L(fguv_loop_sx\sx\()_tbl) - - ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs - tst w13, #1 - ldrh w13, [x14, w13, uxtw #1] - - b.eq 1f - // y overlap - sub w12, w9, #(2 >> \sy) // backup remaining h - mov w9, #(2 >> \sy) - -1: - sub x13, x14, w13, uxtw - -.if \sy - movi v25.8h, #23 - movi v26.8h, #22 -.else - movi v25.8h, #27 - movi v26.8h, #17 -.endif - -.if \sy - add x7, x7, x7 // luma_stride *= 2 -.endif - - br x13 -endfunc -.endm - -fguv 420, 1, 1 -fguv 422, 1, 0 -fguv 444, 0, 0 - -function fguv_loop_sx0_neon -.macro fguv_loop_sx0 csfl, ox, oy -L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): -1: -.if \ox - ld1 {v4.4h}, [x4], x10 // grain_lut old -.endif -.if \oy - ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top -.endif -.if \ox && \oy - ld1 {v5.4h}, [x11], x10 // grain_lut top old -.endif - ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut - -.if \ox - smull v4.4s, v4.4h, v27.4h - smlal v4.4s, v16.4h, v28.4h -.endif - -.if \oy -.if \ox - smull v5.4s, v5.4h, v27.4h - smlal v5.4s, v0.4h, v28.4h - sqrshrn v4.4h, v4.4s, #5 - sqrshrn v5.4h, v5.4s, #5 - smin v4.4h, v4.4h, v15.4h - smin v5.4h, v5.4h, v15.4h - smax v4.4h, v4.4h, v14.4h - smax v5.4h, v5.4h, v14.4h - ins v16.d[0], v4.d[0] - ins v0.d[0], v5.d[0] -.endif - - smull v6.4s, v16.4h, v26.4h - smull2 v7.4s, v16.8h, v26.8h - smull v10.4s, v17.4h, v26.4h - smull2 v11.4s, v17.8h, v26.8h - smull v16.4s, v18.4h, v26.4h - smull2 v17.4s, v18.8h, v26.8h - smull v18.4s, v19.4h, v26.4h - smull2 v19.4s, v19.8h, v26.8h - smlal v6.4s, v0.4h, v25.4h - smlal2 v7.4s, v0.8h, v25.8h - smlal v10.4s, v1.4h, v25.4h - smlal2 v11.4s, v1.8h, v25.8h - smlal v16.4s, v2.4h, v25.4h - smlal2 v17.4s, v2.8h, v25.8h - smlal v18.4s, v3.4h, v25.4h - smlal2 v19.4s, v3.8h, v25.8h - sqrshrn v6.4h, v6.4s, #5 - sqrshrn2 v6.8h, v7.4s, #5 - sqrshrn v7.4h, v10.4s, #5 - sqrshrn2 v7.8h, v11.4s, #5 - sqrshrn v10.4h, v16.4s, #5 - sqrshrn2 v10.8h, v17.4s, #5 - sqrshrn v11.4h, v18.4s, #5 - sqrshrn2 v11.8h, v19.4s, #5 -.endif - -.if \ox && !\oy - sqrshrn v4.4h, v4.4s, #5 - smin v4.4h, v4.4h, v15.4h -.endif - ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma -.if \oy - smin v16.8h, v6.8h, v15.8h - smin v17.8h, v7.8h, v15.8h - smin v18.8h, v10.8h, v15.8h - smin v19.8h, v11.8h, v15.8h - smax v16.8h, v16.8h, v14.8h - smax v17.8h, v17.8h, v14.8h - smax v18.8h, v18.8h, v14.8h - smax v19.8h, v19.8h, v14.8h -.endif - -.if \ox && !\oy - smax v4.4h, v4.4h, v14.4h -.endif - ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src -.if \ox && !\oy - ins v16.d[0], v4.d[0] -.endif - -.if !\csfl - smull v4.4s, v0.4h, v8.4h - smull2 v5.4s, v0.8h, v8.8h - smull v6.4s, v1.4h, v8.4h - smull2 v7.4s, v1.8h, v8.8h - smull v0.4s, v2.4h, v8.4h - smull2 v1.4s, v2.8h, v8.8h - smull v2.4s, v3.4h, v8.4h - smull2 v3.4s, v3.8h, v8.8h - smlal v4.4s, v10.4h, v9.4h - smlal2 v5.4s, v10.8h, v9.8h - smlal v6.4s, v11.4h, v9.4h - smlal2 v7.4s, v11.8h, v9.8h - smlal v0.4s, v12.4h, v9.4h - smlal2 v1.4s, v12.8h, v9.8h - smlal v2.4s, v13.4h, v9.4h - smlal2 v3.4s, v13.8h, v9.8h - shrn v4.4h, v4.4s, #6 - shrn2 v4.8h, v5.4s, #6 - shrn v5.4h, v6.4s, #6 - shrn2 v5.8h, v7.4s, #6 - shrn v6.4h, v0.4s, #6 - shrn2 v6.8h, v1.4s, #6 - shrn v7.4h, v2.4s, #6 - shrn2 v7.8h, v3.4s, #6 - add v0.8h, v4.8h, v24.8h - add v1.8h, v5.8h, v24.8h - add v2.8h, v6.8h, v24.8h - add v3.8h, v7.8h, v24.8h - movi v20.8h, #0 - smin v0.8h, v0.8h, v23.8h - smin v1.8h, v1.8h, v23.8h - smin v2.8h, v2.8h, v23.8h - smin v3.8h, v3.8h, v23.8h - smax v0.8h, v0.8h, v20.8h - smax v1.8h, v1.8h, v20.8h - smax v2.8h, v2.8h, v20.8h - smax v3.8h, v3.8h, v20.8h -.else - // Make sure that uninitialized pixels out of range past the right - // edge are in range; their actual values shouldn't matter. - and v0.16b, v0.16b, v23.16b - and v1.16b, v1.16b, v23.16b - and v2.16b, v2.16b, v23.16b - and v3.16b, v3.16b, v23.16b -.endif - - bl gather32_neon - - uxtl v4.8h, v6.8b // scaling - uxtl2 v5.8h, v6.16b - uxtl v6.8h, v7.8b - uxtl2 v7.8h, v7.16b - - ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) - ushl v5.8h, v5.8h, v29.8h - ushl v6.8h, v6.8h, v29.8h - ushl v7.8h, v7.8h, v29.8h - - sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) - sqrdmulh v17.8h, v17.8h, v5.8h - sqrdmulh v18.8h, v18.8h, v6.8h - sqrdmulh v19.8h, v19.8h, v7.8h - - usqadd v10.8h, v16.8h // *src + noise - usqadd v11.8h, v17.8h - usqadd v12.8h, v18.8h - usqadd v13.8h, v19.8h - - umax v0.8h, v10.8h, v30.8h - umax v1.8h, v11.8h, v30.8h - umax v2.8h, v12.8h, v30.8h - umax v3.8h, v13.8h, v30.8h - umin v0.8h, v0.8h, v31.8h - umin v1.8h, v1.8h, v31.8h - umin v2.8h, v2.8h, v31.8h - umin v3.8h, v3.8h, v31.8h - - subs w9, w9, #1 -.if \oy - dup v25.8h, v28.h[0] - dup v26.8h, v28.h[1] -.endif - st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst - b.gt 1b - -.if \oy - cmp w12, #0 - mov w9, w12 // restore actual remaining h - b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) -.endif - b 9f -.endm - fguv_loop_sx0 0, 0, 0 - fguv_loop_sx0 0, 0, 1 - fguv_loop_sx0 0, 1, 0 - fguv_loop_sx0 0, 1, 1 - fguv_loop_sx0 1, 0, 0 - fguv_loop_sx0 1, 0, 1 - fguv_loop_sx0 1, 1, 0 - fguv_loop_sx0 1, 1, 1 - -9: - ldp d14, d15, [sp, #64] - ldp d12, d13, [sp, #48] - ldp d10, d11, [sp, #32] - ldp d8, d9, [sp, #16] - ldr x30, [sp], #80 - ret - -L(fguv_loop_sx0_tbl): - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) -endfunc - -function fguv_loop_sx1_neon -.macro fguv_loop_sx1 csfl, ox, oy -L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): -1: -.if \ox - ld1 {v18.4h}, [x4], x10 // grain_lut old -.endif -.if \oy - ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top -.endif -.if \ox && \oy - ld1 {v19.4h}, [x11], x10 // grain_lut top old -.endif - ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut - -.if \ox - smull v18.4s, v18.4h, v27.4h - smlal v18.4s, v16.4h, v28.4h -.endif - -.if \oy -.if \ox - smull v19.4s, v19.4h, v27.4h - smlal v19.4s, v20.4h, v28.4h - sqrshrn v18.4h, v18.4s, #5 - sqrshrn v19.4h, v19.4s, #5 - smin v18.4h, v18.4h, v15.4h - smin v19.4h, v19.4h, v15.4h - smax v18.4h, v18.4h, v14.4h - smax v19.4h, v19.4h, v14.4h - ins v16.d[0], v18.d[0] - ins v20.d[0], v19.d[0] -.endif - - smull v0.4s, v16.4h, v26.4h - smull2 v1.4s, v16.8h, v26.8h - smull v2.4s, v17.4h, v26.4h - smull2 v3.4s, v17.8h, v26.8h - smlal v0.4s, v20.4h, v25.4h - smlal2 v1.4s, v20.8h, v25.8h - smlal v2.4s, v21.4h, v25.4h - smlal2 v3.4s, v21.8h, v25.8h - sqrshrn v16.4h, v0.4s, #5 - sqrshrn2 v16.8h, v1.4s, #5 - sqrshrn v17.4h, v2.4s, #5 - sqrshrn2 v17.8h, v3.4s, #5 -.endif - -.if \ox && !\oy - sqrshrn v18.4h, v18.4s, #5 - smin v18.4h, v18.4h, v15.4h -.endif - ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma -.if \oy - smin v16.8h, v16.8h, v15.8h - smin v17.8h, v17.8h, v15.8h - smax v16.8h, v16.8h, v14.8h - smax v17.8h, v17.8h, v14.8h -.endif - -.if \ox && !\oy - smax v18.4h, v18.4h, v14.4h -.endif - ld1 {v10.8h, v11.8h}, [x1], x2 // src -.if \ox && !\oy - ins v16.d[0], v18.d[0] -.endif - addp v0.8h, v0.8h, v1.8h - addp v1.8h, v2.8h, v3.8h - urshr v0.8h, v0.8h, #1 - urshr v1.8h, v1.8h, #1 -.if !\csfl - smull v2.4s, v0.4h, v8.4h - smull2 v3.4s, v0.8h, v8.8h - smull v0.4s, v1.4h, v8.4h - smull2 v1.4s, v1.8h, v8.8h - smlal v2.4s, v10.4h, v9.4h - smlal2 v3.4s, v10.8h, v9.8h - smlal v0.4s, v11.4h, v9.4h - smlal2 v1.4s, v11.8h, v9.8h - shrn v2.4h, v2.4s, #6 - shrn2 v2.8h, v3.4s, #6 - shrn v3.4h, v0.4s, #6 - shrn2 v3.8h, v1.4s, #6 - add v0.8h, v2.8h, v24.8h - add v1.8h, v3.8h, v24.8h - movi v2.8h, #0 - smin v0.8h, v0.8h, v23.8h - smin v1.8h, v1.8h, v23.8h - smax v0.8h, v0.8h, v2.8h - smax v1.8h, v1.8h, v2.8h -.else - // Make sure that uninitialized pixels out of range past the right - // edge are in range; their actual values shouldn't matter. - and v0.16b, v0.16b, v23.16b - and v1.16b, v1.16b, v23.16b -.endif - - bl gather16_neon - - uxtl v4.8h, v6.8b // scaling - uxtl2 v5.8h, v6.16b - - ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) - ushl v5.8h, v5.8h, v29.8h - - sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) - sqrdmulh v17.8h, v17.8h, v5.8h - - usqadd v10.8h, v16.8h // *src + noise - usqadd v11.8h, v17.8h - - umax v0.8h, v10.8h, v30.8h - umax v1.8h, v11.8h, v30.8h - umin v0.8h, v0.8h, v31.8h - umin v1.8h, v1.8h, v31.8h - -.if \oy - mov v16.16b, v25.16b -.endif - subs w9, w9, #1 -.if \oy - mov v25.16b, v26.16b - mov v26.16b, v16.16b -.endif - st1 {v0.8h, v1.8h}, [x0], x2 // dst - b.gt 1b - -.if \oy - cmp w12, #0 - mov w9, w12 // restore actual remaining h - b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) -.endif - - b 9f -.endm - fguv_loop_sx1 0, 0, 0 - fguv_loop_sx1 0, 0, 1 - fguv_loop_sx1 0, 1, 0 - fguv_loop_sx1 0, 1, 1 - fguv_loop_sx1 1, 0, 0 - fguv_loop_sx1 1, 0, 1 - fguv_loop_sx1 1, 1, 0 - fguv_loop_sx1 1, 1, 1 - -9: - ldp d14, d15, [sp, #64] - ldp d12, d13, [sp, #48] - ldp d10, d11, [sp, #32] - ldp d8, d9, [sp, #16] - ldr x30, [sp], #80 - ret - -L(fguv_loop_sx1_tbl): - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) -endfunc diff -Nru dav1d-0.9.2/src/arm/64/filmgrain16.S dav1d-1.0.0/src/arm/64/filmgrain16.S --- dav1d-0.9.2/src/arm/64/filmgrain16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/arm/64/filmgrain16.S 2022-03-18 14:31:55.974356000 +0000 @@ -0,0 +1,1997 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr w11, w2, #3 + lsr w12, w2, #12 + lsr w13, w2, #1 + eor w11, w2, w11 // (r >> 0) ^ (r >> 3) + eor w12, w12, w13 // (r >> 12) ^ (r >> 1) + eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr w2, w2, #\steps +.endif + and w11, w11, #((1 << \steps) - 1) // bit +.if \shift + orr w2, w2, w11, lsl #(16 - \steps) // *state +.else + orr w2, w2, w11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, x2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, x2, #17 - \bits, #\bits + lsr w2, w2, #1 +.endm + +// special calling convention: +// w2 holds seed +// x3 holds dav1d_gaussian_sequence +// clobbers x11-x15 +// returns in v0.8h +function get_gaussian_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + increment_seed 4 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + read_rand x14, 11, 3 + ld1 {v0.h}[3], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 2 + ld1 {v0.h}[4], [x14] + add x15, x3, x15, lsl #1 + read_rand x14, 11, 1 + ld1 {v0.h}[5], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[6], [x14] + ld1 {v0.h}[7], [x15] + ret +endfunc + +.macro store_grain_row r0, r1, r2, r3, r4, r5 + st1 {\r0\().16b,\r1\().16b}, [x0], #32 + st1 {\r2\().16b,\r3\().16b}, [x0], #32 + st1 {\r4\().16b}, [x0], #16 + st1 {\r5\().h}[0], [x0], #2 +.endm + +function get_grain_2_neon + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + ld1 {v0.h}[1], [x15] + srshl v0.4h, v0.4h, v31.4h + ret +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, v0 + mov \dst\().8b, v0.8b +.endif +.endm + +function get_grain_4_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + ld1 {v0.h}[3], [x15] + srshl v0.4h, v0.4h, v31.4h + ret +endfunc + +.macro get_grain_4 dst + bl get_grain_4_neon +.ifnc \dst, v0 + mov \dst\().8b, v0.8b +.endif +.endm + +// w15 holds the number of entries to produce +// w14, w16 and w17 hold the previous output entries +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #2 +.if \n == 1 + madd w11, w14, w4, w11 // sum (above) + *coeff * prev output +.elseif \n == 2 + madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w14, w17, w11 // += *coeff * prev output 2 + mov w16, w14 +.else + madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 + madd w11, w14, w21, w11 // += *coeff * prev output 3 + mov w17, w16 + mov w16, w14 +.endif + add w14, w11, w8 // 1 << (ar_coeff_shift - 1) + add w12, w12, w10 // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + asr w12, w12, w9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.h[7], w14 + b.gt 1b + ret +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + sub x12, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v18.8h}, [x12] // load top right + + ext v0.16b, v16.16b, v17.16b, #14 // top left, top mid + ext v1.16b, v17.16b, v18.16b, #2 // top mid, top right + + smull v4.4s, v17.4h, v28.4h + smlal v4.4s, v0.4h, v27.4h + smlal v4.4s, v1.4h, v29.4h + smull2 v5.4s, v17.8h, v28.8h + smlal2 v5.4s, v0.8h, v27.8h + smlal2 v5.4s, v1.8h, v29.8h + + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + ret +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff + bl sum_\lag\()_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH*2 + ld1 {v22.8h, v23.8h}, [x19], #32 + ld1 {v24.8h, v25.8h}, [x12] + addp v22.8h, v22.8h, v23.8h + addp v23.8h, v24.8h, v25.8h + add v22.8h, v22.8h, v23.8h + srshr v0.8h, v22.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.8h, v23.8h}, [x19], #32 + addp v22.8h, v22.8h, v23.8h + srshr v0.8h, v22.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.8h}, [x19], #16 +.endif +.if \uv_layout +.ifnb \uv_coeff + dup v1.8b, \uv_coeff + sxtl v1.8h, v1.8b + smlal v4.4s, v0.4h, v1.4h + smlal2 v5.4s, v0.8h, v1.8h +.else + smlal v4.4s, v0.4h, v30.4h + smlal2 v5.4s, v0.8h, v30.8h +.endif +.endif +.if \uv_layout && \elems == 8 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 7 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 1 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: +.if \elems > 4 +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + ext v4.16b, v4.16b, v4.16b, #12 +.ifc \lag, lag3 + smov w17, v0.h[5] +.endif +.ifnc \lag, lag1 + smov w16, v0.h[6] +.endif + smov w14, v0.h[7] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b +.ifc \edge, right + mov w15, #3 + bl output_\lag\()_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #2 +.else + mov w15, #4 + bl output_\lag\()_neon +.endif +.else + // elems == 1 + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag\()_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #14 +.endif + st1 {v0.8h}, [x0], #16 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag1_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #1*GRAIN_WIDTH*2 + ld1 {v17.8h}, [x12] // load the previous block right above +.endif + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 7 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 7 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 1 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 1 + + +function sum_lag2_above_neon + sub x12, x0, #2*GRAIN_WIDTH*2 - 16 + sub x13, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v18.8h}, [x12] // load top right + ld1 {v21.8h}, [x13] + + dup v26.8b, v30.b[0] + ext v22.16b, v16.16b, v17.16b, #12 // top left, top mid + dup v27.8b, v30.b[1] + ext v23.16b, v16.16b, v17.16b, #14 + sxtl v26.8h, v26.8b + dup v28.8b, v30.b[3] + ext v0.16b, v17.16b, v18.16b, #2 // top mid, top right + sxtl v27.8h, v27.8b + dup v29.8b, v30.b[4] + ext v1.16b, v17.16b, v18.16b, #4 + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + + smull v4.4s, v22.4h, v26.4h + smlal v4.4s, v23.4h, v27.4h + smlal v4.4s, v0.4h, v28.4h + smlal v4.4s, v1.4h, v29.4h + smull2 v5.4s, v22.8h, v26.8h + smlal2 v5.4s, v23.8h, v27.8h + smlal2 v5.4s, v0.8h, v28.8h + smlal2 v5.4s, v1.8h, v29.8h + + dup v26.16b, v30.b[5] + ext v22.16b, v19.16b, v20.16b, #12 // top left, top mid + dup v27.16b, v30.b[6] + ext v23.16b, v19.16b, v20.16b, #14 + sxtl v26.8h, v26.8b + dup v28.16b, v30.b[8] + ext v0.16b, v20.16b, v21.16b, #2 // top mid, top right + sxtl v27.8h, v27.8b + dup v29.16b, v30.b[9] + ext v1.16b, v20.16b, v21.16b, #4 + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + + smlal v4.4s, v22.4h, v26.4h + smlal v4.4s, v23.4h, v27.4h + smlal v4.4s, v0.4h, v28.4h + smlal v4.4s, v1.4h, v29.4h + smlal2 v5.4s, v22.8h, v26.8h + smlal2 v5.4s, v23.8h, v27.8h + smlal2 v5.4s, v0.8h, v28.8h + smlal2 v5.4s, v1.8h, v29.8h + + dup v26.16b, v30.b[2] + dup v27.16b, v30.b[7] + sxtl v26.8h, v26.8b + sxtl v27.8h, v27.8b + + smlal v4.4s, v17.4h, v26.4h + smlal v4.4s, v20.4h, v27.4h + smlal2 v5.4s, v17.8h, v26.8h + smlal2 v5.4s, v20.8h, v27.8h + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag2_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #2*GRAIN_WIDTH*2 + sub x13, x0, #1*GRAIN_WIDTH*2 + ld1 {v17.8h}, [x12] // load the previous block right above + ld1 {v20.8h}, [x13] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, v30.b[12] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 7 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 7 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 1 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 1 + + +function sum_lag3_above_neon + sub x11, x0, #3*GRAIN_WIDTH*2 - 16 + sub x12, x0, #2*GRAIN_WIDTH*2 - 16 + sub x13, x0, #1*GRAIN_WIDTH*2 - 16 + ld1 {v15.8h}, [x11] // load top right + ld1 {v18.8h}, [x12] + ld1 {v21.8h}, [x13] + + dup v22.8b, v29.b[0] + ext v8.16b, v13.16b, v14.16b, #10 // top left, top mid + dup v23.8b, v29.b[1] + ext v9.16b, v13.16b, v14.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v29.b[2] + sxtl v23.8h, v23.8b + dup v25.8b, v29.b[3] + ext v10.16b, v13.16b, v14.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v29.b[4] + ext v11.16b, v14.16b, v15.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v29.b[5] + ext v12.16b, v14.16b, v15.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v29.b[6] + ext v13.16b, v14.16b, v15.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smull v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v14.4h, v25.4h + smull2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v14.8h, v25.8h + + dup v22.8b, v29.b[7] + ext v8.16b, v16.16b, v17.16b, #10 // top left, top mid + dup v23.8b, v29.b[8] + ext v9.16b, v16.16b, v17.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v29.b[9] + sxtl v23.8h, v23.8b + dup v25.8b, v29.b[10] + ext v10.16b, v16.16b, v17.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v29.b[11] + ext v11.16b, v17.16b, v18.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v29.b[12] + ext v12.16b, v17.16b, v18.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v29.b[13] + ext v13.16b, v17.16b, v18.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smlal v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v17.4h, v25.4h + smlal2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v17.8h, v25.8h + + dup v22.8b, v29.b[14] + ext v8.16b, v19.16b, v20.16b, #10 // top left, top mid + dup v23.8b, v29.b[15] + ext v9.16b, v19.16b, v20.16b, #12 + sxtl v22.8h, v22.8b + dup v24.8b, v30.b[0] + sxtl v23.8h, v23.8b + dup v25.8b, v30.b[1] + ext v10.16b, v19.16b, v20.16b, #14 + sxtl v24.8h, v24.8b + dup v26.8b, v30.b[2] + ext v11.16b, v20.16b, v21.16b, #2 // top mid, top right + sxtl v25.8h, v25.8b + dup v27.8b, v30.b[3] + ext v12.16b, v20.16b, v21.16b, #4 + sxtl v26.8h, v26.8b + dup v28.8b, v30.b[4] + ext v13.16b, v20.16b, v21.16b, #6 + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + + smlal v4.4s, v8.4h, v22.4h + smlal v4.4s, v9.4h, v23.4h + smlal v4.4s, v10.4h, v24.4h + smlal v4.4s, v11.4h, v26.4h + smlal v4.4s, v12.4h, v27.4h + smlal v4.4s, v13.4h, v28.4h + smlal v4.4s, v20.4h, v25.4h + mov v16.16b, v17.16b + mov v17.16b, v18.16b + smlal2 v5.4s, v8.8h, v22.8h + smlal2 v5.4s, v9.8h, v23.8h + smlal2 v5.4s, v10.8h, v24.8h + smlal2 v5.4s, v11.8h, v26.8h + smlal2 v5.4s, v12.8h, v27.8h + smlal2 v5.4s, v13.8h, v28.8h + smlal2 v5.4s, v20.8h, v25.8h + + mov v13.16b, v14.16b + mov v14.16b, v15.16b + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag3_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x11, x0, #3*GRAIN_WIDTH*2 + sub x12, x0, #2*GRAIN_WIDTH*2 + sub x13, x0, #1*GRAIN_WIDTH*2 + ld1 {v14.8h}, [x11] // load the previous block right above + ld1 {v17.8h}, [x12] + ld1 {v20.8h}, [x13] +.endif + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, v30.b[8] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 7 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 7 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 1 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 1 + +function generate_grain_rows_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + mov w16, #80 +2: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h + subs w16, w16, #8 + st1 {v0.8h}, [x0], #16 + b.gt 2b + get_grain_2 v0 + subs w1, w1, #1 + st1 {v0.s}[0], [x0], #4 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function generate_grain_rows_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + mov w16, #40 +2: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h + subs w16, w16, #8 + st1 {v0.8h}, [x0], #16 + b.gt 2b + get_grain_4 v0 + subs w1, w1, #1 + st1 {v0.4h}, [x0] + add x0, x0, #GRAIN_WIDTH*2-80 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function gen_grain_uv_444_lag0_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v4.8h}, [x19], #16 +gen_grain_uv_lag0_8_start: + bl get_gaussian_neon + srshl v0.8h, v0.8h, v31.8h +gen_grain_uv_lag0_8_add: + and v4.16b, v4.16b, v1.16b + smull v2.4s, v4.4h, v27.4h + smull2 v3.4s, v4.8h, v27.8h + srshl v2.4s, v2.4s, v28.4s + srshl v3.4s, v3.4s, v28.4s + sqxtn v2.4h, v2.4s + sqxtn2 v2.8h, v3.4s + sqadd v2.8h, v2.8h, v0.8h + smin v2.8h, v2.8h, v25.8h + smax v2.8h, v2.8h, v26.8h + st1 {v2.8h}, [x0], #16 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function gen_grain_uv_420_lag0_8_neon + AARCH64_SIGN_LINK_REGISTER + add x12, x19, #GRAIN_WIDTH*2 + str x30, [sp, #-16]! + ld1 {v16.8h, v17.8h}, [x19], #32 + ld1 {v18.8h, v19.8h}, [x12] + addp v16.8h, v16.8h, v17.8h + addp v17.8h, v18.8h, v19.8h + add v16.8h, v16.8h, v17.8h + srshr v4.8h, v16.8h, #2 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_422_lag0_8_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.8h, v17.8h}, [x19], #32 + addp v16.8h, v16.8h, v17.8h + srshr v4.8h, v16.8h, #1 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_420_lag0_4_neon + add x12, x19, #GRAIN_WIDTH*2 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.4h, v17.4h}, [x19] + ld1 {v18.4h, v19.4h}, [x12] + add x19, x19, #32 + addp v16.4h, v16.4h, v17.4h + addp v17.4h, v18.4h, v19.4h + add v16.4h, v16.4h, v17.4h + srshr v4.4h, v16.4h, #2 + get_grain_4 v0 + b gen_grain_uv_lag0_8_add +endfunc + +function gen_grain_uv_422_lag0_4_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ld1 {v16.4h, v17.4h}, [x19] + add x19, x19, #32 + addp v16.4h, v16.4h, v17.4h + srshr v4.4h, v16.4h, #1 + get_grain_4 v0 + b gen_grain_uv_lag0_8_add +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + +.ifc \type, uv_444 + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH*2 + mov x1, x2 + mul w13, w13, w14 + clz w15, w4 +.else + clz w15, w2 +.endif + movrel x3, X(gaussian_sequence) + sub w15, w15, #24 // -bitdepth_min_8 + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add x4, x1, #FGD_AR_COEFFS_Y +.else + add x4, x1, #FGD_AR_COEFFS_UV +.endif + add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + +.ifc \type, uv_444 + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne +.endif + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + neg w15, w15 // bitdepth_min_8 + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #128 + lsl w5, w5, w15 // 128 << bitdepth_min_8 + neg w6, w5 // -(128 << bitpdeth_min_8) + sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 + +.ifc \type, uv_444 + eor w2, w2, w11 +.endif + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET +.ifc \type, y + mov w1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + dup v28.4s, w7 + ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + dup v25.8h, w5 + dup v26.8h, w6 + ext v29.16b, v0.16b, v1.16b, #10 + ext v30.16b, v1.16b, v0.16b, #2 + neg v28.4s, v28.4s + sxtl v27.8h, v27.8b + + mov w1, #3 + bl generate_grain_rows_neon + mov w1, #GRAIN_HEIGHT-3 +1: + mov v1.16b, v29.16b + bl gen_grain_uv_444_lag0_neon // 8 + movi v1.16b, #255 + bl gen_grain_uv_444_lag0_neon // 16 + bl gen_grain_uv_444_lag0_neon // 24 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 40 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 56 + bl gen_grain_uv_444_lag0_neon // 64 + bl gen_grain_uv_444_lag0_neon // 72 + mov v1.16b, v30.16b + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 + add x19, x19, #4 + st1 {v16.s}[0], [x0], #4 + b.gt 1b +.endif + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.8b}, [x4], #1 // ar_coeffs_y[0] + ld1r {v28.8b}, [x4], #1 // ar_coeffs_y[1] + ld1r {v29.8b}, [x4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb w4, [x4, #1] // ar_coeffs_y[3] +.else + add x4, x4, #2 +.endif + + mov w1, #3 +.ifc \type, uv_444 + ld1r {v30.8b}, [x4] // ar_coeffs_uv[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b +.ifc \type, uv_444 + sxtl v30.8h, v30.8b +.endif + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_mid_neon // 48 + bl sum_\type\()_lag1_mid_neon // 56 + bl sum_\type\()_lag1_mid_neon // 64 + bl sum_\type\()_lag1_mid_neon // 72 + bl sum_\type\()_lag1_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_mid_neon // 48 + bl sum_\type\()_lag2_mid_neon // 56 + bl sum_\type\()_lag2_mid_neon // 64 + bl sum_\type\()_lag2_mid_neon // 72 + bl sum_\type\()_lag2_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_mid_neon // 48 + bl sum_\type\()_lag3_mid_neon // 56 + bl sum_\type\()_lag3_mid_neon // 64 + bl sum_\type\()_lag3_mid_neon // 72 + bl sum_\type\()_lag3_right_neon // 80 + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #4 +.endif + st1 {v16.s}[0], [x0], #4 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) +.else + sub \reg, \reg, #6*32-GRAIN_WIDTH*2 +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + + mov w13, w3 + mov w14, #28 + add x19, x1, #(3*GRAIN_WIDTH-3)*2 + mov x1, x2 + mul w13, w13, w14 + clz w15, w4 + + movrel x3, X(gaussian_sequence) + sub w15, w15, #24 // -bitdepth_min_8 + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] + add x4, x1, #FGD_AR_COEFFS_UV + add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + neg w15, w15 // bitdepth_min_8 + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #128 + lsl w5, w5, w15 // 128 << bitdepth_min_8 + neg w6, w5 // -(128 << bitpdeth_min_8) + sub w5, w5, #1 // (128 << bitdepth_min_8) - 1 + + eor w2, w2, w11 + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET + dup v28.4s, w7 + ld1r {v27.8b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + dup v25.8h, w5 + dup v26.8h, w6 + ext v29.16b, v0.16b, v1.16b, #10 + ext v30.16b, v1.16b, v0.16b, #14 + neg v28.4s, v28.4s + sxtl v27.8h, v27.8b + + mov w1, #3 + bl generate_grain_rows_44_neon + set_height w1, \type +1: + mov v1.16b, v29.16b + bl gen_grain_\type\()_lag0_8_neon // 8 + movi v1.16b, #255 + bl gen_grain_\type\()_lag0_8_neon // 16 + bl gen_grain_\type\()_lag0_8_neon // 24 + bl gen_grain_\type\()_lag0_8_neon // 32 + bl gen_grain_\type\()_lag0_8_neon // 40 + mov v1.16b, v30.16b + bl gen_grain_\type\()_lag0_4_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.8b}, [x4], #1 // ar_coeffs_uv[0] + ld1r {v28.8b}, [x4], #1 // ar_coeffs_uv[1] + ld1r {v29.8b}, [x4] // ar_coeffs_uv[2] + add x4, x4, #2 + + mov w1, #3 + ld1r {v30.8b}, [x4] // ar_coeffs_u4[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + sxtl v27.8h, v27.8b + sxtl v28.8h, v28.8b + sxtl v29.8h, v29.8b + sxtl v30.8h, v30.8b + set_height w1, \type +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ldr q29, [x4] // ar_coeffs_uv[0-15] + ldr q30, [x4, #16] // ar_coeffs_uv[16-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_right_neon // 44 + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH*2-6*16 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + umov w14, \src1[0] + umov w15, \src2[1] + umov w16, \src1[2] + add x14, x14, x3 + umov w17, \src2[3] + add x15, x15, x3 + ld1 {\dst1}[0+\off], [x14] + umov w14, \src1[4] + add x16, x16, x3 + ld1 {\dst2}[1+\off], [x15] + umov w15, \src2[5] + add x17, x17, x3 + ld1 {\dst1}[2+\off], [x16] + umov w16, \src1[6] + add x14, x14, x3 + ld1 {\dst2}[3+\off], [x17] + umov w17, \src2[7] + add x15, x15, x3 + ld1 {\dst1}[4+\off], [x14] + add x16, x16, x3 + ld1 {\dst2}[5+\off], [x15] + add x17, x17, x3 + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[7+\off], [x17] +.endm + +.macro gather dst1, dst2, src1, src2, src3, src4 + gather_interleaved \dst1, \dst2, \src1, \src3, 0 + gather_interleaved \dst2, \dst1, \src3, \src1, 0 + gather_interleaved \dst1, \dst2, \src2, \src4, 8 + gather_interleaved \dst2, \dst1, \src4, \src2, 8 +.endm + +function gather32_neon + gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h + ret +endfunc + +function gather16_neon + gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 + gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 + ins v6.d[1], v7.d[0] + ret +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, uxtw #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + str d14, [sp, #64] + eor w4, w4, #15 // 15 - scaling_shift + ldr w11, [x6, #8] // offsets[1][0] + ldr w13, [x6, #4] // offsets[0][1] + ldr w15, [x6, #12] // offsets[1][1] + ldr w10, [sp, #96] // bitdepth_max + ldr w6, [x6] // offsets[0][0] + dup v26.8h, w10 // bitdepth_max + clz w10, w10 + ldr w8, [sp, #80] // clip + sub w10, w10, #24 // -bitdepth_min_8 + mov x9, #GRAIN_WIDTH*2 // grain_lut stride + neg w10, w10 // bitdepth_min_8 + + dup v29.8h, w4 // 15 - scaling_shift + dup v27.8h, w10 // bitdepth_min_8 + + movrel x16, overlap_coeffs_0 + + cbz w8, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #235 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v26.16b // bitdepth_max +2: + + ushr v26.8h, v26.8h, #1 // grain_max + not v25.16b, v26.16b // grain_min + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + + add x5, x5, #18 // grain_lut += 9 + add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x9 // grain_lut += grain_stride + + calc_offset w11, w12, w11, 0, 0 + calc_offset w13, w14, w13, 0, 0 + calc_offset w15, w16, w15, 0, 0 + calc_offset w6, w10, w6, 0, 0 + + add_offset x12, w11, x12, x5, x9 + add_offset x14, w13, x14, x5, x9 + add_offset x16, w15, x16, x5, x9 + add_offset x5, w6, x10, x5, x9 + + ldr w11, [sp, #88] // type + adr x13, L(fgy_loop_tbl) + + add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + tst w11, #1 + ldrh w11, [x13, w11, uxtw #1] + + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx + + sub x11, x13, w11, uxtw + + b.eq 1f + // y overlap + dup v8.8h, v27.h[0] + dup v9.8h, v27.h[1] + mov w10, w7 // backup actual h + mov w7, #2 +1: + br x11 +endfunc + +function fgy_loop_neon +.macro fgy ox, oy +L(loop_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src +.if \ox + ld1 {v20.4h}, [x4], x9 // grain_lut old +.endif +.if \oy + ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top +.endif +.if \ox && \oy + ld1 {v14.4h}, [x8], x9 // grain_lut top old +.endif + mvni v4.8h, #0xf0, lsl #8 // 0x0fff + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v4.16b + and v1.16b, v1.16b, v4.16b + and v2.16b, v2.16b, v4.16b + and v3.16b, v3.16b, v4.16b + bl gather32_neon + +.if \ox + smull v20.4s, v20.4h, v27.4h + smlal v20.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v14.4s, v14.4h, v27.4h + smlal v14.4s, v21.4h, v28.4h + sqrshrn v20.4h, v20.4s, #5 + sqrshrn v14.4h, v14.4s, #5 + smin v20.4h, v20.4h, v26.4h + smin v14.4h, v14.4h, v26.4h + smax v20.4h, v20.4h, v25.4h + smax v14.4h, v14.4h, v25.4h +.endif + +.if \ox + smull v10.4s, v20.4h, v9.4h +.else + smull v10.4s, v16.4h, v9.4h +.endif + smull2 v11.4s, v16.8h, v9.8h + smull v12.4s, v17.4h, v9.4h + smull2 v13.4s, v17.8h, v9.8h + smull v16.4s, v18.4h, v9.4h + smull2 v17.4s, v18.8h, v9.8h + smull v18.4s, v19.4h, v9.4h + smull2 v19.4s, v19.8h, v9.8h +.if \ox + smlal v10.4s, v14.4h, v8.4h +.else + smlal v10.4s, v21.4h, v8.4h +.endif + smlal2 v11.4s, v21.8h, v8.8h + smlal v12.4s, v22.4h, v8.4h + smlal2 v13.4s, v22.8h, v8.8h + smlal v16.4s, v23.4h, v8.4h + smlal2 v17.4s, v23.8h, v8.8h + smlal v18.4s, v24.4h, v8.4h + smlal2 v19.4s, v24.8h, v8.8h + sqrshrn v10.4h, v10.4s, #5 + sqrshrn2 v10.8h, v11.4s, #5 + sqrshrn v11.4h, v12.4s, #5 + sqrshrn2 v11.8h, v13.4s, #5 + sqrshrn v12.4h, v16.4s, #5 + sqrshrn2 v12.8h, v17.4s, #5 + sqrshrn v13.4h, v18.4s, #5 + sqrshrn2 v13.8h, v19.4s, #5 + smin v16.8h, v10.8h, v26.8h + smin v17.8h, v11.8h, v26.8h + smin v18.8h, v12.8h, v26.8h + smin v19.8h, v13.8h, v26.8h + smax v16.8h, v16.8h, v25.8h + smax v17.8h, v17.8h, v25.8h + smax v18.8h, v18.8h, v25.8h + smax v19.8h, v19.8h, v25.8h +.endif + + uxtl v4.8h, v6.8b // scaling +.if \ox && !\oy + sqrshrn v20.4h, v20.4s, #5 +.endif + uxtl2 v5.8h, v6.16b +.if \ox && !\oy + smin v20.4h, v20.4h, v26.4h +.endif + uxtl v6.8h, v7.8b +.if \ox && !\oy + smax v20.4h, v20.4h, v25.4h +.endif + uxtl2 v7.8h, v7.16b +.if \ox && !\oy + ins v16.d[0], v20.d[0] +.endif + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v21.8h, v17.8h, v5.8h + sqrdmulh v22.8h, v18.8h, v6.8h + sqrdmulh v23.8h, v19.8h, v7.8h + + usqadd v0.8h, v20.8h // *src + noise + usqadd v1.8h, v21.8h + usqadd v2.8h, v22.8h + usqadd v3.8h, v23.8h + + umax v0.8h, v0.8h, v30.8h + umax v1.8h, v1.8h, v30.8h + umax v2.8h, v2.8h, v30.8h + umax v3.8h, v3.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w7, w7, #1 +.if \oy + dup v8.8h, v28.h[0] + dup v9.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w10, #2 + sub w7, w10, #2 // restore actual remaining h + b.gt L(loop_\ox\()0) +.endif + ldr d14, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 + +L(fgy_loop_tbl): + .hword L(fgy_loop_tbl) - L(loop_00) + .hword L(fgy_loop_tbl) - L(loop_01) + .hword L(fgy_loop_tbl) - L(loop_10) + .hword L(fgy_loop_tbl) - L(loop_11) +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_16bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + + ldp x8, x9, [sp, #80] // offsets, h + ldp x10, x11, [sp, #96] // uv, is_id + ldr w16, [sp, #120] // bitdepth_max + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + dup v23.8h, w16 // bitdepth_max + clz w16, w16 + eor w13, w13, #15 // 15 - scaling_shift + sub w16, w16, #24 // -bitdepth_min_8 + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + neg w16, w16 // bitdepth_min_8 + ld1r {v8.8h}, [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1r {v9.8h}, [x15] // uv_mult + + dup v29.8h, w13 // 15 - scaling_shift + dup v27.8h, w16 // bitdepth_min_8 + + cbz w12, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #240 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + cbz w11, 2f + // is_id + movi v31.8h, #235 + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v23.16b // bitdepth_max +2: + + ushr v15.8h, v23.8h, #1 // grain_max + sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 + not v14.16b, v15.16b // grain_min + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH*2 // grain_lut stride + + add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #112] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx\()_tbl) + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.8h, #23 + movi v26.8h, #22 +.else + movi v25.8h, #27 + movi v26.8h, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: +.if \ox + ld1 {v4.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v5.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut + +.if \ox + smull v4.4s, v4.4h, v27.4h + smlal v4.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v5.4s, v5.4h, v27.4h + smlal v5.4s, v0.4h, v28.4h + sqrshrn v4.4h, v4.4s, #5 + sqrshrn v5.4h, v5.4s, #5 + smin v4.4h, v4.4h, v15.4h + smin v5.4h, v5.4h, v15.4h + smax v4.4h, v4.4h, v14.4h + smax v5.4h, v5.4h, v14.4h + ins v16.d[0], v4.d[0] + ins v0.d[0], v5.d[0] +.endif + + smull v6.4s, v16.4h, v26.4h + smull2 v7.4s, v16.8h, v26.8h + smull v10.4s, v17.4h, v26.4h + smull2 v11.4s, v17.8h, v26.8h + smull v16.4s, v18.4h, v26.4h + smull2 v17.4s, v18.8h, v26.8h + smull v18.4s, v19.4h, v26.4h + smull2 v19.4s, v19.8h, v26.8h + smlal v6.4s, v0.4h, v25.4h + smlal2 v7.4s, v0.8h, v25.8h + smlal v10.4s, v1.4h, v25.4h + smlal2 v11.4s, v1.8h, v25.8h + smlal v16.4s, v2.4h, v25.4h + smlal2 v17.4s, v2.8h, v25.8h + smlal v18.4s, v3.4h, v25.4h + smlal2 v19.4s, v3.8h, v25.8h + sqrshrn v6.4h, v6.4s, #5 + sqrshrn2 v6.8h, v7.4s, #5 + sqrshrn v7.4h, v10.4s, #5 + sqrshrn2 v7.8h, v11.4s, #5 + sqrshrn v10.4h, v16.4s, #5 + sqrshrn2 v10.8h, v17.4s, #5 + sqrshrn v11.4h, v18.4s, #5 + sqrshrn2 v11.8h, v19.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v4.4h, v4.4s, #5 + smin v4.4h, v4.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v6.8h, v15.8h + smin v17.8h, v7.8h, v15.8h + smin v18.8h, v10.8h, v15.8h + smin v19.8h, v11.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h + smax v18.8h, v18.8h, v14.8h + smax v19.8h, v19.8h, v14.8h +.endif + +.if \ox && !\oy + smax v4.4h, v4.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v4.d[0] +.endif + +.if !\csfl + smull v4.4s, v0.4h, v8.4h + smull2 v5.4s, v0.8h, v8.8h + smull v6.4s, v1.4h, v8.4h + smull2 v7.4s, v1.8h, v8.8h + smull v0.4s, v2.4h, v8.4h + smull2 v1.4s, v2.8h, v8.8h + smull v2.4s, v3.4h, v8.4h + smull2 v3.4s, v3.8h, v8.8h + smlal v4.4s, v10.4h, v9.4h + smlal2 v5.4s, v10.8h, v9.8h + smlal v6.4s, v11.4h, v9.4h + smlal2 v7.4s, v11.8h, v9.8h + smlal v0.4s, v12.4h, v9.4h + smlal2 v1.4s, v12.8h, v9.8h + smlal v2.4s, v13.4h, v9.4h + smlal2 v3.4s, v13.8h, v9.8h + shrn v4.4h, v4.4s, #6 + shrn2 v4.8h, v5.4s, #6 + shrn v5.4h, v6.4s, #6 + shrn2 v5.8h, v7.4s, #6 + shrn v6.4h, v0.4s, #6 + shrn2 v6.8h, v1.4s, #6 + shrn v7.4h, v2.4s, #6 + shrn2 v7.8h, v3.4s, #6 + add v0.8h, v4.8h, v24.8h + add v1.8h, v5.8h, v24.8h + add v2.8h, v6.8h, v24.8h + add v3.8h, v7.8h, v24.8h + movi v20.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smin v2.8h, v2.8h, v23.8h + smin v3.8h, v3.8h, v23.8h + smax v0.8h, v0.8h, v20.8h + smax v1.8h, v1.8h, v20.8h + smax v2.8h, v2.8h, v20.8h + smax v3.8h, v3.8h, v20.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b + and v2.16b, v2.16b, v23.16b + and v3.16b, v3.16b, v23.16b +.endif + + bl gather32_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + uxtl v6.8h, v7.8b + uxtl2 v7.8h, v7.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + sqrdmulh v18.8h, v18.8h, v6.8h + sqrdmulh v19.8h, v19.8h, v7.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + usqadd v12.8h, v18.8h + usqadd v13.8h, v19.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umax v2.8h, v12.8h, v30.8h + umax v3.8h, v13.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w9, w9, #1 +.if \oy + dup v25.8h, v28.h[0] + dup v26.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: +.if \ox + ld1 {v18.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v19.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut + +.if \ox + smull v18.4s, v18.4h, v27.4h + smlal v18.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v19.4s, v19.4h, v27.4h + smlal v19.4s, v20.4h, v28.4h + sqrshrn v18.4h, v18.4s, #5 + sqrshrn v19.4h, v19.4s, #5 + smin v18.4h, v18.4h, v15.4h + smin v19.4h, v19.4h, v15.4h + smax v18.4h, v18.4h, v14.4h + smax v19.4h, v19.4h, v14.4h + ins v16.d[0], v18.d[0] + ins v20.d[0], v19.d[0] +.endif + + smull v0.4s, v16.4h, v26.4h + smull2 v1.4s, v16.8h, v26.8h + smull v2.4s, v17.4h, v26.4h + smull2 v3.4s, v17.8h, v26.8h + smlal v0.4s, v20.4h, v25.4h + smlal2 v1.4s, v20.8h, v25.8h + smlal v2.4s, v21.4h, v25.4h + smlal2 v3.4s, v21.8h, v25.8h + sqrshrn v16.4h, v0.4s, #5 + sqrshrn2 v16.8h, v1.4s, #5 + sqrshrn v17.4h, v2.4s, #5 + sqrshrn2 v17.8h, v3.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v18.4h, v18.4s, #5 + smin v18.4h, v18.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v16.8h, v15.8h + smin v17.8h, v17.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h +.endif + +.if \ox && !\oy + smax v18.4h, v18.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v18.d[0] +.endif + addp v0.8h, v0.8h, v1.8h + addp v1.8h, v2.8h, v3.8h + urshr v0.8h, v0.8h, #1 + urshr v1.8h, v1.8h, #1 +.if !\csfl + smull v2.4s, v0.4h, v8.4h + smull2 v3.4s, v0.8h, v8.8h + smull v0.4s, v1.4h, v8.4h + smull2 v1.4s, v1.8h, v8.8h + smlal v2.4s, v10.4h, v9.4h + smlal2 v3.4s, v10.8h, v9.8h + smlal v0.4s, v11.4h, v9.4h + smlal2 v1.4s, v11.8h, v9.8h + shrn v2.4h, v2.4s, #6 + shrn2 v2.8h, v3.4s, #6 + shrn v3.4h, v0.4s, #6 + shrn2 v3.8h, v1.4s, #6 + add v0.8h, v2.8h, v24.8h + add v1.8h, v3.8h, v24.8h + movi v2.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smax v0.8h, v0.8h, v2.8h + smax v1.8h, v1.8h, v2.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b +.endif + + bl gather16_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.8h, v1.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc diff -Nru dav1d-0.9.2/src/arm/64/film_grain.S dav1d-1.0.0/src/arm/64/film_grain.S --- dav1d-0.9.2/src/arm/64/film_grain.S 2021-09-03 15:51:24.397037000 +0000 +++ dav1d-1.0.0/src/arm/64/film_grain.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,1972 +0,0 @@ -/* - * Copyright © 2021, VideoLAN and dav1d authors - * Copyright © 2021, Martin Storsjo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "src/arm/asm.S" -#include "util.S" -#include "src/arm/asm-offsets.h" - -#define GRAIN_WIDTH 82 -#define GRAIN_HEIGHT 73 - -#define SUB_GRAIN_WIDTH 44 -#define SUB_GRAIN_HEIGHT 38 - -.macro increment_seed steps, shift=1 - lsr w11, w2, #3 - lsr w12, w2, #12 - lsr w13, w2, #1 - eor w11, w2, w11 // (r >> 0) ^ (r >> 3) - eor w12, w12, w13 // (r >> 12) ^ (r >> 1) - eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) -.if \shift - lsr w2, w2, #\steps -.endif - and w11, w11, #((1 << \steps) - 1) // bit -.if \shift - orr w2, w2, w11, lsl #(16 - \steps) // *state -.else - orr w2, w2, w11, lsl #16 // *state -.endif -.endm - -.macro read_rand dest, bits, age - ubfx \dest, x2, #16 - \bits - \age, #\bits -.endm - -.macro read_shift_rand dest, bits - ubfx \dest, x2, #17 - \bits, #\bits - lsr w2, w2, #1 -.endm - -// special calling convention: -// w2 holds seed -// x3 holds dav1d_gaussian_sequence -// clobbers x11-x15 -// returns in v0.8h -function get_gaussian_neon - increment_seed 4 - read_rand x14, 11, 3 - read_rand x15, 11, 2 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[0], [x14] - read_rand x14, 11, 1 - ld1 {v0.h}[1], [x15] - add x14, x3, x14, lsl #1 - read_rand x15, 11, 0 - increment_seed 4 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[2], [x14] - read_rand x14, 11, 3 - ld1 {v0.h}[3], [x15] - add x14, x3, x14, lsl #1 - read_rand x15, 11, 2 - ld1 {v0.h}[4], [x14] - add x15, x3, x15, lsl #1 - read_rand x14, 11, 1 - ld1 {v0.h}[5], [x15] - read_rand x15, 11, 0 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[6], [x14] - ld1 {v0.h}[7], [x15] - ret -endfunc - -.macro get_grain_row r0, r1, r2, r3, r4, r5 - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn \r0\().8b, \r5\().8h - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn2 \r0\().16b, \r5\().8h - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn \r1\().8b, \r5\().8h - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn2 \r1\().16b, \r5\().8h - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn \r2\().8b, \r5\().8h - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn2 \r2\().16b, \r5\().8h - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn \r3\().8b, \r5\().8h - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn2 \r3\().16b, \r5\().8h - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn \r4\().8b, \r5\().8h - bl get_gaussian_neon - srshl \r5\().8h, v0.8h, v31.8h - xtn2 \r4\().16b, \r5\().8h - increment_seed 2 - read_rand x14, 11, 1 - read_rand x15, 11, 0 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {\r5\().h}[0], [x14] - ld1 {\r5\().h}[1], [x15] - srshl v0.4h, \r5\().4h, v31.4h - xtn \r5\().8b, v0.8h -.endm - -.macro store_grain_row r0, r1, r2, r3, r4, r5 - st1 {\r0\().16b,\r1\().16b}, [x0], #32 - st1 {\r2\().16b,\r3\().16b}, [x0], #32 - st1 {\r4\().16b}, [x0], #16 - st1 {\r5\().h}[0], [x0], #2 -.endm - -.macro get_grain_row_44 r0, r1, r2 - bl get_gaussian_neon - srshl \r2\().8h, v0.8h, v31.8h - xtn \r0\().8b, \r2\().8h - bl get_gaussian_neon - srshl \r2\().8h, v0.8h, v31.8h - xtn2 \r0\().16b, \r2\().8h - bl get_gaussian_neon - srshl \r2\().8h, v0.8h, v31.8h - xtn \r1\().8b, \r2\().8h - bl get_gaussian_neon - srshl \r2\().8h, v0.8h, v31.8h - xtn2 \r1\().16b, \r2\().8h - bl get_gaussian_neon - srshl \r2\().8h, v0.8h, v31.8h - xtn \r2\().8b, \r2\().8h - - increment_seed 4 - read_rand x14, 11, 3 - read_rand x15, 11, 2 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[0], [x14] - read_rand x14, 11, 1 - ld1 {v0.h}[1], [x15] - read_rand x15, 11, 0 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[2], [x14] - ld1 {v0.h}[3], [x15] - srshl v0.4h, v0.4h, v31.4h - xtn2 \r2\().16b, v0.8h -.endm - -.macro store_grain_row_44 r0, r1, r2 - st1 {\r0\().16b,\r1\().16b}, [x0], #32 - st1 {\r2\().16b}, [x0] - add x0, x0, #GRAIN_WIDTH-32 -.endm - -function get_grain_2_neon - increment_seed 2 - read_rand x14, 11, 1 - read_rand x15, 11, 0 - add x14, x3, x14, lsl #1 - add x15, x3, x15, lsl #1 - ld1 {v0.h}[0], [x14] - ld1 {v0.h}[1], [x15] - srshl v0.4h, v0.4h, v31.4h - xtn v0.8b, v0.8h - ret -endfunc - -.macro get_grain_2 dst - bl get_grain_2_neon -.ifnc \dst, v0 - mov \dst\().8b, v0.8b -.endif -.endm - -// w15 holds the number of entries to produce -// w14, w16 and w17 hold the previous output entries -// v0 holds the vector of produced entries -// v1 holds the input vector of sums from above -.macro output_lag n -function output_lag\n\()_neon -1: - read_shift_rand x13, 11 - mov w11, v1.s[0] - ldrsh w12, [x3, x13, lsl #1] - ext v0.16b, v0.16b, v0.16b, #1 -.if \n == 1 - madd w11, w14, w4, w11 // sum (above) + *coeff * prev output -.elseif \n == 2 - madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 - madd w11, w14, w17, w11 // += *coeff * prev output 2 - mov w16, w14 -.else - madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 - madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 - madd w11, w14, w21, w11 // += *coeff * prev output 3 - mov w17, w16 - mov w16, w14 -.endif - add w14, w11, w8 // 1 << (ar_coeff_shift - 1) - add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1) - asr w14, w14, w7 // >> ar_coeff_shift - asr w12, w12, w9 // >> (4 + grain_scale_shift) - add w14, w14, w12 - cmp w14, w5 - csel w14, w14, w5, le - cmp w14, w6 - csel w14, w14, w6, ge - subs w15, w15, #1 - ext v1.16b, v1.16b, v1.16b, #4 - ins v0.b[15], w14 - b.gt 1b - ret -endfunc -.endm - -output_lag 1 -output_lag 2 -output_lag 3 - - -function sum_lag1_above_neon - smull v2.8h, v3.8b, v28.8b - smull2 v3.8h, v3.16b, v28.16b - smull v4.8h, v0.8b, v27.8b - smull2 v5.8h, v0.16b, v27.16b - smull v6.8h, v1.8b, v29.8b - smull2 v7.8h, v1.16b, v29.16b - saddl v0.4s, v2.4h, v4.4h - saddl2 v1.4s, v2.8h, v4.8h - saddl v2.4s, v3.4h, v5.4h - saddl2 v3.4s, v3.8h, v5.8h - saddw v4.4s, v0.4s, v6.4h - saddw2 v5.4s, v1.4s, v6.8h - saddw v6.4s, v2.4s, v7.4h - saddw2 v7.4s, v3.4s, v7.8h - ret -endfunc - -.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff - bl sum_\lag\()_above_neon -.ifc \type, uv_420 - add x12, x19, #GRAIN_WIDTH - ld1 {v22.16b, v23.16b}, [x19], #32 - ld1 {v24.16b, v25.16b}, [x12] - saddlp v22.8h, v22.16b - saddlp v23.8h, v23.16b - saddlp v24.8h, v24.16b - saddlp v25.8h, v25.16b - add v22.8h, v22.8h, v24.8h - add v23.8h, v23.8h, v25.8h - rshrn v0.8b, v22.8h, #2 - rshrn2 v0.16b, v23.8h, #2 -.endif -.ifc \type, uv_422 - ld1 {v22.16b, v23.16b}, [x19], #32 - saddlp v22.8h, v22.16b - saddlp v23.8h, v23.16b - rshrn v0.8b, v22.8h, #1 - rshrn2 v0.16b, v23.8h, #1 -.endif -.ifc \type, uv_444 - ld1 {v0.16b}, [x19], #16 -.endif -.if \uv_layout -.ifnb \uv_coeff - dup v1.16b, \uv_coeff - smull v2.8h, v0.8b, v1.8b - smull2 v3.8h, v0.16b, v1.16b -.else - smull v2.8h, v0.8b, v30.8b - smull2 v3.8h, v0.16b, v30.16b -.endif - saddw v4.4s, v4.4s, v2.4h - saddw2 v5.4s, v5.4s, v2.8h - saddw v6.4s, v6.4s, v3.4h - saddw2 v7.4s, v7.4s, v3.8h -.endif -.if \uv_layout && \elems == 16 - b sum_\lag\()_y_\edge\()_start -.elseif \uv_layout == 444 && \elems == 15 - b sum_\lag\()_y_\edge\()_start -.elseif \uv_layout == 422 && \elems == 9 - b sum_\lag\()_uv_420_\edge\()_start -.else -sum_\lag\()_\type\()_\edge\()_start: -.ifc \edge, left - increment_seed 4 - read_rand x12, 11, 3 - read_rand x13, 11, 2 - read_rand x14, 11, 1 - add x12, x3, x12, lsl #1 - add x13, x3, x13, lsl #1 - add x14, x3, x14, lsl #1 - ld1 {v0.h}[5], [x12] - ld1 {v0.h}[6], [x13] - ld1 {v0.h}[7], [x14] - lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 - srshl v0.8h, v0.8h, v31.8h - xtn2 v0.16b, v0.8h - ext v4.16b, v4.16b, v4.16b, #12 -.ifc \lag, lag3 - smov w17, v0.b[13] -.endif -.ifnc \lag, lag1 - smov w16, v0.b[14] -.endif - smov w14, v0.b[15] - - mov v1.16b, v4.16b - mov w15, #1 - bl output_\lag\()_neon -.else - increment_seed 4, shift=0 - mov v1.16b, v4.16b - mov w15, #4 - bl output_\lag\()_neon -.endif - - increment_seed 4, shift=0 - mov v1.16b, v5.16b - mov w15, #4 - bl output_\lag\()_neon - - increment_seed 4, shift=0 - mov v1.16b, v6.16b -.if \elems == 9 - mov w15, #1 - bl output_\lag\()_neon - lsr w2, w2, #3 - - read_rand x12, 11, 2 - read_rand x13, 11, 1 - read_rand x14, 11, 0 - add x12, x3, x12, lsl #1 - add x13, x3, x13, lsl #1 - add x14, x3, x14, lsl #1 - ld1 {v1.h}[0], [x12] - ld1 {v1.h}[1], [x13] - ld1 {v1.h}[2], [x14] - srshl v1.4h, v1.4h, v31.4h - xtn v1.8b, v1.8h - ext v0.16b, v0.16b, v1.16b, #7 -.else - mov w15, #4 - bl output_\lag\()_neon - - increment_seed 4, shift=0 - mov v1.16b, v7.16b - -.ifc \edge, right - mov w15, #3 - bl output_\lag\()_neon - read_shift_rand x15, 11 - add x15, x3, x15, lsl #1 - ld1 {v1.h}[0], [x15] - srshl v1.4h, v1.4h, v31.4h - ext v0.16b, v0.16b, v1.16b, #1 -.else - mov w15, #4 - bl output_\lag\()_neon -.endif -.endif -.if \store - st1 {v0.16b}, [x0], #16 -.endif - ldr x30, [sp], #16 - ret -.endif -.endm - -.macro sum_lag1_func type, uv_layout, edge, elems=16 -function sum_\type\()_lag1_\edge\()_neon - str x30, [sp, #-16]! - sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 -endfunc -.endm - -sum_lag1_func y, 0, left -sum_lag1_func y, 0, mid -sum_lag1_func y, 0, right, 15 -sum_lag1_func uv_444, 444, left -sum_lag1_func uv_444, 444, mid -sum_lag1_func uv_444, 444, right, 15 -sum_lag1_func uv_422, 422, left -sum_lag1_func uv_422, 422, mid -sum_lag1_func uv_422, 422, right, 9 -sum_lag1_func uv_420, 420, left -sum_lag1_func uv_420, 420, mid -sum_lag1_func uv_420, 420, right, 9 - -.macro sum_lag1 type, dst, left, mid, right, edge=mid - mov v3.16b, \mid\().16b - ext v0.16b, \left\().16b, \mid\().16b, #15 - ext v1.16b, \mid\().16b, \right\().16b, #1 - bl sum_\type\()_lag1_\edge\()_neon - mov \dst\().16b, v0.16b -.endm - -.macro sum_y_lag1 dst, left, mid, right, edge=mid - sum_lag1 y, \dst, \left, \mid, \right, \edge -.endm - -.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid - sum_lag1 uv_444, \dst, \left, \mid, \right, \edge -.endm - -.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid - sum_lag1 uv_422, \dst, \left, \mid, \right, \edge -.endm - -.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid - sum_lag1 uv_420, \dst, \left, \mid, \right, \edge -.endm - - -function sum_lag2_above_neon - sub x12, x0, #2*GRAIN_WIDTH - 16 - sub x13, x0, #1*GRAIN_WIDTH - 16 - ld1 {v18.16b}, [x12] // load top right - ld1 {v21.16b}, [x13] - - ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid - dup v26.16b, v30.b[0] - ext v23.16b, v16.16b, v17.16b, #15 - dup v27.16b, v30.b[1] - ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right - dup v28.16b, v30.b[3] - ext v1.16b, v17.16b, v18.16b, #2 - dup v29.16b, v30.b[4] - - smull v2.8h, v22.8b, v26.8b - smull2 v3.8h, v22.16b, v26.16b - smull v4.8h, v23.8b, v27.8b - smull2 v5.8h, v23.16b, v27.16b - smull v6.8h, v0.8b, v28.8b - smull2 v7.8h, v0.16b, v28.16b - smull v0.8h, v1.8b, v29.8b - smull2 v1.8h, v1.16b, v29.16b - saddl v22.4s, v2.4h, v4.4h - saddl2 v23.4s, v2.8h, v4.8h - saddl v26.4s, v3.4h, v5.4h - saddl2 v27.4s, v3.8h, v5.8h - saddl v2.4s, v0.4h, v6.4h - saddl2 v3.4s, v0.8h, v6.8h - saddl v6.4s, v1.4h, v7.4h - saddl2 v7.4s, v1.8h, v7.8h - add v4.4s, v22.4s, v2.4s - add v5.4s, v23.4s, v3.4s - add v6.4s, v26.4s, v6.4s - add v7.4s, v27.4s, v7.4s - - ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid - dup v26.16b, v30.b[5] - ext v23.16b, v19.16b, v20.16b, #15 - dup v27.16b, v30.b[6] - ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right - dup v28.16b, v30.b[8] - ext v1.16b, v20.16b, v21.16b, #2 - dup v29.16b, v30.b[9] - - smull v2.8h, v22.8b, v26.8b - smull2 v3.8h, v22.16b, v26.16b - smull v22.8h, v23.8b, v27.8b - smull2 v23.8h, v23.16b, v27.16b - smull v26.8h, v0.8b, v28.8b - smull2 v27.8h, v0.16b, v28.16b - smull v28.8h, v1.8b, v29.8b - smull2 v29.8h, v1.16b, v29.16b - saddl v0.4s, v2.4h, v22.4h - saddl2 v1.4s, v2.8h, v22.8h - saddl v2.4s, v3.4h, v23.4h - saddl2 v3.4s, v3.8h, v23.8h - saddl v22.4s, v26.4h, v28.4h - saddl2 v23.4s, v26.8h, v28.8h - saddl v26.4s, v27.4h, v29.4h - saddl2 v27.4s, v27.8h, v29.8h - add v0.4s, v0.4s, v22.4s - add v1.4s, v1.4s, v23.4s - add v2.4s, v2.4s, v26.4s - add v3.4s, v3.4s, v27.4s - dup v26.16b, v30.b[2] - dup v27.16b, v30.b[7] - smull v22.8h, v17.8b, v26.8b - smull2 v23.8h, v17.16b, v26.16b - smull v24.8h, v20.8b, v27.8b - smull2 v25.8h, v20.16b, v27.16b - add v4.4s, v4.4s, v0.4s - add v5.4s, v5.4s, v1.4s - add v6.4s, v6.4s, v2.4s - add v7.4s, v7.4s, v3.4s - - mov v16.16b, v17.16b - mov v17.16b, v18.16b - - saddl v0.4s, v22.4h, v24.4h - saddl2 v1.4s, v22.8h, v24.8h - saddl v2.4s, v23.4h, v25.4h - saddl2 v3.4s, v23.8h, v25.8h - mov v19.16b, v20.16b - mov v20.16b, v21.16b - add v4.4s, v4.4s, v0.4s - add v5.4s, v5.4s, v1.4s - add v6.4s, v6.4s, v2.4s - add v7.4s, v7.4s, v3.4s - ret -endfunc - -.macro sum_lag2_func type, uv_layout, edge, elems=16 -function sum_\type\()_lag2_\edge\()_neon - str x30, [sp, #-16]! -.ifc \edge, left - sub x12, x0, #2*GRAIN_WIDTH - sub x13, x0, #1*GRAIN_WIDTH - ld1 {v17.16b}, [x12] // load the previous block right above - ld1 {v20.16b}, [x13] -.endif - sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12] -endfunc -.endm - -sum_lag2_func y, 0, left -sum_lag2_func y, 0, mid -sum_lag2_func y, 0, right, 15 -sum_lag2_func uv_444, 444, left -sum_lag2_func uv_444, 444, mid -sum_lag2_func uv_444, 444, right, 15 -sum_lag2_func uv_422, 422, left -sum_lag2_func uv_422, 422, mid -sum_lag2_func uv_422, 422, right, 9 -sum_lag2_func uv_420, 420, left -sum_lag2_func uv_420, 420, mid -sum_lag2_func uv_420, 420, right, 9 - - -function sum_lag3_above_neon - sub x11, x0, #3*GRAIN_WIDTH - 16 - sub x12, x0, #2*GRAIN_WIDTH - 16 - sub x13, x0, #1*GRAIN_WIDTH - 16 - ld1 {v15.16b}, [x11] // load top right - ld1 {v18.16b}, [x12] - ld1 {v21.16b}, [x13] - - ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid - dup v22.16b, v29.b[0] - ext v9.16b, v13.16b, v14.16b, #14 - dup v23.16b, v29.b[1] - ext v10.16b, v13.16b, v14.16b, #15 - dup v24.16b, v29.b[2] - dup v25.16b, v29.b[3] - ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right - dup v26.16b, v29.b[4] - ext v12.16b, v14.16b, v15.16b, #2 - dup v27.16b, v29.b[5] - ext v13.16b, v14.16b, v15.16b, #3 - dup v28.16b, v29.b[6] - - smull v0.8h, v8.8b, v22.8b - smull2 v1.8h, v8.16b, v22.16b - smull v2.8h, v9.8b, v23.8b - smull2 v3.8h, v9.16b, v23.16b - smull v8.8h, v10.8b, v24.8b - smull2 v9.8h, v10.16b, v24.16b - smull v10.8h, v11.8b, v26.8b - smull2 v11.8h, v11.16b, v26.16b - saddl v22.4s, v0.4h, v2.4h - saddl2 v23.4s, v0.8h, v2.8h - saddl v24.4s, v1.4h, v3.4h - saddl2 v26.4s, v1.8h, v3.8h - saddl v0.4s, v8.4h, v10.4h - saddl2 v1.4s, v8.8h, v10.8h - saddl v2.4s, v9.4h, v11.4h - saddl2 v3.4s, v9.8h, v11.8h - smull v8.8h, v12.8b, v27.8b - smull2 v9.8h, v12.16b, v27.16b - smull v10.8h, v13.8b, v28.8b - smull2 v11.8h, v13.16b, v28.16b - smull v12.8h, v14.8b, v25.8b - smull2 v13.8h, v14.16b, v25.16b - add v4.4s, v22.4s, v0.4s - add v5.4s, v23.4s, v1.4s - add v6.4s, v24.4s, v2.4s - add v7.4s, v26.4s, v3.4s - saddl v0.4s, v8.4h, v10.4h - saddl2 v1.4s, v8.8h, v10.8h - saddl v2.4s, v9.4h, v11.4h - saddl2 v3.4s, v9.8h, v11.8h - add v4.4s, v4.4s, v0.4s - add v5.4s, v5.4s, v1.4s - add v6.4s, v6.4s, v2.4s - add v7.4s, v7.4s, v3.4s - saddw v4.4s, v4.4s, v12.4h - saddw2 v5.4s, v5.4s, v12.8h - saddw v6.4s, v6.4s, v13.4h - saddw2 v7.4s, v7.4s, v13.8h - - ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid - dup v22.16b, v29.b[7] - ext v9.16b, v16.16b, v17.16b, #14 - dup v23.16b, v29.b[8] - ext v10.16b, v16.16b, v17.16b, #15 - dup v24.16b, v29.b[9] - dup v25.16b, v29.b[10] - ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right - dup v26.16b, v29.b[11] - ext v12.16b, v17.16b, v18.16b, #2 - dup v27.16b, v29.b[12] - ext v13.16b, v17.16b, v18.16b, #3 - dup v28.16b, v29.b[13] - - smull v0.8h, v8.8b, v22.8b - smull2 v1.8h, v8.16b, v22.16b - smull v2.8h, v9.8b, v23.8b - smull2 v3.8h, v9.16b, v23.16b - smull v8.8h, v10.8b, v24.8b - smull2 v9.8h, v10.16b, v24.16b - smull v10.8h, v11.8b, v26.8b - smull2 v11.8h, v11.16b, v26.16b - saddl v22.4s, v0.4h, v2.4h - saddl2 v23.4s, v0.8h, v2.8h - saddl v24.4s, v1.4h, v3.4h - saddl2 v26.4s, v1.8h, v3.8h - saddl v0.4s, v8.4h, v10.4h - saddl2 v1.4s, v8.8h, v10.8h - saddl v2.4s, v9.4h, v11.4h - saddl2 v3.4s, v9.8h, v11.8h - smull v8.8h, v12.8b, v27.8b - smull2 v9.8h, v12.16b, v27.16b - smull v10.8h, v13.8b, v28.8b - smull2 v11.8h, v13.16b, v28.16b - smull v12.8h, v17.8b, v25.8b - smull2 v13.8h, v17.16b, v25.16b - add v22.4s, v22.4s, v0.4s - add v23.4s, v23.4s, v1.4s - add v24.4s, v24.4s, v2.4s - add v26.4s, v26.4s, v3.4s - saddl v0.4s, v8.4h, v10.4h - saddl2 v1.4s, v8.8h, v10.8h - saddl v2.4s, v9.4h, v11.4h - saddl2 v3.4s, v9.8h, v11.8h - add v4.4s, v4.4s, v22.4s - add v5.4s, v5.4s, v23.4s - add v6.4s, v6.4s, v24.4s - add v7.4s, v7.4s, v26.4s - add v4.4s, v4.4s, v0.4s - add v5.4s, v5.4s, v1.4s - add v6.4s, v6.4s, v2.4s - add v7.4s, v7.4s, v3.4s - saddw v4.4s, v4.4s, v12.4h - saddw2 v5.4s, v5.4s, v12.8h - saddw v6.4s, v6.4s, v13.4h - saddw2 v7.4s, v7.4s, v13.8h - - ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid - dup v22.16b, v29.b[14] - ext v9.16b, v19.16b, v20.16b, #14 - dup v23.16b, v29.b[15] - ext v10.16b, v19.16b, v20.16b, #15 - dup v24.16b, v30.b[0] - dup v25.16b, v30.b[1] - ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right - dup v26.16b, v30.b[2] - ext v12.16b, v20.16b, v21.16b, #2 - dup v27.16b, v30.b[3] - ext v13.16b, v20.16b, v21.16b, #3 - dup v28.16b, v30.b[4] - - smull v0.8h, v8.8b, v22.8b - smull2 v1.8h, v8.16b, v22.16b - smull v2.8h, v9.8b, v23.8b - smull2 v3.8h, v9.16b, v23.16b - smull v8.8h, v10.8b, v24.8b - smull2 v9.8h, v10.16b, v24.16b - smull v10.8h, v11.8b, v26.8b - smull2 v11.8h, v11.16b, v26.16b - saddl v22.4s, v0.4h, v2.4h - saddl2 v23.4s, v0.8h, v2.8h - saddl v24.4s, v1.4h, v3.4h - saddl2 v26.4s, v1.8h, v3.8h - saddl v0.4s, v8.4h, v10.4h - saddl2 v1.4s, v8.8h, v10.8h - saddl v2.4s, v9.4h, v11.4h - saddl2 v3.4s, v9.8h, v11.8h - smull v8.8h, v12.8b, v27.8b - smull2 v9.8h, v12.16b, v27.16b - smull v10.8h, v13.8b, v28.8b - smull2 v11.8h, v13.16b, v28.16b - smull v12.8h, v20.8b, v25.8b - smull2 v19.8h, v20.16b, v25.16b - add v22.4s, v22.4s, v0.4s - add v23.4s, v23.4s, v1.4s - add v24.4s, v24.4s, v2.4s - add v26.4s, v26.4s, v3.4s - saddl v0.4s, v8.4h, v10.4h - saddl2 v1.4s, v8.8h, v10.8h - saddl v2.4s, v9.4h, v11.4h - saddl2 v3.4s, v9.8h, v11.8h - add v4.4s, v4.4s, v22.4s - add v5.4s, v5.4s, v23.4s - add v6.4s, v6.4s, v24.4s - add v7.4s, v7.4s, v26.4s - mov v13.16b, v14.16b - mov v14.16b, v15.16b - add v4.4s, v4.4s, v0.4s - add v5.4s, v5.4s, v1.4s - add v6.4s, v6.4s, v2.4s - add v7.4s, v7.4s, v3.4s - mov v16.16b, v17.16b - mov v17.16b, v18.16b - saddw v4.4s, v4.4s, v12.4h - saddw2 v5.4s, v5.4s, v12.8h - saddw v6.4s, v6.4s, v19.4h - saddw2 v7.4s, v7.4s, v19.8h - - mov v19.16b, v20.16b - mov v20.16b, v21.16b - ret -endfunc - -.macro sum_lag3_func type, uv_layout, edge, elems=16 -function sum_\type\()_lag3_\edge\()_neon - str x30, [sp, #-16]! -.ifc \edge, left - sub x11, x0, #3*GRAIN_WIDTH - sub x12, x0, #2*GRAIN_WIDTH - sub x13, x0, #1*GRAIN_WIDTH - ld1 {v14.16b}, [x11] // load the previous block right above - ld1 {v17.16b}, [x12] - ld1 {v20.16b}, [x13] -.endif - sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8] -endfunc -.endm - -sum_lag3_func y, 0, left -sum_lag3_func y, 0, mid -sum_lag3_func y, 0, right, 15 -sum_lag3_func uv_444, 444, left -sum_lag3_func uv_444, 444, mid -sum_lag3_func uv_444, 444, right, 15 -sum_lag3_func uv_422, 422, left -sum_lag3_func uv_422, 422, mid -sum_lag3_func uv_422, 422, right, 9 -sum_lag3_func uv_420, 420, left -sum_lag3_func uv_420, 420, mid -sum_lag3_func uv_420, 420, right, 9 - -function generate_grain_rows_neon - str x30, [sp, #-16]! -1: - get_grain_row v16, v17, v18, v19, v20, v21 - subs w1, w1, #1 - store_grain_row v16, v17, v18, v19, v20, v21 - b.gt 1b - ldr x30, [sp], #16 - ret -endfunc - -function generate_grain_rows_44_neon - str x30, [sp, #-16]! -1: - get_grain_row_44 v16, v17, v18 - subs w1, w1, #1 - store_grain_row_44 v16, v17, v18 - b.gt 1b - ldr x30, [sp], #16 - ret -endfunc - -function get_grain_row_neon - str x30, [sp, #-16]! - get_grain_row v16, v17, v18, v19, v20, v21 - ldr x30, [sp], #16 - ret -endfunc - -function get_grain_row_44_neon - str x30, [sp, #-16]! - get_grain_row_44 v16, v17, v18 - ldr x30, [sp], #16 - ret -endfunc - -function add_uv_444_coeff_lag0_neon -add_coeff_lag0_start: - smull v2.8h, v0.8b, v27.8b - smull2 v3.8h, v0.16b, v27.16b - srshl v2.8h, v2.8h, v28.8h - srshl v3.8h, v3.8h, v28.8h - saddw v2.8h, v2.8h, v1.8b - saddw2 v3.8h, v3.8h, v1.16b - sqxtn v2.8b, v2.8h - sqxtn2 v2.16b, v3.8h - ret -endfunc - -function add_uv_420_coeff_lag0_neon - ld1 {v4.16b, v5.16b}, [x19], #32 - ld1 {v6.16b, v7.16b}, [x12], #32 - saddlp v4.8h, v4.16b - saddlp v5.8h, v5.16b - saddlp v6.8h, v6.16b - saddlp v7.8h, v7.16b - add v4.8h, v4.8h, v6.8h - add v5.8h, v5.8h, v7.8h - rshrn v4.8b, v4.8h, #2 - rshrn2 v4.16b, v5.8h, #2 - and v0.16b, v4.16b, v0.16b - b add_coeff_lag0_start -endfunc - -function add_uv_422_coeff_lag0_neon - ld1 {v4.16b, v5.16b}, [x19], #32 - saddlp v4.8h, v4.16b - saddlp v5.8h, v5.16b - rshrn v4.8b, v4.8h, #1 - rshrn2 v4.16b, v5.8h, #1 - and v0.16b, v4.16b, v0.16b - b add_coeff_lag0_start -endfunc - -.macro gen_grain_82 type -function generate_grain_\type\()_8bpc_neon, export=1 - stp x30, x19, [sp, #-96]! - -.ifc \type, uv_444 - mov w13, w3 - mov w14, #28 - add x19, x1, #3*GRAIN_WIDTH - mov x1, x2 - mul w13, w13, w14 -.endif - movrel x3, X(gaussian_sequence) - ldr w2, [x1, #FGD_SEED] - ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] -.ifc \type, y - add x4, x1, #FGD_AR_COEFFS_Y -.else - add x4, x1, #FGD_AR_COEFFS_UV -.endif - adr x16, L(gen_grain_\type\()_tbl) - ldr w17, [x1, #FGD_AR_COEFF_LAG] - add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] - dup v31.8h, w9 // 4 + data->grain_scale_shift - sub x16, x16, w17, uxtw - neg v31.8h, v31.8h - -.ifc \type, uv_444 - cmp w13, #0 - mov w11, #0x49d8 - mov w14, #0xb524 - add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] - csel w11, w11, w14, ne -.endif - - ldr w7, [x1, #FGD_AR_COEFF_SHIFT] - mov w8, #1 - mov w10, #1 - lsl w8, w8, w7 // 1 << ar_coeff_shift - lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) - lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) - lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) - mov w5, #127 - mov w6, #-128 - -.ifc \type, uv_444 - eor w2, w2, w11 -.endif - - br x16 - -L(generate_grain_\type\()_lag0): -.ifc \type, y - mov w1, #GRAIN_HEIGHT - bl generate_grain_rows_neon -.else - dup v28.8h, w7 - ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] - movi v0.16b, #0 - movi v1.16b, #255 - ext v29.16b, v0.16b, v1.16b, #13 - ext v30.16b, v1.16b, v0.16b, #1 - neg v28.8h, v28.8h - - mov w1, #3 - bl generate_grain_rows_neon - mov w1, #GRAIN_HEIGHT-3 -1: - ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64 - bl get_grain_row_neon - and v0.16b, v22.16b, v29.16b - mov v1.16b, v16.16b - bl add_uv_444_coeff_lag0_neon - mov v0.16b, v23.16b - mov v1.16b, v17.16b - mov v16.16b, v2.16b - bl add_uv_444_coeff_lag0_neon - ld1 {v26.16b}, [x19], #16 - mov v0.16b, v24.16b - mov v1.16b, v18.16b - mov v17.16b, v2.16b - bl add_uv_444_coeff_lag0_neon - add x19, x19, #2 - mov v0.16b, v25.16b - mov v1.16b, v19.16b - mov v18.16b, v2.16b - bl add_uv_444_coeff_lag0_neon - and v0.16b, v26.16b, v30.16b - mov v1.16b, v20.16b - mov v19.16b, v2.16b - bl add_uv_444_coeff_lag0_neon - mov v20.16b, v2.16b - subs w1, w1, #1 - store_grain_row v16, v17, v18, v19, v20, v21 - b.gt 1b -.endif - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag1): - ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0] - ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1] - ld1r {v29.16b}, [x4] // ar_coeffs_y[2] -.ifc \type, y - ldrsb w4, [x4, #1] // ar_coeffs_y[3] -.else - add x4, x4, #2 -.endif - - mov w1, #3 -.ifc \type, uv_444 - ld1r {v30.16b}, [x4] // ar_coeffs_uv[4] - ldursb w4, [x4, #-1] // ar_coeffs_uv[3] -.endif - bl generate_grain_rows_neon - - mov w1, #GRAIN_HEIGHT - 3 -1: - sum_\type\()_lag1 v22, v16, v16, v17, left - sum_\type\()_lag1 v23, v16, v17, v18 - sum_\type\()_lag1 v24, v17, v18, v19 - sum_\type\()_lag1 v25, v18, v19, v20 - sum_\type\()_lag1 v20, v19, v20, v21, right - get_grain_2 v21 - subs w1, w1, #1 -.ifc \type, uv_444 - add x19, x19, #2 -.endif - store_grain_row v22, v23, v24, v25, v20, v21 - mov v16.16b, v22.16b - mov v17.16b, v23.16b - mov v18.16b, v24.16b - mov v19.16b, v25.16b - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag2): - ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] - - smov w4, v30.b[10] - smov w17, v30.b[11] - - mov w1, #3 - bl generate_grain_rows_neon - - mov w1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag2_left_neon - bl sum_\type\()_lag2_mid_neon - bl sum_\type\()_lag2_mid_neon - bl sum_\type\()_lag2_mid_neon - bl sum_\type\()_lag2_right_neon - get_grain_2 v16 - subs w1, w1, #1 -.ifc \type, uv_444 - add x19, x19, #2 -.endif - st1 {v16.h}[0], [x0], #2 - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag3): - ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] - stp x20, x21, [sp, #80] - - smov w4, v30.b[5] - smov w20, v30.b[6] - smov w21, v30.b[7] - - mov w1, #3 - bl generate_grain_rows_neon - - mov w1, #GRAIN_HEIGHT - 3 -1: - bl sum_\type\()_lag3_left_neon - bl sum_\type\()_lag3_mid_neon - bl sum_\type\()_lag3_mid_neon - bl sum_\type\()_lag3_mid_neon - bl sum_\type\()_lag3_right_neon - get_grain_2 v16 - subs w1, w1, #1 -.ifc \type, uv_444 - add x19, x19, #2 -.endif - st1 {v16.h}[0], [x0], #2 - b.gt 1b - - ldp x20, x21, [sp, #80] - ldp d14, d15, [sp, #64] - ldp d12, d13, [sp, #48] - ldp d10, d11, [sp, #32] - ldp d8, d9, [sp, #16] - ldp x30, x19, [sp], #96 - ret - -L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) -endfunc -.endm - -gen_grain_82 y -gen_grain_82 uv_444 - -.macro set_height dst, type -.ifc \type, uv_420 - mov \dst, #SUB_GRAIN_HEIGHT-3 -.else - mov \dst, #GRAIN_HEIGHT-3 -.endif -.endm - -.macro increment_y_ptr reg, type -.ifc \type, uv_420 - add \reg, \reg, #2*GRAIN_WIDTH-(3*32) -.else - sub \reg, \reg, #3*32-GRAIN_WIDTH -.endif -.endm - -.macro gen_grain_44 type -function generate_grain_\type\()_8bpc_neon, export=1 - stp x30, x19, [sp, #-96]! - - mov w13, w3 - mov w14, #28 - add x19, x1, #3*GRAIN_WIDTH-3 - mov x1, x2 - mul w13, w13, w14 - - movrel x3, X(gaussian_sequence) - ldr w2, [x1, #FGD_SEED] - ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] - add x4, x1, #FGD_AR_COEFFS_UV - adr x16, L(gen_grain_\type\()_tbl) - ldr w17, [x1, #FGD_AR_COEFF_LAG] - add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] - dup v31.8h, w9 // 4 + data->grain_scale_shift - sub x16, x16, w17, uxtw - neg v31.8h, v31.8h - - cmp w13, #0 - mov w11, #0x49d8 - mov w14, #0xb524 - add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] - csel w11, w11, w14, ne - - ldr w7, [x1, #FGD_AR_COEFF_SHIFT] - mov w8, #1 - mov w10, #1 - lsl w8, w8, w7 // 1 << ar_coeff_shift - lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) - lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) - lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) - mov w5, #127 - mov w6, #-128 - - eor w2, w2, w11 - - br x16 - -L(generate_grain_\type\()_lag0): - dup v28.8h, w7 - ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] - movi v0.16b, #0 - movi v1.16b, #255 - ext v29.16b, v0.16b, v1.16b, #13 - ext v30.16b, v1.16b, v0.16b, #7 - neg v28.8h, v28.8h - - mov w1, #3 - bl generate_grain_rows_44_neon - set_height w1, \type -1: - bl get_grain_row_44_neon -.ifc \type, uv_420 - add x12, x19, #GRAIN_WIDTH -.endif - mov v0.16b, v29.16b - mov v1.16b, v16.16b - bl add_\type\()_coeff_lag0_neon - movi v0.16b, #255 - mov v1.16b, v17.16b - mov v16.16b, v2.16b - bl add_\type\()_coeff_lag0_neon - mov v0.16b, v30.16b - mov v1.16b, v18.16b - mov v17.16b, v2.16b - bl add_\type\()_coeff_lag0_neon - mov v18.16b, v2.16b - subs w1, w1, #1 - increment_y_ptr x19, \type - store_grain_row_44 v16, v17, v18 - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag1): - ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0] - ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1] - ld1r {v29.16b}, [x4] // ar_coeffs_uv[2] - add x4, x4, #2 - - mov w1, #3 - ld1r {v30.16b}, [x4] // ar_coeffs_u4[4] - ldursb w4, [x4, #-1] // ar_coeffs_uv[3] - bl generate_grain_rows_44_neon - - set_height w1, \type -1: - sum_\type\()_lag1 v20, v16, v16, v17, left - sum_\type\()_lag1 v21, v16, v17, v18 - sum_\type\()_lag1 v18, v17, v18, v18, right - subs w1, w1, #1 - increment_y_ptr x19, \type - store_grain_row_44 v20, v21, v18 - mov v16.16b, v20.16b - mov v17.16b, v21.16b - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag2): - ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] - - smov w4, v30.b[10] - smov w17, v30.b[11] - - mov w1, #3 - bl generate_grain_rows_44_neon - - set_height w1, \type -1: - bl sum_\type\()_lag2_left_neon - bl sum_\type\()_lag2_mid_neon - bl sum_\type\()_lag2_right_neon - subs w1, w1, #1 - increment_y_ptr x19, \type - add x0, x0, #GRAIN_WIDTH-48 - b.gt 1b - - ldp x30, x19, [sp], #96 - ret - -L(generate_grain_\type\()_lag3): - ldr q29, [x4] // ar_coeffs_uv[0-15] - ldr q30, [x4, #16] // ar_coeffs_uv[16-24] - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] - stp x20, x21, [sp, #80] - - smov w4, v30.b[5] - smov w20, v30.b[6] - smov w21, v30.b[7] - - mov w1, #3 - bl generate_grain_rows_44_neon - - set_height w1, \type -1: - bl sum_\type\()_lag3_left_neon - bl sum_\type\()_lag3_mid_neon - bl sum_\type\()_lag3_right_neon - subs w1, w1, #1 - increment_y_ptr x19, \type - add x0, x0, #GRAIN_WIDTH-48 - b.gt 1b - - ldp x20, x21, [sp, #80] - ldp d14, d15, [sp, #64] - ldp d12, d13, [sp, #48] - ldp d10, d11, [sp, #32] - ldp d8, d9, [sp, #16] - ldp x30, x19, [sp], #96 - ret - -L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) -endfunc -.endm - -gen_grain_44 uv_420 -gen_grain_44 uv_422 - -.macro gather_interleaved dst1, dst2, src1, src2, off - umov w14, \src1[0+\off] - umov w15, \src2[8+\off] - umov w16, \src1[2+\off] - add x14, x14, x3 - umov w17, \src2[10+\off] - add x15, x15, x3 - ld1 {\dst1}[0+\off], [x14] - umov w14, \src1[4+\off] - add x16, x16, x3 - ld1 {\dst2}[8+\off], [x15] - umov w15, \src2[12+\off] - add x17, x17, x3 - ld1 {\dst1}[2+\off], [x16] - umov w16, \src1[6+\off] - add x14, x14, x3 - ld1 {\dst2}[10+\off], [x17] - umov w17, \src2[14+\off] - add x15, x15, x3 - ld1 {\dst1}[4+\off], [x14] - add x16, x16, x3 - ld1 {\dst2}[12+\off], [x15] - add x17, x17, x3 - ld1 {\dst1}[6+\off], [x16] - ld1 {\dst2}[14+\off], [x17] -.endm - -.macro gather dst1, dst2, src1, src2 - gather_interleaved \dst1, \dst2, \src1, \src2, 0 - gather_interleaved \dst2, \dst1, \src2, \src1, 0 - gather_interleaved \dst1, \dst2, \src1, \src2, 1 - gather_interleaved \dst2, \dst1, \src2, \src1, 1 -.endm - -function gather32_neon - gather v4.b, v5.b, v0.b, v1.b - ret -endfunc - -function gather16_neon - gather_interleaved v4.b, v5.b, v0.b, v0.b, 0 - gather_interleaved v4.b, v5.b, v0.b, v0.b, 1 - ins v4.d[1], v5.d[1] - ret -endfunc - -const overlap_coeffs_0, align=4 - .byte 27, 17, 0, 0, 0, 0, 0, 0 - .byte 17, 27, 32, 32, 32, 32, 32, 32 -endconst - -const overlap_coeffs_1, align=4 - .byte 23, 0, 0, 0, 0, 0, 0, 0 - .byte 22, 32, 32, 32, 32, 32, 32, 32 -endconst - -.macro calc_offset offx, offy, src, sx, sy - and \offy, \src, #0xF // randval & 0xF - lsr \offx, \src, #4 // randval >> 4 -.if \sy == 0 - add \offy, \offy, \offy // 2 * (randval & 0xF) -.endif -.if \sx == 0 - add \offx, \offx, \offx // 2 * (randval >> 4) -.endif -.endm - -.macro add_offset dst, offx, offy, src, stride - madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy - add \dst, \dst, \offx, uxtw // grain_lut += offx -.endm - -// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, -// const ptrdiff_t stride, -// const uint8_t scaling[SCALING_SIZE], -// const int scaling_shift, -// const entry grain_lut[][GRAIN_WIDTH], -// const int offsets[][2], -// const int h, const ptrdiff_t clip, -// const ptrdiff_t type); -function fgy_32x32_8bpc_neon, export=1 - str x30, [sp, #-16]! - ldr w11, [x6, #8] // offsets[1][0] - ldr w13, [x6, #4] // offsets[0][1] - ldr w15, [x6, #12] // offsets[1][1] - ldr w6, [x6] // offsets[0][0] - ldr w8, [sp, #16] // clip - mov x9, #GRAIN_WIDTH // grain_lut stride - - neg w4, w4 - dup v29.8h, w4 // -scaling_shift - - movrel x16, overlap_coeffs_0 - - cbz w8, 1f - // clip - movi v30.16b, #16 - movi v31.16b, #235 - b 2f -1: - // no clip - movi v30.16b, #0 - movi v31.16b, #255 -2: - - ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs - - add x5, x5, #9 // grain_lut += 9 - add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride - add x5, x5, x9 // grain_lut += grain_stride - - calc_offset w11, w12, w11, 0, 0 - calc_offset w13, w14, w13, 0, 0 - calc_offset w15, w16, w15, 0, 0 - calc_offset w6, w10, w6, 0, 0 - - add_offset x12, w11, x12, x5, x9 - add_offset x14, w13, x14, x5, x9 - add_offset x16, w15, x16, x5, x9 - add_offset x5, w6, x10, x5, x9 - - ldr w11, [sp, #24] // type - adr x13, L(fgy_loop_tbl) - - add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx - add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - - tst w11, #1 - ldrh w11, [x13, w11, uxtw #1] - - add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx - - sub x11, x13, w11, uxtw - - b.eq 1f - // y overlap - dup v6.16b, v27.b[0] - dup v7.16b, v27.b[1] - mov w10, w7 // backup actual h - mov w7, #2 -1: - br x11 -endfunc - -function fgy_loop_neon -.macro fgy ox, oy -L(loop_\ox\oy): -1: - ld1 {v0.16b, v1.16b}, [x1], x2 // src -.if \ox - ld1 {v20.8b}, [x4], x9 // grain_lut old -.endif -.if \oy - ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top -.endif -.if \ox && \oy - ld1 {v21.8b}, [x8], x9 // grain_lut top old -.endif - ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut - - bl gather32_neon - -.if \ox - smull v20.8h, v20.8b, v27.8b - smlal v20.8h, v18.8b, v28.8b -.endif - -.if \oy -.if \ox - smull v21.8h, v21.8b, v27.8b - smlal v21.8h, v22.8b, v28.8b - sqrshrn v20.8b, v20.8h, #5 - sqrshrn v21.8b, v21.8h, #5 -.endif - -.if \ox - smull v16.8h, v20.8b, v7.8b -.else - smull v16.8h, v18.8b, v7.8b -.endif - smull2 v17.8h, v18.16b, v7.16b - smull v18.8h, v19.8b, v7.8b - smull2 v19.8h, v19.16b, v7.16b -.if \ox - smlal v16.8h, v21.8b, v6.8b -.else - smlal v16.8h, v22.8b, v6.8b -.endif - smlal2 v17.8h, v22.16b, v6.16b - smlal v18.8h, v23.8b, v6.8b - smlal2 v19.8h, v23.16b, v6.16b - sqrshrn v22.8b, v16.8h, #5 - sqrshrn2 v22.16b, v17.8h, #5 - sqrshrn v23.8b, v18.8h, #5 - sqrshrn2 v23.16b, v19.8h, #5 -.endif - - // sxtl of grain -.if \oy - sxtl v16.8h, v22.8b - sxtl2 v17.8h, v22.16b - sxtl v18.8h, v23.8b - sxtl2 v19.8h, v23.16b -.elseif \ox - sqrshrn v20.8b, v20.8h, #5 - sxtl2 v17.8h, v18.16b - sxtl v18.8h, v19.8b - sxtl2 v19.8h, v19.16b - sxtl v16.8h, v20.8b -.else - sxtl v16.8h, v18.8b - sxtl2 v17.8h, v18.16b - sxtl v18.8h, v19.8b - sxtl2 v19.8h, v19.16b -.endif - - uxtl v2.8h, v4.8b // scaling - uxtl2 v3.8h, v4.16b - uxtl v4.8h, v5.8b - uxtl2 v5.8h, v5.16b - - mul v16.8h, v16.8h, v2.8h // scaling * grain - mul v17.8h, v17.8h, v3.8h - mul v18.8h, v18.8h, v4.8h - mul v19.8h, v19.8h, v5.8h - - srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) - srshl v17.8h, v17.8h, v29.8h - srshl v18.8h, v18.8h, v29.8h - srshl v19.8h, v19.8h, v29.8h - - uaddw v16.8h, v16.8h, v0.8b // *src + noise - uaddw2 v17.8h, v17.8h, v0.16b - uaddw v18.8h, v18.8h, v1.8b - uaddw2 v19.8h, v19.8h, v1.16b - - sqxtun v0.8b, v16.8h - sqxtun2 v0.16b, v17.8h - sqxtun v1.8b, v18.8h - sqxtun2 v1.16b, v19.8h - - umax v0.16b, v0.16b, v30.16b - umax v1.16b, v1.16b, v30.16b - umin v0.16b, v0.16b, v31.16b - umin v1.16b, v1.16b, v31.16b - - subs w7, w7, #1 -.if \oy - dup v6.16b, v28.b[0] - dup v7.16b, v28.b[1] -.endif - st1 {v0.16b, v1.16b}, [x0], x2 // dst - b.gt 1b - -.if \oy - cmp w10, #2 - sub w7, w10, #2 // restore actual remaining h - b.gt L(loop_\ox\()0) -.endif - ldr x30, [sp], #16 - ret -.endm - - fgy 0, 0 - fgy 0, 1 - fgy 1, 0 - fgy 1, 1 - -L(fgy_loop_tbl): - .hword L(fgy_loop_tbl) - L(loop_00) - .hword L(fgy_loop_tbl) - L(loop_01) - .hword L(fgy_loop_tbl) - L(loop_10) - .hword L(fgy_loop_tbl) - L(loop_11) -endfunc - -// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, -// const pixel *const src, -// const ptrdiff_t stride, -// const uint8_t scaling[SCALING_SIZE], -// const Dav1dFilmGrainData *const data, -// const entry grain_lut[][GRAIN_WIDTH], -// const pixel *const luma_row, -// const ptrdiff_t luma_stride, -// const int offsets[][2], -// const ptrdiff_t h, const ptrdiff_t uv, -// const ptrdiff_t is_id, -// const ptrdiff_t type); -.macro fguv layout, sx, sy -function fguv_32x32_\layout\()_8bpc_neon, export=1 - str x30, [sp, #-32]! - str d8, [sp, #16] - ldp x8, x9, [sp, #32] // offsets, h - ldp x10, x11, [sp, #48] // uv, is_id - - ldr w13, [x4, #FGD_SCALING_SHIFT] - ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] - neg w13, w13 // -scaling_shift - - // !csfl - add x10, x4, x10, lsl #2 // + 4*uv - add x14, x10, #FGD_UV_LUMA_MULT - add x15, x10, #FGD_UV_MULT - add x10, x10, #FGD_UV_OFFSET - ld1 {v8.h}[0], [x14] // uv_luma_mult - ld1r {v24.8h}, [x10] // uv_offset - ld1 {v8.h}[1], [x15] // uv_mult - - dup v29.8h, w13 // -scaling_shift - - cbz w12, 1f - // clip - movi v30.16b, #16 - movi v31.16b, #240 - cbz w11, 2f - // is_id - movi v31.16b, #235 - b 2f -1: - // no clip - movi v30.16b, #0 - movi v31.16b, #255 -2: - - ldr w12, [x8, #8] // offsets[1][0] - ldr w14, [x8, #4] // offsets[0][1] - ldr w16, [x8, #12] // offsets[1][1] - ldr w8, [x8] // offsets[0][0] - - mov x10, #GRAIN_WIDTH // grain_lut stride - - add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 -.if \sy - add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride - add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride -.else - add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride - add x5, x5, x10 // grain_lut += grain_stride -.endif - - calc_offset w12, w13, w12, \sx, \sy - calc_offset w14, w15, w14, \sx, \sy - calc_offset w16, w17, w16, \sx, \sy - calc_offset w8, w11, w8, \sx, \sy - - add_offset x13, w12, x13, x5, x10 - add_offset x15, w14, x15, x5, x10 - add_offset x17, w16, x17, x5, x10 - add_offset x5, w8, x11, x5, x10 - - add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - - ldr w13, [sp, #64] // type - - movrel x16, overlap_coeffs_\sx - adr x14, L(fguv_loop_sx\sx\()_tbl) - - ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs - tst w13, #1 - ldrh w13, [x14, w13, uxtw #1] - - b.eq 1f - // y overlap - sub w12, w9, #(2 >> \sy) // backup remaining h - mov w9, #(2 >> \sy) - -1: - sub x13, x14, w13, uxtw - -.if \sy - movi v25.16b, #23 - movi v26.16b, #22 -.else - movi v25.16b, #27 - movi v26.16b, #17 -.endif - -.if \sy - add x7, x7, x7 // luma_stride *= 2 -.endif - - br x13 -endfunc -.endm - -fguv 420, 1, 1 -fguv 422, 1, 0 -fguv 444, 0, 0 - -function fguv_loop_sx0_neon -.macro fguv_loop_sx0 csfl, ox, oy -L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): -1: - ld1 {v0.16b, v1.16b}, [x6], x7 // luma - ld1 {v6.16b, v7.16b}, [x1], x2 // src -.if \ox - ld1 {v20.8b}, [x4], x10 // grain_lut old -.endif -.if \oy - ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top -.endif -.if \ox && \oy - ld1 {v21.8b}, [x11], x10 // grain_lut top old -.endif - ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut - -.if !\csfl - uxtl v2.8h, v0.8b - uxtl2 v3.8h, v0.16b - uxtl v4.8h, v1.8b - uxtl2 v5.8h, v1.16b - uxtl v0.8h, v6.8b - uxtl2 v1.8h, v6.16b - uxtl v16.8h, v7.8b - uxtl2 v17.8h, v7.16b - mul v2.8h, v2.8h, v8.h[0] - mul v3.8h, v3.8h, v8.h[0] - mul v4.8h, v4.8h, v8.h[0] - mul v5.8h, v5.8h, v8.h[0] - mul v0.8h, v0.8h, v8.h[1] - mul v1.8h, v1.8h, v8.h[1] - mul v16.8h, v16.8h, v8.h[1] - mul v17.8h, v17.8h, v8.h[1] - sqadd v2.8h, v2.8h, v0.8h - sqadd v3.8h, v3.8h, v1.8h - sqadd v4.8h, v4.8h, v16.8h - sqadd v5.8h, v5.8h, v17.8h - sshr v2.8h, v2.8h, #6 - sshr v3.8h, v3.8h, #6 - sshr v4.8h, v4.8h, #6 - sshr v5.8h, v5.8h, #6 - add v2.8h, v2.8h, v24.8h - add v3.8h, v3.8h, v24.8h - add v4.8h, v4.8h, v24.8h - add v5.8h, v5.8h, v24.8h - sqxtun v0.8b, v2.8h - sqxtun2 v0.16b, v3.8h - sqxtun v1.8b, v4.8h - sqxtun2 v1.16b, v5.8h -.endif - - bl gather32_neon - -.if \ox - smull v20.8h, v20.8b, v27.8b - smlal v20.8h, v18.8b, v28.8b -.endif - -.if \oy -.if \ox - smull v21.8h, v21.8b, v27.8b - smlal v21.8h, v22.8b, v28.8b - sqrshrn v20.8b, v20.8h, #5 - sqrshrn v21.8b, v21.8h, #5 -.endif - -.if \ox - smull v16.8h, v20.8b, v26.8b -.else - smull v16.8h, v18.8b, v26.8b -.endif - smull2 v17.8h, v18.16b, v26.16b - smull v18.8h, v19.8b, v26.8b - smull2 v19.8h, v19.16b, v26.16b -.if \ox - smlal v16.8h, v21.8b, v25.8b -.else - smlal v16.8h, v22.8b, v25.8b -.endif - smlal2 v17.8h, v22.16b, v25.16b - smlal v18.8h, v23.8b, v25.8b - smlal2 v19.8h, v23.16b, v25.16b - sqrshrn v22.8b, v16.8h, #5 - sqrshrn2 v22.16b, v17.8h, #5 - sqrshrn v23.8b, v18.8h, #5 - sqrshrn2 v23.16b, v19.8h, #5 -.endif - - // sxtl of grain -.if \oy - sxtl v16.8h, v22.8b - sxtl2 v17.8h, v22.16b - sxtl v18.8h, v23.8b - sxtl2 v19.8h, v23.16b -.elseif \ox - sqrshrn v20.8b, v20.8h, #5 - sxtl2 v17.8h, v18.16b - sxtl v18.8h, v19.8b - sxtl2 v19.8h, v19.16b - sxtl v16.8h, v20.8b -.else - sxtl v16.8h, v18.8b - sxtl2 v17.8h, v18.16b - sxtl v18.8h, v19.8b - sxtl2 v19.8h, v19.16b -.endif - - uxtl v2.8h, v4.8b // scaling - uxtl2 v3.8h, v4.16b - uxtl v4.8h, v5.8b - uxtl2 v5.8h, v5.16b - - mul v16.8h, v16.8h, v2.8h // scaling * grain - mul v17.8h, v17.8h, v3.8h - mul v18.8h, v18.8h, v4.8h - mul v19.8h, v19.8h, v5.8h - - srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) - srshl v17.8h, v17.8h, v29.8h - srshl v18.8h, v18.8h, v29.8h - srshl v19.8h, v19.8h, v29.8h - - uaddw v16.8h, v16.8h, v6.8b // *src + noise - uaddw2 v17.8h, v17.8h, v6.16b - uaddw v18.8h, v18.8h, v7.8b - uaddw2 v19.8h, v19.8h, v7.16b - - sqxtun v0.8b, v16.8h - sqxtun2 v0.16b, v17.8h - sqxtun v1.8b, v18.8h - sqxtun2 v1.16b, v19.8h - - umax v0.16b, v0.16b, v30.16b - umax v1.16b, v1.16b, v30.16b - umin v0.16b, v0.16b, v31.16b - umin v1.16b, v1.16b, v31.16b - - subs w9, w9, #1 -.if \oy - dup v25.16b, v28.b[0] - dup v26.16b, v28.b[1] -.endif - st1 {v0.16b, v1.16b}, [x0], x2 // dst - b.gt 1b - -.if \oy - cmp w12, #0 - mov w9, w12 // restore actual remaining h - b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) -.endif - b 9f -.endm - fguv_loop_sx0 0, 0, 0 - fguv_loop_sx0 0, 0, 1 - fguv_loop_sx0 0, 1, 0 - fguv_loop_sx0 0, 1, 1 - fguv_loop_sx0 1, 0, 0 - fguv_loop_sx0 1, 0, 1 - fguv_loop_sx0 1, 1, 0 - fguv_loop_sx0 1, 1, 1 - -9: - ldr d8, [sp, #16] - ldr x30, [sp], #32 - ret - -L(fguv_loop_sx0_tbl): - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) -endfunc - -function fguv_loop_sx1_neon -.macro fguv_loop_sx1 csfl, ox, oy -L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): -1: - ld1 {v0.16b, v1.16b}, [x6], x7 // luma - ld1 {v6.16b}, [x1], x2 // src -.if \ox - ld1 {v20.8b}, [x4], x10 // grain_lut old -.endif -.if \oy - ld1 {v22.16b}, [x8], x10 // grain_lut top -.endif -.if \ox && \oy - ld1 {v21.8b}, [x11], x10 // grain_lut top old -.endif - ld1 {v18.16b}, [x5], x10 // grain_lut - - uaddlp v2.8h, v0.16b - uaddlp v3.8h, v1.16b -.if \csfl - rshrn v0.8b, v2.8h, #1 - rshrn2 v0.16b, v3.8h, #1 -.else - urshr v2.8h, v2.8h, #1 - urshr v3.8h, v3.8h, #1 - uxtl v0.8h, v6.8b - uxtl2 v1.8h, v6.16b - mul v2.8h, v2.8h, v8.h[0] - mul v3.8h, v3.8h, v8.h[0] - mul v0.8h, v0.8h, v8.h[1] - mul v1.8h, v1.8h, v8.h[1] - sqadd v2.8h, v2.8h, v0.8h - sqadd v3.8h, v3.8h, v1.8h - sshr v2.8h, v2.8h, #6 - sshr v3.8h, v3.8h, #6 - add v2.8h, v2.8h, v24.8h - add v3.8h, v3.8h, v24.8h - sqxtun v0.8b, v2.8h - sqxtun2 v0.16b, v3.8h -.endif - - bl gather16_neon - -.if \ox - smull v20.8h, v20.8b, v27.8b - smlal v20.8h, v18.8b, v28.8b -.endif - -.if \oy -.if \ox - smull v21.8h, v21.8b, v27.8b - smlal v21.8h, v22.8b, v28.8b - sqrshrn v20.8b, v20.8h, #5 - sqrshrn v21.8b, v21.8h, #5 -.endif - -.if \ox - smull v16.8h, v20.8b, v26.8b -.else - smull v16.8h, v18.8b, v26.8b -.endif - smull2 v17.8h, v18.16b, v26.16b -.if \ox - smlal v16.8h, v21.8b, v25.8b -.else - smlal v16.8h, v22.8b, v25.8b -.endif - smlal2 v17.8h, v22.16b, v25.16b - sqrshrn v22.8b, v16.8h, #5 - sqrshrn2 v22.16b, v17.8h, #5 -.endif - - // sxtl of grain -.if \oy - sxtl v16.8h, v22.8b - sxtl2 v17.8h, v22.16b -.elseif \ox - sqrshrn v20.8b, v20.8h, #5 - sxtl2 v17.8h, v18.16b - sxtl v16.8h, v20.8b -.else - sxtl v16.8h, v18.8b - sxtl2 v17.8h, v18.16b -.endif - - uxtl v2.8h, v4.8b // scaling - uxtl2 v3.8h, v4.16b - - mul v16.8h, v16.8h, v2.8h // scaling * grain - mul v17.8h, v17.8h, v3.8h - - srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) - srshl v17.8h, v17.8h, v29.8h - - uaddw v16.8h, v16.8h, v6.8b // *src + noise - uaddw2 v17.8h, v17.8h, v6.16b - - sqxtun v0.8b, v16.8h - sqxtun2 v0.16b, v17.8h - - umax v0.16b, v0.16b, v30.16b - umin v0.16b, v0.16b, v31.16b - -.if \oy - mov v16.16b, v25.16b -.endif - subs w9, w9, #1 -.if \oy - mov v25.16b, v26.16b - mov v26.16b, v16.16b -.endif - st1 {v0.16b}, [x0], x2 // dst - b.gt 1b - -.if \oy - cmp w12, #0 - mov w9, w12 // restore actual remaining h - b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) -.endif - - b 9f -.endm - fguv_loop_sx1 0, 0, 0 - fguv_loop_sx1 0, 0, 1 - fguv_loop_sx1 0, 1, 0 - fguv_loop_sx1 0, 1, 1 - fguv_loop_sx1 1, 0, 0 - fguv_loop_sx1 1, 0, 1 - fguv_loop_sx1 1, 1, 0 - fguv_loop_sx1 1, 1, 1 - -9: - ldr d8, [sp, #16] - ldr x30, [sp], #32 - ret - -L(fguv_loop_sx1_tbl): - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) -endfunc diff -Nru dav1d-0.9.2/src/arm/64/filmgrain.S dav1d-1.0.0/src/arm/64/filmgrain.S --- dav1d-0.9.2/src/arm/64/filmgrain.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/arm/64/filmgrain.S 2022-03-18 14:31:55.974356000 +0000 @@ -0,0 +1,2010 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr w11, w2, #3 + lsr w12, w2, #12 + lsr w13, w2, #1 + eor w11, w2, w11 // (r >> 0) ^ (r >> 3) + eor w12, w12, w13 // (r >> 12) ^ (r >> 1) + eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr w2, w2, #\steps +.endif + and w11, w11, #((1 << \steps) - 1) // bit +.if \shift + orr w2, w2, w11, lsl #(16 - \steps) // *state +.else + orr w2, w2, w11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, x2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, x2, #17 - \bits, #\bits + lsr w2, w2, #1 +.endm + +// special calling convention: +// w2 holds seed +// x3 holds dav1d_gaussian_sequence +// clobbers x11-x15 +// returns in v0.8h +function get_gaussian_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + increment_seed 4 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + read_rand x14, 11, 3 + ld1 {v0.h}[3], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 2 + ld1 {v0.h}[4], [x14] + add x15, x3, x15, lsl #1 + read_rand x14, 11, 1 + ld1 {v0.h}[5], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[6], [x14] + ld1 {v0.h}[7], [x15] + ret +endfunc + +.macro get_grain_row r0, r1, r2, r3, r4, r5 + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r0\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r0\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r1\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r1\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r2\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r2\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r3\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r3\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r4\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r4\().16b, \r5\().8h + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {\r5\().h}[0], [x14] + ld1 {\r5\().h}[1], [x15] + srshl v0.4h, \r5\().4h, v31.4h + xtn \r5\().8b, v0.8h +.endm + +.macro store_grain_row r0, r1, r2, r3, r4, r5 + st1 {\r0\().16b,\r1\().16b}, [x0], #32 + st1 {\r2\().16b,\r3\().16b}, [x0], #32 + st1 {\r4\().16b}, [x0], #16 + st1 {\r5\().h}[0], [x0], #2 +.endm + +.macro get_grain_row_44 r0, r1, r2 + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn \r0\().8b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn2 \r0\().16b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn \r1\().8b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn2 \r1\().16b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn \r2\().8b, \r2\().8h + + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + ld1 {v0.h}[3], [x15] + srshl v0.4h, v0.4h, v31.4h + xtn2 \r2\().16b, v0.8h +.endm + +.macro store_grain_row_44 r0, r1, r2 + st1 {\r0\().16b,\r1\().16b}, [x0], #32 + st1 {\r2\().16b}, [x0] + add x0, x0, #GRAIN_WIDTH-32 +.endm + +function get_grain_2_neon + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + ld1 {v0.h}[1], [x15] + srshl v0.4h, v0.4h, v31.4h + xtn v0.8b, v0.8h + ret +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, v0 + mov \dst\().8b, v0.8b +.endif +.endm + +// w15 holds the number of entries to produce +// w14, w16 and w17 hold the previous output entries +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #1 +.if \n == 1 + madd w11, w14, w4, w11 // sum (above) + *coeff * prev output +.elseif \n == 2 + madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w14, w17, w11 // += *coeff * prev output 2 + mov w16, w14 +.else + madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 + madd w11, w14, w21, w11 // += *coeff * prev output 3 + mov w17, w16 + mov w16, w14 +.endif + add w14, w11, w8 // 1 << (ar_coeff_shift - 1) + add w12, w12, w10 // 1 << (4 + grain_scale_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + asr w12, w12, w9 // >> (4 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.b[15], w14 + b.gt 1b + ret +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + smull v2.8h, v3.8b, v28.8b + smull2 v3.8h, v3.16b, v28.16b + smull v4.8h, v0.8b, v27.8b + smull2 v5.8h, v0.16b, v27.16b + smull v6.8h, v1.8b, v29.8b + smull2 v7.8h, v1.16b, v29.16b + saddl v0.4s, v2.4h, v4.4h + saddl2 v1.4s, v2.8h, v4.8h + saddl v2.4s, v3.4h, v5.4h + saddl2 v3.4s, v3.8h, v5.8h + saddw v4.4s, v0.4s, v6.4h + saddw2 v5.4s, v1.4s, v6.8h + saddw v6.4s, v2.4s, v7.4h + saddw2 v7.4s, v3.4s, v7.8h + ret +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff + bl sum_\lag\()_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH + ld1 {v22.16b, v23.16b}, [x19], #32 + ld1 {v24.16b, v25.16b}, [x12] + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + saddlp v24.8h, v24.16b + saddlp v25.8h, v25.16b + add v22.8h, v22.8h, v24.8h + add v23.8h, v23.8h, v25.8h + rshrn v0.8b, v22.8h, #2 + rshrn2 v0.16b, v23.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.16b, v23.16b}, [x19], #32 + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + rshrn v0.8b, v22.8h, #1 + rshrn2 v0.16b, v23.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.16b}, [x19], #16 +.endif +.if \uv_layout +.ifnb \uv_coeff + dup v1.16b, \uv_coeff + smull v2.8h, v0.8b, v1.8b + smull2 v3.8h, v0.16b, v1.16b +.else + smull v2.8h, v0.8b, v30.8b + smull2 v3.8h, v0.16b, v30.16b +.endif + saddw v4.4s, v4.4s, v2.4h + saddw2 v5.4s, v5.4s, v2.8h + saddw v6.4s, v6.4s, v3.4h + saddw2 v7.4s, v7.4s, v3.8h +.endif +.if \uv_layout && \elems == 16 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 15 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 9 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + xtn2 v0.16b, v0.8h + ext v4.16b, v4.16b, v4.16b, #12 +.ifc \lag, lag3 + smov w17, v0.b[13] +.endif +.ifnc \lag, lag1 + smov w16, v0.b[14] +.endif + smov w14, v0.b[15] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b + mov w15, #4 + bl output_\lag\()_neon + + increment_seed 4, shift=0 + mov v1.16b, v6.16b +.if \elems == 9 + mov w15, #1 + bl output_\lag\()_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + xtn v1.8b, v1.8h + ext v0.16b, v0.16b, v1.16b, #7 +.else + mov w15, #4 + bl output_\lag\()_neon + + increment_seed 4, shift=0 + mov v1.16b, v7.16b + +.ifc \edge, right + mov w15, #3 + bl output_\lag\()_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #1 +.else + mov w15, #4 + bl output_\lag\()_neon +.endif +.endif +.if \store + st1 {v0.16b}, [x0], #16 +.endif + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag1_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems, store=0 +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 15 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 15 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 9 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 9 + +.macro sum_lag1 type, dst, left, mid, right, edge=mid + mov v3.16b, \mid\().16b + ext v0.16b, \left\().16b, \mid\().16b, #15 + ext v1.16b, \mid\().16b, \right\().16b, #1 + bl sum_\type\()_lag1_\edge\()_neon + mov \dst\().16b, v0.16b +.endm + +.macro sum_y_lag1 dst, left, mid, right, edge=mid + sum_lag1 y, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_444, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_422, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_420, \dst, \left, \mid, \right, \edge +.endm + + +function sum_lag2_above_neon + sub x12, x0, #2*GRAIN_WIDTH - 16 + sub x13, x0, #1*GRAIN_WIDTH - 16 + ld1 {v18.16b}, [x12] // load top right + ld1 {v21.16b}, [x13] + + ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid + dup v26.16b, v30.b[0] + ext v23.16b, v16.16b, v17.16b, #15 + dup v27.16b, v30.b[1] + ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right + dup v28.16b, v30.b[3] + ext v1.16b, v17.16b, v18.16b, #2 + dup v29.16b, v30.b[4] + + smull v2.8h, v22.8b, v26.8b + smull2 v3.8h, v22.16b, v26.16b + smull v4.8h, v23.8b, v27.8b + smull2 v5.8h, v23.16b, v27.16b + smull v6.8h, v0.8b, v28.8b + smull2 v7.8h, v0.16b, v28.16b + smull v0.8h, v1.8b, v29.8b + smull2 v1.8h, v1.16b, v29.16b + saddl v22.4s, v2.4h, v4.4h + saddl2 v23.4s, v2.8h, v4.8h + saddl v26.4s, v3.4h, v5.4h + saddl2 v27.4s, v3.8h, v5.8h + saddl v2.4s, v0.4h, v6.4h + saddl2 v3.4s, v0.8h, v6.8h + saddl v6.4s, v1.4h, v7.4h + saddl2 v7.4s, v1.8h, v7.8h + add v4.4s, v22.4s, v2.4s + add v5.4s, v23.4s, v3.4s + add v6.4s, v26.4s, v6.4s + add v7.4s, v27.4s, v7.4s + + ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid + dup v26.16b, v30.b[5] + ext v23.16b, v19.16b, v20.16b, #15 + dup v27.16b, v30.b[6] + ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right + dup v28.16b, v30.b[8] + ext v1.16b, v20.16b, v21.16b, #2 + dup v29.16b, v30.b[9] + + smull v2.8h, v22.8b, v26.8b + smull2 v3.8h, v22.16b, v26.16b + smull v22.8h, v23.8b, v27.8b + smull2 v23.8h, v23.16b, v27.16b + smull v26.8h, v0.8b, v28.8b + smull2 v27.8h, v0.16b, v28.16b + smull v28.8h, v1.8b, v29.8b + smull2 v29.8h, v1.16b, v29.16b + saddl v0.4s, v2.4h, v22.4h + saddl2 v1.4s, v2.8h, v22.8h + saddl v2.4s, v3.4h, v23.4h + saddl2 v3.4s, v3.8h, v23.8h + saddl v22.4s, v26.4h, v28.4h + saddl2 v23.4s, v26.8h, v28.8h + saddl v26.4s, v27.4h, v29.4h + saddl2 v27.4s, v27.8h, v29.8h + add v0.4s, v0.4s, v22.4s + add v1.4s, v1.4s, v23.4s + add v2.4s, v2.4s, v26.4s + add v3.4s, v3.4s, v27.4s + dup v26.16b, v30.b[2] + dup v27.16b, v30.b[7] + smull v22.8h, v17.8b, v26.8b + smull2 v23.8h, v17.16b, v26.16b + smull v24.8h, v20.8b, v27.8b + smull2 v25.8h, v20.16b, v27.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + saddl v0.4s, v22.4h, v24.4h + saddl2 v1.4s, v22.8h, v24.8h + saddl v2.4s, v23.4h, v25.4h + saddl2 v3.4s, v23.8h, v25.8h + mov v19.16b, v20.16b + mov v20.16b, v21.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + ret +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag2_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #2*GRAIN_WIDTH + sub x13, x0, #1*GRAIN_WIDTH + ld1 {v17.16b}, [x12] // load the previous block right above + ld1 {v20.16b}, [x13] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 15 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 15 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 9 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 9 + + +function sum_lag3_above_neon + sub x11, x0, #3*GRAIN_WIDTH - 16 + sub x12, x0, #2*GRAIN_WIDTH - 16 + sub x13, x0, #1*GRAIN_WIDTH - 16 + ld1 {v15.16b}, [x11] // load top right + ld1 {v18.16b}, [x12] + ld1 {v21.16b}, [x13] + + ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid + dup v22.16b, v29.b[0] + ext v9.16b, v13.16b, v14.16b, #14 + dup v23.16b, v29.b[1] + ext v10.16b, v13.16b, v14.16b, #15 + dup v24.16b, v29.b[2] + dup v25.16b, v29.b[3] + ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right + dup v26.16b, v29.b[4] + ext v12.16b, v14.16b, v15.16b, #2 + dup v27.16b, v29.b[5] + ext v13.16b, v14.16b, v15.16b, #3 + dup v28.16b, v29.b[6] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v14.8b, v25.8b + smull2 v13.8h, v14.16b, v25.16b + add v4.4s, v22.4s, v0.4s + add v5.4s, v23.4s, v1.4s + add v6.4s, v24.4s, v2.4s + add v7.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h + + ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid + dup v22.16b, v29.b[7] + ext v9.16b, v16.16b, v17.16b, #14 + dup v23.16b, v29.b[8] + ext v10.16b, v16.16b, v17.16b, #15 + dup v24.16b, v29.b[9] + dup v25.16b, v29.b[10] + ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right + dup v26.16b, v29.b[11] + ext v12.16b, v17.16b, v18.16b, #2 + dup v27.16b, v29.b[12] + ext v13.16b, v17.16b, v18.16b, #3 + dup v28.16b, v29.b[13] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v17.8b, v25.8b + smull2 v13.8h, v17.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h + + ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid + dup v22.16b, v29.b[14] + ext v9.16b, v19.16b, v20.16b, #14 + dup v23.16b, v29.b[15] + ext v10.16b, v19.16b, v20.16b, #15 + dup v24.16b, v30.b[0] + dup v25.16b, v30.b[1] + ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right + dup v26.16b, v30.b[2] + ext v12.16b, v20.16b, v21.16b, #2 + dup v27.16b, v30.b[3] + ext v13.16b, v20.16b, v21.16b, #3 + dup v28.16b, v30.b[4] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v20.8b, v25.8b + smull2 v19.8h, v20.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s + mov v13.16b, v14.16b + mov v14.16b, v15.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + mov v16.16b, v17.16b + mov v17.16b, v18.16b + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v19.4h + saddw2 v7.4s, v7.4s, v19.8h + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag3_\edge\()_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +.ifc \edge, left + sub x11, x0, #3*GRAIN_WIDTH + sub x12, x0, #2*GRAIN_WIDTH + sub x13, x0, #1*GRAIN_WIDTH + ld1 {v14.16b}, [x11] // load the previous block right above + ld1 {v17.16b}, [x12] + ld1 {v20.16b}, [x13] +.endif + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 15 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 15 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 9 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 9 + +function generate_grain_rows_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + get_grain_row v16, v17, v18, v19, v20, v21 + subs w1, w1, #1 + store_grain_row v16, v17, v18, v19, v20, v21 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function generate_grain_rows_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! +1: + get_grain_row_44 v16, v17, v18 + subs w1, w1, #1 + store_grain_row_44 v16, v17, v18 + b.gt 1b + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function get_grain_row_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + get_grain_row v16, v17, v18, v19, v20, v21 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function get_grain_row_44_neon + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + get_grain_row_44 v16, v17, v18 + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +endfunc + +function add_uv_444_coeff_lag0_neon +add_coeff_lag0_start: + smull v2.8h, v0.8b, v27.8b + smull2 v3.8h, v0.16b, v27.16b + srshl v2.8h, v2.8h, v28.8h + srshl v3.8h, v3.8h, v28.8h + saddw v2.8h, v2.8h, v1.8b + saddw2 v3.8h, v3.8h, v1.16b + sqxtn v2.8b, v2.8h + sqxtn2 v2.16b, v3.8h + ret +endfunc + +function add_uv_420_coeff_lag0_neon + ld1 {v4.16b, v5.16b}, [x19], #32 + ld1 {v6.16b, v7.16b}, [x12], #32 + saddlp v4.8h, v4.16b + saddlp v5.8h, v5.16b + saddlp v6.8h, v6.16b + saddlp v7.8h, v7.16b + add v4.8h, v4.8h, v6.8h + add v5.8h, v5.8h, v7.8h + rshrn v4.8b, v4.8h, #2 + rshrn2 v4.16b, v5.8h, #2 + and v0.16b, v4.16b, v0.16b + b add_coeff_lag0_start +endfunc + +function add_uv_422_coeff_lag0_neon + ld1 {v4.16b, v5.16b}, [x19], #32 + saddlp v4.8h, v4.16b + saddlp v5.8h, v5.16b + rshrn v4.8b, v4.8h, #1 + rshrn2 v4.16b, v5.8h, #1 + and v0.16b, v4.16b, v0.16b + b add_coeff_lag0_start +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + +.ifc \type, uv_444 + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH + mov x1, x2 + mul w13, w13, w14 +.endif + movrel x3, X(gaussian_sequence) + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add x4, x1, #FGD_AR_COEFFS_Y +.else + add x4, x1, #FGD_AR_COEFFS_UV +.endif + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + +.ifc \type, uv_444 + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne +.endif + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #127 + mov w6, #-128 + +.ifc \type, uv_444 + eor w2, w2, w11 +.endif + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET +.ifc \type, y + mov w1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + dup v28.8h, w7 + ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + ext v29.16b, v0.16b, v1.16b, #13 + ext v30.16b, v1.16b, v0.16b, #1 + neg v28.8h, v28.8h + + mov w1, #3 + bl generate_grain_rows_neon + mov w1, #GRAIN_HEIGHT-3 +1: + ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64 + bl get_grain_row_neon + and v0.16b, v22.16b, v29.16b + mov v1.16b, v16.16b + bl add_uv_444_coeff_lag0_neon + mov v0.16b, v23.16b + mov v1.16b, v17.16b + mov v16.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + ld1 {v26.16b}, [x19], #16 + mov v0.16b, v24.16b + mov v1.16b, v18.16b + mov v17.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + add x19, x19, #2 + mov v0.16b, v25.16b + mov v1.16b, v19.16b + mov v18.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + and v0.16b, v26.16b, v30.16b + mov v1.16b, v20.16b + mov v19.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + mov v20.16b, v2.16b + subs w1, w1, #1 + store_grain_row v16, v17, v18, v19, v20, v21 + b.gt 1b +.endif + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0] + ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1] + ld1r {v29.16b}, [x4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb w4, [x4, #1] // ar_coeffs_y[3] +.else + add x4, x4, #2 +.endif + + mov w1, #3 +.ifc \type, uv_444 + ld1r {v30.16b}, [x4] // ar_coeffs_uv[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + sum_\type\()_lag1 v22, v16, v16, v17, left + sum_\type\()_lag1 v23, v16, v17, v18 + sum_\type\()_lag1 v24, v17, v18, v19 + sum_\type\()_lag1 v25, v18, v19, v20 + sum_\type\()_lag1 v20, v19, v20, v21, right + get_grain_2 v21 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + store_grain_row v22, v23, v24, v25, v20, v21 + mov v16.16b, v22.16b + mov v17.16b, v23.16b + mov v18.16b, v24.16b + mov v19.16b, v25.16b + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + st1 {v16.h}[0], [x0], #2 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ld1 {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + st1 {v16.h}[0], [x0], #2 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH-(3*32) +.else + sub \reg, \reg, #3*32-GRAIN_WIDTH +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x30, x19, [sp, #-96]! + + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH-3 + mov x1, x2 + mul w13, w13, w14 + + movrel x3, X(gaussian_sequence) + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] + add x4, x1, #FGD_AR_COEFFS_UV + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #127 + mov w6, #-128 + + eor w2, w2, w11 + + br x16 + +L(generate_grain_\type\()_lag0): + AARCH64_VALID_JUMP_TARGET + dup v28.8h, w7 + ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + ext v29.16b, v0.16b, v1.16b, #13 + ext v30.16b, v1.16b, v0.16b, #7 + neg v28.8h, v28.8h + + mov w1, #3 + bl generate_grain_rows_44_neon + set_height w1, \type +1: + bl get_grain_row_44_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH +.endif + mov v0.16b, v29.16b + mov v1.16b, v16.16b + bl add_\type\()_coeff_lag0_neon + movi v0.16b, #255 + mov v1.16b, v17.16b + mov v16.16b, v2.16b + bl add_\type\()_coeff_lag0_neon + mov v0.16b, v30.16b + mov v1.16b, v18.16b + mov v17.16b, v2.16b + bl add_\type\()_coeff_lag0_neon + mov v18.16b, v2.16b + subs w1, w1, #1 + increment_y_ptr x19, \type + store_grain_row_44 v16, v17, v18 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag1): + AARCH64_VALID_JUMP_TARGET + ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0] + ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1] + ld1r {v29.16b}, [x4] // ar_coeffs_uv[2] + add x4, x4, #2 + + mov w1, #3 + ld1r {v30.16b}, [x4] // ar_coeffs_u4[4] + ldursb w4, [x4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + sum_\type\()_lag1 v20, v16, v16, v17, left + sum_\type\()_lag1 v21, v16, v17, v18 + sum_\type\()_lag1 v18, v17, v18, v18, right + subs w1, w1, #1 + increment_y_ptr x19, \type + store_grain_row_44 v20, v21, v18 + mov v16.16b, v20.16b + mov v17.16b, v21.16b + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag2): + AARCH64_VALID_JUMP_TARGET + ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH-48 + b.gt 1b + + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(generate_grain_\type\()_lag3): + AARCH64_VALID_JUMP_TARGET + ldr q29, [x4] // ar_coeffs_uv[0-15] + ldr q30, [x4, #16] // ar_coeffs_uv[16-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH-48 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + umov w14, \src1[0+\off] + umov w15, \src2[8+\off] + umov w16, \src1[2+\off] + add x14, x14, x3 + umov w17, \src2[10+\off] + add x15, x15, x3 + ld1 {\dst1}[0+\off], [x14] + umov w14, \src1[4+\off] + add x16, x16, x3 + ld1 {\dst2}[8+\off], [x15] + umov w15, \src2[12+\off] + add x17, x17, x3 + ld1 {\dst1}[2+\off], [x16] + umov w16, \src1[6+\off] + add x14, x14, x3 + ld1 {\dst2}[10+\off], [x17] + umov w17, \src2[14+\off] + add x15, x15, x3 + ld1 {\dst1}[4+\off], [x14] + add x16, x16, x3 + ld1 {\dst2}[12+\off], [x15] + add x17, x17, x3 + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[14+\off], [x17] +.endm + +.macro gather dst1, dst2, src1, src2 + gather_interleaved \dst1, \dst2, \src1, \src2, 0 + gather_interleaved \dst2, \dst1, \src2, \src1, 0 + gather_interleaved \dst1, \dst2, \src1, \src2, 1 + gather_interleaved \dst2, \dst1, \src2, \src1, 1 +.endm + +function gather32_neon + gather v4.b, v5.b, v0.b, v1.b + ret +endfunc + +function gather16_neon + gather_interleaved v4.b, v5.b, v0.b, v0.b, 0 + gather_interleaved v4.b, v5.b, v0.b, v0.b, 1 + ins v4.d[1], v5.d[1] + ret +endfunc + +const overlap_coeffs_0, align=4 + .byte 27, 17, 0, 0, 0, 0, 0, 0 + .byte 17, 27, 32, 32, 32, 32, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .byte 23, 0, 0, 0, 0, 0, 0, 0 + .byte 22, 32, 32, 32, 32, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, uxtw // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type); +function fgy_32x32_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-16]! + ldr w11, [x6, #8] // offsets[1][0] + ldr w13, [x6, #4] // offsets[0][1] + ldr w15, [x6, #12] // offsets[1][1] + ldr w6, [x6] // offsets[0][0] + ldr w8, [sp, #16] // clip + mov x9, #GRAIN_WIDTH // grain_lut stride + + neg w4, w4 + dup v29.8h, w4 // -scaling_shift + + movrel x16, overlap_coeffs_0 + + cbz w8, 1f + // clip + movi v30.16b, #16 + movi v31.16b, #235 + b 2f +1: + // no clip + movi v30.16b, #0 + movi v31.16b, #255 +2: + + ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs + + add x5, x5, #9 // grain_lut += 9 + add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x9 // grain_lut += grain_stride + + calc_offset w11, w12, w11, 0, 0 + calc_offset w13, w14, w13, 0, 0 + calc_offset w15, w16, w15, 0, 0 + calc_offset w6, w10, w6, 0, 0 + + add_offset x12, w11, x12, x5, x9 + add_offset x14, w13, x14, x5, x9 + add_offset x16, w15, x16, x5, x9 + add_offset x5, w6, x10, x5, x9 + + ldr w11, [sp, #24] // type + adr x13, L(fgy_loop_tbl) + + add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + tst w11, #1 + ldrh w11, [x13, w11, uxtw #1] + + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx + + sub x11, x13, w11, uxtw + + b.eq 1f + // y overlap + dup v6.16b, v27.b[0] + dup v7.16b, v27.b[1] + mov w10, w7 // backup actual h + mov w7, #2 +1: + br x11 +endfunc + +function fgy_loop_neon +.macro fgy ox, oy +L(loop_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.16b, v1.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x9 // grain_lut old +.endif +.if \oy + ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x8], x9 // grain_lut top old +.endif + ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut + + bl gather32_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v7.8b +.else + smull v16.8h, v18.8b, v7.8b +.endif + smull2 v17.8h, v18.16b, v7.16b + smull v18.8h, v19.8b, v7.8b + smull2 v19.8h, v19.16b, v7.16b +.if \ox + smlal v16.8h, v21.8b, v6.8b +.else + smlal v16.8h, v22.8b, v6.8b +.endif + smlal2 v17.8h, v22.16b, v6.16b + smlal v18.8h, v23.8b, v6.8b + smlal2 v19.8h, v23.16b, v6.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 + sqrshrn v23.8b, v18.8h, #5 + sqrshrn2 v23.16b, v19.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b + sxtl v18.8h, v23.8b + sxtl2 v19.8h, v23.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + uxtl v4.8h, v5.8b + uxtl2 v5.8h, v5.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + mul v18.8h, v18.8h, v4.8h + mul v19.8h, v19.8h, v5.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + srshl v18.8h, v18.8h, v29.8h + srshl v19.8h, v19.8h, v29.8h + + uaddw v16.8h, v16.8h, v0.8b // *src + noise + uaddw2 v17.8h, v17.8h, v0.16b + uaddw v18.8h, v18.8h, v1.8b + uaddw2 v19.8h, v19.8h, v1.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + sqxtun v1.8b, v18.8h + sqxtun2 v1.16b, v19.8h + + umax v0.16b, v0.16b, v30.16b + umax v1.16b, v1.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + umin v1.16b, v1.16b, v31.16b + + subs w7, w7, #1 +.if \oy + dup v6.16b, v28.b[0] + dup v7.16b, v28.b[1] +.endif + st1 {v0.16b, v1.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w10, #2 + sub w7, w10, #2 // restore actual remaining h + b.gt L(loop_\ox\()0) +.endif + ldr x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 + +L(fgy_loop_tbl): + .hword L(fgy_loop_tbl) - L(loop_00) + .hword L(fgy_loop_tbl) - L(loop_01) + .hword L(fgy_loop_tbl) - L(loop_10) + .hword L(fgy_loop_tbl) - L(loop_11) +endfunc + +// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_8bpc_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + str x30, [sp, #-32]! + str d8, [sp, #16] + ldp x8, x9, [sp, #32] // offsets, h + ldp x10, x11, [sp, #48] // uv, is_id + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + neg w13, w13 // -scaling_shift + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + ld1 {v8.h}[0], [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1 {v8.h}[1], [x15] // uv_mult + + dup v29.8h, w13 // -scaling_shift + + cbz w12, 1f + // clip + movi v30.16b, #16 + movi v31.16b, #240 + cbz w11, 2f + // is_id + movi v31.16b, #235 + b 2f +1: + // no clip + movi v30.16b, #0 + movi v31.16b, #255 +2: + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH // grain_lut stride + + add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #64] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx\()_tbl) + + ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.16b, #23 + movi v26.16b, #22 +.else + movi v25.16b, #27 + movi v26.16b, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.16b, v1.16b}, [x6], x7 // luma + ld1 {v6.16b, v7.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x11], x10 // grain_lut top old +.endif + ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut + +.if !\csfl + uxtl v2.8h, v0.8b + uxtl2 v3.8h, v0.16b + uxtl v4.8h, v1.8b + uxtl2 v5.8h, v1.16b + uxtl v0.8h, v6.8b + uxtl2 v1.8h, v6.16b + uxtl v16.8h, v7.8b + uxtl2 v17.8h, v7.16b + mul v2.8h, v2.8h, v8.h[0] + mul v3.8h, v3.8h, v8.h[0] + mul v4.8h, v4.8h, v8.h[0] + mul v5.8h, v5.8h, v8.h[0] + mul v0.8h, v0.8h, v8.h[1] + mul v1.8h, v1.8h, v8.h[1] + mul v16.8h, v16.8h, v8.h[1] + mul v17.8h, v17.8h, v8.h[1] + sqadd v2.8h, v2.8h, v0.8h + sqadd v3.8h, v3.8h, v1.8h + sqadd v4.8h, v4.8h, v16.8h + sqadd v5.8h, v5.8h, v17.8h + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + sshr v4.8h, v4.8h, #6 + sshr v5.8h, v5.8h, #6 + add v2.8h, v2.8h, v24.8h + add v3.8h, v3.8h, v24.8h + add v4.8h, v4.8h, v24.8h + add v5.8h, v5.8h, v24.8h + sqxtun v0.8b, v2.8h + sqxtun2 v0.16b, v3.8h + sqxtun v1.8b, v4.8h + sqxtun2 v1.16b, v5.8h +.endif + + bl gather32_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v26.8b +.else + smull v16.8h, v18.8b, v26.8b +.endif + smull2 v17.8h, v18.16b, v26.16b + smull v18.8h, v19.8b, v26.8b + smull2 v19.8h, v19.16b, v26.16b +.if \ox + smlal v16.8h, v21.8b, v25.8b +.else + smlal v16.8h, v22.8b, v25.8b +.endif + smlal2 v17.8h, v22.16b, v25.16b + smlal v18.8h, v23.8b, v25.8b + smlal2 v19.8h, v23.16b, v25.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 + sqrshrn v23.8b, v18.8h, #5 + sqrshrn2 v23.16b, v19.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b + sxtl v18.8h, v23.8b + sxtl2 v19.8h, v23.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + uxtl v4.8h, v5.8b + uxtl2 v5.8h, v5.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + mul v18.8h, v18.8h, v4.8h + mul v19.8h, v19.8h, v5.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + srshl v18.8h, v18.8h, v29.8h + srshl v19.8h, v19.8h, v29.8h + + uaddw v16.8h, v16.8h, v6.8b // *src + noise + uaddw2 v17.8h, v17.8h, v6.16b + uaddw v18.8h, v18.8h, v7.8b + uaddw2 v19.8h, v19.8h, v7.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + sqxtun v1.8b, v18.8h + sqxtun2 v1.16b, v19.8h + + umax v0.16b, v0.16b, v30.16b + umax v1.16b, v1.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + umin v1.16b, v1.16b, v31.16b + + subs w9, w9, #1 +.if \oy + dup v25.16b, v28.b[0] + dup v26.16b, v28.b[1] +.endif + st1 {v0.16b, v1.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldr d8, [sp, #16] + ldr x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): + AARCH64_VALID_JUMP_TARGET +1: + ld1 {v0.16b, v1.16b}, [x6], x7 // luma + ld1 {v6.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v22.16b}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x11], x10 // grain_lut top old +.endif + ld1 {v18.16b}, [x5], x10 // grain_lut + + uaddlp v2.8h, v0.16b + uaddlp v3.8h, v1.16b +.if \csfl + rshrn v0.8b, v2.8h, #1 + rshrn2 v0.16b, v3.8h, #1 +.else + urshr v2.8h, v2.8h, #1 + urshr v3.8h, v3.8h, #1 + uxtl v0.8h, v6.8b + uxtl2 v1.8h, v6.16b + mul v2.8h, v2.8h, v8.h[0] + mul v3.8h, v3.8h, v8.h[0] + mul v0.8h, v0.8h, v8.h[1] + mul v1.8h, v1.8h, v8.h[1] + sqadd v2.8h, v2.8h, v0.8h + sqadd v3.8h, v3.8h, v1.8h + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + add v2.8h, v2.8h, v24.8h + add v3.8h, v3.8h, v24.8h + sqxtun v0.8b, v2.8h + sqxtun2 v0.16b, v3.8h +.endif + + bl gather16_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v26.8b +.else + smull v16.8h, v18.8b, v26.8b +.endif + smull2 v17.8h, v18.16b, v26.16b +.if \ox + smlal v16.8h, v21.8b, v25.8b +.else + smlal v16.8h, v22.8b, v25.8b +.endif + smlal2 v17.8h, v22.16b, v25.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + + uaddw v16.8h, v16.8h, v6.8b // *src + noise + uaddw2 v17.8h, v17.8h, v6.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + + umax v0.16b, v0.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldr d8, [sp, #16] + ldr x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc diff -Nru dav1d-0.9.2/src/arm/64/ipred16.S dav1d-1.0.0/src/arm/64/ipred16.S --- dav1d-0.9.2/src/arm/64/ipred16.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/ipred16.S 2022-03-18 14:31:55.974356000 +0000 @@ -46,6 +46,7 @@ urshr v0.8h, v0.8h, #1 br x5 4: + AARCH64_VALID_JUMP_TARGET st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 @@ -54,6 +55,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 @@ -62,6 +64,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 16: st1 {v0.8h, v1.8h}, [x0], x1 @@ -72,6 +75,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b @@ -84,6 +88,7 @@ b.gt 32b ret 640: + AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b @@ -124,6 +129,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] 4: st1 {v0.4h}, [x0], x1 @@ -134,6 +140,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] 8: st1 {v0.8h}, [x0], x1 @@ -144,6 +151,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] 16: st1 {v0.8h, v1.8h}, [x0], x1 @@ -154,6 +162,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 @@ -164,6 +173,7 @@ b.gt 32b ret 640: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 sub x1, x1, #64 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] @@ -204,6 +214,7 @@ lsl x1, x1, #1 br x5 4: + AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.4h}, [x0], x1 st1 {v2.4h}, [x6], x1 @@ -213,6 +224,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 @@ -222,6 +234,7 @@ b.gt 8b ret 16: + AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -235,6 +248,7 @@ b.gt 16b ret 32: + AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -252,6 +266,7 @@ b.gt 32b ret 64: + AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -300,6 +315,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 @@ -313,6 +329,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 @@ -326,6 +343,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] addp v0.8h, v0.8h, v1.8h addv h0, v0.8h @@ -341,6 +359,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h @@ -360,6 +379,7 @@ b.gt 32b ret 640: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] @@ -417,12 +437,14 @@ br x5 L(ipred_dc_left_h4): + AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] br x3 L(ipred_dc_left_w4): + AARCH64_VALID_JUMP_TARGET st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 @@ -432,12 +454,14 @@ ret L(ipred_dc_left_h8): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x3 L(ipred_dc_left_w8): + AARCH64_VALID_JUMP_TARGET st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 @@ -447,6 +471,7 @@ ret L(ipred_dc_left_h16): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] addp v0.8h, v0.8h, v1.8h addv h0, v0.8h @@ -455,6 +480,7 @@ dup v1.8h, v2.h[0] br x3 L(ipred_dc_left_w16): + AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 1: st1 {v0.8h, v1.8h}, [x0], x1 @@ -466,6 +492,7 @@ ret L(ipred_dc_left_h32): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h @@ -476,6 +503,7 @@ dup v0.8h, v4.h[0] br x3 L(ipred_dc_left_w32): + AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b @@ -489,6 +517,7 @@ ret L(ipred_dc_left_h64): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] @@ -503,6 +532,7 @@ dup v0.8h, v4.h[0] br x3 L(ipred_dc_left_w64): + AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b @@ -560,11 +590,13 @@ br x5 L(ipred_dc_h4): + AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h add x2, x2, #2 br x3 L(ipred_dc_w4): + AARCH64_VALID_JUMP_TARGET ld1 {v1.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.4h @@ -592,11 +624,13 @@ ret L(ipred_dc_h8): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h add x2, x2, #2 br x3 L(ipred_dc_w8): + AARCH64_VALID_JUMP_TARGET ld1 {v1.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.8h @@ -624,12 +658,14 @@ ret L(ipred_dc_h16): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2], #32 addp v0.8h, v0.8h, v1.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w16): + AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h @@ -659,6 +695,7 @@ ret L(ipred_dc_h32): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h @@ -667,6 +704,7 @@ uaddlv s0, v0.8h br x3 L(ipred_dc_w32): + AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h @@ -700,6 +738,7 @@ ret L(ipred_dc_h64): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 @@ -713,6 +752,7 @@ uaddlv s0, v0.8h br x3 L(ipred_dc_w64): + AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h @@ -786,6 +826,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1r {v5.2d}, [x8] sub v6.8h, v5.8h, v4.8h // top - topleft 4: @@ -821,6 +862,7 @@ 160: 320: 640: + AARCH64_VALID_JUMP_TARGET ld1 {v5.8h}, [x8], #16 mov w9, w3 // Set up pointers for four rows in parallel; x0, x6, x5, x10 @@ -920,6 +962,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1r {v6.2d}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor sub x2, x2, #8 @@ -963,6 +1006,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v6.8h}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor sub x2, x2, #8 @@ -1024,6 +1068,7 @@ 160: 320: 640: + AARCH64_VALID_JUMP_TARGET add x12, x2, w3, uxtw #1 sub x1, x1, w3, uxtw #1 ld1r {v5.8h}, [x12] // right @@ -1120,6 +1165,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1r {v6.2d}, [x2] // top sub v6.8h, v6.8h, v4.8h // top-bottom 4: @@ -1140,6 +1186,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v6.8h}, [x2] // top sub v6.8h, v6.8h, v4.8h // top-bottom 8: @@ -1166,6 +1213,7 @@ 160: 320: 640: + AARCH64_VALID_JUMP_TARGET // Set up pointers for four rows in parallel; x0, x6, x5, x8 add x5, x0, x1 add x8, x6, x1 @@ -1243,6 +1291,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1r {v7.2s}, [x8] // weights_hor sub x2, x2, #8 mov x7, #-8 @@ -1265,6 +1314,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v7.8b}, [x8] // weights_hor sub x2, x2, #8 mov x7, #-8 @@ -1293,6 +1343,7 @@ 160: 320: 640: + AARCH64_VALID_JUMP_TARGET sub x2, x2, #8 mov x7, #-8 // Set up pointers for four rows in parallel; x0, x6, x5, x10 @@ -1387,6 +1438,7 @@ .endif br x5 40: + AARCH64_VALID_JUMP_TARGET ldur d0, [x2, #2] // top (0-3) sub x2, x2, #4 mov x7, #-4 @@ -1428,6 +1480,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ldur q0, [x2, #2] // top (0-7) sub x2, x2, #4 mov x7, #-4 @@ -1497,6 +1550,7 @@ ret 160: 320: + AARCH64_VALID_JUMP_TARGET add x8, x2, #2 sub x2, x2, #4 mov x7, #-4 @@ -1675,6 +1729,7 @@ sub x6, x6, w9, uxtw br x6 40: + AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 4: @@ -1695,6 +1750,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 8: @@ -1721,6 +1777,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 16: @@ -1761,6 +1818,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 32: @@ -1799,6 +1857,7 @@ b.gt 32b ret 640: + AARCH64_VALID_JUMP_TARGET add x2, x0, #64 64: ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 @@ -1863,16 +1922,17 @@ movi v30.8h, #0 br x7 L(ipred_cfl_splat_w4): + AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #4 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h - sshr v16.4s, v2.4s, #31 // sign = diff >> 31 - sshr v17.4s, v3.4s, #31 - sshr v18.4s, v4.4s, #31 - sshr v19.4s, v5.4s, #31 + cmlt v16.4s, v2.4s, #0 // sign + cmlt v17.4s, v3.4s, #0 + cmlt v18.4s, v4.4s, #0 + cmlt v19.4s, v5.4s, #0 add v2.4s, v2.4s, v16.4s // diff + sign add v3.4s, v3.4s, v17.4s add v4.4s, v4.4s, v18.4s @@ -1894,16 +1954,17 @@ b.gt L(ipred_cfl_splat_w4) ret L(ipred_cfl_splat_w8): + AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #2 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h - sshr v16.4s, v2.4s, #31 // sign = diff >> 31 - sshr v17.4s, v3.4s, #31 - sshr v18.4s, v4.4s, #31 - sshr v19.4s, v5.4s, #31 + cmlt v16.4s, v2.4s, #0 // sign + cmlt v17.4s, v3.4s, #0 + cmlt v18.4s, v4.4s, #0 + cmlt v19.4s, v5.4s, #0 add v2.4s, v2.4s, v16.4s // diff + sign add v3.4s, v3.4s, v17.4s add v4.4s, v4.4s, v18.4s @@ -1923,6 +1984,7 @@ b.gt L(ipred_cfl_splat_w8) ret L(ipred_cfl_splat_w16): + AARCH64_VALID_JUMP_TARGET add x7, x5, w3, uxtw #1 sub x1, x1, w3, uxtw #1 mov w9, w3 @@ -1938,14 +2000,14 @@ smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h - sshr v20.4s, v16.4s, #31 // sign = diff >> 31 - sshr v21.4s, v17.4s, #31 - sshr v22.4s, v18.4s, #31 - sshr v23.4s, v19.4s, #31 - sshr v24.4s, v2.4s, #31 - sshr v25.4s, v3.4s, #31 - sshr v26.4s, v4.4s, #31 - sshr v27.4s, v5.4s, #31 + cmlt v20.4s, v16.4s, #0 // sign + cmlt v21.4s, v17.4s, #0 + cmlt v22.4s, v18.4s, #0 + cmlt v23.4s, v19.4s, #0 + cmlt v24.4s, v2.4s, #0 + cmlt v25.4s, v3.4s, #0 + cmlt v26.4s, v4.4s, #0 + cmlt v27.4s, v5.4s, #0 add v16.4s, v16.4s, v20.4s // diff + sign add v17.4s, v17.4s, v21.4s add v18.4s, v18.4s, v22.4s @@ -2013,18 +2075,21 @@ movi v30.8h, #0 br x7 4: + AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] addp v0.8h, v2.8h, v3.8h addv h0, v0.8h @@ -2032,6 +2097,7 @@ dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h @@ -2073,6 +2139,7 @@ br x7 L(ipred_cfl_left_h4): + AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 @@ -2080,6 +2147,7 @@ br x9 L(ipred_cfl_left_h8): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 @@ -2087,6 +2155,7 @@ br x9 L(ipred_cfl_left_h16): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] addp v0.8h, v2.8h, v3.8h addv h0, v0.8h @@ -2095,6 +2164,7 @@ br x9 L(ipred_cfl_left_h32): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h @@ -2142,11 +2212,13 @@ br x7 L(ipred_cfl_h4): + AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h add x2, x2, #2 br x9 L(ipred_cfl_w4): + AARCH64_VALID_JUMP_TARGET ld1 {v2.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.4h @@ -2167,11 +2239,13 @@ b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h add x2, x2, #2 br x9 L(ipred_cfl_w8): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.8h @@ -2192,12 +2266,14 @@ b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2], #32 addp v0.8h, v2.8h, v3.8h add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w16): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v2.8h, v2.8h, v3.8h @@ -2219,6 +2295,7 @@ b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h @@ -2227,6 +2304,7 @@ uaddlv s0, v0.8h br x9 L(ipred_cfl_w32): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] add v0.4s, v0.4s, v16.4s addp v2.8h, v2.8h, v3.8h @@ -2287,6 +2365,7 @@ br x7 L(ipred_cfl_ac_420_w4): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 @@ -2333,6 +2412,7 @@ ret L(ipred_cfl_ac_420_w8): + AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 @@ -2402,12 +2482,14 @@ b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_w16): + AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_420_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_420_w16_wpad0): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 @@ -2445,6 +2527,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr q2, [x1, #32] ld1 {v0.8h, v1.8h}, [x1], x2 @@ -2490,6 +2573,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 @@ -2521,6 +2605,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 @@ -2617,6 +2702,7 @@ br x7 L(ipred_cfl_ac_422_w4): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 @@ -2638,6 +2724,7 @@ b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): + AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 @@ -2701,12 +2788,14 @@ b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): + AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_422_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_422_w16_wpad0): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 @@ -2734,6 +2823,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr q2, [x1, #32] ld1 {v0.8h, v1.8h}, [x1], x2 @@ -2767,6 +2857,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 @@ -2792,6 +2883,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 @@ -2858,6 +2950,7 @@ br x7 L(ipred_cfl_ac_444_w4): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.4h}, [x1], x2 ld1 {v0.d}[1], [x10], x2 @@ -2877,6 +2970,7 @@ b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 @@ -2902,6 +2996,7 @@ b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): + AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 @@ -2949,6 +3044,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): + AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_444_w32_tbl) ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 lsr x2, x2, #1 // Restore the stride to one line increments @@ -2956,6 +3052,7 @@ br x7 L(ipred_cfl_ac_444_w32_wpad0): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 shl v0.8h, v0.8h, #3 @@ -2976,6 +3073,7 @@ b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 8 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 shl v2.8h, v2.8h, #3 @@ -2996,6 +3094,7 @@ b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 16 ld1 {v0.8h, v1.8h}, [x1], x2 shl v1.8h, v1.8h, #3 @@ -3016,6 +3115,7 @@ b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 24 ld1 {v0.8h}, [x1], x2 shl v0.8h, v0.8h, #3 diff -Nru dav1d-0.9.2/src/arm/64/ipred.S dav1d-1.0.0/src/arm/64/ipred.S --- dav1d-0.9.2/src/arm/64/ipred.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/ipred.S 2022-03-18 14:31:55.974356000 +0000 @@ -43,6 +43,7 @@ lsl x1, x1, #1 br x5 4: + AARCH64_VALID_JUMP_TARGET st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 @@ -51,6 +52,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 @@ -59,6 +61,7 @@ b.gt 8b ret 16: + AARCH64_VALID_JUMP_TARGET st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 @@ -67,6 +70,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET movi v1.16b, #128 32: st1 {v0.16b, v1.16b}, [x0], x1 @@ -77,6 +81,7 @@ b.gt 32b ret 640: + AARCH64_VALID_JUMP_TARGET movi v1.16b, #128 movi v2.16b, #128 movi v3.16b, #128 @@ -112,6 +117,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2] 4: st1 {v0.s}[0], [x0], x1 @@ -122,6 +128,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] 8: st1 {v0.8b}, [x0], x1 @@ -132,6 +139,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] 16: st1 {v0.16b}, [x0], x1 @@ -142,6 +150,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] 32: st1 {v0.16b, v1.16b}, [x0], x1 @@ -152,6 +161,7 @@ b.gt 32b ret 640: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 @@ -186,6 +196,7 @@ lsl x1, x1, #1 br x5 4: + AARCH64_VALID_JUMP_TARGET ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.s}[0], [x0], x1 st1 {v2.s}[0], [x6], x1 @@ -195,6 +206,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.8b}, [x0], x1 st1 {v2.8b}, [x6], x1 @@ -204,6 +216,7 @@ b.gt 8b ret 16: + AARCH64_VALID_JUMP_TARGET ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 @@ -213,6 +226,7 @@ b.gt 16b ret 32: + AARCH64_VALID_JUMP_TARGET ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -226,6 +240,7 @@ b.gt 32b ret 64: + AARCH64_VALID_JUMP_TARGET ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] @@ -266,6 +281,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 @@ -279,6 +295,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 @@ -292,6 +309,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 @@ -305,6 +323,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b @@ -321,6 +340,7 @@ b.gt 32b ret 640: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b @@ -371,12 +391,14 @@ br x5 L(ipred_dc_left_h4): + AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w4): + AARCH64_VALID_JUMP_TARGET st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 @@ -386,12 +408,14 @@ ret L(ipred_dc_left_h8): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w8): + AARCH64_VALID_JUMP_TARGET st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 @@ -401,12 +425,14 @@ ret L(ipred_dc_left_h16): + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w16): + AARCH64_VALID_JUMP_TARGET st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 @@ -416,6 +442,7 @@ ret L(ipred_dc_left_h32): + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b @@ -424,6 +451,7 @@ dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w32): + AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 1: st1 {v0.16b, v1.16b}, [x0], x1 @@ -435,6 +463,7 @@ ret L(ipred_dc_left_h64): + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b @@ -447,6 +476,7 @@ dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w64): + AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b @@ -499,12 +529,14 @@ br x5 L(ipred_dc_h4): + AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr uaddlv h0, v0.8b add x2, x2, #1 br x3 L(ipred_dc_w4): + AARCH64_VALID_JUMP_TARGET ld1 {v1.s}[0], [x2] ins v1.s[1], wzr add v0.4h, v0.4h, v16.4h @@ -532,11 +564,13 @@ ret L(ipred_dc_h8): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b add x2, x2, #1 br x3 L(ipred_dc_w8): + AARCH64_VALID_JUMP_TARGET ld1 {v1.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b @@ -563,11 +597,13 @@ ret L(ipred_dc_h16): + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b add x2, x2, #1 br x3 L(ipred_dc_w16): + AARCH64_VALID_JUMP_TARGET ld1 {v1.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b @@ -594,6 +630,7 @@ ret L(ipred_dc_h32): + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2], #32 uaddlv h0, v0.16b uaddlv h1, v1.16b @@ -601,6 +638,7 @@ add v0.4h, v0.4h, v1.4h br x3 L(ipred_dc_w32): + AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b @@ -630,6 +668,7 @@ ret L(ipred_dc_h64): + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 uaddlv h0, v0.16b uaddlv h1, v1.16b @@ -641,6 +680,7 @@ add v0.4h, v0.4h, v2.4h br x3 L(ipred_dc_w64): + AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b @@ -705,6 +745,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1r {v5.4s}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 4: @@ -732,6 +773,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1r {v5.2d}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 8: @@ -772,6 +814,7 @@ 160: 320: 640: + AARCH64_VALID_JUMP_TARGET ld1 {v5.16b}, [x8], #16 mov w9, w3 // Set up pointers for four rows in parallel; x0, x6, x5, x10 @@ -884,6 +927,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1r {v6.2s}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor sub x2, x2, #4 @@ -922,6 +966,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v6.8b}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor sub x2, x2, #4 @@ -974,6 +1019,7 @@ 160: 320: 640: + AARCH64_VALID_JUMP_TARGET add x12, x2, w3, uxtw sub x2, x2, #2 mov x7, #-2 @@ -1061,6 +1107,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1r {v6.2s}, [x2] // top usubl v6.8h, v6.8b, v4.8b // top-bottom 4: @@ -1083,6 +1130,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v6.8b}, [x2] // top usubl v6.8h, v6.8b, v4.8b // top-bottom 8: @@ -1113,6 +1161,7 @@ 160: 320: 640: + AARCH64_VALID_JUMP_TARGET // Set up pointers for four rows in parallel; x0, x6, x5, x8 add x5, x0, x1 add x8, x6, x1 @@ -1198,6 +1247,7 @@ lsl x1, x1, #1 br x5 40: + AARCH64_VALID_JUMP_TARGET ld1r {v7.2s}, [x8] // weights_hor sub x2, x2, #4 mov x7, #-4 @@ -1222,6 +1272,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v7.8b}, [x8] // weights_hor sub x2, x2, #4 mov x7, #-4 @@ -1254,6 +1305,7 @@ 160: 320: 640: + AARCH64_VALID_JUMP_TARGET sub x2, x2, #4 mov x7, #-4 // Set up pointers for four rows in parallel; x0, x6, x5, x10 @@ -1350,6 +1402,7 @@ sxtl v22.8h, v22.8b br x5 40: + AARCH64_VALID_JUMP_TARGET ldur s0, [x2, #1] // top (0-3) sub x2, x2, #2 mov x7, #-2 @@ -1373,6 +1426,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ldur d0, [x2, #1] // top (0-7) sub x2, x2, #2 mov x7, #-2 @@ -1406,6 +1460,7 @@ ret 160: 320: + AARCH64_VALID_JUMP_TARGET add x8, x2, #1 sub x2, x2, #2 mov x7, #-2 @@ -1500,6 +1555,7 @@ lsl x1, x1, #1 br x6 4: + AARCH64_VALID_JUMP_TARGET ld1 {v1.16b}, [x3], #16 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b @@ -1510,6 +1566,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x3], #32 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b @@ -1521,6 +1578,7 @@ b.gt 8b ret 16: + AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b @@ -1534,6 +1592,7 @@ b.gt 16b ret 32: + AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #4 @@ -1552,6 +1611,7 @@ b.gt 32b ret 64: + AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #2 @@ -1592,11 +1652,12 @@ lsl x1, x1, #1 br x7 L(ipred_cfl_splat_w4): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x5], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h - sshr v4.8h, v2.8h, #15 // sign = diff >> 15 - sshr v5.8h, v3.8h, #15 + cmlt v4.8h, v2.8h, #0 // sign + cmlt v5.8h, v3.8h, #0 add v2.8h, v2.8h, v4.8h // diff + sign add v3.8h, v3.8h, v5.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() @@ -1613,15 +1674,16 @@ b.gt L(ipred_cfl_splat_w4) ret L(ipred_cfl_splat_w8): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h - sshr v16.8h, v2.8h, #15 // sign = diff >> 15 - sshr v17.8h, v3.8h, #15 - sshr v18.8h, v4.8h, #15 - sshr v19.8h, v5.8h, #15 + cmlt v16.8h, v2.8h, #0 // sign + cmlt v17.8h, v3.8h, #0 + cmlt v18.8h, v4.8h, #0 + cmlt v19.8h, v5.8h, #0 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h @@ -1646,6 +1708,7 @@ b.gt L(ipred_cfl_splat_w8) ret L(ipred_cfl_splat_w16): + AARCH64_VALID_JUMP_TARGET add x7, x5, w3, uxtw #1 sub x1, x1, w3, uxtw mov w9, w3 @@ -1656,10 +1719,10 @@ mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h - sshr v16.8h, v2.8h, #15 // sign = diff >> 15 - sshr v17.8h, v3.8h, #15 - sshr v18.8h, v4.8h, #15 - sshr v19.8h, v5.8h, #15 + cmlt v16.8h, v2.8h, #0 // sign + cmlt v17.8h, v3.8h, #0 + cmlt v18.8h, v4.8h, #0 + cmlt v19.8h, v5.8h, #0 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h @@ -1713,24 +1776,28 @@ lsl x1, x1, #1 br x7 4: + AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: + AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b @@ -1768,6 +1835,7 @@ br x7 L(ipred_cfl_left_h4): + AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 @@ -1775,6 +1843,7 @@ br x9 L(ipred_cfl_left_h8): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 @@ -1782,6 +1851,7 @@ br x9 L(ipred_cfl_left_h16): + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.4h, v0.4h, #4 @@ -1789,6 +1859,7 @@ br x9 L(ipred_cfl_left_h32): + AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b @@ -1832,12 +1903,14 @@ br x7 L(ipred_cfl_h4): + AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr add x2, x2, #1 uaddlv h0, v0.8b br x9 L(ipred_cfl_w4): + AARCH64_VALID_JUMP_TARGET ld1 {v2.s}[0], [x2] ins v2.s[1], wzr add v0.4h, v0.4h, v16.4h @@ -1858,11 +1931,13 @@ b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b add x2, x2, #1 br x9 L(ipred_cfl_w8): + AARCH64_VALID_JUMP_TARGET ld1 {v2.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b @@ -1882,11 +1957,13 @@ b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b add x2, x2, #1 br x9 L(ipred_cfl_w16): + AARCH64_VALID_JUMP_TARGET ld1 {v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b @@ -1906,6 +1983,7 @@ b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): + AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2], #32 uaddlv h2, v2.16b uaddlv h3, v3.16b @@ -1913,6 +1991,7 @@ add v0.4h, v2.4h, v3.4h br x9 L(ipred_cfl_w32): + AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b @@ -1971,6 +2050,7 @@ br x7 L(ipred_cfl_ac_420_w4): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 @@ -2011,6 +2091,7 @@ ret L(ipred_cfl_ac_420_w8): + AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 @@ -2093,12 +2174,14 @@ ret L(ipred_cfl_ac_420_w16): + AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_420_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_420_w16_wpad0): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 @@ -2132,6 +2215,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 @@ -2173,6 +2257,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 @@ -2200,6 +2285,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 @@ -2288,6 +2374,7 @@ br x7 L(ipred_cfl_ac_422_w4): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x10], x2 @@ -2307,6 +2394,7 @@ b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): + AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 @@ -2362,12 +2450,14 @@ b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): + AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_422_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_422_w16_wpad0): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 @@ -2391,6 +2481,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 @@ -2420,6 +2511,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 @@ -2441,6 +2533,7 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): + AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 @@ -2503,6 +2596,7 @@ br x7 L(ipred_cfl_ac_444_w4): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.s}[0], [x1], x2 ld1 {v0.s}[1], [x10], x2 @@ -2520,6 +2614,7 @@ b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 @@ -2541,6 +2636,7 @@ b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): + AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding ld1 {v0.16b}, [x1], x2 @@ -2606,12 +2702,14 @@ b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): + AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_444_w32_tbl) ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_444_w32_wpad0): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, without padding ld1 {v2.16b, v3.16b}, [x1], x2 ld1 {v6.16b, v7.16b}, [x10], x2 @@ -2638,6 +2736,7 @@ b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 8 ldr d2, [x1, #16] ld1 {v1.16b}, [x1], x2 @@ -2666,6 +2765,7 @@ b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 16 ld1 {v1.16b}, [x1], x2 ld1 {v5.16b}, [x10], x2 @@ -2692,6 +2792,7 @@ b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): + AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 24 ld1 {v0.8b}, [x1], x2 ld1 {v4.8b}, [x10], x2 diff -Nru dav1d-0.9.2/src/arm/64/itx16.S dav1d-1.0.0/src/arm/64/itx16.S --- dav1d-0.9.2/src/arm/64/itx16.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/itx16.S 2022-03-18 14:31:55.978356000 +0000 @@ -416,6 +416,7 @@ .endm function inv_dct_4s_x4_neon + AARCH64_VALID_CALL_TARGET movrel x16, idct_coeffs ld1 {v0.4s}, [x16] idct_4 v16, v17, v18, v19 @@ -449,16 +450,19 @@ .endm function inv_adst_4s_x4_neon + AARCH64_VALID_CALL_TARGET iadst_4x4 v16, v17, v18, v19 ret endfunc function inv_flipadst_4s_x4_neon + AARCH64_VALID_CALL_TARGET iadst_4x4 v19, v18, v17, v16 ret endfunc function inv_identity_4s_x4_neon + AARCH64_VALID_CALL_TARGET movz w16, #(5793-4096)*8, lsl #16 dup v0.2s, w16 sqrdmulh v4.4s, v16.4s, v0.s[0] @@ -541,7 +545,7 @@ st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 - br x15 + ret x15 endfunc .macro def_fn_4x4 txfm1, txfm2 @@ -626,6 +630,7 @@ .endm function inv_dct_4s_x8_neon + AARCH64_VALID_CALL_TARGET movrel x16, idct_coeffs ld1 {v0.4s, v1.4s}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23 @@ -702,16 +707,19 @@ .endm function inv_adst_4s_x8_neon + AARCH64_VALID_CALL_TARGET iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 ret endfunc function inv_flipadst_4s_x8_neon + AARCH64_VALID_CALL_TARGET iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 ret endfunc function inv_identity_4s_x8_neon + AARCH64_VALID_CALL_TARGET sqshl v16.4s, v16.4s, #1 sqshl v17.4s, v17.4s, #1 sqshl v18.4s, v18.4s, #1 @@ -784,7 +792,7 @@ blr x5 load_add_store_8x8 x0, x7 - br x15 + ret x15 endfunc .macro def_fn_8x8 txfm1, txfm2, eob_half @@ -853,7 +861,7 @@ blr x5 load_add_store_8x4 x0, x7 - br x15 + ret x15 endfunc function inv_txfm_add_4x8_neon @@ -902,7 +910,7 @@ blr x5 load_add_store_4x8 x0, x7 - br x15 + ret x15 endfunc .macro def_fn_48 w, h, txfm1, txfm2, eob_half @@ -945,6 +953,7 @@ function inv_dct_4s_x16_neon + AARCH64_VALID_CALL_TARGET movrel x16, idct_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 @@ -1206,16 +1215,19 @@ .endm function inv_adst_4s_x16_neon + AARCH64_VALID_CALL_TARGET iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 ret endfunc function inv_flipadst_4s_x16_neon + AARCH64_VALID_CALL_TARGET iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 ret endfunc function inv_identity_4s_x16_neon + AARCH64_VALID_CALL_TARGET movz w16, #2*(5793-4096)*8, lsl #16 dup v0.2s, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1282,7 +1294,7 @@ st1 {\i}, [x6], #16 .endr - br x14 + ret x14 endfunc .endm @@ -1296,7 +1308,7 @@ .endr blr x5 load_add_store_8x16 x6, x7 - br x14 + ret x14 endfunc function inv_txfm_add_16x16_neon @@ -1338,7 +1350,7 @@ .endr add sp, sp, #512 - br x15 + ret x15 endfunc const eob_16x16 @@ -1423,7 +1435,7 @@ add x6, x0, #16 load_add_store_8x4 x6, x7 - br x15 + ret x15 endfunc function inv_txfm_add_4x16_neon @@ -1517,7 +1529,7 @@ load_add_store_4x16 x0, x6 - br x15 + ret x15 endfunc const eob_4x16 @@ -1698,7 +1710,7 @@ ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 - br x15 + ret x15 endfunc function inv_txfm_add_8x16_neon @@ -1839,7 +1851,7 @@ ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x20 - br x15 + ret x15 endfunc const eob_8x16 @@ -2141,7 +2153,7 @@ store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift .purgem store2 - br x14 + ret x14 endfunc .endm @@ -2216,7 +2228,7 @@ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 .purgem combine - br x14 + ret x14 endfunc const eob_32x32 @@ -2533,7 +2545,7 @@ .endr add sp, sp, #2048 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 @@ -2582,7 +2594,7 @@ .endr add sp, sp, #1024 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 @@ -2632,7 +2644,7 @@ .endr add sp, sp, #1024 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 @@ -2692,7 +2704,7 @@ bl inv_txfm_add_vert_dct_8x32_neon add sp, sp, #512 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 @@ -2743,7 +2755,7 @@ b.lt 1b add sp, sp, #512 - br x15 + ret x15 endfunc function inv_dct64_step1_neon @@ -3070,7 +3082,7 @@ bl inv_dct64_step2_neon - br x14 + ret x14 endfunc .endm @@ -3127,7 +3139,7 @@ cmp x7, x8 b.lt 1b - br x14 + ret x14 endfunc function inv_txfm_add_vert_dct_8x64_neon @@ -3184,7 +3196,7 @@ cmp x7, x8 b.lt 1b - br x14 + ret x14 endfunc function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 @@ -3238,7 +3250,7 @@ .endr add sp, x5, #64*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 @@ -3291,7 +3303,7 @@ .endr add sp, x5, #64*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 @@ -3341,7 +3353,7 @@ .endr add sp, x5, #32*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 @@ -3395,7 +3407,7 @@ .endr add sp, x4, #64*16*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 @@ -3448,5 +3460,5 @@ .endr add sp, x5, #16*32*2 - br x15 + ret x15 endfunc diff -Nru dav1d-0.9.2/src/arm/64/itx.S dav1d-1.0.0/src/arm/64/itx.S --- dav1d-0.9.2/src/arm/64/itx.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/itx.S 2022-03-18 14:31:55.978356000 +0000 @@ -133,10 +133,10 @@ .endif .endm -.macro rshrn_sz d0, s0, s1, shift, sz - rshrn \d0\().4h, \s0\().4s, \shift +.macro sqrshrn_sz d0, s0, s1, shift, sz + sqrshrn \d0\().4h, \s0\().4s, \shift .ifc \sz, .8h - rshrn2 \d0\().8h, \s1\().4s, \shift + sqrshrn2 \d0\().8h, \s1\().4s, \shift .endif .endm @@ -438,11 +438,11 @@ smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz - rshrn_sz v6, v6, v7, #12, \sz - rshrn_sz v7, v4, v5, #12, \sz + sqrshrn_sz v6, v6, v7, #12, \sz + sqrshrn_sz v7, v4, v5, #12, \sz smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz - rshrn_sz v2, v2, v3, #12, \sz - rshrn_sz v3, v4, v5, #12, \sz + sqrshrn_sz v2, v2, v3, #12, \sz + sqrshrn_sz v3, v4, v5, #12, \sz sqadd \r0\sz, v2\sz, v6\sz sqsub \r3\sz, v2\sz, v6\sz sqadd \r1\sz, v3\sz, v7\sz @@ -660,7 +660,7 @@ st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x0], x1 - br x15 + ret x15 endfunc .macro def_fn_4x4 txfm1, txfm2 @@ -714,11 +714,11 @@ smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a - rshrn_sz \r1, v2, v3, #12, \sz // t4a - rshrn_sz \r7, v4, v5, #12, \sz // t7a + sqrshrn_sz \r1, v2, v3, #12, \sz // t4a + sqrshrn_sz \r7, v4, v5, #12, \sz // t7a smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a - rshrn_sz \r3, v6, v7, #12, \sz // t5a - rshrn_sz \r5, v2, v3, #12, \sz // t6a + sqrshrn_sz \r3, v6, v7, #12, \sz // t5a + sqrshrn_sz \r5, v2, v3, #12, \sz // t6a sqadd v2\sz, \r1\sz, \r3\sz // t4 sqsub \r1\sz, \r1\sz, \r3\sz // t5a @@ -727,8 +727,8 @@ smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5 smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6 - rshrn_sz v4, v4, v5, #12, \sz // t5 - rshrn_sz v5, v6, v7, #12, \sz // t6 + sqrshrn_sz v4, v4, v5, #12, \sz // t5 + sqrshrn_sz v5, v6, v7, #12, \sz // t6 sqsub \r7\sz, \r0\sz, v3\sz // out7 sqadd \r0\sz, \r0\sz, v3\sz // out0 @@ -762,19 +762,19 @@ smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz - rshrn_sz v16, v2, v3, #12, \sz // t0a - rshrn_sz v23, v4, v5, #12, \sz // t1a + sqrshrn_sz v16, v2, v3, #12, \sz // t0a + sqrshrn_sz v23, v4, v5, #12, \sz // t1a smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz - rshrn_sz v18, v6, v7, #12, \sz // t2a - rshrn_sz v21, v2, v3, #12, \sz // t3a + sqrshrn_sz v18, v6, v7, #12, \sz // t2a + sqrshrn_sz v21, v2, v3, #12, \sz // t3a smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz - rshrn_sz v20, v4, v5, #12, \sz // t4a - rshrn_sz v19, v6, v7, #12, \sz // t5a + sqrshrn_sz v20, v4, v5, #12, \sz // t4a + sqrshrn_sz v19, v6, v7, #12, \sz // t5a smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz - rshrn_sz v22, v2, v3, #12, \sz // t6a - rshrn_sz v17, v4, v5, #12, \sz // t7a + sqrshrn_sz v22, v2, v3, #12, \sz // t6a + sqrshrn_sz v17, v4, v5, #12, \sz // t7a sqadd v2\sz, v16\sz, v20\sz // t0 sqsub v3\sz, v16\sz, v20\sz // t4 @@ -789,13 +789,13 @@ smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz - rshrn_sz v3, v16, v17, #12, \sz // t4a - rshrn_sz v5, v20, v21, #12, \sz // t5a + sqrshrn_sz v3, v16, v17, #12, \sz // t4a + sqrshrn_sz v5, v20, v21, #12, \sz // t5a smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz - rshrn_sz v7, v22, v23, #12, \sz // t6a - rshrn_sz v19, v16, v17, #12, \sz // t7a + sqrshrn_sz v7, v22, v23, #12, \sz // t6a + sqrshrn_sz v19, v16, v17, #12, \sz // t7a sqadd \o0\()\sz, v2\sz, v6\sz // out0 sqsub v2\sz, v2\sz, v6\sz // t2 @@ -812,11 +812,11 @@ smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20) smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19) smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18) - rshrn_sz v2, v18, v19, #12, \sz // out3 + sqrshrn_sz v2, v18, v19, #12, \sz // out3 smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21) - rshrn_sz v3, v20, v21, #12, \sz // out5 - rshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21) - rshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19) + sqrshrn_sz v3, v20, v21, #12, \sz // out5 + sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21) + sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19) sqneg \o3\()\sz, v2\sz // out3 sqneg \o5\()\sz, v3\sz // out5 @@ -897,7 +897,7 @@ blr x5 load_add_store_8x8 x0, x7 - br x15 + ret x15 endfunc .endm @@ -962,7 +962,7 @@ blr x5 load_add_store_8x4 x0, x7 - br x15 + ret x15 endfunc function inv_txfm_add_4x8_neon @@ -988,7 +988,7 @@ blr x5 load_add_store_4x8 x0, x7 - br x15 + ret x15 endfunc .macro def_fn_48 w, h, txfm1, txfm2 @@ -1033,19 +1033,19 @@ smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a - rshrn_sz v17, v2, v3, #12, \sz // t8a - rshrn_sz v31, v4, v5, #12, \sz // t15a + sqrshrn_sz v17, v2, v3, #12, \sz // t8a + sqrshrn_sz v31, v4, v5, #12, \sz // t15a smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a - rshrn_sz v23, v6, v7, #12, \sz // t9a - rshrn_sz v25, v2, v3, #12, \sz // t14a + sqrshrn_sz v23, v6, v7, #12, \sz // t9a + sqrshrn_sz v25, v2, v3, #12, \sz // t14a smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a - rshrn_sz v21, v4, v5, #12, \sz // t10a - rshrn_sz v27, v6, v7, #12, \sz // t13a + sqrshrn_sz v21, v4, v5, #12, \sz // t10a + sqrshrn_sz v27, v6, v7, #12, \sz // t13a smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a - rshrn_sz v19, v2, v3, #12, \sz // t11a - rshrn_sz v29, v4, v5, #12, \sz // t12a + sqrshrn_sz v19, v2, v3, #12, \sz // t11a + sqrshrn_sz v29, v4, v5, #12, \sz // t12a sqsub v2\sz, v17\sz, v23\sz // t9 sqadd v17\sz, v17\sz, v23\sz // t8 @@ -1058,17 +1058,17 @@ smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a - rshrn_sz v21, v4, v5, #12, \sz // t9a - rshrn_sz v27, v6, v7, #12, \sz // t14a + sqrshrn_sz v21, v4, v5, #12, \sz // t9a + sqrshrn_sz v27, v6, v7, #12, \sz // t14a smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a - rshrn_sz v29, v4, v5, #12, \sz // t13a + sqrshrn_sz v29, v4, v5, #12, \sz // t13a neg v6.4s, v6.4s .ifc \sz, .8h neg v7.4s, v7.4s .endif - rshrn_sz v23, v6, v7, #12, \sz // t10a + sqrshrn_sz v23, v6, v7, #12, \sz // t10a sqsub v2\sz, v17\sz, v19\sz // t11a sqadd v17\sz, v17\sz, v19\sz // t8a @@ -1083,11 +1083,11 @@ smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12 smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a - rshrn_sz v4, v4, v5, #12, \sz // t11 - rshrn_sz v5, v6, v7, #12, \sz // t12 + sqrshrn_sz v4, v4, v5, #12, \sz // t11 + sqrshrn_sz v5, v6, v7, #12, \sz // t12 smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a - rshrn_sz v2, v2, v3, #12, \sz // t10a - rshrn_sz v3, v6, v7, #12, \sz // t13a + sqrshrn_sz v2, v2, v3, #12, \sz // t10a + sqrshrn_sz v3, v6, v7, #12, \sz // t13a sqadd v6\sz, v16\sz, v31\sz // out0 sqsub v31\sz, v16\sz, v31\sz // out15 @@ -1132,35 +1132,35 @@ smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0 smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2 - rshrn_sz v16, v2, v3, #12, \sz // t0 - rshrn_sz v31, v4, v5, #12, \sz // t1 + sqrshrn_sz v16, v2, v3, #12, \sz // t0 + sqrshrn_sz v31, v4, v5, #12, \sz // t1 smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3 smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4 - rshrn_sz v18, v6, v7, #12, \sz // t2 - rshrn_sz v29, v2, v3, #12, \sz // t3 + sqrshrn_sz v18, v6, v7, #12, \sz // t2 + sqrshrn_sz v29, v2, v3, #12, \sz // t3 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5 smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6 - rshrn_sz v20, v4, v5, #12, \sz // t4 - rshrn_sz v27, v6, v7, #12, \sz // t5 + sqrshrn_sz v20, v4, v5, #12, \sz // t4 + sqrshrn_sz v27, v6, v7, #12, \sz // t5 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7 smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8 - rshrn_sz v22, v2, v3, #12, \sz // t6 - rshrn_sz v25, v4, v5, #12, \sz // t7 + sqrshrn_sz v22, v2, v3, #12, \sz // t6 + sqrshrn_sz v25, v4, v5, #12, \sz // t7 smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9 smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10 - rshrn_sz v23, v6, v7, #12, \sz // t8 - rshrn_sz v24, v2, v3, #12, \sz // t9 + sqrshrn_sz v23, v6, v7, #12, \sz // t8 + sqrshrn_sz v24, v2, v3, #12, \sz // t9 smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11 smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12 - rshrn_sz v21, v4, v5, #12, \sz // t10 - rshrn_sz v26, v6, v7, #12, \sz // t11 + sqrshrn_sz v21, v4, v5, #12, \sz // t10 + sqrshrn_sz v26, v6, v7, #12, \sz // t11 smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13 smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14 - rshrn_sz v19, v2, v3, #12, \sz // t12 - rshrn_sz v28, v4, v5, #12, \sz // t13 + sqrshrn_sz v19, v2, v3, #12, \sz // t12 + sqrshrn_sz v28, v4, v5, #12, \sz // t13 smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15 - rshrn_sz v17, v6, v7, #12, \sz // t14 - rshrn_sz v30, v2, v3, #12, \sz // t15 + sqrshrn_sz v17, v6, v7, #12, \sz // t14 + sqrshrn_sz v30, v2, v3, #12, \sz // t15 ld1 {v0.8h}, [x16] @@ -1184,19 +1184,19 @@ smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8 smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9 smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10 - rshrn_sz v17, v4, v5, #12, \sz // t8 - rshrn_sz v30, v6, v7, #12, \sz // t9 + sqrshrn_sz v17, v4, v5, #12, \sz // t8 + sqrshrn_sz v30, v6, v7, #12, \sz // t9 smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12 - rshrn_sz v18, v2, v3, #12, \sz // t10 - rshrn_sz v29, v4, v5, #12, \sz // t11 + sqrshrn_sz v18, v2, v3, #12, \sz // t10 + sqrshrn_sz v29, v4, v5, #12, \sz // t11 smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14 - rshrn_sz v27, v6, v7, #12, \sz // t12 - rshrn_sz v20, v2, v3, #12, \sz // t13 + sqrshrn_sz v27, v6, v7, #12, \sz // t12 + sqrshrn_sz v20, v2, v3, #12, \sz // t13 smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15 - rshrn_sz v25, v4, v5, #12, \sz // t14 - rshrn_sz v22, v6, v7, #12, \sz // t15 + sqrshrn_sz v25, v4, v5, #12, \sz // t14 + sqrshrn_sz v22, v6, v7, #12, \sz // t15 sqsub v2\sz, v16\sz, v21\sz // t4 sqadd v16\sz, v16\sz, v21\sz // t0 @@ -1218,19 +1218,19 @@ smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a - rshrn_sz v22, v4, v5, #12, \sz // t4a - rshrn_sz v25, v6, v7, #12, \sz // t5a + sqrshrn_sz v22, v4, v5, #12, \sz // t4a + sqrshrn_sz v25, v6, v7, #12, \sz // t5a smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12 - rshrn_sz v24, v2, v3, #12, \sz // t6a - rshrn_sz v23, v4, v5, #12, \sz // t7a + sqrshrn_sz v24, v2, v3, #12, \sz // t6a + sqrshrn_sz v23, v4, v5, #12, \sz // t7a smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13 smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14 - rshrn_sz v17, v6, v7, #12, \sz // t12 + sqrshrn_sz v17, v6, v7, #12, \sz // t12 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15 - rshrn_sz v29, v2, v3, #12, \sz // t13 - rshrn_sz v30, v4, v5, #12, \sz // t14 - rshrn_sz v18, v6, v7, #12, \sz // t15 + sqrshrn_sz v29, v2, v3, #12, \sz // t13 + sqrshrn_sz v30, v4, v5, #12, \sz // t14 + sqrshrn_sz v18, v6, v7, #12, \sz // t15 sqsub v2\sz, v16\sz, v21\sz // t2a .ifc \o0, v16 @@ -1267,21 +1267,21 @@ smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24) smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26) - rshrn_sz v24, v24, v25, #12, \sz // out8 - rshrn_sz v4, v4, v5, #12, \sz // out7 - rshrn_sz v5, v6, v7, #12, \sz // out5 + sqrshrn_sz v24, v24, v25, #12, \sz // out8 + sqrshrn_sz v4, v4, v5, #12, \sz // out7 + sqrshrn_sz v5, v6, v7, #12, \sz // out5 smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21) smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27) - rshrn_sz v26, v6, v7, #12, \sz // out10 + sqrshrn_sz v26, v6, v7, #12, \sz // out10 smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20) smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25) smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22) - rshrn_sz \o4, v2, v3, #12, \sz // out4 - rshrn_sz v6, v6, v7, #12, \sz // out11 - rshrn_sz v7, v21, v25, #12, \sz // out9 - rshrn_sz \o6, v22, v23, #12, \sz // out6 + sqrshrn_sz \o4, v2, v3, #12, \sz // out4 + sqrshrn_sz v6, v6, v7, #12, \sz // out11 + sqrshrn_sz v7, v21, v25, #12, \sz // out9 + sqrshrn_sz \o6, v22, v23, #12, \sz // out6 .ifc \o8, v23 mov \o8\szb, v24\szb @@ -1370,6 +1370,7 @@ .macro def_horz_16 scale=0, identity=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x8_neon + AARCH64_VALID_CALL_TARGET mov x14, x30 movi v7.8h, #0 .if \identity @@ -1404,7 +1405,7 @@ st1 {\i}, [x6], #16 .endr - br x14 + ret x14 endfunc .endm @@ -1419,7 +1420,7 @@ .endr blr x5 load_add_store_8x16 x6, x7 - br x14 + ret x14 endfunc function inv_txfm_add_16x16_neon @@ -1453,7 +1454,7 @@ .endr add sp, sp, #512 - br x15 + ret x15 endfunc .macro def_fn_16x16 txfm1, txfm2, eob_half @@ -1553,7 +1554,7 @@ add x6, x0, #8 load_add_store_8x4 x6, x7 - br x15 + ret x15 endfunc function inv_txfm_\variant\()add_4x16_neon @@ -1621,7 +1622,7 @@ load_add_store_4x16 x0, x6 - br x15 + ret x15 endfunc .endm @@ -1731,7 +1732,7 @@ add x0, x0, #8 load_add_store_8x8 x0, x7 - br x15 + ret x15 endfunc function inv_txfm_\variant\()add_8x16_neon @@ -1804,7 +1805,7 @@ load_add_store_8x16 x0, x6 - br x15 + ret x15 endfunc .endm @@ -1859,35 +1860,35 @@ smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a - rshrn_sz v16, v2, v3, #12, .8h // t16a - rshrn_sz v31, v4, v5, #12, .8h // t31a + sqrshrn_sz v16, v2, v3, #12, .8h // t16a + sqrshrn_sz v31, v4, v5, #12, .8h // t31a smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a - rshrn_sz v24, v6, v7, #12, .8h // t17a - rshrn_sz v23, v2, v3, #12, .8h // t30a + sqrshrn_sz v24, v6, v7, #12, .8h // t17a + sqrshrn_sz v23, v2, v3, #12, .8h // t30a smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a - rshrn_sz v20, v4, v5, #12, .8h // t18a - rshrn_sz v27, v6, v7, #12, .8h // t29a + sqrshrn_sz v20, v4, v5, #12, .8h // t18a + sqrshrn_sz v27, v6, v7, #12, .8h // t29a smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a - rshrn_sz v28, v2, v3, #12, .8h // t19a - rshrn_sz v19, v4, v5, #12, .8h // t28a + sqrshrn_sz v28, v2, v3, #12, .8h // t19a + sqrshrn_sz v19, v4, v5, #12, .8h // t28a smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a - rshrn_sz v18, v6, v7, #12, .8h // t20a - rshrn_sz v29, v2, v3, #12, .8h // t27a + sqrshrn_sz v18, v6, v7, #12, .8h // t20a + sqrshrn_sz v29, v2, v3, #12, .8h // t27a smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a - rshrn_sz v26, v4, v5, #12, .8h // t21a - rshrn_sz v21, v6, v7, #12, .8h // t26a + sqrshrn_sz v26, v4, v5, #12, .8h // t21a + sqrshrn_sz v21, v6, v7, #12, .8h // t26a smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a - rshrn_sz v22, v2, v3, #12, .8h // t22a - rshrn_sz v25, v4, v5, #12, .8h // t25a + sqrshrn_sz v22, v2, v3, #12, .8h // t22a + sqrshrn_sz v25, v4, v5, #12, .8h // t25a smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a - rshrn_sz v30, v6, v7, #12, .8h // t23a - rshrn_sz v17, v2, v3, #12, .8h // t24a + sqrshrn_sz v30, v6, v7, #12, .8h // t23a + sqrshrn_sz v17, v2, v3, #12, .8h // t24a ld1 {v0.8h}, [x16] @@ -1911,23 +1912,23 @@ smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a - rshrn_sz v21, v4, v5, #12, .8h // t17a - rshrn_sz v27, v6, v7, #12, .8h // t30a + sqrshrn_sz v21, v4, v5, #12, .8h // t17a + sqrshrn_sz v27, v6, v7, #12, .8h // t30a neg v2.4s, v2.4s // -> t18a neg v3.4s, v3.4s // -> t18a smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a - rshrn_sz v19, v2, v3, #12, .8h // t18a - rshrn_sz v24, v4, v5, #12, .8h // t29a + sqrshrn_sz v19, v2, v3, #12, .8h // t18a + sqrshrn_sz v24, v4, v5, #12, .8h // t29a smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a - rshrn_sz v22, v6, v7, #12, .8h // t21a - rshrn_sz v18, v2, v3, #12, .8h // t26a + sqrshrn_sz v22, v6, v7, #12, .8h // t21a + sqrshrn_sz v18, v2, v3, #12, .8h // t26a neg v4.4s, v4.4s // -> t22a neg v5.4s, v5.4s // -> t22a smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a - rshrn_sz v17, v4, v5, #12, .8h // t22a - rshrn_sz v20, v6, v7, #12, .8h // t25a + sqrshrn_sz v17, v4, v5, #12, .8h // t22a + sqrshrn_sz v20, v6, v7, #12, .8h // t25a sqsub v2.8h, v27.8h, v24.8h // t29 sqadd v27.8h, v27.8h, v24.8h // t30 @@ -1949,23 +1950,23 @@ smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19 - rshrn_sz v18, v4, v5, #12, .8h // t18a - rshrn_sz v25, v6, v7, #12, .8h // t29a + sqrshrn_sz v18, v4, v5, #12, .8h // t18a + sqrshrn_sz v25, v6, v7, #12, .8h // t29a smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28 smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20 - rshrn_sz v29, v2, v3, #12, .8h // t19 - rshrn_sz v24, v4, v5, #12, .8h // t28 + sqrshrn_sz v29, v2, v3, #12, .8h // t19 + sqrshrn_sz v24, v4, v5, #12, .8h // t28 neg v6.4s, v6.4s // -> t20 neg v7.4s, v7.4s // -> t20 smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27 smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a - rshrn_sz v26, v6, v7, #12, .8h // t20 - rshrn_sz v19, v2, v3, #12, .8h // t27 + sqrshrn_sz v26, v6, v7, #12, .8h // t20 + sqrshrn_sz v19, v2, v3, #12, .8h // t27 neg v4.4s, v4.4s // -> t21a neg v5.4s, v5.4s // -> t21a smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a - rshrn_sz v20, v4, v5, #12, .8h // t21a - rshrn_sz v28, v6, v7, #12, .8h // t26a + sqrshrn_sz v20, v4, v5, #12, .8h // t21a + sqrshrn_sz v28, v6, v7, #12, .8h // t26a sqsub v2.8h, v16.8h, v30.8h // t23 sqadd v16.8h, v16.8h, v30.8h // t16 = out16 @@ -1987,24 +1988,24 @@ smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20 smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27 - rshrn_sz v20, v4, v5, #12, .8h // t20 - rshrn_sz v22, v6, v7, #12, .8h // t27 + sqrshrn_sz v20, v4, v5, #12, .8h // t20 + sqrshrn_sz v22, v6, v7, #12, .8h // t27 smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a mov v27.16b, v22.16b // t27 - rshrn_sz v26, v4, v5, #12, .8h // t26a + sqrshrn_sz v26, v4, v5, #12, .8h // t26a smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22 smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25 - rshrn_sz v21, v6, v7, #12, .8h // t21a - rshrn_sz v22, v24, v25, #12, .8h // t22 - rshrn_sz v25, v4, v5, #12, .8h // t25 + sqrshrn_sz v21, v6, v7, #12, .8h // t21a + sqrshrn_sz v22, v24, v25, #12, .8h // t22 + sqrshrn_sz v25, v4, v5, #12, .8h // t25 smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a - rshrn_sz v23, v4, v5, #12, .8h // t23a - rshrn_sz v24, v6, v7, #12, .8h // t24a + sqrshrn_sz v23, v4, v5, #12, .8h // t23a + sqrshrn_sz v24, v6, v7, #12, .8h // t24a ret endfunc @@ -2089,7 +2090,7 @@ store2 v25.8h, v17.8h, \shift store2 v24.8h, v16.8h, \shift .purgem store2 - br x14 + ret x14 endfunc .endm @@ -2163,7 +2164,7 @@ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 .purgem combine - br x14 + ret x14 endfunc const eob_32x32 @@ -2374,7 +2375,7 @@ .endr add sp, sp, #2048 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 @@ -2423,7 +2424,7 @@ .endr add sp, sp, #1024 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 @@ -2468,7 +2469,7 @@ .endr add sp, sp, #1024 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 @@ -2525,7 +2526,7 @@ bl inv_txfm_add_vert_dct_8x32_neon add sp, sp, #512 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 @@ -2559,7 +2560,7 @@ b.lt 1b add sp, sp, #512 - br x15 + ret x15 endfunc function inv_dct64_step1_neon @@ -2593,11 +2594,11 @@ neg v2.4s, v2.4s // t34a neg v3.4s, v3.4s // t34a smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a - rshrn_sz v26, v2, v3, #12, .8h // t34a + sqrshrn_sz v26, v2, v3, #12, .8h // t34a smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a - rshrn_sz v29, v4, v5, #12, .8h // t61a - rshrn_sz v25, v6, v7, #12, .8h // t33a - rshrn_sz v30, v2, v3, #12, .8h // t62a + sqrshrn_sz v29, v4, v5, #12, .8h // t61a + sqrshrn_sz v25, v6, v7, #12, .8h // t33a + sqrshrn_sz v30, v2, v3, #12, .8h // t62a sqadd v16.8h, v24.8h, v27.8h // t32a sqsub v19.8h, v24.8h, v27.8h // t35a @@ -2611,11 +2612,11 @@ smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60 - rshrn_sz v21, v2, v3, #12, .8h // t61a - rshrn_sz v18, v4, v5, #12, .8h // t34a + sqrshrn_sz v21, v2, v3, #12, .8h // t61a + sqrshrn_sz v18, v4, v5, #12, .8h // t34a smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35 - rshrn_sz v20, v6, v7, #12, .8h // t60 - rshrn_sz v19, v2, v3, #12, .8h // t35 + sqrshrn_sz v20, v6, v7, #12, .8h // t60 + sqrshrn_sz v19, v2, v3, #12, .8h // t35 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 @@ -2652,13 +2653,13 @@ smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a - rshrn_sz v25, v2, v3, #12, .8h // t56a - rshrn_sz v27, v4, v5, #12, .8h // t39a + sqrshrn_sz v25, v2, v3, #12, .8h // t56a + sqrshrn_sz v27, v4, v5, #12, .8h // t39a neg v6.4s, v6.4s // t40a neg v7.4s, v7.4s // t40a smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a - rshrn_sz v31, v6, v7, #12, .8h // t40a - rshrn_sz v28, v2, v3, #12, .8h // t55a + sqrshrn_sz v31, v6, v7, #12, .8h // t40a + sqrshrn_sz v28, v2, v3, #12, .8h // t55a sqadd v16.8h, v24.8h, v29.8h // t32a sqsub v19.8h, v24.8h, v29.8h // t47a @@ -2672,11 +2673,11 @@ smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47 - rshrn_sz v18, v2, v3, #12, .8h // t40a - rshrn_sz v21, v4, v5, #12, .8h // t55a + sqrshrn_sz v18, v2, v3, #12, .8h // t40a + sqrshrn_sz v21, v4, v5, #12, .8h // t55a smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48 - rshrn_sz v19, v6, v7, #12, .8h // t47 - rshrn_sz v20, v2, v3, #12, .8h // t48 + sqrshrn_sz v19, v6, v7, #12, .8h // t47 + sqrshrn_sz v20, v2, v3, #12, .8h // t48 str q16, [x6, #2*8*0] // t32a str q17, [x9, #2*8*0] // t39 @@ -2886,7 +2887,7 @@ bl inv_dct64_step2_neon - br x14 + ret x14 endfunc .endm @@ -2943,7 +2944,7 @@ cmp x7, x8 b.lt 1b - br x14 + ret x14 endfunc function inv_txfm_add_vert_dct_8x64_neon @@ -2999,7 +3000,7 @@ cmp x7, x8 b.lt 1b - br x14 + ret x14 endfunc function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 @@ -3053,7 +3054,7 @@ .endr add sp, x5, #64*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 @@ -3106,7 +3107,7 @@ .endr add sp, x5, #64*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 @@ -3158,7 +3159,7 @@ .endr add sp, x5, #32*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 @@ -3212,7 +3213,7 @@ .endr add sp, x4, #64*16*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 @@ -3265,5 +3266,5 @@ .endr add sp, x5, #16*32*2 - br x15 + ret x15 endfunc diff -Nru dav1d-0.9.2/src/arm/64/loopfilter16.S dav1d-1.0.0/src/arm/64/loopfilter16.S --- dav1d-0.9.2/src/arm/64/loopfilter16.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/loopfilter16.S 2022-03-18 14:31:55.978356000 +0000 @@ -364,16 +364,16 @@ .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - br x13 + ret x13 .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - br x14 + ret x14 .endif 9: // Return directly without writing back any pixels - br x15 + ret x15 endfunc .endm @@ -418,7 +418,7 @@ st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_4_8_neon @@ -453,7 +453,7 @@ st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 endfunc function lpf_v_6_8_neon @@ -477,7 +477,7 @@ st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_6_8_neon @@ -512,7 +512,7 @@ st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 endfunc function lpf_v_8_8_neon @@ -540,7 +540,7 @@ st1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 - br x15 + ret x15 8: sub x16, x0, x1, lsl #1 @@ -549,7 +549,7 @@ st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_8_8_neon @@ -584,7 +584,7 @@ st1 {v23.8h}, [x16], x1 st1 {v27.8h}, [x0], x1 add x0, x0, #8 - br x15 + ret x15 8: sub x16, x0, x1, lsl #3 sub x16, x16, #4 @@ -600,7 +600,7 @@ st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 endfunc function lpf_v_16_8_neon @@ -643,7 +643,7 @@ st1 {v11.8h}, [x0], x1 // q5 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 - br x15 + ret x15 7: sub x16, x0, x1 sub x16, x16, x1, lsl #1 @@ -655,7 +655,7 @@ st1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 - br x15 + ret x15 8: sub x16, x0, x1, lsl #1 @@ -664,7 +664,7 @@ st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_16_8_neon @@ -714,7 +714,7 @@ st1 {v30.8h}, [x0], x1 st1 {v5.8h}, [x16], x1 st1 {v31.8h}, [x0], x1 - br x15 + ret x15 7: sub x16, x0, x1, lsl #3 @@ -731,7 +731,7 @@ st1 {v23.8h}, [x16], x1 st1 {v27.8h}, [x0], x1 add x0, x0, #8 - br x15 + ret x15 8: sub x16, x0, x1, lsl #3 sub x16, x16, #4 @@ -747,7 +747,7 @@ st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 endfunc // void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -892,7 +892,7 @@ ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 - br x11 + ret x11 endfunc .endm diff -Nru dav1d-0.9.2/src/arm/64/loopfilter.S dav1d-1.0.0/src/arm/64/loopfilter.S --- dav1d-0.9.2/src/arm/64/loopfilter.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/loopfilter.S 2022-03-18 14:31:55.978356000 +0000 @@ -478,16 +478,16 @@ .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - br x13 + ret x13 .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - br x14 + ret x14 .endif 9: // Return directly without writing back any pixels - br x15 + ret x15 endfunc .endm @@ -532,7 +532,7 @@ st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_4_16_neon @@ -583,7 +583,7 @@ st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 - br x15 + ret x15 endfunc function lpf_v_6_16_neon @@ -607,7 +607,7 @@ st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_6_16_neon @@ -658,7 +658,7 @@ st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 - br x15 + ret x15 endfunc function lpf_v_8_16_neon @@ -686,7 +686,7 @@ st1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 - br x15 + ret x15 8: sub x16, x0, x1, lsl #1 @@ -695,7 +695,7 @@ st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_8_16_neon @@ -746,7 +746,7 @@ st1 {v27.d}[0], [x16], x1 st1 {v27.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 8: sub x16, x0, x1, lsl #4 sub x16, x16, #2 @@ -770,7 +770,7 @@ st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 - br x15 + ret x15 endfunc function lpf_v_16_16_neon @@ -813,7 +813,7 @@ st1 {v11.16b}, [x0], x1 // q5 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 - br x15 + ret x15 7: sub x16, x0, x1 sub x16, x16, x1, lsl #1 @@ -825,7 +825,7 @@ st1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 - br x15 + ret x15 8: sub x16, x0, x1, lsl #1 @@ -834,7 +834,7 @@ st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_16_16_neon @@ -916,7 +916,7 @@ st1 {v30.d}[1], [x0], x1 st1 {v5.d}[1], [x16], x1 st1 {v31.d}[1], [x0], x1 - br x15 + ret x15 7: sub x16, x0, x1, lsl #4 @@ -941,7 +941,7 @@ st1 {v27.d}[0], [x16], x1 st1 {v27.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 8: sub x16, x0, x1, lsl #4 sub x16, x16, #2 @@ -965,7 +965,7 @@ st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 - br x15 + ret x15 endfunc // void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -1096,7 +1096,7 @@ ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 - br x11 + ret x11 endfunc .endm diff -Nru dav1d-0.9.2/src/arm/64/looprestoration16.S dav1d-1.0.0/src/arm/64/looprestoration16.S --- dav1d-0.9.2/src/arm/64/looprestoration16.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/looprestoration16.S 2022-03-18 14:31:55.978356000 +0000 @@ -45,36 +45,31 @@ endconst // void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, -// const pixel (*left)[4], -// const pixel *lpf, const ptrdiff_t lpf_stride, +// const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges, // const int bitdepth_max); function wiener_filter7_16bpc_neon, export=1 ldr w8, [sp] -#ifdef __APPLE__ - ldr w9, [sp, #4] -#else - ldr w9, [sp, #8] -#endif + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp - ld1 {v0.8h, v1.8h}, [x7] - tst w8, #4 // LR_HAVE_TOP + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*6 - dup v28.8h, w9 // bitdepth_max - clz w9, w9 + dup v28.8h, w8 // bitdepth_max + clz w8, w8 movi v30.4s, #1 - sub w10, w9, #38 // -(bitdepth + 6) - sub w11, w9, #11 // round_bits_v - sub w9, w9, #25 // -round_bits_h + sub w10, w8, #38 // -(bitdepth + 6) + sub w11, w8, #11 // round_bits_v + sub w8, w8, #25 // -round_bits_h neg w10, w10 // bitdepth + 6 neg w11, w11 // -round_bits_v dup v2.4s, w10 - dup v29.4s, w9 // -round_bits_h + dup v29.4s, w8 // -round_bits_h dup v27.4s, w11 // -round_bits_v movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) @@ -94,50 +89,48 @@ mov x16, x2 // backup left mov x2, #0 bl wiener_filter7_h_16bpc_neon - add x3, x3, x4 // lpf += lpf_stride + add x3, x3, x1 // lpf += stride mov x9, x14 // t6 mov x10, x14 // t5 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon - add x3, x3, x4, lsl #2 - add x3, x3, x4 // lpf += lpf_stride*5 + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon mov x13, x14 // t2 - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride L(main_7): add x15, x14, #384*2 // t0 = t1 + 384*2 L(main_loop_7): bl wiener_filter7_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_loop_7) - tst w8, #8 // LR_HAVE_BOTTOM + tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v3_7) mov x3, x16 // restore lpf mov x2, #0 // left = NULL - sub x4, x4, x1 // lpf_stride - p_stride bl wiener_filter7_hv_16bpc_neon - add x3, x3, x4 // src += lpf_stride - p_stride bl wiener_filter7_hv_16bpc_neon L(v1_7): bl wiener_filter7_v_16bpc_neon @@ -145,15 +138,16 @@ mov sp, x29 ldp d8, d9, [sp, #16] ldp x29, x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER ret L(no_top_7): - add x3, x3, x4, lsl #2 - add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x9, x14 // t6 mov x10, x14 // t5 mov x11, x14 // t4 @@ -163,22 +157,22 @@ add x3, x3, x1 // src += p_stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v2_7) add x3, x3, x1 // src += p_stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) add x3, x3, x1 // src += p_stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter7_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) add x15, x15, #384*2*4 // t0 += 384*2*4 bl wiener_filter7_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_7) L(v3_7): bl wiener_filter7_v_16bpc_neon @@ -189,11 +183,11 @@ function wiener_filter7_h_16bpc_neon - stp x3, x5, [sp, #-32]! + stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -227,13 +221,13 @@ 2: ld1 {v4.8h}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #19 + cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, @@ -242,14 +236,14 @@ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. - sub w17, w5, #22 + sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -6 + movrel x6, right_ext_mask, -6 ldr h26, [x3, w17, sxtw #1] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] - ld1 {v23.16b, v24.16b, v25.16b}, [x7] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b @@ -314,20 +308,20 @@ sub v6.8h, v6.8h, v31.8h sub v7.8h, v7.8h, v31.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldr x14, [sp, #16] - ldp x3, x5, [sp], #32 + ldp x3, x4, [sp], #32 ret endfunc @@ -337,7 +331,7 @@ stp x10, x11, [sp, #-64]! stp x12, x13, [sp, #16] stp x14, x14, [sp, #32] - stp x0, x5, [sp, #48] + stp x0, x4, [sp, #48] 1: ld1 {v16.8h, v17.8h}, [x9], #32 ld1 {v18.8h, v19.8h}, [x10], #32 @@ -384,11 +378,11 @@ sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v28.8h // bitdepth_max umin v3.8h, v3.8h, v28.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v2.8h, v3.8h}, [x0], #32 b.gt 1b - ldp x0, x5, [sp, #48] + ldp x0, x4, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] ldp x9, x10, [sp], #64 @@ -404,10 +398,10 @@ stp x12, x13, [sp, #16] stp x14, x15, [sp, #32] stp x10, x0, [sp, #48] - stp x3, x5, [sp, #64] + stp x3, x4, [sp, #64] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -440,13 +434,13 @@ 2: ld1 {v4.8h}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #19 + cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, @@ -455,14 +449,14 @@ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. - sub w17, w5, #22 + sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -6 + movrel x6, right_ext_mask, -6 ldr h26, [x3, w17, sxtw #1] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] - ld1 {v23.16b, v24.16b, v25.16b}, [x7] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b @@ -571,19 +565,19 @@ st1 {v6.8h, v7.8h}, [x15], #32 umin v18.8h, v18.8h, v28.8h // bitdepth_max umin v19.8h, v19.8h, v28.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v18.8h, v19.8h}, [x0], #32 b.le 0f mov v2.16b, v4.16b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: - ldp x3, x5, [sp, #64] + ldp x3, x4, [sp, #64] ldp x15, x0, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] @@ -596,36 +590,31 @@ endfunc // void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, -// const pixel (*left)[4], -// const pixel *lpf, const ptrdiff_t lpf_stride, +// const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges, // const int bitdepth_max); function wiener_filter5_16bpc_neon, export=1 ldr w8, [sp] -#ifdef __APPLE__ - ldr w9, [sp, #4] -#else - ldr w9, [sp, #8] -#endif + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp - ld1 {v0.8h, v1.8h}, [x7] - tst w8, #4 // LR_HAVE_TOP + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*4 - dup v28.8h, w9 // bitdepth_max - clz w9, w9 + dup v28.8h, w8 // bitdepth_max + clz w8, w8 movi v30.4s, #1 - sub w10, w9, #38 // -(bitdepth + 6) - sub w11, w9, #11 // round_bits_v - sub w9, w9, #25 // -round_bits_h + sub w10, w8, #38 // -(bitdepth + 6) + sub w11, w8, #11 // round_bits_v + sub w8, w8, #25 // -round_bits_h neg w10, w10 // bitdepth + 6 neg w11, w11 // -round_bits_v dup v2.4s, w10 - dup v29.4s, w9 // -round_bits_h + dup v29.4s, w8 // -round_bits_h dup v27.4s, w11 // -round_bits_v movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) @@ -643,74 +632,73 @@ mov x16, x2 // backup left mov x2, #0 bl wiener_filter5_h_16bpc_neon - add x3, x3, x4 // lpf += lpf_stride + add x3, x3, x1 // lpf += stride mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_16bpc_neon - add x3, x3, x4, lsl #2 - add x3, x3, x4 // lpf += lpf_stride*5 + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 mov x12, x14 // t3 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter5_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v1_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride L(main_5): mov x15, x11 // t0 = t4 L(main_loop_5): bl wiener_filter5_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_loop_5) - tst w8, #8 // LR_HAVE_BOTTOM + tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v2_5) mov x3, x16 // restore lpf mov x2, #0 // left = NULL - sub x4, x4, x1 // lpf_stride - p_stride bl wiener_filter5_hv_16bpc_neon - add x3, x3, x4 // src += lpf_stride - p_stride bl wiener_filter5_hv_16bpc_neon L(end_5): mov sp, x29 ldp d8, d9, [sp, #16] ldp x29, x30, [sp], #32 + AARCH64_VALIDATE_LINK_REGISTER ret L(no_top_5): - add x3, x3, x4, lsl #2 - add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter5_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter5_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) add x15, x15, #384*2*3 // t0 += 384*2*3 bl wiener_filter5_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_5) L(v2_5): bl wiener_filter5_v_16bpc_neon @@ -725,11 +713,11 @@ function wiener_filter5_h_16bpc_neon - stp x3, x5, [sp, #-32]! + stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -763,13 +751,13 @@ 2: ld1 {v4.8h}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #18 + cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, @@ -778,14 +766,14 @@ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. - sub w17, w5, #23 + sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -4 + movrel x6, right_ext_mask, -4 ldr h26, [x3, w17, sxtw #1] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] - ld1 {v23.16b, v24.16b, v25.16b}, [x7] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b @@ -839,27 +827,27 @@ sub v6.8h, v6.8h, v31.8h sub v7.8h, v7.8h, v31.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldr x14, [sp, #16] - ldp x3, x5, [sp], #32 + ldp x3, x4, [sp], #32 ret endfunc function wiener_filter5_v_16bpc_neon stp x11, x12, [sp, #-48]! stp x13, x14, [sp, #16] - stp x0, x5, [sp, #32] + stp x0, x4, [sp, #32] 1: ld1 {v16.8h, v17.8h}, [x11], #32 ld1 {v18.8h, v19.8h}, [x12], #32 @@ -897,11 +885,11 @@ umin v2.8h, v2.8h, v28.8h // bitdepth_max umin v3.8h, v3.8h, v28.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v2.8h, v3.8h}, [x0], #32 b.gt 1b - ldp x0, x5, [sp, #32] + ldp x0, x4, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #48 @@ -914,10 +902,10 @@ stp x12, x13, [sp, #-64]! stp x14, x15, [sp, #16] stp x12, x0, [sp, #32] - stp x3, x5, [sp, #48] + stp x3, x4, [sp, #48] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -950,13 +938,13 @@ 2: ld1 {v4.8h}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #18 + cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, @@ -965,14 +953,14 @@ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. - sub w17, w5, #23 + sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -4 + movrel x6, right_ext_mask, -4 ldr h26, [x3, w17, sxtw #1] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] - ld1 {v23.16b, v24.16b, v25.16b}, [x7] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b @@ -1059,19 +1047,19 @@ umin v8.8h, v8.8h, v28.8h // bitdepth_max umin v9.8h, v9.8h, v28.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v8.8h, v9.8h}, [x0], #32 b.le 0f mov v2.16b, v4.16b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: - ldp x3, x5, [sp, #48] + ldp x3, x4, [sp, #48] ldp x15, x0, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #64 diff -Nru dav1d-0.9.2/src/arm/64/looprestoration.S dav1d-1.0.0/src/arm/64/looprestoration.S --- dav1d-0.9.2/src/arm/64/looprestoration.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/looprestoration.S 2022-03-18 14:31:55.978356000 +0000 @@ -44,18 +44,17 @@ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst -// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t p_stride, -// const pixel (*left)[4], -// const pixel *lpf, const ptrdiff_t lpf_stride, +// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride, +// const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges); function wiener_filter7_8bpc_neon, export=1 - ldr w8, [sp] + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-16]! mov x29, sp - ld1 {v0.8h, v1.8h}, [x7] - tst w8, #4 // LR_HAVE_TOP + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*6 mov w17, #(1 << 14) - (1 << 2) @@ -75,90 +74,89 @@ mov x16, x2 // backup left mov x2, #0 bl wiener_filter7_h_8bpc_neon - add x3, x3, x4 // lpf += lpf_stride + add x3, x3, x1 // lpf += stride mov x9, x14 // t6 mov x10, x14 // t5 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon - add x3, x3, x4, lsl #2 - add x3, x3, x4 // lpf += lpf_stride*5 + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon mov x13, x14 // t2 - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride L(main_7): add x15, x14, #384*2 // t0 = t1 + 384*2 L(main_loop_7): bl wiener_filter7_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_loop_7) - tst w8, #8 // LR_HAVE_BOTTOM + tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v3_7) mov x3, x16 // restore lpf mov x2, #0 // left = NULL - sub x4, x4, x1 // lpf_stride - p_stride bl wiener_filter7_hv_8bpc_neon - add x3, x3, x4 // src += lpf_stride - p_stride bl wiener_filter7_hv_8bpc_neon L(v1_7): bl wiener_filter7_v_8bpc_neon mov sp, x29 ldp x29, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER ret L(no_top_7): - add x3, x3, x4, lsl #2 - add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x9, x14 // t6 mov x10, x14 // t5 mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v2_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter7_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) add x15, x15, #384*2*4 // t0 += 384*2*4 bl wiener_filter7_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_7) L(v3_7): bl wiener_filter7_v_8bpc_neon @@ -169,11 +167,11 @@ function wiener_filter7_h_8bpc_neon - stp x3, x5, [sp, #-32]! + stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -208,13 +206,13 @@ uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #19 + cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, @@ -223,14 +221,14 @@ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. - sub w17, w5, #22 + sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -6 + movrel x6, right_ext_mask, -6 ldr b28, [x3, w17, sxtw] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] - ld1 {v25.16b, v26.16b, v27.16b}, [x7] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b @@ -280,14 +278,14 @@ add v6.8h, v6.8h, v31.8h add v7.8h, v7.8h, v31.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. @@ -295,7 +293,7 @@ 0: ldr x14, [sp, #16] - ldp x3, x5, [sp], #32 + ldp x3, x4, [sp], #32 ret endfunc @@ -305,7 +303,7 @@ stp x10, x11, [sp, #-64]! stp x12, x13, [sp, #16] stp x14, x14, [sp, #32] - stp x0, x5, [sp, #48] + stp x0, x4, [sp, #48] 1: ld1 {v20.8h, v21.8h}, [x11], #32 ld1 {v24.8h, v25.8h}, [x13], #32 @@ -345,11 +343,11 @@ sqrshrun2 v3.8h, v5.4s, #11 sqxtun v2.8b, v2.8h sqxtun2 v2.16b, v3.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v2.16b}, [x0], #16 b.gt 1b - ldp x0, x5, [sp, #48] + ldp x0, x4, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] ldp x9, x10, [sp], #64 @@ -365,10 +363,10 @@ stp x12, x13, [sp, #16] stp x14, x15, [sp, #32] stp x10, x0, [sp, #48] - stp x3, x5, [sp, #64] + stp x3, x4, [sp, #64] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -402,13 +400,13 @@ uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #19 + cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, @@ -417,14 +415,14 @@ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. - sub w17, w5, #22 + sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -6 + movrel x6, right_ext_mask, -6 ldr b28, [x3, w17, sxtw] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] - ld1 {v25.16b, v26.16b, v27.16b}, [x7] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b @@ -510,21 +508,21 @@ st1 {v6.8h, v7.8h}, [x15], #32 sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v18.16b}, [x0], #16 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: - ldp x3, x5, [sp, #64] + ldp x3, x4, [sp, #64] ldp x15, x0, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] @@ -536,18 +534,17 @@ ret endfunc -// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t p_stride, -// const pixel (*left)[4], -// const pixel *lpf, const ptrdiff_t lpf_stride, +// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride, +// const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges); function wiener_filter5_8bpc_neon, export=1 - ldr w8, [sp] + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-16]! mov x29, sp - ld1 {v0.8h, v1.8h}, [x7] - tst w8, #4 // LR_HAVE_TOP + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*4 mov w17, #(1 << 14) - (1 << 2) @@ -565,73 +562,72 @@ mov x16, x2 // backup left mov x2, #0 bl wiener_filter5_h_8bpc_neon - add x3, x3, x4 // lpf += lpf_stride + add x3, x3, x1 // lpf += stride mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_8bpc_neon - add x3, x3, x4, lsl #2 - add x3, x3, x4 // lpf += lpf_stride*5 + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 mov x12, x14 // t3 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter5_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v1_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride L(main_5): mov x15, x11 // t0 = t4 L(main_loop_5): bl wiener_filter5_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_loop_5) - tst w8, #8 // LR_HAVE_BOTTOM + tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v2_5) mov x3, x16 // restore lpf mov x2, #0 // left = NULL - sub x4, x4, x1 // lpf_stride - p_stride bl wiener_filter5_hv_8bpc_neon - add x3, x3, x4 // src += lpf_stride - p_stride bl wiener_filter5_hv_8bpc_neon L(end_5): mov sp, x29 ldp x29, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER ret L(no_top_5): - add x3, x3, x4, lsl #2 - add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter5_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter5_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) add x15, x15, #384*2*3 // t0 += 384*2*3 bl wiener_filter5_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_5) L(v2_5): bl wiener_filter5_v_8bpc_neon @@ -646,11 +642,11 @@ function wiener_filter5_h_8bpc_neon - stp x3, x5, [sp, #-32]! + stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -685,13 +681,13 @@ uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #18 + cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, @@ -700,14 +696,14 @@ // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. - sub w17, w5, #23 + sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -4 + movrel x6, right_ext_mask, -4 ldr b28, [x3, w17, sxtw] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] - ld1 {v25.16b, v26.16b, v27.16b}, [x7] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b @@ -748,14 +744,14 @@ add v6.8h, v6.8h, v31.8h add v7.8h, v7.8h, v31.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. @@ -763,14 +759,14 @@ 0: ldr x14, [sp, #16] - ldp x3, x5, [sp], #32 + ldp x3, x4, [sp], #32 ret endfunc function wiener_filter5_v_8bpc_neon stp x11, x12, [sp, #-48]! stp x13, x14, [sp, #16] - stp x0, x5, [sp, #32] + stp x0, x4, [sp, #32] 1: ld1 {v18.8h, v19.8h}, [x12], #32 ld1 {v22.8h, v23.8h}, [x14], #32 @@ -800,11 +796,11 @@ sqrshrun2 v3.8h, v5.4s, #11 sqxtun v2.8b, v2.8h sqxtun2 v2.16b, v3.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v2.16b}, [x0], #16 b.gt 1b - ldp x0, x5, [sp, #32] + ldp x0, x4, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #48 @@ -817,10 +813,10 @@ stp x12, x13, [sp, #-64]! stp x14, x15, [sp, #16] stp x12, x0, [sp, #32] - stp x3, x5, [sp, #48] + stp x3, x4, [sp, #48] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -854,13 +850,13 @@ uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #18 + cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, @@ -869,14 +865,14 @@ // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. - sub w17, w5, #23 + sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -4 + movrel x6, right_ext_mask, -4 ldr b28, [x3, w17, sxtw] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] - ld1 {v25.16b, v26.16b, v27.16b}, [x7] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b @@ -944,21 +940,21 @@ st1 {v6.8h, v7.8h}, [x15], #32 sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v18.16b}, [x0], #16 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: - ldp x3, x5, [sp, #48] + ldp x3, x4, [sp, #48] ldp x15, x0, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #64 diff -Nru dav1d-0.9.2/src/arm/64/mc16.S dav1d-1.0.0/src/arm/64/mc16.S --- dav1d-0.9.2/src/arm/64/mc16.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/mc16.S 2022-03-18 14:31:55.978356000 +0000 @@ -65,10 +65,8 @@ saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h - xtn \d0\().4h, \d0\().4s - xtn2 \d0\().8h, \t0\().4s - xtn \d1\().4h, \d1\().4s - xtn2 \d1\().8h, \t1\().4s + uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 + uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits @@ -106,10 +104,8 @@ saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h - xtn \d0\().4h, \d0\().4s - xtn2 \d0\().8h, \t0\().4s - xtn \d1\().4h, \d1\().4s - xtn2 \d1\().8h, \t1\().4s + uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 + uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits @@ -156,6 +152,7 @@ sub x7, x7, w4, uxtw br x7 40: + AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 4: @@ -168,6 +165,7 @@ \type v4, v5, v0, v1, v2, v3 b 4b 80: + AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 8: @@ -178,6 +176,7 @@ \type v4, v5, v0, v1, v2, v3 b 8b 16: + AARCH64_VALID_JUMP_TARGET \type v6, v7, v0, v1, v2, v3 st1 {v4.8h, v5.8h}, [x0], x1 subs w5, w5, #2 @@ -186,6 +185,7 @@ \type v4, v5, v0, v1, v2, v3 b 16b 32: + AARCH64_VALID_JUMP_TARGET \type v6, v7, v0, v1, v2, v3 subs w5, w5, #1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 @@ -193,6 +193,7 @@ \type v4, v5, v0, v1, v2, v3 b 32b 640: + AARCH64_VALID_JUMP_TARGET add x7, x0, #64 64: \type v6, v7, v0, v1, v2, v3 @@ -205,6 +206,7 @@ \type v4, v5, v0, v1, v2, v3 b 64b 1280: + AARCH64_VALID_JUMP_TARGET add x7, x0, #64 mov x8, #128 sub x1, x1, #128 @@ -273,6 +275,7 @@ lsl x1, x1, #1 br x10 4: + AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) subs w5, w5, #4 @@ -313,8 +316,7 @@ umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 - xtn v20.8b, v20.8h // 64 - m - xtn2 v20.16b, v21.8h + uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 @@ -338,6 +340,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 subs w5, w5, #2 @@ -378,8 +381,7 @@ umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 - xtn v20.8b, v20.8h // 64 - m - xtn2 v20.16b, v21.8h + uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 @@ -402,6 +404,7 @@ 640: 320: 160: + AARCH64_VALID_JUMP_TARGET mov w11, w4 sub x1, x1, w4, uxtw #1 .if \type == 444 @@ -496,10 +499,8 @@ umin v6.8h, v6.8h, v31.8h // iclip_pixel umin v7.8h, v7.8h, v31.8h .if \type == 444 - xtn v20.8b, v20.8h // 64 - m - xtn2 v20.16b, v21.8h - xtn v21.8b, v22.8h - xtn2 v21.16b, v23.8h + uzp1 v20.16b, v20.16b, v21.16b // 64 - m + uzp1 v21.16b, v22.16b, v23.16b sub v20.16b, v1.16b, v20.16b // m sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 @@ -564,6 +565,7 @@ add x8, x0, x1 br x6 40: + AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 4: ld1 {v2.8b}, [x5], #8 @@ -582,6 +584,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 8: ld1 {v4.16b}, [x5], #16 @@ -605,6 +608,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 16: ld1 {v16.16b, v17.16b}, [x5], #32 @@ -639,6 +643,7 @@ b.gt 16b ret 32: + AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b}, [x5], #32 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 subs w4, w4, #1 @@ -688,6 +693,7 @@ sub x6, x6, w7, uxtw br x6 2: + AARCH64_VALID_JUMP_TARGET ld2r {v2.8b, v3.8b}, [x5], #2 ld1 {v1.4h}, [x2], #8 ext v2.8b, v2.8b, v3.8b, #6 @@ -705,6 +711,7 @@ b.gt 2b ret 4: + AARCH64_VALID_JUMP_TARGET ld2r {v2.8b, v3.8b}, [x5], #2 ld1 {v1.8h}, [x2], #16 ext v2.8b, v2.8b, v3.8b, #4 @@ -722,6 +729,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld2r {v4.8b, v5.8b}, [x5], #2 ld1 {v2.8h, v3.8h}, [x2], #32 neg v4.8b, v4.8b // -m @@ -744,6 +752,7 @@ b.gt 8b ret 16: + AARCH64_VALID_JUMP_TARGET ld2r {v16.8b, v17.8b}, [x5], #2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 neg v16.8b, v16.8b // -m @@ -774,6 +783,7 @@ 1280: 640: 320: + AARCH64_VALID_JUMP_TARGET sub x1, x1, w3, uxtw #1 add x7, x2, w3, uxtw #1 321: @@ -847,6 +857,7 @@ sub x6, x6, w3, uxtw br x6 20: + AARCH64_VALID_JUMP_TARGET ld1r {v2.8b}, [x5] neg v2.8b, v2.8b // -m sxtl v2.8h, v2.8b @@ -866,6 +877,7 @@ b.gt 2b ret 40: + AARCH64_VALID_JUMP_TARGET ld1r {v2.2s}, [x5] sub x1, x1, #4 neg v2.8b, v2.8b // -m @@ -886,6 +898,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1 {v4.8b}, [x5] sub x1, x1, #8 neg v4.8b, v4.8b // -m @@ -909,6 +922,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET ld1 {v16.16b}, [x5] sub x1, x1, #16 neg v17.16b, v16.16b // -m @@ -940,6 +954,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET ld1 {v24.16b, v25.16b}, [x5] neg v26.16b, v24.16b // -m neg v27.8b, v25.8b @@ -995,6 +1010,7 @@ br x10 2: + AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], x3 ld1 {v1.s}[0], [x2], x3 subs w5, w5, #2 @@ -1003,6 +1019,7 @@ b.gt 2b ret 4: + AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], x3 ld1 {v1.4h}, [x2], x3 subs w5, w5, #2 @@ -1011,6 +1028,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET add x8, x0, x1 lsl x1, x1, #1 add x9, x2, x3 @@ -1024,6 +1042,7 @@ b.gt 8b ret 16: + AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] @@ -1034,6 +1053,7 @@ b.gt 16b ret 32: + AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] @@ -1048,6 +1068,7 @@ b.gt 32b ret 64: + AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] @@ -1062,6 +1083,7 @@ b.gt 64b ret 128: + AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] @@ -1107,6 +1129,7 @@ br x10 40: + AARCH64_VALID_JUMP_TARGET add x9, x1, x2 lsl x2, x2, #1 4: @@ -1119,6 +1142,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET add x9, x1, x2 lsl x2, x2, #1 8: @@ -1133,6 +1157,7 @@ b.gt 8b ret 16: + AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] add x1, x1, x2 sshl v0.8h, v0.8h, v31.8h @@ -1150,6 +1175,7 @@ b.gt 16b ret 32: + AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1, #32] @@ -1166,6 +1192,7 @@ b.gt 32b ret 64: + AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] subs w4, w4, #1 sshl v0.8h, v0.8h, v31.8h @@ -1196,6 +1223,7 @@ b.gt 64b ret 128: + AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] subs w4, w4, #1 sshl v0.8h, v0.8h, v31.8h @@ -1389,11 +1417,9 @@ .endif .endm .macro xtn_h r0, r1, r2, r3 - xtn \r0\().4h, \r0\().4s - xtn2 \r0\().8h, \r1\().4s + uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2 .ifnb \r2 - xtn \r2\().4h, \r2\().4s - xtn2 \r2\().8h, \r3\().4s + uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto .endif .endm .macro srshl_s shift, r0, r1, r2, r3 @@ -1553,6 +1579,7 @@ br x10 20: // 2xN h + AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] @@ -1587,6 +1614,7 @@ .endif 40: // 4xN h + AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #2 @@ -1621,8 +1649,7 @@ srshl v16.8h, v16.8h, v29.8h // -intermediate_bits umin v16.8h, v16.8h, v31.8h .else - xtn v16.4h, v16.4s - xtn2 v16.8h, v20.4s + uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2 sub v16.8h, v16.8h, v28.8h // PREP_BIAS .endif st1 {v16.d}[0], [\dst], \d_strd @@ -1635,6 +1662,7 @@ 320: 640: 1280: // 8xN, 16xN, 32xN, ... h + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] sub \src, \src, #6 add \ds2, \dst, \d_strd @@ -1681,10 +1709,8 @@ umin v18.8h, v18.8h, v31.8h umin v22.8h, v22.8h, v31.8h .else - xtn v18.4h, v18.4s - xtn2 v18.8h, v19.4s - xtn v22.4h, v22.4s - xtn2 v22.8h, v23.4s + uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2 + uzp1 v22.8h, v22.8h, v23.8h // Ditto sub v18.8h, v18.8h, v28.8h // PREP_BIAS sub v22.8h, v22.8h, v28.8h // PREP_BIAS .endif @@ -1741,6 +1767,7 @@ br x10 20: // 2xN v + AARCH64_VALID_JUMP_TARGET .ifc \type, put b.gt 28f @@ -1774,7 +1801,7 @@ st_s \d_strd, v16, 4 ret -28: // 2x8, 2x16 v +28: // 2x6, 2x8, 2x12, 2x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd @@ -1787,33 +1814,38 @@ interleave_1_s v1, v2, v3, v4, v5 interleave_1_s v5, v6, v7 216: - subs \h, \h, #8 + subs \h, \h, #4 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 - load_s \sr2, \src, \s_strd, v20, v21, v22, v23 interleave_1_s v7, v16, v17, v18, v19 - interleave_1_s v19, v20, v21, v22, v23 smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 - smull_smlal_8 v26, v5, v6, v7, v16, v17, v18, v19, v20 - smull_smlal_8 v27, v7, v16, v17, v18, v19, v20, v21, v22 - sqrshrun_h 6, v24, v25, v26, v27 - umin_h v31, .8h, v24, v26 + sqrshrun_h 6, v24, v25 + umin_h v31, .8h, v24 st_s \d_strd, v24, 4 - st_s \d_strd, v26, 4 b.le 0f - mov v1.16b, v17.16b - mov v2.16b, v18.16b - mov v3.16b, v19.16b - mov v4.16b, v20.16b - mov v5.16b, v21.16b - mov v6.16b, v22.16b - mov v7.16b, v23.16b + cmp \h, #2 + mov v1.16b, v5.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + mov v4.16b, v16.16b + mov v5.16b, v17.16b + mov v6.16b, v18.16b + mov v7.16b, v19.16b + b.eq 26f b 216b +26: + load_s \sr2, \src, \s_strd, v16, v17 + interleave_1_s v7, v16, v17 + smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 + sqrshrun_h 6, v24 + umin_h v31, .4h, v24 + st_s \d_strd, v24, 2 0: ret .endif 40: + AARCH64_VALID_JUMP_TARGET b.gt 480f // 4x2, 4x4 v @@ -1839,7 +1871,7 @@ 0: ret -480: // 4x8, 4x16 v +480: // 4x6, 4x8, 4x12, 4x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd @@ -1859,6 +1891,7 @@ smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_4 \type, \d_strd, v1, v2, v3, v4 b.le 0f + cmp \h, #2 mov v16.8b, v20.8b mov v17.8b, v21.8b mov v18.8b, v22.8b @@ -1866,11 +1899,18 @@ mov v20.8b, v24.8b mov v21.8b, v25.8b mov v22.8b, v26.8b + b.eq 46f b 48b +46: + load_4h \sr2, \src, \s_strd, v23, v24 + smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 + smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 + shift_store_4 \type, \d_strd, v1, v2 0: ret 80: + AARCH64_VALID_JUMP_TARGET b.gt 880f // 8x2, 8x4 v @@ -1905,6 +1945,7 @@ 320: // 32x8, 32x16, ... 640: 1280: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmy] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 @@ -1959,6 +2000,7 @@ ret 160: + AARCH64_VALID_JUMP_TARGET b.gt 1680b // 16x2, 16x4 v @@ -2022,6 +2064,7 @@ br x10 20: + AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] @@ -2139,7 +2182,7 @@ b 28b 0: - br x15 + ret x15 L(\type\()_8tap_filter_2): ld1 {v25.8h}, [\sr2], \s_strd @@ -2160,6 +2203,7 @@ .endif 40: + AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 480f @@ -2304,7 +2348,7 @@ mov v22.8b, v25.8b b 48b 0: - br x15 + ret x15 L(\type\()_8tap_filter_4): ld1 {v24.8h}, [\sr2], \s_strd @@ -2332,6 +2376,7 @@ 80: 160: 320: + AARCH64_VALID_JUMP_TARGET b.gt 880f add \xmy, \xmy, #2 ld1 {v0.8b}, [\xmx] @@ -2364,8 +2409,7 @@ // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). - xtn v16.4h, v24.4s - xtn2 v16.8h, v25.4s + uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_8tap_filter_8) mov v17.16b, v23.16b @@ -2432,6 +2476,7 @@ 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #6 @@ -2463,8 +2508,7 @@ // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). - xtn v16.4h, v24.4s - xtn2 v16.8h, v25.4s + uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_8tap_filter_8) mov v17.16b, v23.16b @@ -2554,7 +2598,7 @@ add \dst, \dst, #16 b 168b 0: - br x15 + ret x15 L(\type\()_8tap_filter_8): ld1 {v4.8h, v5.8h}, [\sr2], \s_strd @@ -2575,10 +2619,8 @@ srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) - xtn v23.4h, v25.4s - xtn2 v23.8h, v26.4s - xtn v24.4h, v27.4s - xtn2 v24.8h, v28.4s + uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2 + uzp1 v24.8h, v27.8h, v28.8h // Ditto ret L(\type\()_8tap_hv_tbl): @@ -2639,6 +2681,7 @@ br x10 20: // 2xN h + AARCH64_VALID_JUMP_TARGET .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -2663,6 +2706,7 @@ .endif 40: // 4xN h + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 @@ -2689,6 +2733,7 @@ ret 80: // 8xN h + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 @@ -2722,6 +2767,7 @@ 320: 640: 1280: // 16xN, 32xN, ... h + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 @@ -2812,6 +2858,7 @@ br x10 20: // 2xN v + AARCH64_VALID_JUMP_TARGET .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd @@ -2822,6 +2869,7 @@ // 2x2 v ld1 {v16.s}[0], [\src], \s_strd b.gt 24f +22: ld1 {v17.s}[0], [\sr2], \s_strd ld1 {v18.s}[0], [\src], \s_strd trn1 v16.2s, v16.2s, v17.2s @@ -2832,11 +2880,12 @@ st1 {v4.s}[0], [\dst] st1 {v4.s}[1], [\ds2] ret -24: // 2x4, 2x8, ... v +24: // 2x4, 2x6, 2x8, ... v ld1 {v17.s}[0], [\sr2], \s_strd ld1 {v18.s}[0], [\src], \s_strd ld1 {v19.s}[0], [\sr2], \s_strd ld1 {v20.s}[0], [\src], \s_strd + sub \h, \h, #4 trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s trn1 v18.2s, v18.2s, v19.2s @@ -2845,20 +2894,22 @@ trn1 v17.2d, v17.2d, v19.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h - subs \h, \h, #4 + cmp \h, #2 urshr v4.8h, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd st1 {v4.s}[2], [\dst], \d_strd st1 {v4.s}[3], [\ds2], \d_strd - b.le 0f + b.lt 0f mov v16.8b, v20.8b + b.eq 22b b 24b 0: ret .endif 40: // 4xN v + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 @@ -2887,6 +2938,7 @@ ret 80: // 8xN v + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 @@ -2921,6 +2973,7 @@ 320: 640: 1280: + AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \ds2, \dst, \d_strd @@ -3004,6 +3057,7 @@ br x10 20: // 2xN hv + AARCH64_VALID_JUMP_TARGET .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd @@ -3044,6 +3098,7 @@ .endif 40: // 4xN hv + AARCH64_VALID_JUMP_TARGET add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 @@ -3075,8 +3130,7 @@ .ifc \type, put urshl v4.4s, v4.4s, v30.4s urshl v5.4s, v5.4s, v30.4s - xtn v4.4h, v4.4s - xtn2 v4.8h, v5.4s + uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 @@ -3096,6 +3150,7 @@ 320: 640: 1280: + AARCH64_VALID_JUMP_TARGET mov \my, \h 1: @@ -3138,10 +3193,8 @@ urshl v5.4s, v5.4s, v30.4s urshl v6.4s, v6.4s, v30.4s urshl v7.4s, v7.4s, v30.4s - xtn v4.4h, v4.4s - xtn2 v4.8h, v5.4s - xtn v5.4h, v6.4s - xtn2 v5.8h, v7.4s + uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 + uzp1 v5.8h, v6.8h, v7.8h // Ditto .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 @@ -3308,32 +3361,24 @@ .endif bl warp_filter_horz_neon - xtn v24.4h, v16.4s - xtn2 v24.8h, v17.4s + uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2 bl warp_filter_horz_neon - xtn v25.4h, v16.4s - xtn2 v25.8h, v17.4s + uzp1 v25.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v26.4h, v16.4s - xtn2 v26.8h, v17.4s + uzp1 v26.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v27.4h, v16.4s - xtn2 v27.8h, v17.4s + uzp1 v27.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v28.4h, v16.4s - xtn2 v28.8h, v17.4s + uzp1 v28.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v29.4h, v16.4s - xtn2 v29.8h, v17.4s + uzp1 v29.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon - xtn v30.4h, v16.4s - xtn2 v30.8h, v17.4s + uzp1 v30.8h, v16.8h, v17.8h // Ditto 1: add w14, w6, #512 bl warp_filter_horz_neon - xtn v31.4h, v16.4s - xtn2 v31.8h, v17.4s + uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 @@ -3398,7 +3443,7 @@ ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 - br x15 + ret x15 endfunc .endm diff -Nru dav1d-0.9.2/src/arm/64/mc.S dav1d-1.0.0/src/arm/64/mc.S --- dav1d-0.9.2/src/arm/64/mc.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/mc.S 2022-03-18 14:31:55.978356000 +0000 @@ -86,6 +86,7 @@ sub x7, x7, w4, uxtw br x7 40: + AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 4: @@ -114,6 +115,7 @@ st1 {v5.s}[3], [x7], x1 ret 80: + AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 8: @@ -127,6 +129,7 @@ \type v4, v0, v1, v2, v3 b 8b 16: + AARCH64_VALID_JUMP_TARGET \type v5, v0, v1, v2, v3 st1 {v4.16b}, [x0], x1 \type v6, v0, v1, v2, v3 @@ -139,6 +142,7 @@ \type v4, v0, v1, v2, v3 b 16b 320: + AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 32: @@ -152,6 +156,7 @@ \type v4, v0, v1, v2, v3 b 32b 640: + AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 64: @@ -169,6 +174,7 @@ \type v4, v0, v1, v2, v3 b 64b 1280: + AARCH64_VALID_JUMP_TARGET add x7, x0, #64 128: \type v5, v0, v1, v2, v3 @@ -225,6 +231,7 @@ lsl x1, x1, #1 br x9 4: + AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) subs w5, w5, #4 @@ -245,8 +252,7 @@ sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 - xtn v18.8b, v18.8h - xtn2 v18.16b, v19.8h + uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 @@ -270,6 +276,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v6.8h, v7.8h}, [x3], #32 subs w5, w5, #2 @@ -290,8 +297,7 @@ sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 - xtn v18.8b, v18.8h - xtn2 v18.16b, v19.8h + uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 @@ -314,6 +320,7 @@ 640: 320: 160: + AARCH64_VALID_JUMP_TARGET mov w11, w4 sub x1, x1, w4, uxtw .if \type == 444 @@ -364,10 +371,8 @@ sqrshrun v26.8b, v26.8h, #4 sqrshrun v27.8b, v27.8h, #4 .if \type == 444 - xtn v20.8b, v20.8h - xtn2 v20.16b, v21.8h - xtn v21.8b, v22.8h - xtn2 v21.16b, v23.8h + uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2 + uzp1 v21.16b, v22.16b, v23.16b // Ditto sub v20.16b, v1.16b, v20.16b sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 @@ -434,6 +439,7 @@ lsl x1, x1, #1 br x6 4: + AARCH64_VALID_JUMP_TARGET ld1 {v2.8b}, [x5], #8 ld1 {v1.d}[0], [x2], #8 ld1 {v0.s}[0], [x0] @@ -448,6 +454,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld1 {v2.16b}, [x5], #16 ld1 {v1.16b}, [x2], #16 ld1 {v0.d}[0], [x0] @@ -465,6 +472,7 @@ b.gt 8b ret 16: + AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x5], #32 ld1 {v5.16b, v6.16b}, [x2], #32 ld1 {v0.16b}, [x0] @@ -489,6 +497,7 @@ b.gt 16b ret 32: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ld1 {v20.16b, v21.16b}, [x0] @@ -547,6 +556,7 @@ sub x6, x6, w7, uxtw br x6 2: + AARCH64_VALID_JUMP_TARGET ld1 {v0.h}[0], [x5], #2 ld1 {v1.s}[0], [x2], #4 subs w4, w4, #2 @@ -562,6 +572,7 @@ b.gt 2b ret 4: + AARCH64_VALID_JUMP_TARGET ld2r {v0.8b, v1.8b}, [x5], #2 ld1 {v2.8b}, [x2], #8 subs w4, w4, #2 @@ -577,6 +588,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld2r {v0.16b, v1.16b}, [x5], #2 ld1 {v2.16b}, [x2], #16 ld1 {v3.d}[0], [x0] @@ -595,6 +607,7 @@ b.gt 8b ret 16: + AARCH64_VALID_JUMP_TARGET ld2r {v0.16b, v1.16b}, [x5], #2 ld1 {v2.16b, v3.16b}, [x2], #32 ld1 {v5.16b}, [x0] @@ -621,6 +634,7 @@ 1280: 640: 320: + AARCH64_VALID_JUMP_TARGET sub x1, x1, w3, uxtw add x7, x2, w3, uxtw 321: @@ -691,6 +705,7 @@ sub x6, x6, w3, uxtw br x6 20: + AARCH64_VALID_JUMP_TARGET ld1r {v0.8b}, [x5] sub v1.8b, v4.8b, v0.8b 2: @@ -708,6 +723,7 @@ b.gt 2b ret 40: + AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x5] sub x1, x1, #2 sub v1.8b, v4.8b, v0.8b @@ -726,6 +742,7 @@ b.gt 4b ret 80: + AARCH64_VALID_JUMP_TARGET ld1r {v0.2d}, [x5] sub x1, x1, #4 sub v1.16b, v4.16b, v0.16b @@ -747,6 +764,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x5] sub x1, x1, #8 sub v2.16b, v4.16b, v0.16b @@ -774,6 +792,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x5] sub x1, x1, #16 sub v2.16b, v4.16b, v0.16b @@ -825,6 +844,7 @@ br x9 2: + AARCH64_VALID_JUMP_TARGET ld1 {v0.h}[0], [x2], x3 ld1 {v1.h}[0], [x2], x3 subs w5, w5, #2 @@ -833,6 +853,7 @@ b.gt 2b ret 4: + AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], x3 ld1 {v1.s}[0], [x2], x3 subs w5, w5, #2 @@ -841,6 +862,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], x3 ld1 {v1.8b}, [x2], x3 subs w5, w5, #2 @@ -849,6 +871,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET add x8, x0, x1 lsl x1, x1, #1 add x9, x2, x3 @@ -862,6 +885,7 @@ b.gt 16b ret 32: + AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] @@ -872,6 +896,7 @@ b.gt 32b ret 64: + AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] @@ -886,6 +911,7 @@ b.gt 64b ret 128: + AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] @@ -920,6 +946,7 @@ br x9 4: + AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x1], x2 ld1 {v1.s}[0], [x1], x2 subs w4, w4, #2 @@ -929,6 +956,7 @@ b.gt 4b ret 8: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x1], x2 subs w4, w4, #2 @@ -938,6 +966,7 @@ b.gt 8b ret 160: + AARCH64_VALID_JUMP_TARGET add x9, x1, x2 lsl x2, x2, #1 16: @@ -952,6 +981,7 @@ b.gt 16b ret 320: + AARCH64_VALID_JUMP_TARGET add x8, x0, w3, uxtw 32: ld1 {v0.16b, v1.16b}, [x1], x2 @@ -972,6 +1002,7 @@ b.gt 32b ret 640: + AARCH64_VALID_JUMP_TARGET add x8, x0, #32 mov x6, #64 64: @@ -994,6 +1025,7 @@ b.gt 64b ret 1280: + AARCH64_VALID_JUMP_TARGET add x8, x0, #64 mov x6, #128 128: @@ -1131,6 +1163,26 @@ // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. +.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().4h, \s0\().4h, v0.h[0] + mla \d0\().4h, \s1\().4h, v0.h[1] + mla \d0\().4h, \s2\().4h, v0.h[2] + mla \d0\().4h, \s3\().4h, v0.h[3] + mla \d0\().4h, \s4\().4h, v0.h[4] + mla \d0\().4h, \s5\().4h, v0.h[5] + mla \d0\().4h, \s6\().4h, v0.h[6] + mla \d0\().4h, \s7\().4h, v0.h[7] +.endm +.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 + mul \d0\().8h, \s0\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] +.endm .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] @@ -1167,24 +1219,6 @@ mla \d1\().8h, \s8\().8h, v0.h[6] mla \d1\().8h, \s9\().8h, v0.h[7] .endm -.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 - mul \d0\().8h, \s0\().8h, v0.h[0] - mla \d0\().8h, \s1\().8h, v0.h[1] - mla \d0\().8h, \s2\().8h, v0.h[2] - mla \d0\().8h, \s3\().8h, v0.h[3] - mla \d0\().8h, \s4\().8h, v0.h[4] - mla \d0\().8h, \s5\().8h, v0.h[5] - mla \d0\().8h, \s6\().8h, v0.h[6] - mla \d0\().8h, \s7\().8h, v0.h[7] - mul \d1\().8h, \s4\().8h, v0.h[0] - mla \d1\().8h, \s5\().8h, v0.h[1] - mla \d1\().8h, \s6\().8h, v0.h[2] - mla \d1\().8h, \s7\().8h, v0.h[3] - mla \d1\().8h, \s8\().8h, v0.h[4] - mla \d1\().8h, \s9\().8h, v0.h[5] - mla \d1\().8h, \s10\().8h, v0.h[6] - mla \d1\().8h, \s11\().8h, v0.h[7] -.endm .macro sqrshrun_b shift, r0, r1, r2, r3 sqrshrun \r0\().8b, \r0\().8h, #\shift .ifnb \r1 @@ -1342,6 +1376,7 @@ br x9 20: // 2xN h + AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] @@ -1376,6 +1411,7 @@ .endif 40: // 4xN h + AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #1 @@ -1419,6 +1455,7 @@ ret 80: // 8xN h + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] sub \src, \src, #3 add \ds2, \dst, \d_strd @@ -1460,6 +1497,7 @@ 320: 640: 1280: // 16xN, 32xN, ... h + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] sub \src, \src, #3 add \ds2, \dst, \d_strd @@ -1563,6 +1601,7 @@ br x9 20: // 2xN v + AARCH64_VALID_JUMP_TARGET .ifc \type, put b.gt 28f @@ -1596,7 +1635,7 @@ st_h \d_strd, v6, 4 ret -28: // 2x8, 2x16 v +28: // 2x6, 2x8, 2x12, 2x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd @@ -1611,33 +1650,38 @@ interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 216: - subs \h, \h, #8 + subs \h, \h, #4 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 - load_h \sr2, \src, \s_strd, v20, v21, v22, v23 interleave_1_h v7, v16, v17, v18, v19 - interleave_1_h v19, v20, v21, v22, v23 interleave_2_s v5, v6, v7, v16, v17, v18 - interleave_2_s v17, v18, v19, v20, v21, v22 uxtl_b v5, v6, v7, v16 - uxtl_b v17, v18, v19, v20 - mul_mla_8_4 v30, v31, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20 - sqrshrun_b 6, v30, v31 + mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 + sqrshrun_b 6, v30 st_h \d_strd, v30, 4 - st_h \d_strd, v31, 4 b.le 0f - mov v1.16b, v17.16b - mov v2.16b, v18.16b - mov v3.16b, v19.16b - mov v4.16b, v20.16b - mov v5.16b, v21.16b - mov v6.16b, v22.16b - mov v7.16b, v23.16b + cmp \h, #2 + mov v1.16b, v5.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + mov v4.16b, v16.16b + mov v5.16b, v17.16b + mov v6.16b, v18.16b + mov v7.16b, v19.16b + b.eq 26f b 216b +26: + load_h \sr2, \src, \s_strd, v16, v17 + interleave_1_h v7, v16, v17 + uxtl_b v5, v6, v7, v16 + mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 + sqrshrun_b 6, v30 + st_h \d_strd, v30, 2 0: ret .endif 40: + AARCH64_VALID_JUMP_TARGET b.gt 480f // 4x2, 4x4 v @@ -1665,7 +1709,7 @@ 0: ret -480: // 4x8, 4x16 v +480: // 4x6, 4x8, 4x12, 4x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd @@ -1688,12 +1732,19 @@ mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 shift_store_4 \type, \d_strd, v1, v2 b.le 0f - subs \h, \h, #4 - load_s \sr2, \src, \s_strd, v27, v16, v17, v18 - interleave_1_s v26, v27, v16, v17, v18 - uxtl_b v26, v27, v16, v17 - mul_mla_8_2 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17 - shift_store_4 \type, \d_strd, v1, v2 + load_s \sr2, \src, \s_strd, v27, v16 + subs \h, \h, #2 + interleave_1_s v26, v27, v16 + uxtl_b v26, v27 + mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 + shift_store_4 \type, \d_strd, v1 + b.le 0f + load_s \sr2, \src, \s_strd, v17, v18 + subs \h, \h, #2 + interleave_1_s v16, v17, v18 + uxtl_b v16, v17 + mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 + shift_store_4 \type, \d_strd, v2 b.le 0f subs \h, \h, #4 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 @@ -1706,6 +1757,7 @@ ret 80: + AARCH64_VALID_JUMP_TARGET b.gt 880f // 8x2, 8x4 v @@ -1738,6 +1790,7 @@ 320: // 32x8, 32x16, ... 640: 1280: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmy] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 @@ -1804,6 +1857,7 @@ ret 160: + AARCH64_VALID_JUMP_TARGET b.gt 1680b // 16x2, 16x4 v @@ -1872,6 +1926,7 @@ br x9 20: + AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] @@ -1979,7 +2034,7 @@ b 28b 0: - br x15 + ret x15 L(\type\()_8tap_filter_2): ld1 {v28.8b}, [\sr2], \s_strd @@ -2001,6 +2056,7 @@ .endif 40: + AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 480f @@ -2135,7 +2191,7 @@ mov v22.8b, v29.8b b 48b 0: - br x15 + ret x15 L(\type\()_8tap_filter_4): ld1 {v26.8b}, [\sr2], \s_strd @@ -2163,6 +2219,7 @@ 80: 160: 320: + AARCH64_VALID_JUMP_TARGET b.gt 880f add \xmy, \xmy, #2 ld1 {v0.8b}, [\xmx] @@ -2242,6 +2299,7 @@ 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: + AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #3 @@ -2343,7 +2401,7 @@ .endif b 168b 0: - br x15 + ret x15 L(\type\()_8tap_filter_8_first): ld1 {v28.8b, v29.8b}, [\src], \s_strd @@ -2426,6 +2484,7 @@ br x9 20: // 2xN h + AARCH64_VALID_JUMP_TARGET .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -2449,6 +2508,7 @@ .endif 40: // 4xN h + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 @@ -2475,6 +2535,7 @@ ret 80: // 8xN h + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 @@ -2504,6 +2565,7 @@ 320: 640: 1280: // 16xN, 32xN, ... h + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 @@ -2581,6 +2643,7 @@ br x9 20: // 2xN v + AARCH64_VALID_JUMP_TARGET .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd @@ -2591,6 +2654,7 @@ // 2x2 v ld1 {v16.h}[0], [\src], \s_strd b.gt 24f +22: ld1 {v17.h}[0], [\sr2], \s_strd ld1 {v18.h}[0], [\src], \s_strd trn1 v16.4h, v16.4h, v17.4h @@ -2601,11 +2665,12 @@ st1 {v4.h}[0], [\dst] st1 {v4.h}[1], [\ds2] ret -24: // 2x4, 2x8, ... v +24: // 2x4, 2x6, 2x8, ... v ld1 {v17.h}[0], [\sr2], \s_strd ld1 {v18.h}[0], [\src], \s_strd ld1 {v19.h}[0], [\sr2], \s_strd ld1 {v20.h}[0], [\src], \s_strd + sub \h, \h, #4 trn1 v16.4h, v16.4h, v17.4h trn1 v17.4h, v17.4h, v18.4h trn1 v18.4h, v18.4h, v19.4h @@ -2614,20 +2679,22 @@ trn1 v17.2s, v17.2s, v19.2s umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b - subs \h, \h, #4 + cmp \h, #2 uqrshrn v4.8b, v4.8h, #4 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd st1 {v4.h}[2], [\dst], \d_strd st1 {v4.h}[3], [\ds2], \d_strd - b.le 0f + b.lt 0f mov v16.8b, v20.8b + b.eq 22b b 24b 0: ret .endif 40: // 4xN v + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 @@ -2656,6 +2723,7 @@ ret 80: // 8xN v + AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 @@ -2688,6 +2756,7 @@ 320: 640: 1280: + AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \ds2, \dst, \d_strd @@ -2760,6 +2829,7 @@ br x9 20: // 2xN hv + AARCH64_VALID_JUMP_TARGET .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd @@ -2797,6 +2867,7 @@ .endif 40: // 4xN hv + AARCH64_VALID_JUMP_TARGET add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 @@ -2842,6 +2913,7 @@ 320: 640: 1280: + AARCH64_VALID_JUMP_TARGET mov \my, \h 1: @@ -3072,7 +3144,7 @@ add w6, w6, w4 b.gt 1b - br x15 + ret x15 endfunc .endm diff -Nru dav1d-0.9.2/src/arm/64/refmvs.S dav1d-1.0.0/src/arm/64/refmvs.S --- dav1d-0.9.2/src/arm/64/refmvs.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/64/refmvs.S 2022-03-18 14:31:55.978356000 +0000 @@ -51,26 +51,32 @@ br x3 10: + AARCH64_VALID_JUMP_TARGET st1 {v0.8b}, [x1] str s2, [x1, #8] b.gt 1b ret 20: + AARCH64_VALID_JUMP_TARGET st1 {v0.16b}, [x1] str d1, [x1, #16] b.gt 1b ret 320: + AARCH64_VALID_JUMP_TARGET st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 160: + AARCH64_VALID_JUMP_TARGET st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 80: + AARCH64_VALID_JUMP_TARGET st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 40: + AARCH64_VALID_JUMP_TARGET st1 {v0.16b, v1.16b, v2.16b}, [x1] b.gt 1b ret diff -Nru dav1d-0.9.2/src/arm/asm.S dav1d-1.0.0/src/arm/asm.S --- dav1d-0.9.2/src/arm/asm.S 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/asm.S 2022-03-18 14:31:55.982356000 +0000 @@ -30,6 +30,135 @@ #include "config.h" +#if ARCH_AARCH64 +#define x18 do_not_use_x18 +#define w18 do_not_use_w18 + +/* Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * + * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to + * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be + * used immediately before saving the LR register (x30) to the stack. + * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring + * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone + * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also + * have the same value at the two points. For example: + * + * .global f + * f: + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or + * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an + * indirect call target. In particular, all symbols exported from a file must + * begin with one of these macros. For example, a leaf function that does not + * save LR can instead use |AARCH64_VALID_CALL_TARGET|: + * + * .globl return_zero + * return_zero: + * AARCH64_VALID_CALL_TARGET + * mov x0, #0 + * ret + * + * A non-leaf function which does not immediately save LR may need both macros + * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function + * may jump to an alternate implementation before setting up the stack: + * + * .globl with_early_jump + * with_early_jump: + * AARCH64_VALID_CALL_TARGET + * cmp x0, #128 + * b.lt .Lwith_early_jump_128 + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * .Lwith_early_jump_128: + * ... + * ret + * + * These annotations are only required with indirect calls. Private symbols that + * are only the target of direct calls do not require annotations. Also note + * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not + * indirect jumps (BR). Indirect jumps in assembly are supported through + * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and + * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|. + * + * Although not necessary, it is safe to use these macros in 32-bit ARM + * assembly. This may be used to simplify dual 32-bit and 64-bit files. + * + * References: + * - "ELF for the Arm® 64-bit Architecture" + * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst + * - "Providing protection for complex software" + * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software + */ +#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1) +#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification +#define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc' +#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' +#define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j' +#else +#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification +#define AARCH64_VALID_JUMP_CALL_TARGET +#define AARCH64_VALID_CALL_TARGET +#define AARCH64_VALID_JUMP_TARGET +#endif + +#if defined(__ARM_FEATURE_PAC_DEFAULT) + +#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A +#define AARCH64_SIGN_LINK_REGISTER paciasp +#define AARCH64_VALIDATE_LINK_REGISTER autiasp +#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B +#define AARCH64_SIGN_LINK_REGISTER pacibsp +#define AARCH64_VALIDATE_LINK_REGISTER autibsp +#else +#error Pointer authentication defines no valid key! +#endif +#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions +#error Authentication of leaf functions is enabled but not supported in dav1d! +#endif +#define GNU_PROPERTY_AARCH64_PAC (1 << 1) + +#else /* __ARM_FEATURE_PAC_DEFAULT */ + +#define GNU_PROPERTY_AARCH64_PAC 0 +#define AARCH64_SIGN_LINK_REGISTER +#define AARCH64_VALIDATE_LINK_REGISTER + +#endif /* !__ARM_FEATURE_PAC_DEFAULT */ + + +#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) + .pushsection .note.gnu.property, "a" + .balign 8 + .long 4 + .long 0x10 + .long 0x5 + .asciz "GNU" + .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ + .long 4 + .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC) + .long 0 + .popsection +#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */ +#endif /* ARCH_AARCH64 */ + #if ARCH_ARM .syntax unified #ifdef __ELF__ @@ -38,7 +167,7 @@ .eabi_attribute 10, 0 // suppress Tag_FP_arch .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch .section .note.GNU-stack,"",%progbits // Mark stack as non-executable -#endif +#endif /* __ELF__ */ #ifdef _WIN32 #define CONFIG_THUMB 1 @@ -53,8 +182,8 @@ #else #define A #define T @ -#endif -#endif +#endif /* CONFIG_THUMB */ +#endif /* ARCH_ARM */ #if !defined(PIC) #if defined(__PIC__) @@ -110,6 +239,11 @@ #endif .endif \name: +#if ARCH_AARCH64 + .if \export + AARCH64_VALID_CALL_TARGET + .endif +#endif .endm .macro const name, export=0, align=2 @@ -147,9 +281,5 @@ #define X(x) CONCAT(EXTERN, x) -#if ARCH_AARCH64 -#define x18 do_not_use_x18 -#define w18 do_not_use_w18 -#endif #endif /* DAV1D_SRC_ARM_ASM_S */ diff -Nru dav1d-0.9.2/src/arm/cdef_init_tmpl.c dav1d-1.0.0/src/arm/cdef_init_tmpl.c --- dav1d-0.9.2/src/arm/cdef_init_tmpl.c 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/cdef_init_tmpl.c 2022-03-18 14:31:55.982356000 +0000 @@ -31,11 +31,13 @@ void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src, ptrdiff_t src_stride, const pixel (*left)[2], - const pixel *const top, int h, + const pixel *const top, + const pixel *const bottom, int h, enum CdefEdgeFlags edges); void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src, ptrdiff_t src_stride, const pixel (*left)[2], - const pixel *const top, int h, + const pixel *const top, + const pixel *const bottom, int h, enum CdefEdgeFlags edges); // Passing edges to this function, to allow it to switch to a more @@ -52,9 +54,10 @@ #define DEFINE_FILTER(w, h, tmp_stride) \ static void \ -cdef_filter_##w##x##h##_neon(pixel *dst, \ - const ptrdiff_t stride, \ - const pixel (*left)[2], const pixel *const top, \ +cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const pixel *const bottom, \ const int pri_strength, const int sec_strength, \ const int dir, const int damping, \ const enum CdefEdgeFlags edges \ @@ -62,7 +65,8 @@ { \ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \ - BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, left, top, h, edges); \ + BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, \ + left, top, bottom, h, edges); \ BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength, \ sec_strength, dir, damping, h, edges \ HIGHBD_TAIL_SUFFIX); \ diff -Nru dav1d-0.9.2/src/arm/film_grain_init_tmpl.c dav1d-1.0.0/src/arm/film_grain_init_tmpl.c --- dav1d-0.9.2/src/arm/film_grain_init_tmpl.c 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/film_grain_init_tmpl.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,218 +0,0 @@ -/* - * Copyright © 2018, Niklas Haas - * Copyright © 2018, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC - * Copyright © 2021, Martin Storsjo - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "src/cpu.h" -#include "src/film_grain.h" -#include "asm-offsets.h" - -CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED); -CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG); -CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y); -CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV); -CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT); -CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT); - -CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT); -CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT); -CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT); -CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET); -CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE); - -void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH], - const Dav1dFilmGrainData *const data - HIGHBD_DECL_SUFFIX); - -#define GEN_GRAIN_UV(suff) \ -void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \ - const entry buf_y[][GRAIN_WIDTH], \ - const Dav1dFilmGrainData *const data, \ - const intptr_t uv \ - HIGHBD_DECL_SUFFIX) - -GEN_GRAIN_UV(420); -GEN_GRAIN_UV(422); -GEN_GRAIN_UV(444); - -// Use ptrdiff_t instead of int for the last few parameters, to get the -// same layout of parameters on the stack across platforms. -void BF(dav1d_fgy_32x32, neon)(pixel *const dst, - const pixel *const src, - const ptrdiff_t stride, - const uint8_t scaling[SCALING_SIZE], - const int scaling_shift, - const entry grain_lut[][GRAIN_WIDTH], - const int offsets[][2], - const int h, const ptrdiff_t clip, - const ptrdiff_t type - HIGHBD_DECL_SUFFIX); - -// Use ptrdiff_t instead of int for the last few parameters, to get the -// parameters on the stack with the same layout across platforms. -#define FGUV(suff) \ -void BF(dav1d_fguv_32x32_ ## suff, neon)(pixel *const dst, \ - const pixel *const src, \ - const ptrdiff_t stride, \ - const uint8_t scaling[SCALING_SIZE], \ - const Dav1dFilmGrainData *const data, \ - const entry grain_lut[][GRAIN_WIDTH], \ - const pixel *const luma_row, \ - const ptrdiff_t luma_stride, \ - const int offsets[][2], \ - const ptrdiff_t h, const ptrdiff_t uv, \ - const ptrdiff_t is_id, \ - const ptrdiff_t type \ - HIGHBD_DECL_SUFFIX) - -FGUV(420); -FGUV(422); -FGUV(444); - -static inline int get_random_number(const int bits, unsigned *const state) { - const int r = *state; - unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; - *state = (r >> 1) | (bit << 15); - - return (*state >> (16 - bits)) & ((1 << bits) - 1); -} - -static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, - const ptrdiff_t stride, - const Dav1dFilmGrainData *const data, const size_t pw, - const uint8_t scaling[SCALING_SIZE], - const entry grain_lut[][GRAIN_WIDTH], - const int bh, const int row_num HIGHBD_DECL_SUFFIX) -{ - const int rows = 1 + (data->overlap_flag && row_num > 0); - - // seed[0] contains the current row, seed[1] contains the previous - unsigned seed[2]; - for (int i = 0; i < rows; i++) { - seed[i] = data->seed; - seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; - seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); - } - - int offsets[2 /* col offset */][2 /* row offset */]; - - // process this row in BLOCK_SIZE^2 blocks - for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { - - if (data->overlap_flag && bx) { - // shift previous offsets left - for (int i = 0; i < rows; i++) - offsets[1][i] = offsets[0][i]; - } - - // update current offsets - for (int i = 0; i < rows; i++) - offsets[0][i] = get_random_number(8, &seed[i]); - - int type = 0; - if (data->overlap_flag && row_num) - type |= 1; /* overlap y */ - if (data->overlap_flag && bx) - type |= 2; /* overlap x */ - - BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride, - scaling, data->scaling_shift, - grain_lut, offsets, bh, - data->clip_to_restricted_range, type - HIGHBD_TAIL_SUFFIX); - } -} - -#define fguv_ss_fn(nm, sx, sy) \ -static void \ -fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ - const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \ - const int pw, const uint8_t scaling[SCALING_SIZE], \ - const entry grain_lut[][GRAIN_WIDTH], const int bh, \ - const int row_num, const pixel *const luma_row, \ - const ptrdiff_t luma_stride, const int uv, const int is_id \ - HIGHBD_DECL_SUFFIX) \ -{ \ - const int rows = 1 + (data->overlap_flag && row_num > 0); \ - \ - /* seed[0] contains the current row, seed[1] contains the previous */ \ - unsigned seed[2]; \ - for (int i = 0; i < rows; i++) { \ - seed[i] = data->seed; \ - seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \ - seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \ - } \ - \ - int offsets[2 /* col offset */][2 /* row offset */]; \ - \ - /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \ - for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \ - if (data->overlap_flag && bx) { \ - /* shift previous offsets left */ \ - for (int i = 0; i < rows; i++) \ - offsets[1][i] = offsets[0][i]; \ - } \ - \ - /* update current offsets */ \ - for (int i = 0; i < rows; i++) \ - offsets[0][i] = get_random_number(8, &seed[i]); \ - \ - int type = 0; \ - if (data->overlap_flag && row_num) \ - type |= 1; /* overlap y */ \ - if (data->overlap_flag && bx) \ - type |= 2; /* overlap x */ \ - if (data->chroma_scaling_from_luma) \ - type |= 4; \ - \ - BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \ - scaling, data, grain_lut, \ - luma_row + (bx << sx), luma_stride, \ - offsets, bh, uv, is_id, type \ - HIGHBD_TAIL_SUFFIX); \ - } \ -} - -fguv_ss_fn(420, 1, 1); -fguv_ss_fn(422, 1, 0); -fguv_ss_fn(444, 0, 0); - -COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c) { - const unsigned flags = dav1d_get_cpu_flags(); - - if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; - - c->generate_grain_y = BF(dav1d_generate_grain_y, neon); - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon); - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon); - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon); - - c->fgy_32x32xn = fgy_32x32xn_neon; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon; -} diff -Nru dav1d-0.9.2/src/arm/filmgrain_init_tmpl.c dav1d-1.0.0/src/arm/filmgrain_init_tmpl.c --- dav1d-0.9.2/src/arm/filmgrain_init_tmpl.c 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/arm/filmgrain_init_tmpl.c 2022-03-18 14:31:55.982356000 +0000 @@ -0,0 +1,218 @@ +/* + * Copyright © 2018, Niklas Haas + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/filmgrain.h" +#include "asm-offsets.h" + +CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT); + +CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET); +CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE); + +void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data + HIGHBD_DECL_SUFFIX); + +#define GEN_GRAIN_UV(suff) \ +void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \ + const entry buf_y[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data, \ + const intptr_t uv \ + HIGHBD_DECL_SUFFIX) + +GEN_GRAIN_UV(420); +GEN_GRAIN_UV(422); +GEN_GRAIN_UV(444); + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// same layout of parameters on the stack across platforms. +void BF(dav1d_fgy_32x32, neon)(pixel *const dst, + const pixel *const src, + const ptrdiff_t stride, + const uint8_t scaling[SCALING_SIZE], + const int scaling_shift, + const entry grain_lut[][GRAIN_WIDTH], + const int offsets[][2], + const int h, const ptrdiff_t clip, + const ptrdiff_t type + HIGHBD_DECL_SUFFIX); + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// parameters on the stack with the same layout across platforms. +#define FGUV(suff) \ +void BF(dav1d_fguv_32x32_ ## suff, neon)(pixel *const dst, \ + const pixel *const src, \ + const ptrdiff_t stride, \ + const uint8_t scaling[SCALING_SIZE], \ + const Dav1dFilmGrainData *const data, \ + const entry grain_lut[][GRAIN_WIDTH], \ + const pixel *const luma_row, \ + const ptrdiff_t luma_stride, \ + const int offsets[][2], \ + const ptrdiff_t h, const ptrdiff_t uv, \ + const ptrdiff_t is_id, \ + const ptrdiff_t type \ + HIGHBD_DECL_SUFFIX) + +FGUV(420); +FGUV(422); +FGUV(444); + +static inline int get_random_number(const int bits, unsigned *const state) { + const int r = *state; + unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; + *state = (r >> 1) | (bit << 15); + + return (*state >> (16 - bits)) & ((1 << bits) - 1); +} + +static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, + const Dav1dFilmGrainData *const data, const size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], + const int bh, const int row_num HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { + + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + int type = 0; + if (data->overlap_flag && row_num) + type |= 1; /* overlap y */ + if (data->overlap_flag && bx) + type |= 2; /* overlap x */ + + BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride, + scaling, data->scaling_shift, + grain_lut, offsets, bh, + data->clip_to_restricted_range, type + HIGHBD_TAIL_SUFFIX); + } +} + +#define fguv_ss_fn(nm, sx, sy) \ +static void \ +fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ + const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \ + const int pw, const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], const int bh, \ + const int row_num, const pixel *const luma_row, \ + const ptrdiff_t luma_stride, const int uv, const int is_id \ + HIGHBD_DECL_SUFFIX) \ +{ \ + const int rows = 1 + (data->overlap_flag && row_num > 0); \ + \ + /* seed[0] contains the current row, seed[1] contains the previous */ \ + unsigned seed[2]; \ + for (int i = 0; i < rows; i++) { \ + seed[i] = data->seed; \ + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \ + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \ + } \ + \ + int offsets[2 /* col offset */][2 /* row offset */]; \ + \ + /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \ + for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \ + if (data->overlap_flag && bx) { \ + /* shift previous offsets left */ \ + for (int i = 0; i < rows; i++) \ + offsets[1][i] = offsets[0][i]; \ + } \ + \ + /* update current offsets */ \ + for (int i = 0; i < rows; i++) \ + offsets[0][i] = get_random_number(8, &seed[i]); \ + \ + int type = 0; \ + if (data->overlap_flag && row_num) \ + type |= 1; /* overlap y */ \ + if (data->overlap_flag && bx) \ + type |= 2; /* overlap x */ \ + if (data->chroma_scaling_from_luma) \ + type |= 4; \ + \ + BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \ + scaling, data, grain_lut, \ + luma_row + (bx << sx), luma_stride, \ + offsets, bh, uv, is_id, type \ + HIGHBD_TAIL_SUFFIX); \ + } \ +} + +fguv_ss_fn(420, 1, 1); +fguv_ss_fn(422, 1, 0); +fguv_ss_fn(444, 0, 0); + +COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon); + + c->fgy_32x32xn = fgy_32x32xn_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon; +} diff -Nru dav1d-0.9.2/src/arm/looprestoration_init_tmpl.c dav1d-1.0.0/src/arm/looprestoration_init_tmpl.c --- dav1d-0.9.2/src/arm/looprestoration_init_tmpl.c 2021-09-03 15:51:24.401037200 +0000 +++ dav1d-1.0.0/src/arm/looprestoration_init_tmpl.c 2022-03-18 14:31:55.982356000 +0000 @@ -29,16 +29,14 @@ #include "src/looprestoration.h" #if ARCH_AARCH64 -void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t p_stride, - const pixel (*left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); -void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t p_stride, - const pixel (*left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges @@ -76,9 +74,8 @@ const int16_t fv[8], enum LrEdgeFlags edges, ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX); -static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) @@ -88,20 +85,20 @@ int mid_stride = (w + 7) & ~7; // Horizontal filter - BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride, + BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride, filter[0], w, h, edges HIGHBD_TAIL_SUFFIX); if (edges & LR_HAVE_TOP) - BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride, + BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride, filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); if (edges & LR_HAVE_BOTTOM) BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL, - lpf + 6 * PXSTRIDE(lpf_stride), - lpf_stride, filter[0], w, 2, edges + lpf + 6 * PXSTRIDE(stride), + stride, filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); // Vertical filter - BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride], + BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride], w, h, filter[1], edges, mid_stride * sizeof(*mid) HIGHBD_TAIL_SUFFIX); @@ -127,8 +124,7 @@ /* filter with a 3x3 box (radius=1) */ static void dav1d_sgr_filter1_neon(int16_t *tmp, const pixel *src, const ptrdiff_t stride, - const pixel (*left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, + const pixel (*left)[4], const pixel *lpf, const int w, const int h, const int strength, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) @@ -141,12 +137,12 @@ BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges); if (edges & LR_HAVE_TOP) BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], - NULL, lpf, lpf_stride, w, 2, edges); + NULL, lpf, stride, w, 2, edges); if (edges & LR_HAVE_BOTTOM) BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], - NULL, lpf + 6 * PXSTRIDE(lpf_stride), - lpf_stride, w, 2, edges); + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges); dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX); @@ -172,8 +168,7 @@ /* filter with a 5x5 box (radius=2) */ static void dav1d_sgr_filter2_neon(int16_t *tmp, const pixel *src, const ptrdiff_t stride, - const pixel (*left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, + const pixel (*left)[4], const pixel *lpf, const int w, const int h, const int strength, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) @@ -186,12 +181,12 @@ BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges); if (edges & LR_HAVE_TOP) BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], - NULL, lpf, lpf_stride, w, 2, edges); + NULL, lpf, stride, w, 2, edges); if (edges & LR_HAVE_BOTTOM) BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], - NULL, lpf + 6 * PXSTRIDE(lpf_stride), - lpf_stride, w, 2, edges); + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges); dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX); @@ -208,49 +203,46 @@ const int w, const int h, const int16_t wt[2] HIGHBD_DECL_SUFFIX); -static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t dst_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int16_t, tmp, 64 * 384,); - dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, + dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf, w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); - BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX); } -static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t dst_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int16_t, tmp, 64 * 384,); - dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, + dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf, w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); - BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX); } -static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t dst_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int16_t, tmp1, 64 * 384,); ALIGN_STK_16(int16_t, tmp2, 64 * 384,); - dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride, + dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf, w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); - dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride, + dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf, w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 }; - BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride, + BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride, tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX); } diff -Nru dav1d-0.9.2/src/cdef_apply.h dav1d-1.0.0/src/cdef_apply.h --- dav1d-0.9.2/src/cdef_apply.h 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/cdef_apply.h 2022-03-18 14:31:55.982356000 +0000 @@ -32,7 +32,8 @@ #include "src/internal.h" -void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *f, pixel *const p[3], - const Av1Filter *lflvl, int by_start, int by_end); +void bytefn(dav1d_cdef_brow)(Dav1dTaskContext *tc, pixel *const p[3], + const Av1Filter *lflvl, int by_start, int by_end, + int sbrow_start, int sby); #endif /* DAV1D_SRC_CDEF_APPLY_H */ diff -Nru dav1d-0.9.2/src/cdef_apply_tmpl.c dav1d-1.0.0/src/cdef_apply_tmpl.c --- dav1d-0.9.2/src/cdef_apply_tmpl.c 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/cdef_apply_tmpl.c 2022-03-18 14:31:55.982356000 +0000 @@ -33,7 +33,6 @@ #include "src/cdef_apply.h" - enum Backup2x8Flags { BACKUP_2X8_Y = 1 << 0, BACKUP_2X8_UV = 1 << 1, @@ -95,11 +94,13 @@ return (strength * (4 + i) + 8) >> 4; } -void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, +void bytefn(dav1d_cdef_brow)(Dav1dTaskContext *const tc, pixel *const p[3], const Av1Filter *const lflvl, - const int by_start, const int by_end) + const int by_start, const int by_end, + const int sbrow_start, const int sby) { + Dav1dFrameContext *const f = (Dav1dFrameContext *)tc->f; const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8; const Dav1dDSPContext *const dsp = f->dsp; enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0); @@ -114,14 +115,28 @@ static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 }, { 7, 0, 2, 4, 5, 6, 6, 6 } }; const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422]; + const int have_tt = f->c->n_tc > 1; + const int sb128 = f->seq_hdr->sb128; + const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1]; + const ptrdiff_t y_stride = PXSTRIDE(f->cur.stride[0]); + const ptrdiff_t uv_stride = PXSTRIDE(f->cur.stride[1]); for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) { - const int tf = f->lf.top_pre_cdef_toggle; + const int tf = tc->top_pre_cdef_toggle; const int by_idx = (by & 30) >> 1; if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM; - if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration - backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride, layout); + if ((!have_tt || sbrow_start || by + 2 < by_end) && + edges & CDEF_HAVE_BOTTOM) + { + // backup pre-filter data for next iteration + pixel *const cdef_top_bak[3] = { + f->lf.cdef_line[!tf][0] + have_tt * sby * 4 * y_stride, + f->lf.cdef_line[!tf][1] + have_tt * sby * 8 * uv_stride, + f->lf.cdef_line[!tf][2] + have_tt * sby * 8 * uv_stride + }; + backup2lines(cdef_top_bak, ptrs, f->cur.stride, layout); + } ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]); pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] }; @@ -190,29 +205,87 @@ dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0], &variance HIGHBD_CALL_SUFFIX); + const pixel *top, *bot; + ptrdiff_t offset; + + if (!have_tt) goto st_y; + if (sbrow_start && by == by_start) { + if (resize) { + offset = (sby - 1) * 4 * y_stride + bx * 4; + top = &f->lf.cdef_lpf_line[0][offset]; + } else { + offset = (sby * (4 << sb128) - 4) * y_stride + bx * 4; + top = &f->lf.lr_lpf_line[0][offset]; + } + bot = bptrs[0] + 8 * y_stride; + } else if (!sbrow_start && by + 2 >= by_end) { + top = &f->lf.cdef_line[tf][0][sby * 4 * y_stride + bx * 4]; + if (resize) { + offset = (sby * 4 + 2) * y_stride + bx * 4; + bot = &f->lf.cdef_lpf_line[0][offset]; + } else { + const int line = sby * (4 << sb128) + 4 * sb128 + 2; + offset = line * y_stride + bx * 4; + bot = &f->lf.lr_lpf_line[0][offset]; + } + } else { + st_y:; + offset = sby * 4 * y_stride; + top = &f->lf.cdef_line[tf][0][have_tt * offset + bx * 4]; + bot = bptrs[0] + 8 * y_stride; + } if (y_pri_lvl) { const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance); if (adj_y_pri_lvl || y_sec_lvl) dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0], - &f->lf.cdef_line[tf][0][bx * 4], - adj_y_pri_lvl, y_sec_lvl, dir, - damping, edges HIGHBD_CALL_SUFFIX); + top, bot, adj_y_pri_lvl, y_sec_lvl, + dir, damping, edges HIGHBD_CALL_SUFFIX); } else if (y_sec_lvl) dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0], - &f->lf.cdef_line[tf][0][bx * 4], - 0, y_sec_lvl, 0, - damping, edges HIGHBD_CALL_SUFFIX); - if (uv_lvl) { - assert(layout != DAV1D_PIXEL_LAYOUT_I400); - const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0; - for (int pl = 1; pl <= 2; pl++) { - dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl], - &f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor], - uv_pri_lvl, uv_sec_lvl, uvdir, - damping - 1, edges HIGHBD_CALL_SUFFIX); + top, bot, 0, y_sec_lvl, 0, damping, + edges HIGHBD_CALL_SUFFIX); + + if (!uv_lvl) goto skip_uv; + assert(layout != DAV1D_PIXEL_LAYOUT_I400); + + const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0; + for (int pl = 1; pl <= 2; pl++) { + if (!have_tt) goto st_uv; + if (sbrow_start && by == by_start) { + if (resize) { + offset = (sby - 1) * 4 * uv_stride + (bx * 4 >> ss_hor); + top = &f->lf.cdef_lpf_line[pl][offset]; + } else { + const int line = sby * (4 << sb128) - 4; + offset = line * uv_stride + (bx * 4 >> ss_hor); + top = &f->lf.lr_lpf_line[pl][offset]; + } + bot = bptrs[pl] + (8 >> ss_ver) * uv_stride; + } else if (!sbrow_start && by + 2 >= by_end) { + const ptrdiff_t top_offset = sby * 8 * uv_stride + + (bx * 4 >> ss_hor); + top = &f->lf.cdef_line[tf][pl][top_offset]; + if (resize) { + offset = (sby * 4 + 2) * uv_stride + (bx * 4 >> ss_hor); + bot = &f->lf.cdef_lpf_line[pl][offset]; + } else { + const int line = sby * (4 << sb128) + 4 * sb128 + 2; + offset = line * uv_stride + (bx * 4 >> ss_hor); + bot = &f->lf.lr_lpf_line[pl][offset]; + } + } else { + st_uv:; + const ptrdiff_t offset = sby * 8 * uv_stride; + top = &f->lf.cdef_line[tf][pl][have_tt * offset + (bx * 4 >> ss_hor)]; + bot = bptrs[pl] + (8 >> ss_ver) * uv_stride; } + dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], + lr_bak[bit][pl], top, bot, + uv_pri_lvl, uv_sec_lvl, uvdir, + damping - 1, edges HIGHBD_CALL_SUFFIX); } + skip_uv: bit ^= 1; last_skip = 0; @@ -231,6 +304,6 @@ ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]); ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; - f->lf.top_pre_cdef_toggle ^= 1; + tc->top_pre_cdef_toggle ^= 1; } } diff -Nru dav1d-0.9.2/src/cdef.h dav1d-1.0.0/src/cdef.h --- dav1d-0.9.2/src/cdef.h 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/cdef.h 2022-03-18 14:31:55.982356000 +0000 @@ -52,7 +52,8 @@ // order to get access to pre-filter top pixels, use $top. #define decl_cdef_fn(name) \ void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \ - const pixel *top, int pri_strength, int sec_strength, \ + const pixel *top, const pixel *bottom, \ + int pri_strength, int sec_strength, \ int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) typedef decl_cdef_fn(*cdef_fn); diff -Nru dav1d-0.9.2/src/cdef_tmpl.c dav1d-1.0.0/src/cdef_tmpl.c --- dav1d-0.9.2/src/cdef_tmpl.c 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/cdef_tmpl.c 2022-03-18 14:31:55.982356000 +0000 @@ -55,9 +55,9 @@ static void padding(int16_t *tmp, const ptrdiff_t tmp_stride, const pixel *src, const ptrdiff_t src_stride, - const pixel (*left)[2], const pixel *top, - const int w, const int h, - const enum CdefEdgeFlags edges) + const pixel (*left)[2], + const pixel *top, const pixel *bottom, + const int w, const int h, const enum CdefEdgeFlags edges) { // fill extended input buffer int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2; @@ -86,17 +86,25 @@ for (int y = 0; y < h; y++) for (int x = x_start; x < 0; x++) tmp[x + y * tmp_stride] = left[y][2 + x]; - for (int y = 0; y < y_end; y++) { + for (int y = 0; y < h; y++) { for (int x = (y < h) ? 0 : x_start; x < x_end; x++) tmp[x] = src[x]; src += PXSTRIDE(src_stride); tmp += tmp_stride; } + for (int y = h; y < y_end; y++) { + for (int x = x_start; x < x_end; x++) + tmp[x] = bottom[x]; + bottom += PXSTRIDE(src_stride); + tmp += tmp_stride; + } + } static NOINLINE void cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride, - const pixel (*left)[2], const pixel *const top, + const pixel (*left)[2], + const pixel *const top, const pixel *const bottom, const int pri_strength, const int sec_strength, const int dir, const int damping, const int w, int h, const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX) @@ -106,7 +114,7 @@ int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4) int16_t *tmp = tmp_buf + 2 * tmp_stride + 2; - padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges); + padding(tmp, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); if (pri_strength) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; @@ -211,6 +219,7 @@ const ptrdiff_t stride, \ const pixel (*left)[2], \ const pixel *const top, \ + const pixel *const bottom, \ const int pri_strength, \ const int sec_strength, \ const int dir, \ @@ -218,8 +227,8 @@ const enum CdefEdgeFlags edges \ HIGHBD_DECL_SUFFIX) \ { \ - cdef_filter_block_c(dst, stride, left, top, pri_strength, sec_strength, \ - dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \ + cdef_filter_block_c(dst, stride, left, top, bottom, \ + pri_strength, sec_strength, dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \ } cdef_fn(4, 4); diff -Nru dav1d-0.9.2/src/cdf.c dav1d-1.0.0/src/cdf.c --- dav1d-0.9.2/src/cdf.c 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/cdf.c 2022-03-18 14:31:55.982356000 +0000 @@ -4096,16 +4096,15 @@ } int dav1d_cdf_thread_alloc(Dav1dContext *const c, CdfThreadContext *const cdf, - struct thread_data *const t) + const int have_frame_mt) { cdf->ref = dav1d_ref_create_using_pool(c->cdf_pool, sizeof(CdfContext) + sizeof(atomic_uint)); if (!cdf->ref) return DAV1D_ERR(ENOMEM); cdf->data.cdf = cdf->ref->data; - if (t) { + if (have_frame_mt) { cdf->progress = (atomic_uint *) &cdf->data.cdf[1]; atomic_init(cdf->progress, 0); - cdf->t = t; } return 0; } @@ -4123,22 +4122,3 @@ dav1d_ref_dec(&cdf->ref); memset(cdf, 0, sizeof(*cdf)); } - -void dav1d_cdf_thread_wait(CdfThreadContext *const cdf) { - if (!cdf->t) return; - - if (atomic_load(cdf->progress)) return; - pthread_mutex_lock(&cdf->t->lock); - while (!atomic_load(cdf->progress)) - pthread_cond_wait(&cdf->t->cond, &cdf->t->lock); - pthread_mutex_unlock(&cdf->t->lock); -} - -void dav1d_cdf_thread_signal(CdfThreadContext *const cdf) { - if (!cdf->t) return; - - pthread_mutex_lock(&cdf->t->lock); - atomic_store(cdf->progress, 1); - pthread_cond_broadcast(&cdf->t->cond); - pthread_mutex_unlock(&cdf->t->lock); -} diff -Nru dav1d-0.9.2/src/cdf.h dav1d-1.0.0/src/cdf.h --- dav1d-0.9.2/src/cdf.h 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/cdf.h 2022-03-18 14:31:55.982356000 +0000 @@ -135,23 +135,16 @@ CdfContext *cdf; // if ref != NULL unsigned qcat; // if ref == NULL, from static CDF tables } data; - struct thread_data *t; atomic_uint *progress; } CdfThreadContext; void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx); int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf, - struct thread_data *t); + const int have_frame_mt); void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src); void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src); void dav1d_cdf_thread_unref(CdfThreadContext *cdf); void dav1d_cdf_thread_update(const Dav1dFrameHeader *hdr, CdfContext *dst, const CdfContext *src); -/* - * These are binary signals (so a signal is either "done" or "not done"). - */ -void dav1d_cdf_thread_wait(CdfThreadContext *cdf); -void dav1d_cdf_thread_signal(CdfThreadContext *cdf); - #endif /* DAV1D_SRC_CDF_H */ diff -Nru dav1d-0.9.2/src/cpu.c dav1d-1.0.0/src/cpu.c --- dav1d-0.9.2/src/cpu.c 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/cpu.c 2022-03-18 14:31:55.982356000 +0000 @@ -29,21 +29,31 @@ #include #include "src/cpu.h" +#include "src/log.h" -static unsigned flags = 0; - -#if __has_feature(memory_sanitizer) -// memory sanitizer is inherently incompatible with asm -static unsigned flags_mask = 0; -#elif ARCH_X86 -/* Disable AVX-512 by default for the time being */ -static unsigned flags_mask = ~DAV1D_X86_CPU_FLAG_AVX512ICL; +#ifdef _WIN32 +#include +#elif defined(__APPLE__) +#include +#include #else -static unsigned flags_mask = -1; +#include +#include #endif +#ifdef HAVE_PTHREAD_NP_H +#include +#endif +#if defined(__FreeBSD__) +#define cpu_set_t cpuset_t +#endif + +static unsigned flags = 0; +static unsigned flags_mask = -1; + COLD void dav1d_init_cpu(void) { -#if HAVE_ASM +#if HAVE_ASM && !__has_feature(memory_sanitizer) +// memory sanitizer is inherently incompatible with asm #if ARCH_AARCH64 || ARCH_ARM flags = dav1d_get_cpu_flags_arm(); #elif ARCH_PPC64LE @@ -61,3 +71,34 @@ COLD void dav1d_set_cpu_flags_mask(const unsigned mask) { flags_mask = mask; } + +COLD int dav1d_num_logical_processors(Dav1dContext *const c) { +#ifdef _WIN32 +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) + GROUP_AFFINITY affinity; + if (GetThreadGroupAffinity(GetCurrentThread(), &affinity)) { + int num_processors = 1; + while (affinity.Mask &= affinity.Mask - 1) + num_processors++; + return num_processors; + } +#else + SYSTEM_INFO system_info; + GetNativeSystemInfo(&system_info); + return system_info.dwNumberOfProcessors; +#endif +#elif defined(HAVE_PTHREAD_GETAFFINITY_NP) && defined(CPU_COUNT) + cpu_set_t affinity; + if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) + return CPU_COUNT(&affinity); +#elif defined(__APPLE__) + int num_processors; + size_t length = sizeof(num_processors); + if (!sysctlbyname("hw.logicalcpu", &num_processors, &length, NULL, 0)) + return num_processors; +#elif defined(_SC_NPROCESSORS_ONLN) + return (int)sysconf(_SC_NPROCESSORS_ONLN); +#endif + dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n"); + return 1; +} diff -Nru dav1d-0.9.2/src/cpu.h dav1d-1.0.0/src/cpu.h --- dav1d-0.9.2/src/cpu.h 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/cpu.h 2022-03-18 14:31:55.982356000 +0000 @@ -33,6 +33,7 @@ #include "common/attributes.h" #include "dav1d/common.h" +#include "dav1d/dav1d.h" #if ARCH_AARCH64 || ARCH_ARM #include "src/arm/cpu.h" @@ -45,5 +46,6 @@ void dav1d_init_cpu(void); unsigned dav1d_get_cpu_flags(void); DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask); +int dav1d_num_logical_processors(Dav1dContext *c); #endif /* DAV1D_SRC_CPU_H */ diff -Nru dav1d-0.9.2/src/data.c dav1d-1.0.0/src/data.c --- dav1d-0.9.2/src/data.c 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/data.c 2022-03-18 14:31:55.986356000 +0000 @@ -116,11 +116,17 @@ void dav1d_data_props_set_defaults(Dav1dDataProps *const props) { assert(props != NULL); + memset(props, 0, sizeof(*props)); props->timestamp = INT64_MIN; - props->duration = 0; props->offset = -1; - props->user_data.data = NULL; - props->user_data.ref = NULL; +} + +void dav1d_data_props_unref_internal(Dav1dDataProps *const props) { + validate_input(props != NULL); + + struct Dav1dRef *user_data_ref = props->user_data.ref; + dav1d_data_props_set_defaults(props); + dav1d_ref_dec(&user_data_ref); } void dav1d_data_unref_internal(Dav1dData *const buf) { @@ -132,5 +138,6 @@ dav1d_ref_dec(&buf->ref); } memset(buf, 0, sizeof(*buf)); + dav1d_data_props_set_defaults(&buf->m); dav1d_ref_dec(&user_data_ref); } diff -Nru dav1d-0.9.2/src/data.h dav1d-1.0.0/src/data.h --- dav1d-0.9.2/src/data.h 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/data.h 2022-03-18 14:31:55.986356000 +0000 @@ -33,7 +33,7 @@ void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src); /** - * Copy the source properties to the destitionatin and increase the + * Copy the source properties to the destination and increase the * user_data's reference count (if it's not NULL). */ void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src); @@ -51,5 +51,6 @@ void *cookie), void *cookie); void dav1d_data_unref_internal(Dav1dData *buf); +void dav1d_data_props_unref_internal(Dav1dDataProps *props); #endif /* DAV1D_SRC_DATA_H */ diff -Nru dav1d-0.9.2/src/decode.c dav1d-1.0.0/src/decode.c --- dav1d-0.9.2/src/decode.c 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/decode.c 2022-03-18 14:31:55.986356000 +0000 @@ -42,7 +42,7 @@ #include "src/decode.h" #include "src/dequant_tables.h" #include "src/env.h" -#include "src/film_grain.h" +#include "src/filmgrain.h" #include "src/log.h" #include "src/qm.h" #include "src/recon.h" @@ -73,7 +73,7 @@ } } -static int read_mv_component_diff(Dav1dTileContext *const t, +static int read_mv_component_diff(Dav1dTaskContext *const t, CdfMvComponent *const mv_comp, const int have_fp) { @@ -117,7 +117,7 @@ return sign ? -diff : diff; } -static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv, +static void read_mv_residual(Dav1dTaskContext *const t, mv *const ref_mv, CdfMvContext *const mv_cdf, const int have_fp) { switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint, @@ -138,7 +138,7 @@ } } -static void read_tx_tree(Dav1dTileContext *const t, +static void read_tx_tree(Dav1dTaskContext *const t, const enum RectTxfmSize from, const int depth, uint16_t *const masks, const int x_off, const int y_off) @@ -216,7 +216,7 @@ } } -static void find_matching_ref(const Dav1dTileContext *const t, +static void find_matching_ref(const Dav1dTaskContext *const t, const enum EdgeFlags intra_edge_flags, const int bw4, const int bh4, const int w4, const int h4, @@ -289,7 +289,7 @@ #undef matches } -static void derive_warpmv(const Dav1dTileContext *const t, +static void derive_warpmv(const Dav1dTaskContext *const t, const int bw4, const int bh4, const uint64_t masks[2], const union mv mv, Dav1dWarpedMotionParams *const wmp) @@ -370,7 +370,7 @@ return 0; } -static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b, +static void read_pal_plane(Dav1dTaskContext *const t, Av1Block *const b, const int pl, const int sz_ctx, const int bx4, const int by4) { @@ -425,7 +425,7 @@ const int n_used_cache = i; // parse new entries - uint16_t *const pal = f->frame_thread.pass ? + uint16_t *const pal = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl]; if (i < pal_sz) { @@ -473,7 +473,7 @@ } } -static void read_pal_uv(Dav1dTileContext *const t, Av1Block *const b, +static void read_pal_uv(Dav1dTaskContext *const t, Av1Block *const b, const int sz_ctx, const int bx4, const int by4) { read_pal_plane(t, b, 1, sz_ctx, bx4, by4); @@ -481,7 +481,7 @@ // V pal coding Dav1dTileState *const ts = t->ts; const Dav1dFrameContext *const f = t->f; - uint16_t *const pal = f->frame_thread.pass ? + uint16_t *const pal = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2]; if (dav1d_msac_decode_bool_equi(&ts->msac)) { @@ -575,7 +575,7 @@ } } -static void read_pal_indices(Dav1dTileContext *const t, +static void read_pal_indices(Dav1dTaskContext *const t, uint8_t *const pal_idx, const Av1Block *const b, const int pl, const int w4, const int h4, @@ -612,7 +612,7 @@ } } -static void read_vartx_tree(Dav1dTileContext *const t, +static void read_vartx_tree(Dav1dTaskContext *const t, Av1Block *const b, const enum BlockSize bs, const int bx4, const int by4) { @@ -674,11 +674,6 @@ const ptrdiff_t stride) { assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE); - if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr->primary_ref_frame], - (by + h4) * 4, PLANE_TYPE_BLOCK)) - { - return 8; - } unsigned seg_id = 8; ref_seg_map += by * stride + bx; @@ -693,7 +688,7 @@ } static inline void splat_oneref_mv(const Dav1dContext *const c, - Dav1dTileContext *const t, + Dav1dTaskContext *const t, const enum BlockSize bs, const Av1Block *const b, const int bw4, const int bh4) @@ -709,7 +704,7 @@ } static inline void splat_intrabc_mv(const Dav1dContext *const c, - Dav1dTileContext *const t, + Dav1dTaskContext *const t, const enum BlockSize bs, const Av1Block *const b, const int bw4, const int bh4) @@ -724,7 +719,7 @@ } static inline void splat_tworef_mv(const Dav1dContext *const c, - Dav1dTileContext *const t, + Dav1dTaskContext *const t, const enum BlockSize bs, const Av1Block *const b, const int bw4, const int bh4) @@ -741,7 +736,7 @@ } static inline void splat_intraref(const Dav1dContext *const c, - Dav1dTileContext *const t, + Dav1dTaskContext *const t, const enum BlockSize bs, const int bw4, const int bh4) { @@ -754,7 +749,98 @@ c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4); } -static int decode_b(Dav1dTileContext *const t, +static inline void mc_lowest_px(int *const dst, const int by4, const int bh4, + const int mvy, const int ss_ver, + const struct ScalableMotionParams *const smp) +{ + const int v_mul = 4 >> ss_ver; + if (!smp->scale) { + const int my = mvy >> (3 + ss_ver), dy = mvy & (15 >> !ss_ver); + *dst = imax(*dst, (by4 + bh4) * v_mul + my + 4 * !!dy); + } else { + int y = (by4 * v_mul << 4) + mvy * (1 << !ss_ver); + const int64_t tmp = (int64_t)(y) * smp->scale + (smp->scale - 0x4000) * 8; + y = apply_sign64((int)((llabs(tmp) + 128) >> 8), tmp) + 32; + const int bottom = ((y + (bh4 * v_mul - 1) * smp->step) >> 10) + 1 + 4; + *dst = imax(*dst, bottom); + } +} + +static inline void affine_lowest_px(Dav1dTaskContext *const t, + int *const dst, const int is_chroma, + const uint8_t *const b_dim, + const Dav1dWarpedMotionParams *const wmp) +{ + const Dav1dFrameContext *const f = t->f; + const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; + assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7)); + const int32_t *const mat = wmp->matrix; + const int y = b_dim[1] * v_mul - 8; // lowest y + + const int src_y = t->by * 4 + ((y + 4) << ss_ver); + const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; + // check left- and right-most blocks + for (int x = 0; x < b_dim[0] * h_mul; x += imax(8, b_dim[0] * h_mul - 8)) { + // calculate transformation relative to center of 8x8 block in + // luma pixel units + const int src_x = t->bx * 4 + ((x + 4) << ss_hor); + const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; + const int dy = (int) (mvy >> 16) - 4; + *dst = imax(*dst, dy + 4 + 8); + } +} + +static void obmc_lowest_px(Dav1dTaskContext *const t, + int (*const dst)[2], const int is_chroma, + const uint8_t *const b_dim, + const int bx4, const int by4, const int w4, const int h4) +{ + assert(!(t->bx & 1) && !(t->by & 1)); + const Dav1dFrameContext *const f = t->f; + /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5]; + const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; + + if (t->by > t->ts->tiling.row_start && + (!is_chroma || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16)) + { + for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) { + // only odd blocks are considered for overlap handling, hence +1 + const refmvs_block *const a_r = &r[-1][t->bx + x + 1]; + const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs]; + + if (a_r->ref.ref[0] > 0) { + const int oh4 = imin(b_dim[1], 16) >> 1; + mc_lowest_px(&dst[a_r->ref.ref[0] - 1][is_chroma], t->by, + (oh4 * 3 + 3) >> 2, a_r->mv.mv[0].y, ss_ver, + &f->svc[a_r->ref.ref[0] - 1][1]); + i++; + } + x += imax(a_b_dim[0], 2); + } + } + + if (t->bx > t->ts->tiling.col_start) + for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) { + // only odd blocks are considered for overlap handling, hence +1 + const refmvs_block *const l_r = &r[y + 1][t->bx - 1]; + const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs]; + + if (l_r->ref.ref[0] > 0) { + const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]); + mc_lowest_px(&dst[l_r->ref.ref[0] - 1][is_chroma], + t->by + y, oh4, l_r->mv.mv[0].y, ss_ver, + &f->svc[l_r->ref.ref[0] - 1][1]); + i++; + } + y += imax(l_b_dim[1], 2); + } +} + +static int decode_b(Dav1dTaskContext *const t, const enum BlockLevel bl, const enum BlockSize bs, const enum BlockPartition bp, @@ -762,7 +848,7 @@ { Dav1dTileState *const ts = t->ts; const Dav1dFrameContext *const f = t->f; - Av1Block b_mem, *const b = f->frame_thread.pass ? + Av1Block b_mem, *const b = t->frame_thread.pass ? &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem; const uint8_t *const b_dim = dav1d_block_dimensions[bs]; const int bx4 = t->bx & 31, by4 = t->by & 31; @@ -778,7 +864,7 @@ (bw4 > ss_hor || t->bx & 1) && (bh4 > ss_ver || t->by & 1); - if (f->frame_thread.pass == 2) { + if (t->frame_thread.pass == 2) { if (b->intra) { f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b); @@ -1156,6 +1242,7 @@ if (DEBUG_BLOCK_INFO) printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng); + b->uv_angle = 0; if (b->uv_mode == CFL_PRED) { #define SIGN(a) (!!(a) + ((a) > 0)) const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac, @@ -1188,8 +1275,6 @@ uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED]; const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6); b->uv_angle = angle - 3; - } else { - b->uv_angle = 0; } } @@ -1236,10 +1321,11 @@ if (b->pal_sz[0]) { uint8_t *pal_idx; - if (f->frame_thread.pass) { - assert(ts->frame_thread.pal_idx); - pal_idx = ts->frame_thread.pal_idx; - ts->frame_thread.pal_idx += bw4 * bh4 * 16; + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + assert(ts->frame_thread[p].pal_idx); + pal_idx = ts->frame_thread[p].pal_idx; + ts->frame_thread[p].pal_idx += bw4 * bh4 * 16; } else pal_idx = t->scratch.pal_idx; read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4); @@ -1249,10 +1335,11 @@ if (has_chroma && b->pal_sz[1]) { uint8_t *pal_idx; - if (f->frame_thread.pass) { - assert(ts->frame_thread.pal_idx); - pal_idx = ts->frame_thread.pal_idx; - ts->frame_thread.pal_idx += cbw4 * cbh4 * 16; + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + assert(ts->frame_thread[p].pal_idx); + pal_idx = ts->frame_thread[p].pal_idx; + ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16; } else pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16]; read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4); @@ -1284,7 +1371,7 @@ } // reconstruction - if (f->frame_thread.pass == 1) { + if (t->frame_thread.pass == 1) { f->bd_fn.read_coef_blocks(t, bs, b); } else { f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b); @@ -1328,7 +1415,7 @@ case_set(bw4, a->, 0, bx4); #undef set_ctx if (b->pal_sz[0]) { - uint16_t *const pal = f->frame_thread.pass ? + uint16_t *const pal = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0]; for (int x = 0; x < bw4; x++) @@ -1343,7 +1430,7 @@ case_set(cbw4, a->, 0, cbx4); #undef set_ctx if (b->pal_sz[1]) { - const uint16_t (*const pal)[8] = f->frame_thread.pass ? + const uint16_t (*const pal)[8] = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] : t->scratch.pal; @@ -1446,7 +1533,7 @@ read_vartx_tree(t, b, bs, bx4, by4); // reconstruction - if (f->frame_thread.pass == 1) { + if (t->frame_thread.pass == 1) { f->bd_fn.read_coef_blocks(t, bs, b); b->filter2d = FILTER_2D_BILINEAR; } else { @@ -1910,7 +1997,7 @@ signabs(t->warpmv.u.p.delta), b->mv[0].y, b->mv[0].x); #undef signabs - if (f->frame_thread.pass) { + if (t->frame_thread.pass) { if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) { b->matrix[0] = t->warpmv.matrix[2] - 0x10000; b->matrix[1] = t->warpmv.matrix[3]; @@ -1923,8 +2010,8 @@ } if (DEBUG_BLOCK_INFO) - printf("Post-motionmode[%d]: r=%d [mask: 0x%" PRIu64 "x/0x%" - PRIu64 "x]\n", b->motion_mode, ts->msac.rng, mask[0], + printf("Post-motionmode[%d]: r=%d [mask: 0x%" PRIx64 "/0x%" + PRIx64 "]\n", b->motion_mode, ts->msac.rng, mask[0], mask[1]); } else { b->motion_mode = MM_TRANSLATION; @@ -1970,7 +2057,7 @@ read_vartx_tree(t, b, bs, bx4, by4); // reconstruction - if (f->frame_thread.pass == 1) { + if (t->frame_thread.pass == 1) { f->bd_fn.read_coef_blocks(t, bs, b); } else { if (f->bd_fn.recon_b_inter(t, bs, b)) return -1; @@ -2052,6 +2139,110 @@ } } + if (t->frame_thread.pass == 1 && !b->intra && IS_INTER_OR_SWITCH(f->frame_hdr)) { + const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift; + int (*const lowest_px)[2] = ts->lowest_pixel[sby]; + + // keep track of motion vectors for each reference + if (b->comp_type == COMP_INTER_NONE) { + // y + if (imin(bw4, bh4) > 1 && + ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || + (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) + { + affine_lowest_px(t, &lowest_px[b->ref[0]][0], 0, b_dim, + b->motion_mode == MM_WARP ? &t->warpmv : + &f->frame_hdr->gmv[b->ref[0]]); + } else { + mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y, + 0, &f->svc[b->ref[0]][1]); + if (b->motion_mode == MM_OBMC) { + obmc_lowest_px(t, lowest_px, 0, b_dim, bx4, by4, w4, h4); + } + } + + // uv + if (has_chroma) { + // sub8x8 derivation + int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver; + refmvs_block *const *r; + if (is_sub8x8) { + assert(ss_hor == 1); + r = &t->rt.r[(t->by & 31) + 5]; + if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0; + if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0; + if (bw4 == 1 && bh4 == ss_ver) + is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0; + } + + // chroma prediction + if (is_sub8x8) { + assert(ss_hor == 1); + if (bw4 == 1 && bh4 == ss_ver) { + const refmvs_block *const rr = &r[-1][t->bx - 1]; + mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1], + t->by - 1, bh4, rr->mv.mv[0].y, ss_ver, + &f->svc[rr->ref.ref[0] - 1][1]); + } + if (bw4 == 1) { + const refmvs_block *const rr = &r[0][t->bx - 1]; + mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1], + t->by, bh4, rr->mv.mv[0].y, ss_ver, + &f->svc[rr->ref.ref[0] - 1][1]); + } + if (bh4 == ss_ver) { + const refmvs_block *const rr = &r[-1][t->bx]; + mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1], + t->by - 1, bh4, rr->mv.mv[0].y, ss_ver, + &f->svc[rr->ref.ref[0] - 1][1]); + } + mc_lowest_px(&lowest_px[b->ref[0]][1], t->by, bh4, + b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]); + } else { + if (imin(cbw4, cbh4) > 1 && + ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || + (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) + { + affine_lowest_px(t, &lowest_px[b->ref[0]][1], 1, b_dim, + b->motion_mode == MM_WARP ? &t->warpmv : + &f->frame_hdr->gmv[b->ref[0]]); + } else { + mc_lowest_px(&lowest_px[b->ref[0]][1], + t->by & ~ss_ver, bh4 << (bh4 == ss_ver), + b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]); + if (b->motion_mode == MM_OBMC) { + obmc_lowest_px(t, lowest_px, 1, b_dim, bx4, by4, w4, h4); + } + } + } + } + } else { + // y + for (int i = 0; i < 2; i++) { + if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) { + affine_lowest_px(t, &lowest_px[b->ref[i]][0], 0, b_dim, + &f->frame_hdr->gmv[b->ref[i]]); + } else { + mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4, + b->mv[i].y, 0, &f->svc[b->ref[i]][1]); + } + } + + // uv + if (has_chroma) for (int i = 0; i < 2; i++) { + if (b->inter_mode == GLOBALMV_GLOBALMV && + imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]]) + { + affine_lowest_px(t, &lowest_px[b->ref[i]][1], 1, b_dim, + &f->frame_hdr->gmv[b->ref[i]]); + } else { + mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4, + b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]); + } + } + } + } + return 0; } @@ -2059,7 +2250,7 @@ #include -static int checked_decode_b(Dav1dTileContext *const t, +static int checked_decode_b(Dav1dTaskContext *const t, const enum BlockLevel bl, const enum BlockSize bs, const enum BlockPartition bp, @@ -2068,7 +2259,7 @@ const Dav1dFrameContext *const f = t->f; const int err = decode_b(t, bl, bs, bp, intra_edge_flags); - if (err == 0 && !(f->frame_thread.pass & 1)) { + if (err == 0 && !(t->frame_thread.pass & 1)) { const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; const uint8_t *const b_dim = dav1d_block_dimensions[bs]; @@ -2108,10 +2299,11 @@ #endif /* defined(__has_feature) */ -static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, +static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl, const EdgeNode *const node) { const Dav1dFrameContext *const f = t->f; + Dav1dTileState *const ts = t->ts; const int hsz = 16 >> bl; const int have_h_split = f->bw > t->bx + hsz; const int have_v_split = f->bh > t->by + hsz; @@ -2124,22 +2316,22 @@ uint16_t *pc; enum BlockPartition bp; int ctx, bx8, by8; - if (f->frame_thread.pass != 2) { + if (t->frame_thread.pass != 2) { if (0 && bl == BL_64X64) printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n", - f->frame_hdr->frame_offset, t->by, t->bx, bl, t->ts->msac.rng); + f->frame_hdr->frame_offset, t->by, t->bx, bl, ts->msac.rng); bx8 = (t->bx & 31) >> 1; by8 = (t->by & 31) >> 1; ctx = get_partition_ctx(t->a, &t->l, bl, by8, bx8); - pc = t->ts->cdf.m.partition[bl][ctx]; + pc = ts->cdf.m.partition[bl][ctx]; } if (have_h_split && have_v_split) { - if (f->frame_thread.pass == 2) { + if (t->frame_thread.pass == 2) { const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; bp = b->bl == bl ? b->bp : PARTITION_SPLIT; } else { - bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, + bp = dav1d_msac_decode_symbol_adapt16(&ts->msac, pc, dav1d_partition_type_count[bl]); if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && (bp == PARTITION_V || bp == PARTITION_V4 || @@ -2150,7 +2342,7 @@ if (DEBUG_BLOCK_INFO) printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n", f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, bp, - t->ts->msac.rng); + ts->msac.rng); } const uint8_t *const b = dav1d_block_sizes[bl][bp]; @@ -2195,6 +2387,16 @@ return -1; t->bx--; t->by--; +#if ARCH_X86_64 + if (t->frame_thread.pass) { + /* In 8-bit mode with 2-pass decoding the coefficient buffer + * can end up misaligned due to skips here. Work around + * the issue by explicitly realigning the buffer. */ + const int p = t->frame_thread.pass & 1; + ts->frame_thread[p].cf = + (void*)(((uintptr_t)ts->frame_thread[p].cf + 63) & ~63); + } +#endif } else { const EdgeBranch *const branch = (const EdgeBranch *) node; if (decode_sb(t, bl + 1, branch->split[0])) @@ -2307,16 +2509,16 @@ } } else if (have_h_split) { unsigned is_split; - if (f->frame_thread.pass == 2) { + if (t->frame_thread.pass == 2) { const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; is_split = b->bl != bl; } else { - is_split = dav1d_msac_decode_bool(&t->ts->msac, + is_split = dav1d_msac_decode_bool(&ts->msac, gather_top_partition_prob(pc, bl)); if (DEBUG_BLOCK_INFO) printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n", f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, - is_split ? PARTITION_SPLIT : PARTITION_H, t->ts->msac.rng); + is_split ? PARTITION_SPLIT : PARTITION_H, ts->msac.rng); } assert(bl < BL_8X8); @@ -2336,18 +2538,18 @@ } else { assert(have_v_split); unsigned is_split; - if (f->frame_thread.pass == 2) { + if (t->frame_thread.pass == 2) { const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; is_split = b->bl != bl; } else { - is_split = dav1d_msac_decode_bool(&t->ts->msac, + is_split = dav1d_msac_decode_bool(&ts->msac, gather_left_partition_prob(pc, bl)); if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split) return 1; if (DEBUG_BLOCK_INFO) printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n", f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, - is_split ? PARTITION_SPLIT : PARTITION_V, t->ts->msac.rng); + is_split ? PARTITION_SPLIT : PARTITION_V, ts->msac.rng); } assert(bl < BL_8X8); @@ -2366,7 +2568,7 @@ } } - if (f->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) { + if (t->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) { #define set_ctx(type, dir, diridx, off, mul, rep_macro) \ rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \ rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp]) @@ -2426,14 +2628,15 @@ const int sb_shift = f->sb_shift; const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout]; - ts->frame_thread.pal_idx = f->frame_thread.pal_idx ? - &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] : - NULL; - - ts->frame_thread.cf = f->frame_thread.cf ? - (uint8_t*)f->frame_thread.cf + - (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) : - NULL; + for (int p = 0; p < 2; p++) { + ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ? + &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] : + NULL; + ts->frame_thread[p].cf = f->frame_thread.cf ? + (uint8_t*)f->frame_thread.cf + + (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) : + NULL; + } dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf); ts->last_qidx = f->frame_hdr->quant.yac; @@ -2488,11 +2691,13 @@ ts->lr_ref[p]->sgr_weights[1] = 31; } - if (f->n_tc > 1) - atomic_init(&ts->progress, row_sb_start); + if (f->c->n_tc > 1) { + for (int p = 0; p < 2; p++) + atomic_init(&ts->progress[p], row_sb_start); + } } -static void read_restoration_info(Dav1dTileContext *const t, +static void read_restoration_info(Dav1dTaskContext *const t, Av1RestorationUnit *const lr, const int p, const enum Dav1dRestorationType frame_type) { @@ -2558,7 +2763,7 @@ } } -int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) { +int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { const Dav1dFrameContext *const f = t->f; const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64; Dav1dTileState *const ts = t->ts; @@ -2572,13 +2777,22 @@ dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start, ts->tiling.col_end, ts->tiling.row_start, ts->tiling.row_end, t->by >> f->sb_shift, - ts->tiling.row); + ts->tiling.row, t->frame_thread.pass); + } + + if (IS_INTER_OR_SWITCH(f->frame_hdr) && c->n_fc > 1) { + const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift; + int (*const lowest_px)[2] = ts->lowest_pixel[sby]; + for (int n = 0; n < 7; n++) + for (int m = 0; m < 2; m++) + lowest_px[n][m] = INT_MIN; } - reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass); - if (f->frame_thread.pass == 2) { + reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), t->frame_thread.pass); + if (t->frame_thread.pass == 2) { + const int off_2pass = c->n_tc > 1 ? f->sb128w * f->frame_hdr->tiling.rows : 0; for (t->bx = ts->tiling.col_start, - t->a = f->a + col_sb128_start + tile_row * f->sb128w; + t->a = f->a + off_2pass + col_sb128_start + tile_row * f->sb128w; t->bx < ts->tiling.col_end; t->bx += sb_step) { if (atomic_load_explicit(c->flush, memory_order_acquire)) @@ -2595,13 +2809,7 @@ // error out on symbol decoder overread if (ts->msac.cnt < -15) return 1; - if (f->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) { - if (c->n_fc > 1) for (int n = 0; n < 7; n++) - if (dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step), - PLANE_TYPE_BLOCK)) - { - return 1; - } + if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) { dav1d_refmvs_load_tmvs(&f->rf, ts->tiling.row, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, t->by >> 1, (t->by + sb_step) >> 1); @@ -2686,14 +2894,14 @@ } } - if (f->seq_hdr->ref_frame_mvs && f->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { + if (f->seq_hdr->ref_frame_mvs && f->c->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { dav1d_refmvs_save_tmvs(&t->rt, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, t->by >> 1, (t->by + sb_step) >> 1); } // backup pre-loopfilter pixels for intra prediction of the next sbrow - if (f->frame_thread.pass != 1) + if (t->frame_thread.pass != 1) f->bd_fn.backup_ipred_edge(t); // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix" @@ -2709,50 +2917,24 @@ return 0; } -int dav1d_decode_frame(Dav1dFrameContext *const f) { +int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const Dav1dContext *const c = f->c; int retval = DAV1D_ERR(ENOMEM); - if (f->n_tc > 1) { - const int titsati_sz = f->frame_hdr->tiling.cols * f->sbh; - if (titsati_sz != f->tile_thread.titsati_sz) { - freep(&f->tile_thread.task_idx_to_sby_and_tile_idx); - f->tile_thread.task_idx_to_sby_and_tile_idx = - malloc(sizeof(*f->tile_thread.task_idx_to_sby_and_tile_idx) * - titsati_sz); - if (!f->tile_thread.task_idx_to_sby_and_tile_idx) { - f->tile_thread.titsati_sz = 0; - goto error; - } - f->tile_thread.titsati_sz = titsati_sz; - } - if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols || - f->tile_thread.titsati_init[1] != f->frame_hdr->tiling.rows || - memcmp(f->frame_hdr->tiling.row_start_sb, f->tile_thread.titsati_index_rows, - sizeof(*f->tile_thread.titsati_index_rows) * - (f->frame_hdr->tiling.rows + 1))) - { - for (int tile_row = 0, task_idx = 0; - tile_row < f->frame_hdr->tiling.rows; tile_row++) - { - for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; - sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++) - { - for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; - tile_col++, task_idx++) - { - f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0] = sby; - f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1] = - tile_row * f->frame_hdr->tiling.cols + tile_col; - } - } - } - f->tile_thread.titsati_init[0] = f->frame_hdr->tiling.cols; - f->tile_thread.titsati_init[1] = f->frame_hdr->tiling.rows; - memcpy(f->tile_thread.titsati_index_rows, f->frame_hdr->tiling.row_start_sb, - sizeof(*f->tile_thread.titsati_index_rows) * - (f->frame_hdr->tiling.rows + 1)); + if (f->sbh > f->lf.start_of_tile_row_sz) { + free(f->lf.start_of_tile_row); + f->lf.start_of_tile_row = malloc(f->sbh * sizeof(uint8_t)); + if (!f->lf.start_of_tile_row) { + f->lf.start_of_tile_row_sz = 0; + goto error; } + f->lf.start_of_tile_row_sz = f->sbh; + } + int sby = 0; + for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { + f->lf.start_of_tile_row[sby++] = tile_row; + while (sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]) + f->lf.start_of_tile_row[sby++] = 0; } const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; @@ -2762,45 +2944,17 @@ f->frame_thread.tile_start_off = malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts); if (!f->frame_thread.tile_start_off) { - for (int n = 0; n < f->n_ts; n++) { - Dav1dTileState *const ts = &f->ts[n]; - pthread_cond_destroy(&ts->tile_thread.cond); - pthread_mutex_destroy(&ts->tile_thread.lock); - } f->n_ts = 0; goto error; } } - Dav1dTileState *ts_new = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32); - if (!ts_new) goto error; - if (n_ts > f->n_ts) { - if (f->ts) { - memcpy(ts_new, f->ts, sizeof(*f->ts) * f->n_ts); - dav1d_free_aligned(f->ts); - } - f->ts = ts_new; - for (int n = f->n_ts; n < n_ts; f->n_ts = ++n) { - Dav1dTileState *const ts = &f->ts[n]; - if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error; - if (pthread_cond_init(&ts->tile_thread.cond, NULL)) { - pthread_mutex_destroy(&ts->tile_thread.lock); - goto error; - } - } - } else { - for (int n = n_ts; n < f->n_ts; n++) { - Dav1dTileState *const ts = &f->ts[n]; - pthread_cond_destroy(&ts->tile_thread.cond); - pthread_mutex_destroy(&ts->tile_thread.lock); - } - memcpy(ts_new, f->ts, sizeof(*f->ts) * n_ts); - dav1d_free_aligned(f->ts); - f->n_ts = n_ts; - f->ts = ts_new; - } + dav1d_free_aligned(f->ts); + f->ts = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32); + if (!f->ts) goto error; + f->n_ts = n_ts; } - const int a_sz = f->sb128w * f->frame_hdr->tiling.rows; + const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1)); if (a_sz != f->a_sz) { freep(&f->a); f->a = malloc(sizeof(*f->a) * a_sz); @@ -2827,11 +2981,34 @@ } } + const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh; + if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) { + free(f->tile_thread.lowest_pixel_mem); + f->tile_thread.lowest_pixel_mem = + malloc(lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem)); + if (!f->tile_thread.lowest_pixel_mem) { + f->tile_thread.lowest_pixel_mem_sz = 0; + goto error; + } + f->tile_thread.lowest_pixel_mem_sz = lowest_pixel_mem_sz; + } + int (*lowest_pixel_ptr)[7][2] = f->tile_thread.lowest_pixel_mem; + for (int tile_row = 0, tile_row_base = 0; tile_row < f->frame_hdr->tiling.rows; + tile_row++, tile_row_base += f->frame_hdr->tiling.cols) + { + const int tile_row_sb_h = f->frame_hdr->tiling.row_start_sb[tile_row + 1] - + f->frame_hdr->tiling.row_start_sb[tile_row]; + for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { + f->ts[tile_row_base + tile_col].lowest_pixel = lowest_pixel_ptr; + lowest_pixel_ptr += tile_row_sb_h; + } + } + const int cf_sz = (num_sb128 * size_mul[0]) << hbd; if (cf_sz != f->frame_thread.cf_sz) { dav1d_freep_aligned(&f->frame_thread.cf); f->frame_thread.cf = - dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 32); + dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 64); if (!f->frame_thread.cf) { f->frame_thread.cf_sz = 0; goto error; @@ -2845,7 +3022,7 @@ dav1d_freep_aligned(&f->frame_thread.pal); f->frame_thread.pal = dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) * - num_sb128 * 16 * 16, 32); + num_sb128 * 16 * 16, 64); if (!f->frame_thread.pal) { f->frame_thread.pal_sz = 0; goto error; @@ -2858,7 +3035,7 @@ dav1d_freep_aligned(&f->frame_thread.pal_idx); f->frame_thread.pal_idx = dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) * - pal_idx_sz * 128 * 128 / 4, 32); + pal_idx_sz * 128 * 128 / 4, 64); if (!f->frame_thread.pal_idx) { f->frame_thread.pal_idx_sz = 0; goto error; @@ -2873,33 +3050,38 @@ } // update allocation of block contexts for above - const ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1]; - if (y_stride != f->lf.cdef_line_sz[0] || uv_stride != f->lf.cdef_line_sz[1]) { + ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1]; + const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1]; + const int need_cdef_lpf_copy = c->n_tc > 1 && has_resize; + if (y_stride * f->sbh * 4 != f->lf.cdef_buf_plane_sz[0] || + uv_stride * f->sbh * 8 != f->lf.cdef_buf_plane_sz[1] || + need_cdef_lpf_copy != f->lf.need_cdef_lpf_copy || + f->sbh != f->lf.cdef_buf_sbh) + { dav1d_free_aligned(f->lf.cdef_line_buf); size_t alloc_sz = 64; - alloc_sz += (y_stride < 0 ? -y_stride : y_stride ) * 4; - alloc_sz += (uv_stride < 0 ? -uv_stride : uv_stride) * 8; + alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy; + alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy; uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32); if (!ptr) { - f->lf.cdef_line_sz[0] = f->lf.cdef_line_sz[1] = 0; + f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0; goto error; } ptr += 32; if (y_stride < 0) { - f->lf.cdef_line[0][0] = ptr - y_stride * 1; - f->lf.cdef_line[1][0] = ptr - y_stride * 3; - ptr -= y_stride * 4; + f->lf.cdef_line[0][0] = ptr - y_stride * (f->sbh * 4 - 1); + f->lf.cdef_line[1][0] = ptr - y_stride * (f->sbh * 4 - 3); } else { f->lf.cdef_line[0][0] = ptr + y_stride * 0; f->lf.cdef_line[1][0] = ptr + y_stride * 2; - ptr += y_stride * 4; } + ptr += llabs(y_stride) * f->sbh * 4; if (uv_stride < 0) { - f->lf.cdef_line[0][1] = ptr - uv_stride * 1; - f->lf.cdef_line[0][2] = ptr - uv_stride * 3; - f->lf.cdef_line[1][1] = ptr - uv_stride * 5; - f->lf.cdef_line[1][2] = ptr - uv_stride * 7; + f->lf.cdef_line[0][1] = ptr - uv_stride * (f->sbh * 8 - 1); + f->lf.cdef_line[0][2] = ptr - uv_stride * (f->sbh * 8 - 3); + f->lf.cdef_line[1][1] = ptr - uv_stride * (f->sbh * 8 - 5); + f->lf.cdef_line[1][2] = ptr - uv_stride * (f->sbh * 8 - 7); } else { f->lf.cdef_line[0][1] = ptr + uv_stride * 0; f->lf.cdef_line[0][2] = ptr + uv_stride * 2; @@ -2907,27 +3089,61 @@ f->lf.cdef_line[1][2] = ptr + uv_stride * 6; } - f->lf.cdef_line_sz[0] = (int) y_stride; - f->lf.cdef_line_sz[1] = (int) uv_stride; + if (need_cdef_lpf_copy) { + ptr += llabs(uv_stride) * f->sbh * 8; + if (y_stride < 0) + f->lf.cdef_lpf_line[0] = ptr - y_stride * (f->sbh * 4 - 1); + else + f->lf.cdef_lpf_line[0] = ptr; + ptr += llabs(y_stride) * f->sbh * 4; + if (uv_stride < 0) { + f->lf.cdef_lpf_line[1] = ptr - uv_stride * (f->sbh * 4 - 1); + f->lf.cdef_lpf_line[2] = ptr - uv_stride * (f->sbh * 8 - 1); + } else { + f->lf.cdef_lpf_line[1] = ptr; + f->lf.cdef_lpf_line[2] = ptr + uv_stride * f->sbh * 4; + } + } + + f->lf.cdef_buf_plane_sz[0] = (int) y_stride * f->sbh * 4; + f->lf.cdef_buf_plane_sz[1] = (int) uv_stride * f->sbh * 8; + f->lf.need_cdef_lpf_copy = need_cdef_lpf_copy; + f->lf.cdef_buf_sbh = f->sbh; } - const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd; - if (lr_line_sz != f->lf.lr_line_sz) { - dav1d_freep_aligned(&f->lf.lr_lpf_line[0]); - const int num_lines = c->n_pfc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12; + const int sb128 = f->seq_hdr->sb128; + const int num_lines = c->n_tc > 1 ? f->sbh * 4 << sb128 : 12; + y_stride = f->sr_cur.p.stride[0], uv_stride = f->sr_cur.p.stride[1]; + if (y_stride * num_lines != f->lf.lr_buf_plane_sz[0] || + uv_stride * num_lines * 2 != f->lf.lr_buf_plane_sz[1]) + { + dav1d_free_aligned(f->lf.lr_line_buf); // lr simd may overread the input, so slightly over-allocate the lpf buffer - uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * num_lines * 3 + 64, 32); - if (!lr_ptr) { - f->lf.lr_line_sz = 0; + size_t alloc_sz = 128; + alloc_sz += (size_t)llabs(y_stride) * num_lines; + alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2; + uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(alloc_sz, 64); + if (!ptr) { + f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0; goto error; } - for (int pl = 0; pl <= 2; pl++) { - f->lf.lr_lpf_line[pl] = lr_ptr; - lr_ptr += lr_line_sz * num_lines; + ptr += 64; + if (y_stride < 0) + f->lf.lr_lpf_line[0] = ptr - y_stride * (num_lines - 1); + else + f->lf.lr_lpf_line[0] = ptr; + ptr += llabs(y_stride) * num_lines; + if (uv_stride < 0) { + f->lf.lr_lpf_line[1] = ptr - uv_stride * (num_lines * 1 - 1); + f->lf.lr_lpf_line[2] = ptr - uv_stride * (num_lines * 2 - 1); + } else { + f->lf.lr_lpf_line[1] = ptr; + f->lf.lr_lpf_line[2] = ptr + uv_stride * num_lines; } - f->lf.lr_line_sz = lr_line_sz; + f->lf.lr_buf_plane_sz[0] = (int) y_stride * num_lines; + f->lf.lr_buf_plane_sz[1] = (int) uv_stride * num_lines * 2; } // update allocation for loopfilter masks @@ -2983,7 +3199,7 @@ if (ipred_edge_sz != f->ipred_edge_sz) { dav1d_freep_aligned(&f->ipred_edge[0]); uint8_t *ptr = f->ipred_edge[0] = - dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 32); + dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 64); if (!ptr) { f->ipred_edge_sz = 0; goto error; @@ -3009,17 +3225,11 @@ if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { const int ret = dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr, - f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc); + f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, + f->c->n_tc, f->c->n_fc); if (ret < 0) goto error; } - // create post-filtering tasks - if (c->n_pfc > 1) - if (dav1d_task_create_filter_sbrow(f)) - goto error; - - retval = DAV1D_ERR(EINVAL); - // setup dequant tables init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq); if (f->frame_hdr->quant.qm) @@ -3081,14 +3291,22 @@ f->lf.sr_p[0] = f->sr_cur.p.data[0]; f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0]; f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0]; - f->lf.tile_row = 1; - dav1d_cdf_thread_wait(&f->in_cdf); + retval = 0; +error: + return retval; +} + +int dav1d_decode_frame_init_cdf(Dav1dFrameContext *const f) { + const Dav1dContext *const c = f->c; + int retval = DAV1D_ERR(EINVAL); + if (f->frame_hdr->refresh_context) dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf); // parse individual tiles per tile group - int update_set = 0, tile_row = 0, tile_col = 0; + int tile_row = 0, tile_col = 0; + f->task_thread.update_set = 0; for (int i = 0; i < f->n_tile_data; i++) { const uint8_t *data = f->tile[i].data.data; size_t size = f->tile[i].data.sz; @@ -3115,213 +3333,79 @@ tile_row++; } if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context) - update_set = 1; + f->task_thread.update_set = 1; data += tile_sz; size -= tile_sz; } } - // 2-pass decoding: - // - enabled for frame-threading, so that one frame can do symbol parsing - // as another (or multiple) are doing reconstruction. One advantage here - // is that although reconstruction is limited by reference availability, - // symbol parsing is not. Therefore, symbol parsing can effectively use - // row and col tile threading, but reconstruction only col tile threading; - // - pass 0 means no 2-pass; - // - pass 1 means symbol parsing only; - // - pass 2 means reconstruction and loop filtering. - - const int uses_2pass = c->n_fc > 1 && f->frame_hdr->refresh_context; - for (f->frame_thread.pass = uses_2pass; - f->frame_thread.pass <= 2 * uses_2pass; f->frame_thread.pass++) - { - const enum PlaneType progress_plane_type = - f->frame_thread.pass == 0 ? PLANE_TYPE_ALL : - f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y; - - for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++) - reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass); - - if (f->n_tc == 1 || (c->n_pfc > 1 && f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows == 1)) { - Dav1dTileContext *const t = f->tc; - - // no tile threading - we explicitly interleave tile/sbrow decoding - // and post-filtering, so that the full process runs in-line, so - // that frame threading is still possible - for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { - const int sbh_end = - imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh); - for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; - sby < sbh_end; sby++) - { - t->by = sby << (4 + f->seq_hdr->sb128); - const int by_end = (t->by + f->sb_step) >> 1; - if (f->frame_thread.pass <= 1 && f->frame_hdr->use_ref_frame_mvs) { - if (c->n_fc > 1) for (int n = 0; n < 7; n++) - if (dav1d_thread_picture_wait(&f->refp[n], - 4 * (t->by + f->sb_step), - PLANE_TYPE_BLOCK)) - { - goto error; - } - dav1d_refmvs_load_tmvs(&f->rf, tile_row, - 0, f->bw >> 1, t->by >> 1, by_end); - } - for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { - t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; - if (dav1d_decode_tile_sbrow(t)) goto error; - } - if (f->seq_hdr->ref_frame_mvs && f->frame_thread.pass <= 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { - dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end); - } + if (c->n_tc > 1) { + const int uses_2pass = c->n_fc > 1; + for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows * (1 + uses_2pass); n++) + reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), + uses_2pass ? 1 + (n >= f->sb128w * f->frame_hdr->tiling.rows) : 0); + } - // loopfilter + cdef + restoration - if (f->frame_thread.pass != 1) { - if (c->n_pfc == 1) - f->bd_fn.filter_sbrow(f, sby); - else { - pthread_mutex_lock(&f->lf.thread.pftd->lock); - if (f->lf.thread.npf != 0 && !f->lf.thread.done) { - Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf]; - t->start = 1; - if (t->status == DAV1D_TASK_READY) - dav1d_task_schedule(f->lf.thread.pftd, t); - } - pthread_mutex_unlock(&f->lf.thread.pftd->lock); - } - } - if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0) - dav1d_thread_picture_signal(&f->sr_cur, - (sby + 1) * f->sb_step * 4, - progress_plane_type); - } - } - } else { - // signal available tasks to worker threads - int num_tasks; - - pthread_mutex_lock(&f->tile_thread.lock); - assert(!f->tile_thread.tasks_left); - if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) { - // we can (or in fact, if >, we need to) do full tile decoding. - // loopfilter happens below - num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; - } else { - // we need to interleave sbrow decoding for all tile cols in a - // tile row, since otherwise subsequent threads will be blocked - // waiting for the post-filter to complete - num_tasks = f->sbh * f->frame_hdr->tiling.cols; - } - f->tile_thread.num_tasks = f->tile_thread.tasks_left = num_tasks; - pthread_cond_broadcast(&f->tile_thread.cond); - pthread_mutex_unlock(&f->tile_thread.lock); - - for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { - for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; - sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++) - { - for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; - tile_col++) - { - int progress; - Dav1dTileState *const ts = - &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; - - if ((progress = atomic_load(&ts->progress)) <= sby) { - pthread_mutex_lock(&ts->tile_thread.lock); - while ((progress = atomic_load(&ts->progress)) <= sby) - pthread_cond_wait(&ts->tile_thread.cond, - &ts->tile_thread.lock); - pthread_mutex_unlock(&ts->tile_thread.lock); - } - if (progress == TILE_ERROR) { - dav1d_thread_picture_signal(&f->sr_cur, FRAME_ERROR, - PLANE_TYPE_ALL); - const uint64_t all_mask = ~0ULL >> (64 - f->n_tc); - pthread_mutex_lock(&f->tile_thread.lock); - while (f->tile_thread.available != all_mask) - pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock); - pthread_mutex_unlock(&f->tile_thread.lock); - goto error; - } - } + retval = 0; +error: + return retval; +} - // loopfilter + cdef + restoration - if (f->frame_thread.pass != 1) { - if (c->n_pfc == 1) - f->bd_fn.filter_sbrow(f, sby); - else { - pthread_mutex_lock(&f->lf.thread.pftd->lock); - if (f->lf.thread.npf != 0 && !f->lf.thread.done) { - Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf]; - t->start = 1; - if (t->status == DAV1D_TASK_READY) - dav1d_task_schedule(f->lf.thread.pftd, t); - } - pthread_mutex_unlock(&f->lf.thread.pftd->lock); - } - } - if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0) - dav1d_thread_picture_signal(&f->sr_cur, - (sby + 1) * f->sb_step * 4, - progress_plane_type); - } - } +int dav1d_decode_frame_main(Dav1dFrameContext *const f) { + const Dav1dContext *const c = f->c; + int retval = DAV1D_ERR(EINVAL); - const uint64_t all_mask = ~0ULL >> (64 - f->n_tc); - pthread_mutex_lock(&f->tile_thread.lock); - while (f->tile_thread.available != all_mask) - pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock); - pthread_mutex_unlock(&f->tile_thread.lock); - } + assert(f->c->n_tc == 1); - if (f->frame_thread.pass <= 1 && f->frame_hdr->refresh_context) { - // cdf update - if (update_set) - dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf, - &f->ts[f->frame_hdr->tiling.update].cdf); - dav1d_cdf_thread_signal(&f->out_cdf); - } - if (f->frame_thread.pass == 1) { - assert(c->n_fc > 1); - for (int tile_idx = 0; - tile_idx < f->frame_hdr->tiling.rows * f->frame_hdr->tiling.cols; - tile_idx++) - { - Dav1dTileState *const ts = &f->ts[tile_idx]; - const size_t tile_start_off = - (size_t) f->frame_thread.tile_start_off[tile_idx]; - ts->frame_thread.pal_idx = f->frame_thread.pal_idx ? - &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4] : - NULL; - ts->frame_thread.cf = f->frame_thread.cf ? - (uint8_t*)f->frame_thread.cf + - ((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) : - NULL; - if (f->n_tc > 0) { - const unsigned row_sb_start = - f->frame_hdr->tiling.row_start_sb[ts->tiling.row]; - atomic_init(&ts->progress, row_sb_start); - } + Dav1dTaskContext *const t = &c->tc[f - c->fc]; + t->f = f; + t->frame_thread.pass = 0; + + for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++) + reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), 0); + + // no threading - we explicitly interleave tile/sbrow decoding + // and post-filtering, so that the full process runs in-line + for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { + const int sbh_end = + imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh); + for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; + sby < sbh_end; sby++) + { + t->by = sby << (4 + f->seq_hdr->sb128); + const int by_end = (t->by + f->sb_step) >> 1; + if (f->frame_hdr->use_ref_frame_mvs) { + dav1d_refmvs_load_tmvs(&f->rf, tile_row, + 0, f->bw >> 1, t->by >> 1, by_end); } + for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { + t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; + if (dav1d_decode_tile_sbrow(t)) goto error; + } + if (IS_INTER_OR_SWITCH(f->frame_hdr)) { + dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end); + } + + // loopfilter + cdef + restoration + f->bd_fn.filter_sbrow(f, sby); } } retval = 0; error: - if (c->n_pfc > 1) { - pthread_mutex_lock(&f->lf.thread.pftd->lock); - if (!f->lf.thread.done) { - if (retval != 0) { - f->lf.thread.done = -1; - pthread_cond_signal(&f->lf.thread.pftd->cond); - } - pthread_cond_wait(&f->lf.thread.cond, &f->lf.thread.pftd->lock); - } - pthread_mutex_unlock(&f->lf.thread.pftd->lock); + return retval; +} + +void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) { + const Dav1dContext *const c = f->c; + + if (f->sr_cur.p.data[0]) + atomic_init(&f->task_thread.error, 0); + + if (c->n_fc > 1 && retval && f->frame_thread.cf) { + memset(f->frame_thread.cf, 0, + (size_t)f->frame_thread.cf_sz * 128 * 128 / 2); } - dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR, - PLANE_TYPE_ALL); for (int i = 0; i < 7; i++) { if (f->refp[i].p.data[0]) dav1d_thread_picture_unref(&f->refp[i]); @@ -3331,8 +3415,9 @@ dav1d_picture_unref_internal(&f->cur); dav1d_thread_picture_unref(&f->sr_cur); dav1d_cdf_thread_unref(&f->in_cdf); - if (f->frame_hdr->refresh_context) { - dav1d_cdf_thread_signal(&f->out_cdf); + if (f->frame_hdr && f->frame_hdr->refresh_context) { + if (f->out_cdf.progress) + atomic_store(f->out_cdf.progress, retval == 0 ? 1 : TILE_ERROR); dav1d_cdf_thread_unref(&f->out_cdf); } dav1d_ref_dec(&f->cur_segmap_ref); @@ -3343,8 +3428,43 @@ for (int i = 0; i < f->n_tile_data; i++) dav1d_data_unref_internal(&f->tile[i].data); + f->task_thread.retval = retval; +} - return retval; +int dav1d_decode_frame(Dav1dFrameContext *const f) { + assert(f->c->n_fc == 1); + // if n_tc > 1 (but n_fc == 1), we could run init/exit in the task + // threads also. Not sure it makes a measurable difference. + int res = dav1d_decode_frame_init(f); + if (!res) res = dav1d_decode_frame_init_cdf(f); + // wait until all threads have completed + if (!res) { + if (f->c->n_tc > 1) { + pthread_mutex_lock(&f->task_thread.ttd->lock); + res = dav1d_task_create_tile_sbrow(f, 0, 1); + if (!res) { + const int uses_2pass = f->c->n_fc > 1; + while (!f->task_thread.done[0] || + (uses_2pass && !f->task_thread.done[1]) || + f->task_thread.task_counter > 0) + { + pthread_cond_wait(&f->task_thread.cond, + &f->task_thread.ttd->lock); + } + } + pthread_mutex_unlock(&f->task_thread.ttd->lock); + res = f->task_thread.retval; + } else { + res = dav1d_decode_frame_main(f); + if (!res && f->frame_hdr->refresh_context && f->task_thread.update_set) { + dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf, + &f->ts[f->frame_hdr->tiling.update].cdf); + } + } + } + dav1d_decode_frame_exit(f, res); + f->n_tile_data = 0; + return res; } static int get_upscale_x0(const int in_w, const int out_w, const int step) { @@ -3360,21 +3480,37 @@ // wait for c->out_delayed[next] and move into c->out if visible Dav1dThreadPicture *out_delayed; if (c->n_fc > 1) { + pthread_mutex_lock(&c->task_thread.lock); const unsigned next = c->frame_thread.next++; if (c->frame_thread.next == c->n_fc) c->frame_thread.next = 0; f = &c->fc[next]; - pthread_mutex_lock(&f->frame_thread.td.lock); while (f->n_tile_data > 0) - pthread_cond_wait(&f->frame_thread.td.cond, - &f->frame_thread.td.lock); + pthread_cond_wait(&f->task_thread.cond, + &c->task_thread.lock); out_delayed = &c->frame_thread.out_delayed[next]; - if (out_delayed->p.data[0]) { + if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) { + if (atomic_load(&c->task_thread.first) + 1U < c->n_fc) + atomic_fetch_add(&c->task_thread.first, 1U); + else + atomic_store(&c->task_thread.first, 0); + if (c->task_thread.cur && c->task_thread.cur < c->n_fc) + c->task_thread.cur--; + } + const int error = f->task_thread.retval; + if (error) { + f->task_thread.retval = 0; + c->cached_error = error; + dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m); + dav1d_thread_picture_unref(out_delayed); + } else if (out_delayed->p.data[0]) { const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], memory_order_relaxed); - if (out_delayed->visible && progress != FRAME_ERROR) { - dav1d_picture_ref(&c->out, &out_delayed->p); + if ((out_delayed->visible || c->output_invisible_frames) && + progress != FRAME_ERROR) + { + dav1d_thread_picture_ref(&c->out, out_delayed); c->event_flags |= dav1d_picture_get_event_flags(out_delayed); } dav1d_thread_picture_unref(out_delayed); @@ -3429,7 +3565,8 @@ f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \ f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \ f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \ - f->bd_fn.filter_sbrow_deblock = dav1d_filter_sbrow_deblock_##bd##bpc; \ + f->bd_fn.filter_sbrow_deblock_cols = dav1d_filter_sbrow_deblock_cols_##bd##bpc; \ + f->bd_fn.filter_sbrow_deblock_rows = dav1d_filter_sbrow_deblock_rows_##bd##bpc; \ f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \ f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \ f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \ @@ -3484,7 +3621,7 @@ f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4; f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4; } else { - f->svc[i][0].scale = 0; + f->svc[i][0].scale = f->svc[i][1].scale = 0; } f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION && !f->frame_hdr->force_integer_mv && @@ -3501,7 +3638,7 @@ dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]); } if (f->frame_hdr->refresh_context) { - res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL); + res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1); if (res < 0) goto error; } @@ -3546,8 +3683,8 @@ // move f->cur into output queue if (c->n_fc == 1) { - if (f->frame_hdr->show_frame) { - dav1d_picture_ref(&c->out, &f->sr_cur.p); + if (f->frame_hdr->show_frame || c->output_invisible_frames) { + dav1d_thread_picture_ref(&c->out, &f->sr_cur); c->event_flags |= dav1d_picture_get_event_flags(&f->sr_cur); } } else { @@ -3565,6 +3702,11 @@ f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift; f->b4_stride = (f->bw + 31) & ~31; f->bitdepth_max = (1 << f->cur.p.bpc) - 1; + atomic_init(&f->task_thread.error, 0); + const int uses_2pass = c->n_fc > 1; + const int cols = f->frame_hdr->tiling.cols; + const int rows = f->frame_hdr->tiling.rows; + f->task_thread.task_counter = (cols * rows + f->sbh) << uses_2pass; // ref_mvs if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { @@ -3694,7 +3836,7 @@ if (c->n_fc == 1) { if ((res = dav1d_decode_frame(f)) < 0) { - dav1d_picture_unref_internal(&c->out); + dav1d_thread_picture_unref(&c->out); for (int i = 0; i < 8; i++) { if (refresh_frame_flags & (1 << i)) { if (c->refs[i].p.p.data[0]) @@ -3704,15 +3846,16 @@ dav1d_ref_dec(&c->refs[i].refmvs); } } - return res; + goto error; } } else { - pthread_cond_signal(&f->frame_thread.td.cond); - pthread_mutex_unlock(&f->frame_thread.td.lock); + dav1d_task_frame_init(f); + pthread_mutex_unlock(&c->task_thread.lock); } return 0; error: + atomic_init(&f->task_thread.error, 1); dav1d_cdf_thread_unref(&f->in_cdf); if (f->frame_hdr->refresh_context) dav1d_cdf_thread_unref(&f->out_cdf); @@ -3722,7 +3865,7 @@ dav1d_ref_dec(&f->ref_mvs_ref[i]); } if (c->n_fc == 1) - dav1d_picture_unref_internal(&c->out); + dav1d_thread_picture_unref(&c->out); else dav1d_thread_picture_unref(out_delayed); dav1d_picture_unref_internal(&f->cur); @@ -3730,15 +3873,14 @@ dav1d_ref_dec(&f->mvs_ref); dav1d_ref_dec(&f->seq_hdr_ref); dav1d_ref_dec(&f->frame_hdr_ref); + dav1d_data_props_copy(&c->cached_error_props, &c->in.m); for (int i = 0; i < f->n_tile_data; i++) dav1d_data_unref_internal(&f->tile[i].data); f->n_tile_data = 0; - if (c->n_fc > 1) { - pthread_cond_signal(&f->frame_thread.td.cond); - pthread_mutex_unlock(&f->frame_thread.td.lock); - } + if (c->n_fc > 1) + pthread_mutex_unlock(&c->task_thread.lock); return res; } diff -Nru dav1d-0.9.2/src/ext/x86/x86inc.asm dav1d-1.0.0/src/ext/x86/x86inc.asm --- dav1d-0.9.2/src/ext/x86/x86inc.asm 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/ext/x86/x86inc.asm 2022-03-18 14:31:55.986356000 +0000 @@ -1,7 +1,7 @@ ;***************************************************************************** ;* x86inc.asm: x86 abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2021 x264 project +;* Copyright (C) 2005-2022 x264 project ;* ;* Authors: Loren Merritt ;* Henrik Gramner @@ -238,6 +238,16 @@ %endif %endmacro +; Repeats an instruction/operation for multiple arguments. +; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" +%macro REPX 2-* ; operation, args + %xdefine %%f(x) %1 + %rep %0 - 1 + %rotate 1 + %%f(%1) + %endrep +%endmacro + %macro PUSH 1 push %1 %ifidn rstk, rsp @@ -1342,7 +1352,20 @@ %1 %6, __src2 %endif %elif %0 >= 9 - __instr %6, %7, %8, %9 + %if avx_enabled && __sizeofreg >= 16 && %4 == 1 + %ifnnum regnumof%7 + %if %3 + vmovaps %6, %7 + %else + vmovdqa %6, %7 + %endif + __instr %6, %6, %8, %9 + %else + __instr %6, %7, %8, %9 + %endif + %else + __instr %6, %7, %8, %9 + %endif %elif %0 == 8 %if avx_enabled && __sizeofreg >= 16 && %4 == 0 %xdefine __src1 %7 @@ -1379,7 +1402,7 @@ %else vmovdqa %6, %7 %endif - __instr %6, %8 + __instr %6, %6, %8 %else __instr %6, __src1, __src2 %endif @@ -1448,8 +1471,8 @@ AVX_INSTR andps, sse, 1, 0, 1 AVX_INSTR blendpd, sse4, 1, 1, 0 AVX_INSTR blendps, sse4, 1, 1, 0 -AVX_INSTR blendvpd, sse4 ; can't be emulated -AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding +AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR cmpeqpd, sse2, 1, 0, 1 AVX_INSTR cmpeqps, sse, 1, 0, 1 AVX_INSTR cmpeqsd, sse2, 1, 0, 0 @@ -1582,7 +1605,7 @@ AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 -AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR pblendw, sse4, 0, 1, 0 AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 diff -Nru dav1d-0.9.2/src/fg_apply.h dav1d-1.0.0/src/fg_apply.h --- dav1d-0.9.2/src/fg_apply.h 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/fg_apply.h 2022-03-18 14:31:55.986356000 +0000 @@ -32,10 +32,27 @@ #include "common/bitdepth.h" -#include "src/film_grain.h" +#include "src/filmgrain.h" -bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp, - Dav1dPicture *const out, - Dav1dPicture *const in); +#ifdef BITDEPTH +# define array_decl(type, name, sz) type name sz +#else +# define array_decl(type, name, sz) void *name +#endif + +bitfn_decls(void dav1d_apply_grain, + const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, const Dav1dPicture *const in); +bitfn_decls(void dav1d_prep_grain, + const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, const Dav1dPicture *const in, + array_decl(uint8_t, scaling, [3][SCALING_SIZE]), + array_decl(entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH])); +bitfn_decls(void dav1d_apply_grain_row, + const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, const Dav1dPicture *const in, + array_decl(const uint8_t, scaling, [3][SCALING_SIZE]), + array_decl(const entry, grain_lut, [3][GRAIN_HEIGHT+1][GRAIN_WIDTH]), + const int row); #endif /* DAV1D_SRC_FG_APPLY_H */ diff -Nru dav1d-0.9.2/src/fg_apply_tmpl.c dav1d-1.0.0/src/fg_apply_tmpl.c --- dav1d-0.9.2/src/fg_apply_tmpl.c 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/fg_apply_tmpl.c 2022-03-18 14:31:55.986356000 +0000 @@ -30,13 +30,13 @@ #include +#include "dav1d/common.h" #include "dav1d/picture.h" -#include "common.h" #include "common/intops.h" #include "common/bitdepth.h" -#include "fg_apply.h" +#include "src/fg_apply.h" static void generate_scaling(const int bitdepth, const uint8_t points[][2], const int num, @@ -44,14 +44,15 @@ { #if BITDEPTH == 8 const int shift_x = 0; + const int scaling_size = SCALING_SIZE; #else + assert(bitdepth > 8); const int shift_x = bitdepth - 8; -#endif const int scaling_size = 1 << bitdepth; +#endif // Fill up the preceding entries with the initial value - for (int i = 0; i < points[0][0] << shift_x; i++) - scaling[i] = points[0][1]; + memset(scaling, points[0][1], points[0][0] << shift_x); // Linearly interpolate the values in the middle for (int i = 0; i < num - 1; i++) { @@ -61,16 +62,17 @@ const int ey = points[i+1][1]; const int dx = ex - bx; const int dy = ey - by; + assert(dx > 0); const int delta = dy * ((0x10000 + (dx >> 1)) / dx); - for (int x = 0; x < dx; x++) { - const int v = by + ((x * delta + 0x8000) >> 16); - scaling[(bx + x) << shift_x] = v; + for (int x = 0, d = 0x8000; x < dx; x++) { + scaling[(bx + x) << shift_x] = by + (d >> 16); + d += delta; } } // Fill up the remaining entries with the final value - for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++) - scaling[i] = points[num - 1][1]; + const int n = points[num - 1][0] << shift_x; + memset(&scaling[n], points[num - 1][1], scaling_size - n); #if BITDEPTH != 8 const int pad = 1 << shift_x, rnd = pad >> 1; @@ -80,8 +82,9 @@ const int dx = ex - bx; for (int x = 0; x < dx; x += pad) { const int range = scaling[bx + x + pad] - scaling[bx + x]; - for (int n = 1; n < pad; n++) { - scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x); + for (int n = 1, r = rnd; n < pad; n++) { + r += range; + scaling[bx + x + n] = scaling[bx + x] + (r >> shift_x); } } } @@ -89,14 +92,13 @@ } #ifndef UNIT_TEST -void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp, - Dav1dPicture *const out, - Dav1dPicture *const in) +void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, + const Dav1dPicture *const in, + uint8_t scaling[3][SCALING_SIZE], + entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH]) { const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data; - - entry grain_lut[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH]; - uint8_t scaling[3][SCALING_SIZE]; #if BITDEPTH != 8 const int bitdepth_max = (1 << out->p.bpc) - 1; #endif @@ -150,60 +152,86 @@ memcpy(out->data[2], in->data[2], sz); } } +} +void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, + const Dav1dPicture *const in, + const uint8_t scaling[3][SCALING_SIZE], + const entry grain_lut[3][GRAIN_HEIGHT+1][GRAIN_WIDTH], + const int row) +{ // Synthesize grain for the affected planes - const int rows = (out->p.h + 31) >> 5; + const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data; const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420; const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444; const int cpw = (out->p.w + ss_x) >> ss_x; const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY; - for (int row = 0; row < rows; row++) { - pixel *const luma_src = - ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]); - - if (data->num_y_points) { - const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE); - dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]), - luma_src, out->stride[0], data, - out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX); - } + pixel *const luma_src = + ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]); +#if BITDEPTH != 8 + const int bitdepth_max = (1 << out->p.bpc) - 1; +#endif - if (!data->num_uv_points[0] && !data->num_uv_points[1] && - !data->chroma_scaling_from_luma) - { - continue; - } + if (data->num_y_points) { + const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE); + dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]), + luma_src, out->stride[0], data, + out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX); + } - const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y; + if (!data->num_uv_points[0] && !data->num_uv_points[1] && + !data->chroma_scaling_from_luma) + { + return; + } - // extend padding pixels - if (out->p.w & ss_x) { - pixel *ptr = luma_src; - for (int y = 0; y < bh; y++) { - ptr[out->p.w] = ptr[out->p.w - 1]; - ptr += PXSTRIDE(in->stride[0]) << ss_y; - } + const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y; + + // extend padding pixels + if (out->p.w & ss_x) { + pixel *ptr = luma_src; + for (int y = 0; y < bh; y++) { + ptr[out->p.w] = ptr[out->p.w - 1]; + ptr += PXSTRIDE(in->stride[0]) << ss_y; } + } - const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y; - if (data->chroma_scaling_from_luma) { - for (int pl = 0; pl < 2; pl++) + const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y; + if (data->chroma_scaling_from_luma) { + for (int pl = 0; pl < 2; pl++) + dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, + ((const pixel *) in->data[1 + pl]) + uv_off, + in->stride[1], data, cpw, + scaling[0], grain_lut[1 + pl], + bh, row, luma_src, in->stride[0], + pl, is_id HIGHBD_TAIL_SUFFIX); + } else { + for (int pl = 0; pl < 2; pl++) + if (data->num_uv_points[pl]) dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, ((const pixel *) in->data[1 + pl]) + uv_off, in->stride[1], data, cpw, - scaling[0], grain_lut[1 + pl], + scaling[1 + pl], grain_lut[1 + pl], bh, row, luma_src, in->stride[0], pl, is_id HIGHBD_TAIL_SUFFIX); - } else { - for (int pl = 0; pl < 2; pl++) - if (data->num_uv_points[pl]) - dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, - ((const pixel *) in->data[1 + pl]) + uv_off, - in->stride[1], data, cpw, - scaling[1 + pl], grain_lut[1 + pl], - bh, row, luma_src, in->stride[0], - pl, is_id HIGHBD_TAIL_SUFFIX); - } } } + +void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp, + Dav1dPicture *const out, + const Dav1dPicture *const in) +{ + ALIGN_STK_16(entry, grain_lut, 3,[GRAIN_HEIGHT + 1][GRAIN_WIDTH]); +#if ARCH_X86_64 && BITDEPTH == 8 + ALIGN_STK_64(uint8_t, scaling, 3,[SCALING_SIZE]); +#else + uint8_t scaling[3][SCALING_SIZE]; +#endif + const int rows = (out->p.h + 31) >> 5; + + bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut); + for (int row = 0; row < rows; row++) + bitfn(dav1d_apply_grain_row)(dsp, out, in, scaling, grain_lut, row); +} #endif diff -Nru dav1d-0.9.2/src/film_grain.h dav1d-1.0.0/src/film_grain.h --- dav1d-0.9.2/src/film_grain.h 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/film_grain.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,86 +0,0 @@ -/* - * Copyright © 2018-2021, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef DAV1D_SRC_FILM_GRAIN_H -#define DAV1D_SRC_FILM_GRAIN_H - -#include "common/bitdepth.h" - -#include "src/levels.h" - -#define GRAIN_WIDTH 82 -#define GRAIN_HEIGHT 73 -#define BLOCK_SIZE 32 -#if !defined(BITDEPTH) || BITDEPTH == 8 -#define SCALING_SIZE 256 -typedef int8_t entry; -#else -#define SCALING_SIZE 4096 -typedef int16_t entry; -#endif - -#define decl_generate_grain_y_fn(name) \ -void (name)(entry buf[][GRAIN_WIDTH], \ - const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX) -typedef decl_generate_grain_y_fn(*generate_grain_y_fn); - -#define decl_generate_grain_uv_fn(name) \ -void (name)(entry buf[][GRAIN_WIDTH], \ - const entry buf_y[][GRAIN_WIDTH], \ - const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX) -typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn); - -#define decl_fgy_32x32xn_fn(name) \ -void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ - const Dav1dFilmGrainData *data, \ - size_t pw, const uint8_t scaling[SCALING_SIZE], \ - const entry grain_lut[][GRAIN_WIDTH], \ - int bh, int row_num HIGHBD_DECL_SUFFIX) -typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn); - -#define decl_fguv_32x32xn_fn(name) \ -void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ - const Dav1dFilmGrainData *data, int pw, \ - const uint8_t scaling[SCALING_SIZE], \ - const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \ - const pixel *luma_row, ptrdiff_t luma_stride, \ - int uv_pl, int is_id HIGHBD_DECL_SUFFIX) -typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn); - -typedef struct Dav1dFilmGrainDSPContext { - generate_grain_y_fn generate_grain_y; - generate_grain_uv_fn generate_grain_uv[3]; - - fgy_32x32xn_fn fgy_32x32xn; - fguv_32x32xn_fn fguv_32x32xn[3]; -} Dav1dFilmGrainDSPContext; - -bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c); -bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c); -bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c); - -#endif /* DAV1D_SRC_FILM_GRAIN_H */ diff -Nru dav1d-0.9.2/src/filmgrain.h dav1d-1.0.0/src/filmgrain.h --- dav1d-0.9.2/src/filmgrain.h 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/filmgrain.h 2022-03-18 14:31:55.986356000 +0000 @@ -0,0 +1,86 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_FILM_GRAIN_H +#define DAV1D_SRC_FILM_GRAIN_H + +#include "common/bitdepth.h" + +#include "src/levels.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 +#define BLOCK_SIZE 32 +#if !defined(BITDEPTH) || BITDEPTH == 8 +#define SCALING_SIZE 256 +typedef int8_t entry; +#else +#define SCALING_SIZE 4096 +typedef int16_t entry; +#endif + +#define decl_generate_grain_y_fn(name) \ +void (name)(entry buf[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX) +typedef decl_generate_grain_y_fn(*generate_grain_y_fn); + +#define decl_generate_grain_uv_fn(name) \ +void (name)(entry buf[][GRAIN_WIDTH], \ + const entry buf_y[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX) +typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn); + +#define decl_fgy_32x32xn_fn(name) \ +void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ + const Dav1dFilmGrainData *data, \ + size_t pw, const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], \ + int bh, int row_num HIGHBD_DECL_SUFFIX) +typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn); + +#define decl_fguv_32x32xn_fn(name) \ +void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \ + const Dav1dFilmGrainData *data, int pw, \ + const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \ + const pixel *luma_row, ptrdiff_t luma_stride, \ + int uv_pl, int is_id HIGHBD_DECL_SUFFIX) +typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn); + +typedef struct Dav1dFilmGrainDSPContext { + generate_grain_y_fn generate_grain_y; + generate_grain_uv_fn generate_grain_uv[3]; + + fgy_32x32xn_fn fgy_32x32xn; + fguv_32x32xn_fn fguv_32x32xn[3]; +} Dav1dFilmGrainDSPContext; + +bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c); +bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c); +bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c); + +#endif /* DAV1D_SRC_FILM_GRAIN_H */ diff -Nru dav1d-0.9.2/src/film_grain_tmpl.c dav1d-1.0.0/src/film_grain_tmpl.c --- dav1d-0.9.2/src/film_grain_tmpl.c 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/film_grain_tmpl.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,433 +0,0 @@ -/* - * Copyright © 2018, Niklas Haas - * Copyright © 2018, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "common/attributes.h" -#include "common/intops.h" - -#include "film_grain.h" -#include "tables.h" - -#define SUB_GRAIN_WIDTH 44 -#define SUB_GRAIN_HEIGHT 38 - -static inline int get_random_number(const int bits, unsigned *const state) { - const int r = *state; - unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; - *state = (r >> 1) | (bit << 15); - - return (*state >> (16 - bits)) & ((1 << bits) - 1); -} - -static inline int round2(const int x, const uint64_t shift) { - return (x + ((1 << shift) >> 1)) >> shift; -} - -static void generate_grain_y_c(entry buf[][GRAIN_WIDTH], - const Dav1dFilmGrainData *const data - HIGHBD_DECL_SUFFIX) -{ - const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; - unsigned seed = data->seed; - const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; - const int grain_ctr = 128 << bitdepth_min_8; - const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; - - for (int y = 0; y < GRAIN_HEIGHT; y++) { - for (int x = 0; x < GRAIN_WIDTH; x++) { - const int value = get_random_number(11, &seed); - buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); - } - } - - const int ar_pad = 3; - const int ar_lag = data->ar_coeff_lag; - - for (int y = ar_pad; y < GRAIN_HEIGHT; y++) { - for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) { - const int8_t *coeff = data->ar_coeffs_y; - int sum = 0; - for (int dy = -ar_lag; dy <= 0; dy++) { - for (int dx = -ar_lag; dx <= ar_lag; dx++) { - if (!dx && !dy) - break; - sum += *(coeff++) * buf[y + dy][x + dx]; - } - } - - const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); - buf[y][x] = iclip(grain, grain_min, grain_max); - } - } -} - -static NOINLINE void -generate_grain_uv_c(entry buf[][GRAIN_WIDTH], - const entry buf_y[][GRAIN_WIDTH], - const Dav1dFilmGrainData *const data, const intptr_t uv, - const int subx, const int suby HIGHBD_DECL_SUFFIX) -{ - const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; - unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524); - const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; - const int grain_ctr = 128 << bitdepth_min_8; - const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; - - const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH; - const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT; - - for (int y = 0; y < chromaH; y++) { - for (int x = 0; x < chromaW; x++) { - const int value = get_random_number(11, &seed); - buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); - } - } - - const int ar_pad = 3; - const int ar_lag = data->ar_coeff_lag; - - for (int y = ar_pad; y < chromaH; y++) { - for (int x = ar_pad; x < chromaW - ar_pad; x++) { - const int8_t *coeff = data->ar_coeffs_uv[uv]; - int sum = 0; - for (int dy = -ar_lag; dy <= 0; dy++) { - for (int dx = -ar_lag; dx <= ar_lag; dx++) { - // For the final (current) pixel, we need to add in the - // contribution from the luma grain texture - if (!dx && !dy) { - if (!data->num_y_points) - break; - int luma = 0; - const int lumaX = ((x - ar_pad) << subx) + ar_pad; - const int lumaY = ((y - ar_pad) << suby) + ar_pad; - for (int i = 0; i <= suby; i++) { - for (int j = 0; j <= subx; j++) { - luma += buf_y[lumaY + i][lumaX + j]; - } - } - luma = round2(luma, subx + suby); - sum += luma * (*coeff); - break; - } - - sum += *(coeff++) * buf[y + dy][x + dx]; - } - } - - const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); - buf[y][x] = iclip(grain, grain_min, grain_max); - } - } -} - -#define gnuv_ss_fn(nm, ss_x, ss_y) \ -static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \ - generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \ -} - -gnuv_ss_fn(420, 1, 1); -gnuv_ss_fn(422, 1, 0); -gnuv_ss_fn(444, 0, 0); - -// samples from the correct block of a grain LUT, while taking into account the -// offsets provided by the offsets cache -static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH], - const int offsets[2][2], const int subx, const int suby, - const int bx, const int by, const int x, const int y) -{ - const int randval = offsets[bx][by]; - const int offx = 3 + (2 >> subx) * (3 + (randval >> 4)); - const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF)); - return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by] - [offx + x + (BLOCK_SIZE >> subx) * bx]; -} - -static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row, - const ptrdiff_t stride, - const Dav1dFilmGrainData *const data, const size_t pw, - const uint8_t scaling[SCALING_SIZE], - const entry grain_lut[][GRAIN_WIDTH], - const int bh, const int row_num HIGHBD_DECL_SUFFIX) -{ - const int rows = 1 + (data->overlap_flag && row_num > 0); - const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; - const int grain_ctr = 128 << bitdepth_min_8; - const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; - - int min_value, max_value; - if (data->clip_to_restricted_range) { - min_value = 16 << bitdepth_min_8; - max_value = 235 << bitdepth_min_8; - } else { - min_value = 0; - max_value = BITDEPTH_MAX; - } - - // seed[0] contains the current row, seed[1] contains the previous - unsigned seed[2]; - for (int i = 0; i < rows; i++) { - seed[i] = data->seed; - seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; - seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); - } - - assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); - - int offsets[2 /* col offset */][2 /* row offset */]; - - // process this row in BLOCK_SIZE^2 blocks - for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { - const int bw = imin(BLOCK_SIZE, (int) pw - bx); - - if (data->overlap_flag && bx) { - // shift previous offsets left - for (int i = 0; i < rows; i++) - offsets[1][i] = offsets[0][i]; - } - - // update current offsets - for (int i = 0; i < rows; i++) - offsets[0][i] = get_random_number(8, &seed[i]); - - // x/y block offsets to compensate for overlapped regions - const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0; - const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0; - - static const int w[2][2] = { { 27, 17 }, { 17, 27 } }; - -#define add_noise_y(x, y, grain) \ - const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \ - pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \ - const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \ - *dst = iclip(*src + noise, min_value, max_value); - - for (int y = ystart; y < bh; y++) { - // Non-overlapped image region (straightforward) - for (int x = xstart; x < bw; x++) { - int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); - add_noise_y(x, y, grain); - } - - // Special case for overlapped column - for (int x = 0; x < xstart; x++) { - int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); - int old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); - grain = round2(old * w[x][0] + grain * w[x][1], 5); - grain = iclip(grain, grain_min, grain_max); - add_noise_y(x, y, grain); - } - } - - for (int y = 0; y < ystart; y++) { - // Special case for overlapped row (sans corner) - for (int x = xstart; x < bw; x++) { - int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); - int old = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); - grain = round2(old * w[y][0] + grain * w[y][1], 5); - grain = iclip(grain, grain_min, grain_max); - add_noise_y(x, y, grain); - } - - // Special case for doubly-overlapped corner - for (int x = 0; x < xstart; x++) { - // Blend the top pixel with the top left block - int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); - int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y); - top = round2(old * w[x][0] + top * w[x][1], 5); - top = iclip(top, grain_min, grain_max); - - // Blend the current pixel with the left block - int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); - old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); - grain = round2(old * w[x][0] + grain * w[x][1], 5); - grain = iclip(grain, grain_min, grain_max); - - // Mix the row rows together and apply grain - grain = round2(top * w[y][0] + grain * w[y][1], 5); - grain = iclip(grain, grain_min, grain_max); - add_noise_y(x, y, grain); - } - } - } -} - -static NOINLINE void -fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row, - const ptrdiff_t stride, const Dav1dFilmGrainData *const data, - const int pw, const uint8_t scaling[SCALING_SIZE], - const entry grain_lut[][GRAIN_WIDTH], const int bh, - const int row_num, const pixel *const luma_row, - const ptrdiff_t luma_stride, const int uv, const int is_id, - const int sx, const int sy HIGHBD_DECL_SUFFIX) -{ - const int rows = 1 + (data->overlap_flag && row_num > 0); - const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; - const int grain_ctr = 128 << bitdepth_min_8; - const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; - - int min_value, max_value; - if (data->clip_to_restricted_range) { - min_value = 16 << bitdepth_min_8; - max_value = (is_id ? 235 : 240) << bitdepth_min_8; - } else { - min_value = 0; - max_value = BITDEPTH_MAX; - } - - // seed[0] contains the current row, seed[1] contains the previous - unsigned seed[2]; - for (int i = 0; i < rows; i++) { - seed[i] = data->seed; - seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; - seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); - } - - assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); - - int offsets[2 /* col offset */][2 /* row offset */]; - - // process this row in BLOCK_SIZE^2 blocks (subsampled) - for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { - const int bw = imin(BLOCK_SIZE >> sx, pw - bx); - if (data->overlap_flag && bx) { - // shift previous offsets left - for (int i = 0; i < rows; i++) - offsets[1][i] = offsets[0][i]; - } - - // update current offsets - for (int i = 0; i < rows; i++) - offsets[0][i] = get_random_number(8, &seed[i]); - - // x/y block offsets to compensate for overlapped regions - const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0; - const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0; - - static const int w[2 /* sub */][2 /* off */][2] = { - { { 27, 17 }, { 17, 27 } }, - { { 23, 22 } }, - }; - -#define add_noise_uv(x, y, grain) \ - const int lx = (bx + x) << sx; \ - const int ly = y << sy; \ - const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \ - pixel avg = luma[0]; \ - if (sx) \ - avg = (avg + luma[1] + 1) >> 1; \ - const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ - pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ - int val = avg; \ - if (!data->chroma_scaling_from_luma) { \ - const int combined = avg * data->uv_luma_mult[uv] + \ - *src * data->uv_mult[uv]; \ - val = iclip_pixel( (combined >> 6) + \ - (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \ - } \ - const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \ - *dst = iclip(*src + noise, min_value, max_value); - - for (int y = ystart; y < bh; y++) { - // Non-overlapped image region (straightforward) - for (int x = xstart; x < bw; x++) { - int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); - add_noise_uv(x, y, grain); - } - - // Special case for overlapped column - for (int x = 0; x < xstart; x++) { - int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); - int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); - grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); - grain = iclip(grain, grain_min, grain_max); - add_noise_uv(x, y, grain); - } - } - - for (int y = 0; y < ystart; y++) { - // Special case for overlapped row (sans corner) - for (int x = xstart; x < bw; x++) { - int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); - int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); - grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5); - grain = iclip(grain, grain_min, grain_max); - add_noise_uv(x, y, grain); - } - - // Special case for doubly-overlapped corner - for (int x = 0; x < xstart; x++) { - // Blend the top pixel with the top left block - int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); - int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y); - top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5); - top = iclip(top, grain_min, grain_max); - - // Blend the current pixel with the left block - int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); - old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); - grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); - grain = iclip(grain, grain_min, grain_max); - - // Mix the row rows together and apply to image - grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5); - grain = iclip(grain, grain_min, grain_max); - add_noise_uv(x, y, grain); - } - } - } -} - -#define fguv_ss_fn(nm, ss_x, ss_y) \ -static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \ - fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \ - row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \ - HIGHBD_TAIL_SUFFIX); \ -} - -fguv_ss_fn(420, 1, 1); -fguv_ss_fn(422, 1, 0); -fguv_ss_fn(444, 0, 0); - -COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) { - c->generate_grain_y = generate_grain_y_c; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c; - - c->fgy_32x32xn = fgy_32x32xn_c; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c; - -#if HAVE_ASM -#if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_film_grain_dsp_init_arm)(c); -#elif ARCH_X86 - bitfn(dav1d_film_grain_dsp_init_x86)(c); -#endif -#endif -} diff -Nru dav1d-0.9.2/src/filmgrain_tmpl.c dav1d-1.0.0/src/filmgrain_tmpl.c --- dav1d-0.9.2/src/filmgrain_tmpl.c 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/filmgrain_tmpl.c 2022-03-18 14:31:55.986356000 +0000 @@ -0,0 +1,433 @@ +/* + * Copyright © 2018, Niklas Haas + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "common/attributes.h" +#include "common/intops.h" + +#include "src/filmgrain.h" +#include "src/tables.h" + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +static inline int get_random_number(const int bits, unsigned *const state) { + const int r = *state; + unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; + *state = (r >> 1) | (bit << 15); + + return (*state >> (16 - bits)) & ((1 << bits) - 1); +} + +static inline int round2(const int x, const uint64_t shift) { + return (x + ((1 << shift) >> 1)) >> shift; +} + +static void generate_grain_y_c(entry buf[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data + HIGHBD_DECL_SUFFIX) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + unsigned seed = data->seed; + const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + for (int y = 0; y < GRAIN_HEIGHT; y++) { + for (int x = 0; x < GRAIN_WIDTH; x++) { + const int value = get_random_number(11, &seed); + buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); + } + } + + const int ar_pad = 3; + const int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < GRAIN_HEIGHT; y++) { + for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) { + const int8_t *coeff = data->ar_coeffs_y; + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + if (!dx && !dy) + break; + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + buf[y][x] = iclip(grain, grain_min, grain_max); + } + } +} + +static NOINLINE void +generate_grain_uv_c(entry buf[][GRAIN_WIDTH], + const entry buf_y[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data, const intptr_t uv, + const int subx, const int suby HIGHBD_DECL_SUFFIX) +{ + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524); + const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH; + const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT; + + for (int y = 0; y < chromaH; y++) { + for (int x = 0; x < chromaW; x++) { + const int value = get_random_number(11, &seed); + buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); + } + } + + const int ar_pad = 3; + const int ar_lag = data->ar_coeff_lag; + + for (int y = ar_pad; y < chromaH; y++) { + for (int x = ar_pad; x < chromaW - ar_pad; x++) { + const int8_t *coeff = data->ar_coeffs_uv[uv]; + int sum = 0; + for (int dy = -ar_lag; dy <= 0; dy++) { + for (int dx = -ar_lag; dx <= ar_lag; dx++) { + // For the final (current) pixel, we need to add in the + // contribution from the luma grain texture + if (!dx && !dy) { + if (!data->num_y_points) + break; + int luma = 0; + const int lumaX = ((x - ar_pad) << subx) + ar_pad; + const int lumaY = ((y - ar_pad) << suby) + ar_pad; + for (int i = 0; i <= suby; i++) { + for (int j = 0; j <= subx; j++) { + luma += buf_y[lumaY + i][lumaX + j]; + } + } + luma = round2(luma, subx + suby); + sum += luma * (*coeff); + break; + } + + sum += *(coeff++) * buf[y + dy][x + dx]; + } + } + + const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); + buf[y][x] = iclip(grain, grain_min, grain_max); + } + } +} + +#define gnuv_ss_fn(nm, ss_x, ss_y) \ +static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \ + generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \ +} + +gnuv_ss_fn(420, 1, 1); +gnuv_ss_fn(422, 1, 0); +gnuv_ss_fn(444, 0, 0); + +// samples from the correct block of a grain LUT, while taking into account the +// offsets provided by the offsets cache +static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH], + const int offsets[2][2], const int subx, const int suby, + const int bx, const int by, const int x, const int y) +{ + const int randval = offsets[bx][by]; + const int offx = 3 + (2 >> subx) * (3 + (randval >> 4)); + const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF)); + return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by] + [offx + x + (BLOCK_SIZE >> subx) * bx]; +} + +static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, + const Dav1dFilmGrainData *const data, const size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], + const int bh, const int row_num HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + int min_value, max_value; + if (data->clip_to_restricted_range) { + min_value = 16 << bitdepth_min_8; + max_value = 235 << bitdepth_min_8; + } else { + min_value = 0; + max_value = BITDEPTH_MAX; + } + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { + const int bw = imin(BLOCK_SIZE, (int) pw - bx); + + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + // x/y block offsets to compensate for overlapped regions + const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0; + const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0; + + static const int w[2][2] = { { 27, 17 }, { 17, 27 } }; + +#define add_noise_y(x, y, grain) \ + const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \ + pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \ + const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \ + *dst = iclip(*src + noise, min_value, max_value); + + for (int y = ystart; y < bh; y++) { + // Non-overlapped image region (straightforward) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + add_noise_y(x, y, grain); + } + + // Special case for overlapped column + for (int x = 0; x < xstart; x++) { + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); + grain = round2(old * w[x][0] + grain * w[x][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_y(x, y, grain); + } + } + + for (int y = 0; y < ystart; y++) { + // Special case for overlapped row (sans corner) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); + grain = round2(old * w[y][0] + grain * w[y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_y(x, y, grain); + } + + // Special case for doubly-overlapped corner + for (int x = 0; x < xstart; x++) { + // Blend the top pixel with the top left block + int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); + int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y); + top = round2(old * w[x][0] + top * w[x][1], 5); + top = iclip(top, grain_min, grain_max); + + // Blend the current pixel with the left block + int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); + old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); + grain = round2(old * w[x][0] + grain * w[x][1], 5); + grain = iclip(grain, grain_min, grain_max); + + // Mix the row rows together and apply grain + grain = round2(top * w[y][0] + grain * w[y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_y(x, y, grain); + } + } + } +} + +static NOINLINE void +fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, const Dav1dFilmGrainData *const data, + const int pw, const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], const int bh, + const int row_num, const pixel *const luma_row, + const ptrdiff_t luma_stride, const int uv, const int is_id, + const int sx, const int sy HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; + const int grain_ctr = 128 << bitdepth_min_8; + const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; + + int min_value, max_value; + if (data->clip_to_restricted_range) { + min_value = 16 << bitdepth_min_8; + max_value = (is_id ? 235 : 240) << bitdepth_min_8; + } else { + min_value = 0; + max_value = BITDEPTH_MAX; + } + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks (subsampled) + for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { + const int bw = imin(BLOCK_SIZE >> sx, pw - bx); + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + // x/y block offsets to compensate for overlapped regions + const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0; + const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0; + + static const int w[2 /* sub */][2 /* off */][2] = { + { { 27, 17 }, { 17, 27 } }, + { { 23, 22 } }, + }; + +#define add_noise_uv(x, y, grain) \ + const int lx = (bx + x) << sx; \ + const int ly = y << sy; \ + const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \ + pixel avg = luma[0]; \ + if (sx) \ + avg = (avg + luma[1] + 1) >> 1; \ + const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ + pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ + int val = avg; \ + if (!data->chroma_scaling_from_luma) { \ + const int combined = avg * data->uv_luma_mult[uv] + \ + *src * data->uv_mult[uv]; \ + val = iclip_pixel( (combined >> 6) + \ + (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \ + } \ + const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \ + *dst = iclip(*src + noise, min_value, max_value); + + for (int y = ystart; y < bh; y++) { + // Non-overlapped image region (straightforward) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + add_noise_uv(x, y, grain); + } + + // Special case for overlapped column + for (int x = 0; x < xstart; x++) { + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); + grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_uv(x, y, grain); + } + } + + for (int y = 0; y < ystart; y++) { + // Special case for overlapped row (sans corner) + for (int x = xstart; x < bw; x++) { + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); + grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_uv(x, y, grain); + } + + // Special case for doubly-overlapped corner + for (int x = 0; x < xstart; x++) { + // Blend the top pixel with the top left block + int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); + int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y); + top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5); + top = iclip(top, grain_min, grain_max); + + // Blend the current pixel with the left block + int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); + old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); + grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); + grain = iclip(grain, grain_min, grain_max); + + // Mix the row rows together and apply to image + grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5); + grain = iclip(grain, grain_min, grain_max); + add_noise_uv(x, y, grain); + } + } + } +} + +#define fguv_ss_fn(nm, ss_x, ss_y) \ +static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \ + fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \ + row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \ + HIGHBD_TAIL_SUFFIX); \ +} + +fguv_ss_fn(420, 1, 1); +fguv_ss_fn(422, 1, 0); +fguv_ss_fn(444, 0, 0); + +COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) { + c->generate_grain_y = generate_grain_y_c; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c; + + c->fgy_32x32xn = fgy_32x32xn_c; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c; + +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_film_grain_dsp_init_arm)(c); +#elif ARCH_X86 + bitfn(dav1d_film_grain_dsp_init_x86)(c); +#endif +#endif +} diff -Nru dav1d-0.9.2/src/internal.h dav1d-1.0.0/src/internal.h --- dav1d-0.9.2/src/internal.h 2021-09-03 15:51:24.405037200 +0000 +++ dav1d-1.0.0/src/internal.h 2022-03-18 14:31:55.990356000 +0000 @@ -34,8 +34,7 @@ typedef struct Dav1dFrameContext Dav1dFrameContext; typedef struct Dav1dTileState Dav1dTileState; -typedef struct Dav1dTileContext Dav1dTileContext; -typedef struct Dav1dPostFilterContext Dav1dPostFilterContext; +typedef struct Dav1dTaskContext Dav1dTaskContext; typedef struct Dav1dTask Dav1dTask; #include "common/attributes.h" @@ -44,7 +43,7 @@ #include "src/cdf.h" #include "src/data.h" #include "src/env.h" -#include "src/film_grain.h" +#include "src/filmgrain.h" #include "src/intra_edge.h" #include "src/ipred.h" #include "src/itx.h" @@ -74,12 +73,28 @@ int start, end; }; +enum TaskType { + DAV1D_TASK_TYPE_INIT, + DAV1D_TASK_TYPE_INIT_CDF, + DAV1D_TASK_TYPE_TILE_ENTROPY, + DAV1D_TASK_TYPE_ENTROPY_PROGRESS, + DAV1D_TASK_TYPE_TILE_RECONSTRUCTION, + DAV1D_TASK_TYPE_DEBLOCK_COLS, + DAV1D_TASK_TYPE_DEBLOCK_ROWS, + DAV1D_TASK_TYPE_CDEF, + DAV1D_TASK_TYPE_SUPER_RESOLUTION, + DAV1D_TASK_TYPE_LOOP_RESTORATION, + DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS, + DAV1D_TASK_TYPE_FG_PREP, + DAV1D_TASK_TYPE_FG_APPLY, +}; + struct Dav1dContext { Dav1dFrameContext *fc; unsigned n_fc; - Dav1dPostFilterContext *pfc; - unsigned n_pfc; + Dav1dTaskContext *tc; + unsigned n_tc; // cache of OBUs that make up a single frame before we submit them // to a frame worker to be decoded @@ -103,7 +118,7 @@ // decoded output picture queue Dav1dData in; - Dav1dPicture out; + Dav1dThreadPicture out, cache; // dummy is a pointer to prevent compiler errors about atomic_load() // not taking const arguments atomic_int flush_mem, *flush; @@ -112,14 +127,38 @@ unsigned next; } frame_thread; - // postfilter threading (refer to pfc[] for per_thread thingies) - struct PostFilterThreadData { + // task threading (refer to tc[] for per_thread thingies) + struct TaskThreadData { pthread_mutex_t lock; pthread_cond_t cond; - struct Dav1dTask *tasks; - int frame_cnt; + atomic_uint first; + unsigned cur; + // This is used for delayed reset of the task cur pointer when + // such operation is needed but the thread doesn't enter a critical + // section (typically when executing the next sbrow task locklessly). + // See src/thread_task.c:reset_task_cur(). + atomic_uint reset_task_cur; + atomic_int cond_signaled; + struct { + int exec; + pthread_cond_t cond; + const Dav1dPicture *in; + Dav1dPicture *out; + enum TaskType type; + atomic_int progress[2]; /* [0]=started, [1]=completed */ + union { + struct { + ALIGN(int8_t grain_lut_8bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16); + ALIGN(uint8_t scaling_8bpc[3][256], 64); + }; + struct { + ALIGN(int16_t grain_lut_16bpc[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH], 16); + ALIGN(uint8_t scaling_16bpc[3][4096], 64); + }; + }; + } delayed_fg; int inited; - } postfilter_thread; + } task_thread; // reference/entropy state Dav1dMemPool *segmap_pool; @@ -150,16 +189,33 @@ int operating_point; unsigned operating_point_idc; int all_layers; + int max_spatial_id; unsigned frame_size_limit; + int strict_std_compliance; + int output_invisible_frames; + enum Dav1dInloopFilterType inloop_filters; int drain; enum PictureFlags frame_flags; enum Dav1dEventFlags event_flags; + Dav1dDataProps cached_error_props; + int cached_error; Dav1dLogger logger; Dav1dMemPool *picture_pool; }; +struct Dav1dTask { + unsigned frame_idx; // frame thread id + enum TaskType type; // task work + int sby; // sbrow + + // task dependencies + int recon_progress, deblock_progress; + int deps_skip; + struct Dav1dTask *next; // only used in task queue +}; + struct Dav1dFrameContext { Dav1dRef *seq_hdr_ref; Dav1dSequenceHeader *seq_hdr; @@ -189,8 +245,6 @@ int resize_step[2 /* y, uv */], resize_start[2 /* y, uv */]; const Dav1dContext *c; - Dav1dTileContext *tc; - int n_tc; Dav1dTileState *ts; int n_ts; const Dav1dDSPContext *dsp; @@ -198,8 +252,9 @@ recon_b_intra_fn recon_b_intra; recon_b_inter_fn recon_b_inter; filter_sbrow_fn filter_sbrow; - filter_sbrow_fn filter_sbrow_deblock; - filter_sbrow_fn filter_sbrow_cdef; + filter_sbrow_fn filter_sbrow_deblock_cols; + filter_sbrow_fn filter_sbrow_deblock_rows; + void (*filter_sbrow_cdef)(Dav1dTaskContext *tc, int sby); filter_sbrow_fn filter_sbrow_resize; filter_sbrow_fn filter_sbrow_lr; backup_ipred_edge_fn backup_ipred_edge; @@ -219,8 +274,10 @@ int bitdepth_max; struct { - struct thread_data td; - int pass, die; + int next_tile_row[2 /* 0: reconstruction, 1: entropy */]; + int entropy_progress; + atomic_int deblock_progress; // in sby units + atomic_uint *frame_progress, *copy_lpf_progress; // indexed using t->by * f->b4_stride + t->bx Av1Block *b; struct CodedBlockInfo { @@ -232,6 +289,7 @@ // iterated over inside tile state uint8_t *pal_idx; coef *cf; + int prog_sz; int pal_sz, pal_idx_sz, cf_sz; // start offsets per tile int *tile_start_off; @@ -242,44 +300,52 @@ uint8_t (*level)[4]; Av1Filter *mask; Av1Restoration *lr_mask; - int top_pre_cdef_toggle; - int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */; - int lr_line_sz, re_sz /* h */; + int mask_sz /* w*h */, lr_mask_sz; + int cdef_buf_plane_sz[2]; /* stride*sbh*4 */ + int cdef_buf_sbh; + int lr_buf_plane_sz[2]; /* (stride*sbh*4) << sb128 if n_tc > 1, else stride*4 */ + int re_sz /* h */; ALIGN(Av1FilterLUT lim_lut, 16); int last_sharpness; uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */]; uint8_t *tx_lpf_right_edge[2]; - uint8_t *cdef_line_buf; + uint8_t *cdef_line_buf, *lr_line_buf; pixel *cdef_line[2 /* pre, post */][3 /* plane */]; + pixel *cdef_lpf_line[3 /* plane */]; pixel *lr_lpf_line[3 /* plane */]; // in-loop filter per-frame state keeping - int tile_row; // for carry-over at tile row edges + uint8_t *start_of_tile_row; + int start_of_tile_row_sz; + int need_cdef_lpf_copy; pixel *p[3], *sr_p[3]; Av1Filter *mask_ptr, *prev_mask_ptr; int restore_planes; // enum LrRestorePlanes - - struct { - pthread_cond_t cond; - struct PostFilterThreadData *pftd; - struct Dav1dTask *tasks; - int num_tasks; - int npf; - int done; - int inited; - } thread; } lf; + struct { + pthread_cond_t cond; + struct TaskThreadData *ttd; + struct Dav1dTask *tasks, *tile_tasks[2], init_task; + int num_tasks, num_tile_tasks; + int init_done; + int done[2]; + int retval; + int update_set; // whether we need to update CDF reference + atomic_int error; + int task_counter; + struct Dav1dTask *task_head, *task_tail; + // Points to the task directly before the cur pointer in the queue. + // This cur pointer is theoretical here, we actually keep track of the + // "prev_t" variable. This is needed to not loose the tasks in + // [head;cur-1] when picking one for execution. + struct Dav1dTask *task_cur_prev; + } task_thread; + // threading (refer to tc[] for per-thread things) struct FrameTileThreadData { - uint64_t available; - pthread_mutex_t lock; - pthread_cond_t cond, icond; - int tasks_left, num_tasks; - int (*task_idx_to_sby_and_tile_idx)[2]; - int titsati_sz, titsati_init[2]; - uint16_t titsati_index_rows[1 + DAV1D_MAX_TILE_ROWS]; - int inited; + int (*lowest_pixel_mem)[7][2]; + int lowest_pixel_mem_sz; } tile_thread; }; @@ -292,15 +358,16 @@ int col, row; // in tile units } tiling; - atomic_int progress; // in sby units, TILE_ERROR after a decoding error - struct { - pthread_mutex_t lock; - pthread_cond_t cond; - } tile_thread; + // in sby units, TILE_ERROR after a decoding error + atomic_int progress[2 /* 0: reconstruction, 1: entropy */]; struct { uint8_t *pal_idx; coef *cf; - } frame_thread; + } frame_thread[2 /* 0: reconstruction, 1: entropy */]; + + // in fullpel units, [0] = Y, [1] = UV, used for progress requirements + // each entry is one tile-sbrow; middle index is refidx + int (*lowest_pixel)[7][2]; uint16_t dqmem[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */]; const uint16_t (*dq)[3][2]; @@ -313,12 +380,14 @@ Av1RestorationUnit *lr_ref[3]; }; -struct Dav1dTileContext { +struct Dav1dTaskContext { + const Dav1dContext *c; const Dav1dFrameContext *f; Dav1dTileState *ts; int bx, by; BlockContext l, *a; - ALIGN(union, 32) { + refmvs_tile rt; + ALIGN(union, 64) { int16_t cf_8bpc [32 * 32]; int32_t cf_16bpc[32 * 32]; }; @@ -327,7 +396,6 @@ uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */]; uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */]; uint8_t txtp_map[32 * 32]; // inter-only - refmvs_tile rt; ALIGN(union, 64) { struct { union { @@ -355,7 +423,7 @@ int16_t ac[32 * 32]; uint8_t pal_idx[2 * 64 * 64]; uint16_t pal[3 /* plane */][8 /* palette_idx */]; - ALIGN(union, 32) { + ALIGN(union, 64) { struct { uint8_t interintra_8bpc[64 * 64]; uint8_t edge_8bpc[257]; @@ -370,6 +438,7 @@ Dav1dWarpedMotionParams warpmv; Av1Filter *lf_mask; + int top_pre_cdef_toggle; int8_t *cur_sb_cdef_idx_ptr; // for chroma sub8x8, we need to know the filter for all 4 subblocks in // a 4x4 area, but the top/left one can go out of cache already, so this @@ -377,17 +446,15 @@ enum Filter2d tl_4x4_filter; struct { + int pass; + } frame_thread; + struct { struct thread_data td; + struct TaskThreadData *ttd; struct FrameTileThreadData *fttd; + int flushed; int die; - } tile_thread; -}; - -struct Dav1dPostFilterContext { - Dav1dContext *c; - struct thread_data td; - int flushed; - int die; + } task_thread; }; #endif /* DAV1D_SRC_INTERNAL_H */ diff -Nru dav1d-0.9.2/src/lf_apply.h dav1d-1.0.0/src/lf_apply.h --- dav1d-0.9.2/src/lf_apply.h 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/lf_apply.h 2022-03-18 14:31:55.990356000 +0000 @@ -35,8 +35,14 @@ #include "src/internal.h" #include "src/levels.h" -void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *f, - pixel *const p[3], Av1Filter *lflvl, - int sby, int start_of_tile_row); +void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *f, + pixel *const p[3], Av1Filter *lflvl, + int sby, int start_of_tile_row); +void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *f, + pixel *const p[3], Av1Filter *lflvl, + int sby); + +void bytefn(dav1d_copy_lpf)(Dav1dFrameContext *const f, + /*const*/ pixel *const src[3], int sby); #endif /* DAV1D_SRC_LF_APPLY_H */ diff -Nru dav1d-0.9.2/src/lf_apply_tmpl.c dav1d-1.0.0/src/lf_apply_tmpl.c --- dav1d-0.9.2/src/lf_apply_tmpl.c 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/lf_apply_tmpl.c 2022-03-18 14:31:55.990356000 +0000 @@ -32,6 +32,146 @@ #include "common/intops.h" #include "src/lf_apply.h" +#include "src/lr_apply.h" + +// The loop filter buffer stores 12 rows of pixels. A superblock block will +// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above +// and 2 below) the final 4 rows are used to swap the bottom of the last +// stripe with the top of the next super block row. +static void backup_lpf(const Dav1dFrameContext *const f, + pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int ss_ver, const int sb128, + int row, const int row_h, const int src_w, + const int h, const int ss_hor, const int lr_backup) +{ + const int cdef_backup = !lr_backup; + const int dst_w = f->frame_hdr->super_res.enabled ? + (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w; + + // The first stripe of the frame is shorter by 8 luma pixel rows. + int stripe_h = ((64 << (cdef_backup & sb128)) - 8 * !row) >> ss_ver; + src += (stripe_h - 2) * PXSTRIDE(src_stride); + + if (f->c->n_tc == 1) { + if (row) { + const int top = 4 << sb128; + // Copy the top part of the stored loop filtered pixels from the + // previous sb row needed above the first stripe of this sb row. + pixel_copy(&dst[PXSTRIDE(dst_stride) * 0], + &dst[PXSTRIDE(dst_stride) * top], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 1], + &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 2], + &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 3], + &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w); + } + dst += 4 * PXSTRIDE(dst_stride); + } + + if (lr_backup && (f->frame_hdr->width[0] != f->frame_hdr->width[1])) { + while (row + stripe_h <= row_h) { + const int n_lines = 4 - (row + stripe_h + 1 == h); + f->dsp->mc.resize(dst, dst_stride, src, src_stride, + dst_w, n_lines, src_w, f->resize_step[ss_hor], + f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX); + row += stripe_h; // unmodified stripe_h for the 1st stripe + stripe_h = 64 >> ss_ver; + src += stripe_h * PXSTRIDE(src_stride); + dst += n_lines * PXSTRIDE(dst_stride); + if (n_lines == 3) { + pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w); + dst += PXSTRIDE(dst_stride); + } + } + } else { + while (row + stripe_h <= row_h) { + const int n_lines = 4 - (row + stripe_h + 1 == h); + for (int i = 0; i < 4; i++) { + pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] : + src, src_w); + dst += PXSTRIDE(dst_stride); + src += PXSTRIDE(src_stride); + } + row += stripe_h; // unmodified stripe_h for the 1st stripe + stripe_h = 64 >> ss_ver; + src += (stripe_h - 4) * PXSTRIDE(src_stride); + } + } +} + +void bytefn(dav1d_copy_lpf)(Dav1dFrameContext *const f, + /*const*/ pixel *const src[3], const int sby) +{ + const int have_tt = f->c->n_tc > 1; + const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1]; + const int offset = 8 * !!sby; + const ptrdiff_t *const src_stride = f->cur.stride; + const ptrdiff_t *const lr_stride = f->sr_cur.p.stride; + const int tt_off = have_tt * sby * (4 << f->seq_hdr->sb128); + pixel *const dst[3] = { + f->lf.lr_lpf_line[0] + tt_off * PXSTRIDE(lr_stride[0]), + f->lf.lr_lpf_line[1] + tt_off * PXSTRIDE(lr_stride[1]), + f->lf.lr_lpf_line[2] + tt_off * PXSTRIDE(lr_stride[1]) + }; + + // TODO Also check block level restore type to reduce copying. + const int restore_planes = f->lf.restore_planes; + + if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_Y) { + const int h = f->cur.p.h; + const int w = f->bw << 2; + const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1); + const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset; + if (restore_planes & LR_RESTORE_Y || !resize) + backup_lpf(f, dst[0], lr_stride[0], + src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], + 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 1); + if (have_tt && resize) { + const ptrdiff_t cdef_off_y = sby * 4 * PXSTRIDE(src_stride[0]); + backup_lpf(f, f->lf.cdef_lpf_line[0] + cdef_off_y, src_stride[0], + src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], + 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, 0); + } + } + if ((f->seq_hdr->cdef || restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) && + f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) + { + const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h = (f->cur.p.h + ss_ver) >> ss_ver; + const int w = f->bw << (2 - ss_hor); + const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1); + const int offset_uv = offset >> ss_ver; + const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; + const ptrdiff_t cdef_off_uv = sby * 4 * PXSTRIDE(src_stride[1]); + if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_U) { + if (restore_planes & LR_RESTORE_U || !resize) + backup_lpf(f, dst[1], lr_stride[1], + src[1] - offset_uv * PXSTRIDE(src_stride[1]), + src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, + row_h, w, h, ss_hor, 1); + if (have_tt && resize) + backup_lpf(f, f->lf.cdef_lpf_line[1] + cdef_off_uv, src_stride[1], + src[1] - offset_uv * PXSTRIDE(src_stride[1]), + src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, + row_h, w, h, ss_hor, 0); + } + if (f->seq_hdr->cdef || restore_planes & LR_RESTORE_V) { + if (restore_planes & LR_RESTORE_V || !resize) + backup_lpf(f, dst[2], lr_stride[1], + src[2] - offset_uv * PXSTRIDE(src_stride[1]), + src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, + row_h, w, h, ss_hor, 1); + if (have_tt && resize) + backup_lpf(f, f->lf.cdef_lpf_line[2] + cdef_off_uv, src_stride[1], + src[2] - offset_uv * PXSTRIDE(src_stride[1]), + src_stride[1], ss_ver, f->seq_hdr->sb128, y_stripe, + row_h, w, h, ss_hor, 0); + } + } +} static inline void filter_plane_cols_y(const Dav1dFrameContext *const f, const int have_left, @@ -170,13 +310,12 @@ } } -void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f, - pixel *const p[3], Av1Filter *const lflvl, - int sby, const int start_of_tile_row) +void bytefn(dav1d_loopfilter_sbrow_cols)(const Dav1dFrameContext *const f, + pixel *const p[3], Av1Filter *const lflvl, + int sby, const int start_of_tile_row) { int x, have_left; // Don't filter outside the frame - const int have_top = sby > 0; const int is_sb64 = !f->seq_hdr->sb128; const int starty4 = (sby & is_sb64) << 4; const int sbsz = 32 >> is_sb64; @@ -271,13 +410,6 @@ imin(32, f->w4 - x * 32), starty4, endy4); } - level_ptr = f->lf.level + f->b4_stride * sby * sbsz; - for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) { - filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride, - lflvl[x].filter_y[1], ptr, f->cur.stride[0], - imin(32, f->w4 - x * 32), starty4, endy4); - } - if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v) return; @@ -292,7 +424,35 @@ (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor, starty4 >> ss_ver, uv_endy4, ss_ver); } +} + +void bytefn(dav1d_loopfilter_sbrow_rows)(const Dav1dFrameContext *const f, + pixel *const p[3], Av1Filter *const lflvl, + int sby) +{ + int x; + // Don't filter outside the frame + const int have_top = sby > 0; + const int is_sb64 = !f->seq_hdr->sb128; + const int starty4 = (sby & is_sb64) << 4; + const int sbsz = 32 >> is_sb64; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz); + const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver; + pixel *ptr; + uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz; + for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) { + filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride, + lflvl[x].filter_y[1], ptr, f->cur.stride[0], + imin(32, f->w4 - x * 32), starty4, endy4); + } + + if (!f->frame_hdr->loopfilter.level_u && !f->frame_hdr->loopfilter.level_v) + return; + + ptrdiff_t uv_off; level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver); for (uv_off = 0, x = 0; x < f->sb128w; x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor) diff -Nru dav1d-0.9.2/src/lib.c dav1d-1.0.0/src/lib.c --- dav1d-0.9.2/src/lib.c 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/lib.c 2022-03-18 14:31:55.994356000 +0000 @@ -63,9 +63,8 @@ } COLD void dav1d_default_settings(Dav1dSettings *const s) { - s->n_frame_threads = 1; - s->n_tile_threads = 1; - s->n_postfilter_threads = 1; + s->n_threads = 0; + s->max_frame_delay = 0; s->apply_grain = 1; s->allocator.cookie = NULL; s->allocator.alloc_picture_callback = dav1d_default_picture_alloc; @@ -75,6 +74,9 @@ s->operating_point = 0; s->all_layers = 1; // just until the tests are adjusted s->frame_size_limit = 0; + s->strict_std_compliance = 0; + s->output_invisible_frames = 0; + s->inloop_filters = DAV1D_INLOOPFILTER_ALL; } static void close_internal(Dav1dContext **const c_out, int flush); @@ -101,12 +103,10 @@ validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL)); validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL)); - validate_input_or_ret(s->n_postfilter_threads >= 1 && - s->n_postfilter_threads <= DAV1D_MAX_POSTFILTER_THREADS, DAV1D_ERR(EINVAL)); - validate_input_or_ret(s->n_tile_threads >= 1 && - s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL)); - validate_input_or_ret(s->n_frame_threads >= 1 && - s->n_frame_threads <= DAV1D_MAX_FRAME_THREADS, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->n_threads >= 0 && + s->n_threads <= DAV1D_MAX_THREADS, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->max_frame_delay >= 0 && + s->max_frame_delay <= DAV1D_MAX_FRAME_DELAY, DAV1D_ERR(EINVAL)); validate_input_or_ret(s->allocator.alloc_picture_callback != NULL, DAV1D_ERR(EINVAL)); validate_input_or_ret(s->allocator.release_picture_callback != NULL, @@ -120,7 +120,7 @@ pthread_attr_setstacksize(&thread_attr, stack_size); - Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32); + Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64); if (!c) goto error; memset(c, 0, sizeof(*c)); @@ -130,6 +130,11 @@ c->operating_point = s->operating_point; c->all_layers = s->all_layers; c->frame_size_limit = s->frame_size_limit; + c->strict_std_compliance = s->strict_std_compliance; + c->output_invisible_frames = s->output_invisible_frames; + c->inloop_filters = s->inloop_filters; + + dav1d_data_props_set_defaults(&c->cached_error_props); if (dav1d_mem_pool_init(&c->seq_hdr_pool) || dav1d_mem_pool_init(&c->frame_hdr_pool) || @@ -166,44 +171,43 @@ c->flush = &c->flush_mem; atomic_init(c->flush, 0); - c->n_pfc = s->n_postfilter_threads; - c->n_fc = s->n_frame_threads; - c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32); + c->n_tc = s->n_threads ? s->n_threads : + iclip(dav1d_num_logical_processors(c), 1, DAV1D_MAX_THREADS); + /* ceil(sqrt(n)) */ + static const uint8_t fc_lut[49] = { + 1, /* 1 */ + 2, 2, 2, /* 2- 4 */ + 3, 3, 3, 3, 3, /* 5- 9 */ + 4, 4, 4, 4, 4, 4, 4, /* 10-16 */ + 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 17-25 */ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 26-36 */ + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 37-49 */ + }; + c->n_fc = s->max_frame_delay ? umin(s->max_frame_delay, c->n_tc) : + c->n_tc < 50 ? fc_lut[c->n_tc - 1] : 8; // min(8, ceil(sqrt(n))) + + c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32); if (!c->fc) goto error; - memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads); + memset(c->fc, 0, sizeof(*c->fc) * c->n_fc); - if (c->n_pfc > 1) { - c->pfc = dav1d_alloc_aligned(sizeof(*c->pfc) * s->n_postfilter_threads, 32); - if (!c->pfc) goto error; - memset(c->pfc, 0, sizeof(*c->pfc) * s->n_postfilter_threads); - if (pthread_mutex_init(&c->postfilter_thread.lock, NULL)) goto error; - if (pthread_cond_init(&c->postfilter_thread.cond, NULL)) { - pthread_mutex_destroy(&c->postfilter_thread.lock); + c->tc = dav1d_alloc_aligned(sizeof(*c->tc) * c->n_tc, 64); + if (!c->tc) goto error; + memset(c->tc, 0, sizeof(*c->tc) * c->n_tc); + if (c->n_tc > 1) { + if (pthread_mutex_init(&c->task_thread.lock, NULL)) goto error; + if (pthread_cond_init(&c->task_thread.cond, NULL)) { + pthread_mutex_destroy(&c->task_thread.lock); goto error; } - c->postfilter_thread.inited = 1; - for (int n = 0; n < s->n_frame_threads; n++) { - Dav1dFrameContext *const f = &c->fc[n]; - if (pthread_cond_init(&f->lf.thread.cond, NULL)) goto error; - f->lf.thread.pftd = &c->postfilter_thread; - f->lf.thread.done = 1; - f->lf.thread.inited = 1; - } - for (int n = 0; n < s->n_postfilter_threads; ++n) { - Dav1dPostFilterContext *const pf = &c->pfc[n]; - pf->c = c; - if (pthread_mutex_init(&pf->td.lock, NULL)) goto error; - if (pthread_cond_init(&pf->td.cond, NULL)) { - pthread_mutex_destroy(&pf->td.lock); - goto error; - } - if (pthread_create(&pf->td.thread, &thread_attr, dav1d_postfilter_task, pf)) { - pthread_cond_destroy(&c->postfilter_thread.cond); - pthread_mutex_destroy(&c->postfilter_thread.lock); - goto error; - } - pf->td.inited = 1; + if (pthread_cond_init(&c->task_thread.delayed_fg.cond, NULL)) { + pthread_cond_destroy(&c->task_thread.cond); + pthread_mutex_destroy(&c->task_thread.lock); + goto error; } + c->task_thread.cur = c->n_fc; + atomic_init(&c->task_thread.reset_task_cur, UINT_MAX); + atomic_init(&c->task_thread.cond_signaled, 0); + c->task_thread.inited = 1; } if (c->n_fc > 1) { @@ -211,59 +215,34 @@ calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed)); if (!c->frame_thread.out_delayed) goto error; } - for (int n = 0; n < s->n_frame_threads; n++) { + for (unsigned n = 0; n < c->n_fc; n++) { Dav1dFrameContext *const f = &c->fc[n]; + if (c->n_tc > 1) + if (pthread_cond_init(&f->task_thread.cond, NULL)) goto error; f->c = c; + f->task_thread.ttd = &c->task_thread; f->lf.last_sharpness = -1; - f->n_tc = s->n_tile_threads; - f->tc = dav1d_alloc_aligned(sizeof(*f->tc) * s->n_tile_threads, 64); - if (!f->tc) goto error; - memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads); - if (f->n_tc > 1) { - if (pthread_mutex_init(&f->tile_thread.lock, NULL)) goto error; - if (pthread_cond_init(&f->tile_thread.cond, NULL)) { - pthread_mutex_destroy(&f->tile_thread.lock); - goto error; - } - if (pthread_cond_init(&f->tile_thread.icond, NULL)) { - pthread_mutex_destroy(&f->tile_thread.lock); - pthread_cond_destroy(&f->tile_thread.cond); - goto error; - } - f->tile_thread.inited = 1; - } - for (int m = 0; m < s->n_tile_threads; m++) { - Dav1dTileContext *const t = &f->tc[m]; - t->f = f; - memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc)); - if (f->n_tc > 1) { - if (pthread_mutex_init(&t->tile_thread.td.lock, NULL)) goto error; - if (pthread_cond_init(&t->tile_thread.td.cond, NULL)) { - pthread_mutex_destroy(&t->tile_thread.td.lock); - goto error; - } - t->tile_thread.fttd = &f->tile_thread; - if (pthread_create(&t->tile_thread.td.thread, &thread_attr, dav1d_tile_task, t)) { - pthread_cond_destroy(&t->tile_thread.td.cond); - pthread_mutex_destroy(&t->tile_thread.td.lock); - goto error; - } - t->tile_thread.td.inited = 1; - } - } dav1d_refmvs_init(&f->rf); - if (c->n_fc > 1) { - if (pthread_mutex_init(&f->frame_thread.td.lock, NULL)) goto error; - if (pthread_cond_init(&f->frame_thread.td.cond, NULL)) { - pthread_mutex_destroy(&f->frame_thread.td.lock); + } + + for (unsigned m = 0; m < c->n_tc; m++) { + Dav1dTaskContext *const t = &c->tc[m]; + t->f = &c->fc[0]; + t->task_thread.ttd = &c->task_thread; + t->c = c; + memset(t->cf_16bpc, 0, sizeof(t->cf_16bpc)); + if (c->n_tc > 1) { + if (pthread_mutex_init(&t->task_thread.td.lock, NULL)) goto error; + if (pthread_cond_init(&t->task_thread.td.cond, NULL)) { + pthread_mutex_destroy(&t->task_thread.td.lock); goto error; } - if (pthread_create(&f->frame_thread.td.thread, &thread_attr, dav1d_frame_task, f)) { - pthread_cond_destroy(&f->frame_thread.td.cond); - pthread_mutex_destroy(&f->frame_thread.td.lock); + if (pthread_create(&t->task_thread.td.thread, &thread_attr, dav1d_worker_task, t)) { + pthread_cond_destroy(&t->task_thread.td.cond); + pthread_mutex_destroy(&t->task_thread.td.lock); goto error; } - f->frame_thread.td.inited = 1; + t->task_thread.td.inited = 1; } } dav1d_refmvs_dsp_init(&c->refmvs_dsp); @@ -298,6 +277,7 @@ Dav1dSettings s; dav1d_default_settings(&s); + s.n_threads = 1; s.logger.callback = NULL; Dav1dContext *c; @@ -333,61 +313,53 @@ return res; } -static int output_image(Dav1dContext *const c, Dav1dPicture *const out, - Dav1dPicture *const in) +static int has_grain(const Dav1dPicture *const pic) { - const Dav1dFilmGrainData *fgdata = &in->frame_hdr->film_grain.data; - int has_grain = fgdata->num_y_points || fgdata->num_uv_points[0] || - fgdata->num_uv_points[1]; - - // If there is nothing to be done, skip the allocation/copy - if (!c->apply_grain || !has_grain) { - dav1d_picture_move_ref(out, in); - return 0; - } + const Dav1dFilmGrainData *fgdata = &pic->frame_hdr->film_grain.data; + return fgdata->num_y_points || fgdata->num_uv_points[0] || + fgdata->num_uv_points[1]; +} - // Apply film grain to a new copy of the image to avoid corrupting refs - int res = dav1d_picture_alloc_copy(c, out, in->p.w, in); - if (res < 0) { - dav1d_picture_unref_internal(in); - dav1d_picture_unref_internal(out); - return res; - } +static int output_image(Dav1dContext *const c, Dav1dPicture *const out) +{ + int res = 0; - switch (out->p.bpc) { -#if CONFIG_8BPC - case 8: - dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in); - break; -#endif -#if CONFIG_16BPC - case 10: - case 12: - dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in); - break; -#endif - default: - assert(0); + Dav1dThreadPicture *const in = (c->all_layers || !c->max_spatial_id) + ? &c->out : &c->cache; + if (!c->apply_grain || !has_grain(&in->p)) { + dav1d_picture_move_ref(out, &in->p); + dav1d_thread_picture_unref(in); + goto end; } - dav1d_picture_unref_internal(in); - return 0; + res = dav1d_apply_grain(c, out, &in->p); + dav1d_thread_picture_unref(in); +end: + if (!c->all_layers && c->max_spatial_id && c->out.p.data[0]) { + dav1d_thread_picture_move_ref(in, &c->out); + } + return res; } -static int output_picture_ready(Dav1dContext *const c) { - - if (!c->out.data[0]) return 0; - - // skip lower spatial layers - if (c->operating_point_idc && !c->all_layers) { - const int max_spatial_id = ulog2(c->operating_point_idc >> 8); - if (max_spatial_id > c->out.frame_hdr->spatial_id) { - dav1d_picture_unref_internal(&c->out); +static int output_picture_ready(Dav1dContext *const c, const int drain) { + if (c->cached_error) return 1; + if (!c->all_layers && c->max_spatial_id) { + if (c->out.p.data[0] && c->cache.p.data[0]) { + if (c->max_spatial_id == c->cache.p.frame_hdr->spatial_id || + c->out.flags & PICTURE_FLAG_NEW_TEMPORAL_UNIT) + return 1; + dav1d_thread_picture_unref(&c->cache); + dav1d_thread_picture_move_ref(&c->cache, &c->out); + return 0; + } else if (c->cache.p.data[0] && drain) { + return 1; + } else if (c->out.p.data[0]) { + dav1d_thread_picture_move_ref(&c->cache, &c->out); return 0; } } - return 1; + return !!c->out.p.data[0]; } static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) { @@ -395,29 +367,49 @@ do { const unsigned next = c->frame_thread.next; Dav1dFrameContext *const f = &c->fc[next]; - pthread_mutex_lock(&f->frame_thread.td.lock); + pthread_mutex_lock(&c->task_thread.lock); while (f->n_tile_data > 0) - pthread_cond_wait(&f->frame_thread.td.cond, - &f->frame_thread.td.lock); - pthread_mutex_unlock(&f->frame_thread.td.lock); + pthread_cond_wait(&f->task_thread.cond, + &f->task_thread.ttd->lock); Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next]; + if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) { + if (atomic_load(&c->task_thread.first) + 1U < c->n_fc) + atomic_fetch_add(&c->task_thread.first, 1U); + else + atomic_store(&c->task_thread.first, 0); + if (c->task_thread.cur && c->task_thread.cur < c->n_fc) + c->task_thread.cur--; + } if (++c->frame_thread.next == c->n_fc) c->frame_thread.next = 0; + pthread_mutex_unlock(&c->task_thread.lock); + const int error = f->task_thread.retval; + if (error) { + f->task_thread.retval = 0; + dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m); + dav1d_thread_picture_unref(out_delayed); + return error; + } if (out_delayed->p.data[0]) { const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], memory_order_relaxed); - if (out_delayed->visible && progress != FRAME_ERROR) { - dav1d_picture_ref(&c->out, &out_delayed->p); + if ((out_delayed->visible || c->output_invisible_frames) && + progress != FRAME_ERROR) + { + dav1d_thread_picture_ref(&c->out, out_delayed); c->event_flags |= dav1d_picture_get_event_flags(out_delayed); } dav1d_thread_picture_unref(out_delayed); - if (output_picture_ready(c)) - return output_image(c, out, &c->out); + if (output_picture_ready(c, 0)) + return output_image(c, out); } } while (++drain_count < c->n_fc); + if (output_picture_ready(c, 1)) + return output_image(c, out); + return DAV1D_ERR(EAGAIN); } @@ -426,7 +418,7 @@ int res; Dav1dData *const in = &c->in; - if (output_picture_ready(c)) + if (output_picture_ready(c, 0)) return 0; while (in->sz > 0) { @@ -439,7 +431,7 @@ in->data += res; if (!in->sz) dav1d_data_unref_internal(in); } - if (output_picture_ready(c)) + if (output_picture_ready(c, 0)) break; if (res < 0) return res; @@ -479,8 +471,14 @@ if (res < 0) return res; - if (output_picture_ready(c)) - return output_image(c, out, &c->out); + if (c->cached_error) { + const int res = c->cached_error; + c->cached_error = 0; + return res; + } + + if (output_picture_ready(c, c->n_fc == 1)) + return output_image(c, out); if (c->n_fc > 1 && drain) return drain_picture(c, out); @@ -488,9 +486,56 @@ return DAV1D_ERR(EAGAIN); } +int dav1d_apply_grain(Dav1dContext *const c, Dav1dPicture *const out, + const Dav1dPicture *const in) +{ + validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL)); + + if (!has_grain(in)) { + dav1d_picture_ref(out, in); + return 0; + } + + int res = dav1d_picture_alloc_copy(c, out, in->p.w, in); + if (res < 0) goto error; + + if (c->n_tc > 1) { + dav1d_task_delayed_fg(c, out, in); + } else { + switch (out->p.bpc) { +#if CONFIG_8BPC + case 8: + dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in); + break; +#endif +#if CONFIG_16BPC + case 10: + case 12: + dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in); + break; +#endif + default: abort(); + } + } + + return 0; + +error: + dav1d_picture_unref_internal(out); + return res; +} + void dav1d_flush(Dav1dContext *const c) { dav1d_data_unref_internal(&c->in); + if (c->out.p.data[0]) + dav1d_thread_picture_unref(&c->out); + if (c->cache.p.data[0]) + dav1d_thread_picture_unref(&c->cache); + c->drain = 0; + c->cached_error = 0; for (int i = 0; i < 8; i++) { if (c->refs[i].p.p.data[0]) @@ -510,51 +555,46 @@ dav1d_ref_dec(&c->content_light_ref); dav1d_ref_dec(&c->itut_t35_ref); - if (c->n_fc == 1 && c->n_pfc == 1) return; + dav1d_data_props_unref_internal(&c->cached_error_props); - // wait for threads to complete flushing - if (c->n_pfc > 1) - pthread_mutex_lock(&c->postfilter_thread.lock); + if (c->n_fc == 1 && c->n_tc == 1) return; atomic_store(c->flush, 1); - if (c->n_pfc > 1) { - pthread_cond_broadcast(&c->postfilter_thread.cond); - pthread_mutex_unlock(&c->postfilter_thread.lock); - } - if (c->n_fc == 1) goto skip_ft_flush; - for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) { - if (next == c->n_fc) next = 0; - Dav1dFrameContext *const f = &c->fc[next]; - pthread_mutex_lock(&f->frame_thread.td.lock); - if (f->n_tile_data > 0) { - while (f->n_tile_data > 0) - pthread_cond_wait(&f->frame_thread.td.cond, - &f->frame_thread.td.lock); - assert(!f->cur.data[0]); - } - pthread_mutex_unlock(&f->frame_thread.td.lock); - Dav1dThreadPicture *const out_delayed = - &c->frame_thread.out_delayed[next]; - if (out_delayed->p.data[0]) - dav1d_thread_picture_unref(out_delayed); + + // stop running tasks in worker threads + if (c->n_tc > 1) { + pthread_mutex_lock(&c->task_thread.lock); + for (unsigned i = 0; i < c->n_tc; i++) { + Dav1dTaskContext *const tc = &c->tc[i]; + while (!tc->task_thread.flushed) { + pthread_cond_wait(&tc->task_thread.td.cond, &c->task_thread.lock); + } + } + for (unsigned i = 0; i < c->n_fc; i++) { + c->fc[i].task_thread.task_head = NULL; + c->fc[i].task_thread.task_tail = NULL; + c->fc[i].task_thread.task_cur_prev = NULL; + } + atomic_init(&c->task_thread.first, 0); + c->task_thread.cur = c->n_fc; + atomic_store(&c->task_thread.reset_task_cur, UINT_MAX); + atomic_store(&c->task_thread.cond_signaled, 0); + pthread_mutex_unlock(&c->task_thread.lock); } - c->frame_thread.next = 0; -skip_ft_flush: - if (c->n_pfc > 1) { - for (unsigned i = 0; i < c->n_pfc; ++i) { - Dav1dPostFilterContext *const pf = &c->pfc[i]; - pthread_mutex_lock(&pf->td.lock); - if (!pf->flushed) - pthread_cond_wait(&pf->td.cond, &pf->td.lock); - pf->flushed = 0; - pthread_mutex_unlock(&pf->td.lock); - } - pthread_mutex_lock(&c->postfilter_thread.lock); - c->postfilter_thread.tasks = NULL; - pthread_mutex_unlock(&c->postfilter_thread.lock); - for (unsigned i = 0; i < c->n_fc; ++i) { - freep(&c->fc[i].lf.thread.tasks); - c->fc[i].lf.thread.num_tasks = 0; + + // wait for threads to complete flushing + if (c->n_fc > 1) { + for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) { + if (next == c->n_fc) next = 0; + Dav1dFrameContext *const f = &c->fc[next]; + dav1d_decode_frame_exit(f, -1); + f->n_tile_data = 0; + f->task_thread.retval = 0; + Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next]; + if (out_delayed->p.data[0]) { + dav1d_thread_picture_unref(out_delayed); + } } + c->frame_thread.next = 0; } atomic_store(c->flush, 0); } @@ -570,82 +610,48 @@ if (flush) dav1d_flush(c); - if (c->pfc) { - struct PostFilterThreadData *pftd = &c->postfilter_thread; - if (pftd->inited) { - pthread_mutex_lock(&pftd->lock); - for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) - c->pfc[n].die = 1; - pthread_cond_broadcast(&pftd->cond); - pthread_mutex_unlock(&pftd->lock); - for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) { - pthread_join(c->pfc[n].td.thread, NULL); - pthread_cond_destroy(&c->pfc[n].td.cond); - pthread_mutex_destroy(&c->pfc[n].td.lock); - } - pthread_cond_destroy(&pftd->cond); - pthread_mutex_destroy(&pftd->lock); + if (c->tc) { + struct TaskThreadData *ttd = &c->task_thread; + if (ttd->inited) { + pthread_mutex_lock(&ttd->lock); + for (unsigned n = 0; n < c->n_tc && c->tc[n].task_thread.td.inited; n++) + c->tc[n].task_thread.die = 1; + pthread_cond_broadcast(&ttd->cond); + pthread_mutex_unlock(&ttd->lock); + for (unsigned n = 0; n < c->n_tc; n++) { + Dav1dTaskContext *const pf = &c->tc[n]; + if (!pf->task_thread.td.inited) break; + pthread_join(pf->task_thread.td.thread, NULL); + pthread_cond_destroy(&pf->task_thread.td.cond); + pthread_mutex_destroy(&pf->task_thread.td.lock); + } + pthread_cond_destroy(&ttd->delayed_fg.cond); + pthread_cond_destroy(&ttd->cond); + pthread_mutex_destroy(&ttd->lock); } - dav1d_free_aligned(c->pfc); + dav1d_free_aligned(c->tc); } for (unsigned n = 0; c->fc && n < c->n_fc; n++) { Dav1dFrameContext *const f = &c->fc[n]; // clean-up threading stuff - if (c->n_fc > 1 && f->frame_thread.td.inited) { - pthread_mutex_lock(&f->frame_thread.td.lock); - f->frame_thread.die = 1; - pthread_cond_signal(&f->frame_thread.td.cond); - pthread_mutex_unlock(&f->frame_thread.td.lock); - pthread_join(f->frame_thread.td.thread, NULL); + if (c->n_fc > 1) { + freep(&f->tile_thread.lowest_pixel_mem); freep(&f->frame_thread.b); dav1d_freep_aligned(&f->frame_thread.pal_idx); dav1d_freep_aligned(&f->frame_thread.cf); freep(&f->frame_thread.tile_start_off); dav1d_freep_aligned(&f->frame_thread.pal); freep(&f->frame_thread.cbi); - pthread_mutex_destroy(&f->frame_thread.td.lock); - pthread_cond_destroy(&f->frame_thread.td.cond); } - if (f->n_tc > 1 && f->tc && f->tile_thread.inited) { - pthread_mutex_lock(&f->tile_thread.lock); - for (int m = 0; m < f->n_tc; m++) { - Dav1dTileContext *const t = &f->tc[m]; - t->tile_thread.die = 1; - // mark not created tile threads as available - if (!t->tile_thread.td.inited) - f->tile_thread.available |= 1ULL<tile_thread.cond); - while (f->tile_thread.available != ~0ULL >> (64 - f->n_tc)) - pthread_cond_wait(&f->tile_thread.icond, - &f->tile_thread.lock); - pthread_mutex_unlock(&f->tile_thread.lock); - for (int m = 0; m < f->n_tc; m++) { - Dav1dTileContext *const t = &f->tc[m]; - if (f->n_tc > 1 && t->tile_thread.td.inited) { - pthread_join(t->tile_thread.td.thread, NULL); - pthread_mutex_destroy(&t->tile_thread.td.lock); - pthread_cond_destroy(&t->tile_thread.td.cond); - } - } - pthread_mutex_destroy(&f->tile_thread.lock); - pthread_cond_destroy(&f->tile_thread.cond); - pthread_cond_destroy(&f->tile_thread.icond); - freep(&f->tile_thread.task_idx_to_sby_and_tile_idx); - } - for (int m = 0; f->ts && m < f->n_ts; m++) { - Dav1dTileState *const ts = &f->ts[m]; - pthread_cond_destroy(&ts->tile_thread.cond); - pthread_mutex_destroy(&ts->tile_thread.lock); - } - if (f->lf.thread.inited) { - freep(&f->lf.thread.tasks); - pthread_cond_destroy(&f->lf.thread.cond); + if (c->n_tc > 1) { + pthread_cond_destroy(&f->task_thread.cond); } + freep(&f->frame_thread.frame_progress); + freep(&f->task_thread.tasks); + freep(&f->task_thread.tile_tasks[0]); dav1d_free_aligned(f->ts); - dav1d_free_aligned(f->tc); dav1d_free_aligned(f->ipred_edge[0]); free(f->a); free(f->tile); @@ -653,12 +659,12 @@ free(f->lf.lr_mask); free(f->lf.level); free(f->lf.tx_lpf_right_edge[0]); + free(f->lf.start_of_tile_row); dav1d_refmvs_clear(&f->rf); dav1d_free_aligned(f->lf.cdef_line_buf); - dav1d_free_aligned(f->lf.lr_lpf_line[0]); + dav1d_free_aligned(f->lf.lr_line_buf); } dav1d_free_aligned(c->fc); - dav1d_data_unref_internal(&c->in); if (c->n_fc > 1 && c->frame_thread.out_delayed) { for (unsigned n = 0; n < c->n_fc; n++) if (c->frame_thread.out_delayed[n].p.data[0]) @@ -701,6 +707,17 @@ return 0; } +int dav1d_get_decode_error_data_props(Dav1dContext *const c, Dav1dDataProps *const out) { + validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL)); + + dav1d_data_props_unref_internal(out); + *out = c->cached_error_props; + dav1d_data_props_set_defaults(&c->cached_error_props); + + return 0; +} + void dav1d_picture_unref(Dav1dPicture *const p) { dav1d_picture_unref_internal(p); } @@ -733,3 +750,7 @@ void dav1d_data_unref(Dav1dData *const buf) { dav1d_data_unref_internal(buf); } + +void dav1d_data_props_unref(Dav1dDataProps *const props) { + dav1d_data_props_unref_internal(props); +} diff -Nru dav1d-0.9.2/src/looprestoration.h dav1d-1.0.0/src/looprestoration.h --- dav1d-0.9.2/src/looprestoration.h 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/looprestoration.h 2022-03-18 14:31:55.994356000 +0000 @@ -64,8 +64,8 @@ #define decl_lr_filter_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const_left_pixel_row left, \ - const pixel *lpf, ptrdiff_t lpf_stride, \ - int w, int h, const LooprestorationParams *params, \ + const pixel *lpf, int w, int h, \ + const LooprestorationParams *params, \ enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) typedef decl_lr_filter_fn(*looprestorationfilter_fn); diff -Nru dav1d-0.9.2/src/looprestoration_tmpl.c dav1d-1.0.0/src/looprestoration_tmpl.c --- dav1d-0.9.2/src/looprestoration_tmpl.c 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/looprestoration_tmpl.c 2022-03-18 14:31:55.994356000 +0000 @@ -40,9 +40,9 @@ // TODO Reuse p when no padding is needed (add and remove lpf pixels in p) // TODO Chroma only requires 2 rows of padding. static NOINLINE void -padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride, - const pixel (*left)[4], const pixel *lpf, const ptrdiff_t lpf_stride, - int unit_w, const int stripe_h, const enum LrEdgeFlags edges) +padding(pixel *dst, const pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, int unit_w, + const int stripe_h, const enum LrEdgeFlags edges) { const int have_left = !!(edges & LR_HAVE_LEFT); const int have_right = !!(edges & LR_HAVE_RIGHT); @@ -56,7 +56,7 @@ if (edges & LR_HAVE_TOP) { // Copy previous loop filtered rows const pixel *const above_1 = lpf; - const pixel *const above_2 = above_1 + PXSTRIDE(lpf_stride); + const pixel *const above_2 = above_1 + PXSTRIDE(stride); pixel_copy(dst_l, above_1, unit_w); pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w); pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w); @@ -75,14 +75,14 @@ pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE; if (edges & LR_HAVE_BOTTOM) { // Copy next loop filtered rows - const pixel *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride); - const pixel *const below_2 = below_1 + PXSTRIDE(lpf_stride); + const pixel *const below_1 = lpf + 6 * PXSTRIDE(stride); + const pixel *const below_2 = below_1 + PXSTRIDE(stride); pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w); pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w); pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w); } else { // Pad with last row - const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride); + const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(stride); pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w); pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w); pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w); @@ -97,7 +97,7 @@ for (int j = 0; j < stripe_h; j++) { pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left); dst_tl += REST_UNIT_STRIDE; - p += PXSTRIDE(p_stride); + p += PXSTRIDE(stride); } if (!have_right) { @@ -131,10 +131,9 @@ // (since first and last tops are always 0 for chroma) // FIXME Could implement a version that requires less temporary memory // (should be possible to implement with only 6 rows of temp storage) -static void wiener_c(pixel *p, const ptrdiff_t p_stride, +static void wiener_c(pixel *p, const ptrdiff_t stride, const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, - const int w, const int h, + const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { @@ -143,7 +142,7 @@ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; pixel *tmp_ptr = tmp; - padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); + padding(tmp, p, stride, left, lpf, w, h, edges); // Values stored between horizontal and vertical filtering don't // fit in a uint8_t. @@ -184,7 +183,7 @@ sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k]; } - p[j * PXSTRIDE(p_stride) + i] = + p[j * PXSTRIDE(stride) + i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v); } } @@ -382,11 +381,11 @@ const unsigned p = imax(a * n - b * b, 0); const unsigned z = (p * s + (1 << 19)) >> 20; - const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)]; + const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)]; // This is where we invert A and B, so that B is of size coef. AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12; - BB[i] = 256 - x; + BB[i] = x; } AA += step * REST_UNIT_STRIDE; BB += step * REST_UNIT_STRIDE; @@ -403,7 +402,7 @@ for (int i = 0; i < w; i++) { const int a = SIX_NEIGHBORS(B, i); const int b = SIX_NEIGHBORS(A, i); - dst[i] = (a * src[i] + b + (1 << 8)) >> 9; + dst[i] = (b - a * src[i] + (1 << 8)) >> 9; } dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */; src += REST_UNIT_STRIDE; @@ -412,7 +411,7 @@ for (int i = 0; i < w; i++) { const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5; const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5; - dst[i] = (a * src[i] + b + (1 << 7)) >> 8; + dst[i] = (b - a * src[i] + (1 << 7)) >> 8; } dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */; src += REST_UNIT_STRIDE; @@ -423,7 +422,7 @@ for (int i = 0; i < w; i++) { const int a = SIX_NEIGHBORS(B, i); const int b = SIX_NEIGHBORS(A, i); - dst[i] = (a * src[i] + b + (1 << 8)) >> 9; + dst[i] = (b - a * src[i] + (1 << 8)) >> 9; } } #undef SIX_NEIGHBORS @@ -436,7 +435,7 @@ for (int i = 0; i < w; i++) { const int a = EIGHT_NEIGHBORS(B, i); const int b = EIGHT_NEIGHBORS(A, i); - dst[i] = (a * src[i] + b + (1 << 8)) >> 9; + dst[i] = (b - a * src[i] + (1 << 8)) >> 9; } dst += 384; src += REST_UNIT_STRIDE; @@ -447,9 +446,9 @@ #undef EIGHT_NEIGHBORS } -static void sgr_5x5_c(pixel *p, const ptrdiff_t p_stride, +static void sgr_5x5_c(pixel *p, const ptrdiff_t stride, const pixel (*const left)[4], const pixel *lpf, - const ptrdiff_t lpf_stride, const int w, const int h, + const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { @@ -461,48 +460,46 @@ // maximum restoration width of 384 (256 * 1.5) coef dst[64 * 384]; - padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); + padding(tmp, p, stride, left, lpf, w, h, edges); selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, params->sgr.s0 HIGHBD_TAIL_SUFFIX); const int w0 = params->sgr.w0; for (int j = 0; j < h; j++) { for (int i = 0; i < w; i++) { - const int u = (p[i] << 4); - const int v = (u << 7) + w0 * (dst[j * 384 + i] - u); - p[i] = iclip_pixel((v + (1 << 10)) >> 11); + const int v = w0 * dst[j * 384 + i]; + p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11)); } - p += PXSTRIDE(p_stride); + p += PXSTRIDE(stride); } } -static void sgr_3x3_c(pixel *p, const ptrdiff_t p_stride, +static void sgr_3x3_c(pixel *p, const ptrdiff_t stride, const pixel (*const left)[4], const pixel *lpf, - const ptrdiff_t lpf_stride, const int w, const int h, + const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; coef dst[64 * 384]; - padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); + padding(tmp, p, stride, left, lpf, w, h, edges); selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, params->sgr.s1 HIGHBD_TAIL_SUFFIX); const int w1 = params->sgr.w1; for (int j = 0; j < h; j++) { for (int i = 0; i < w; i++) { - const int u = (p[i] << 4); - const int v = (u << 7) + w1 * (dst[j * 384 + i] - u); - p[i] = iclip_pixel((v + (1 << 10)) >> 11); + const int v = w1 * dst[j * 384 + i]; + p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11)); } - p += PXSTRIDE(p_stride); + p += PXSTRIDE(stride); } } -static void sgr_mix_c(pixel *p, const ptrdiff_t p_stride, +static void sgr_mix_c(pixel *p, const ptrdiff_t stride, const pixel (*const left)[4], const pixel *lpf, - const ptrdiff_t lpf_stride, const int w, const int h, + const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { @@ -510,7 +507,7 @@ coef dst0[64 * 384]; coef dst1[64 * 384]; - padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); + padding(tmp, p, stride, left, lpf, w, h, edges); selfguided_filter(dst0, tmp, REST_UNIT_STRIDE, w, h, 25, params->sgr.s0 HIGHBD_TAIL_SUFFIX); selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, @@ -520,12 +517,10 @@ const int w1 = params->sgr.w1; for (int j = 0; j < h; j++) { for (int i = 0; i < w; i++) { - const int u = (p[i] << 4); - const int v = (u << 7) + w0 * (dst0[j * 384 + i] - u) + - w1 * (dst1[j * 384 + i] - u); - p[i] = iclip_pixel((v + (1 << 10)) >> 11); + const int v = w0 * dst0[j * 384 + i] + w1 * dst1[j * 384 + i]; + p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11)); } - p += PXSTRIDE(p_stride); + p += PXSTRIDE(stride); } } diff -Nru dav1d-0.9.2/src/lr_apply.h dav1d-1.0.0/src/lr_apply.h --- dav1d-0.9.2/src/lr_apply.h 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/lr_apply.h 2022-03-18 14:31:55.994356000 +0000 @@ -35,8 +35,11 @@ #include "src/internal.h" -void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, - /*const*/ pixel *const src[3], int sby); +enum LrRestorePlanes { + LR_RESTORE_Y = 1 << 0, + LR_RESTORE_U = 1 << 1, + LR_RESTORE_V = 1 << 2, +}; void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3], int sby); diff -Nru dav1d-0.9.2/src/lr_apply_tmpl.c dav1d-1.0.0/src/lr_apply_tmpl.c --- dav1d-0.9.2/src/lr_apply_tmpl.c 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/lr_apply_tmpl.c 2022-03-18 14:31:55.994356000 +0000 @@ -33,124 +33,6 @@ #include "src/lr_apply.h" -enum LrRestorePlanes { - LR_RESTORE_Y = 1 << 0, - LR_RESTORE_U = 1 << 1, - LR_RESTORE_V = 1 << 2, -}; - -// The loop filter buffer stores 12 rows of pixels. A superblock block will -// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above -// and 2 below) the final 4 rows are used to swap the bottom of the last -// stripe with the top of the next super block row. -static void backup_lpf(const Dav1dFrameContext *const f, - pixel *dst, const ptrdiff_t dst_stride, - const pixel *src, const ptrdiff_t src_stride, - const int ss_ver, const int sb128, - int row, const int row_h, const int src_w, - const int h, const int ss_hor, const int pft) -{ - const int dst_w = f->frame_hdr->super_res.enabled ? - (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w; - - // The first stripe of the frame is shorter by 8 luma pixel rows. - int stripe_h = (64 - 8 * !row) >> ss_ver; - src += (stripe_h - 2) * PXSTRIDE(src_stride); - - if (!pft) { - if (row) { - const int top = 4 << sb128; - // Copy the top part of the stored loop filtered pixels from the - // previous sb row needed above the first stripe of this sb row. - pixel_copy(&dst[PXSTRIDE(dst_stride) * 0], - &dst[PXSTRIDE(dst_stride) * top], dst_w); - pixel_copy(&dst[PXSTRIDE(dst_stride) * 1], - &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w); - pixel_copy(&dst[PXSTRIDE(dst_stride) * 2], - &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w); - pixel_copy(&dst[PXSTRIDE(dst_stride) * 3], - &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w); - } - dst += 4 * PXSTRIDE(dst_stride); - } - - if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { - while (row + stripe_h <= row_h) { - const int n_lines = 4 - (row + stripe_h + 1 == h); - f->dsp->mc.resize(dst, dst_stride, src, src_stride, - dst_w, n_lines, src_w, f->resize_step[ss_hor], - f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX); - row += stripe_h; // unmodified stripe_h for the 1st stripe - stripe_h = 64 >> ss_ver; - src += stripe_h * PXSTRIDE(src_stride); - dst += n_lines * PXSTRIDE(dst_stride); - if (n_lines == 3) { - pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w); - dst += PXSTRIDE(dst_stride); - } - } - } else { - while (row + stripe_h <= row_h) { - const int n_lines = 4 - (row + stripe_h + 1 == h); - for (int i = 0; i < 4; i++) { - pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] : - src, src_w); - dst += PXSTRIDE(dst_stride); - src += PXSTRIDE(src_stride); - } - row += stripe_h; // unmodified stripe_h for the 1st stripe - stripe_h = 64 >> ss_ver; - src += (stripe_h - 4) * PXSTRIDE(src_stride); - } - } -} - -void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, - /*const*/ pixel *const src[3], const int sby) -{ - const int pft = f->c->n_pfc > 1; - const int offset = 8 * !!sby; - const ptrdiff_t *const src_stride = f->cur.stride; - const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel); - pixel *const dst[3] = { - f->lf.lr_lpf_line[0] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride), - f->lf.lr_lpf_line[1] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride), - f->lf.lr_lpf_line[2] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride) - }; - - // TODO Also check block level restore type to reduce copying. - const int restore_planes = f->lf.restore_planes; - - if (restore_planes & LR_RESTORE_Y) { - const int h = f->cur.p.h; - const int w = f->bw << 2; - const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1); - const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset; - backup_lpf(f, dst[0], lr_stride, - src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], - 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, pft); - } - if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) { - const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420; - const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444; - const int h = (f->cur.p.h + ss_ver) >> ss_ver; - const int w = f->bw << (2 - ss_hor); - const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1); - const int offset_uv = offset >> ss_ver; - const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; - if (restore_planes & LR_RESTORE_U) { - backup_lpf(f, dst[1], lr_stride, - src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1], - ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft); - } - if (restore_planes & LR_RESTORE_V) { - backup_lpf(f, dst[2], lr_stride, - src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1], - ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft); - } - } -} - static void lr_stripe(const Dav1dFrameContext *const f, pixel *p, const pixel (*left)[4], int x, int y, const int plane, const int unit_w, const int row_h, @@ -159,10 +41,11 @@ const Dav1dDSPContext *const dsp = f->dsp; const int chroma = !!plane; const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420); - const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma]; - const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31); + const ptrdiff_t stride = f->sr_cur.p.stride[chroma]; const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128); - const pixel *lpf = f->lf.lr_lpf_line[plane] + (f->c->n_pfc > 1) * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x; + const int have_tt = f->c->n_tc > 1; + const pixel *lpf = f->lf.lr_lpf_line[plane] + + have_tt * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(stride) + x; // The first stripe of the frame is shorter by 8 luma pixel rows. int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y); @@ -201,16 +84,15 @@ while (y + stripe_h <= row_h) { // Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h) edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM; - lr_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h, - ¶ms, edges HIGHBD_CALL_SUFFIX); + lr_fn(p, stride, left, lpf, unit_w, stripe_h, ¶ms, edges HIGHBD_CALL_SUFFIX); left += stripe_h; y += stripe_h; - p += stripe_h * PXSTRIDE(p_stride); + p += stripe_h * PXSTRIDE(stride); edges |= LR_HAVE_TOP; stripe_h = imin(64 >> ss_ver, row_h - y); if (stripe_h == 0) break; - lpf += 4 * PXSTRIDE(lpf_stride); + lpf += 4 * PXSTRIDE(stride); } } @@ -246,7 +128,8 @@ // TODO Support chroma subsampling. const int shift_hor = 7 - ss_hor; - pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4]; + /* maximum sbrow height is 128 + 8 rows offset */ + ALIGN_STK_16(pixel, pre_lr_border, 2, [128 + 8][4]); const Av1RestorationUnit *lr[2]; enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT; diff -Nru dav1d-0.9.2/src/meson.build dav1d-1.0.0/src/meson.build --- dav1d-0.9.2/src/meson.build 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/meson.build 2022-03-18 14:31:55.994356000 +0000 @@ -58,7 +58,7 @@ 'cdef_apply_tmpl.c', 'cdef_tmpl.c', 'fg_apply_tmpl.c', - 'film_grain_tmpl.c', + 'filmgrain_tmpl.c', 'ipred_prepare_tmpl.c', 'ipred_tmpl.c', 'itx_tmpl.c', @@ -96,7 +96,7 @@ ) libdav1d_tmpl_sources += files( 'arm/cdef_init_tmpl.c', - 'arm/film_grain_init_tmpl.c', + 'arm/filmgrain_init_tmpl.c', 'arm/ipred_init_tmpl.c', 'arm/itx_init_tmpl.c', 'arm/loopfilter_init_tmpl.c', @@ -116,7 +116,7 @@ if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( 'arm/64/cdef.S', - 'arm/64/film_grain.S', + 'arm/64/filmgrain.S', 'arm/64/ipred.S', 'arm/64/loopfilter.S', 'arm/64/looprestoration.S', @@ -127,7 +127,7 @@ if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( 'arm/64/cdef16.S', - 'arm/64/film_grain16.S', + 'arm/64/filmgrain16.S', 'arm/64/ipred16.S', 'arm/64/itx16.S', 'arm/64/loopfilter16.S', @@ -147,7 +147,7 @@ if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( 'arm/32/cdef.S', - 'arm/32/film_grain.S', + 'arm/32/filmgrain.S', 'arm/32/ipred.S', 'arm/32/loopfilter.S', 'arm/32/looprestoration.S', @@ -158,7 +158,7 @@ if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( 'arm/32/cdef16.S', - 'arm/32/film_grain16.S', + 'arm/32/filmgrain16.S', 'arm/32/ipred16.S', 'arm/32/itx16.S', 'arm/32/loopfilter16.S', @@ -183,7 +183,7 @@ libdav1d_tmpl_sources += files( 'x86/cdef_init_tmpl.c', - 'x86/film_grain_init_tmpl.c', + 'x86/filmgrain_init_tmpl.c', 'x86/ipred_init_tmpl.c', 'x86/itx_init_tmpl.c', 'x86/loopfilter_init_tmpl.c', @@ -206,12 +206,17 @@ if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( 'x86/cdef_avx512.asm', + 'x86/filmgrain_avx512.asm', + 'x86/ipred_avx512.asm', + 'x86/itx_avx512.asm', + 'x86/loopfilter_avx512.asm', + 'x86/looprestoration_avx512.asm', 'x86/mc_avx512.asm', - 'x86/mc_avx2.asm', - 'x86/film_grain_avx2.asm', + 'x86/filmgrain_avx2.asm', 'x86/ipred_avx2.asm', 'x86/loopfilter_avx2.asm', - 'x86/film_grain_sse.asm', + 'x86/mc_avx2.asm', + 'x86/filmgrain_sse.asm', 'x86/ipred_sse.asm', 'x86/loopfilter_sse.asm', 'x86/looprestoration_sse.asm', @@ -221,15 +226,19 @@ if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( + 'x86/filmgrain16_avx512.asm', + 'x86/ipred16_avx512.asm', + 'x86/looprestoration16_avx512.asm', + 'x86/mc16_avx512.asm', 'x86/cdef16_avx2.asm', - 'x86/film_grain16_avx2.asm', + 'x86/filmgrain16_avx2.asm', 'x86/ipred16_avx2.asm', 'x86/itx16_avx2.asm', 'x86/loopfilter16_avx2.asm', 'x86/looprestoration16_avx2.asm', 'x86/mc16_avx2.asm', 'x86/cdef16_sse.asm', - 'x86/film_grain16_sse.asm', + 'x86/filmgrain16_sse.asm', 'x86/ipred16_sse.asm', 'x86/itx16_sse.asm', 'x86/loopfilter16_sse.asm', @@ -254,26 +263,35 @@ +libdav1d_rc_obj = [] +libdav1d_flags = [stackalign_flag] api_export_flags = [] # # Windows .rc file and API export flags # -if host_machine.system() == 'windows' and get_option('default_library') != 'static' - rc_file = configure_file( - input : 'dav1d.rc.in', - output : 'dav1d.rc', - configuration : rc_data - ) +if host_machine.system() == 'windows' + if get_option('default_library') != 'static' + rc_file = configure_file( + input : 'dav1d.rc.in', + output : 'dav1d.rc', + configuration : rc_data + ) - libdav1d_rc_obj = winmod.compile_resources(rc_file) + libdav1d_rc_obj = winmod.compile_resources(rc_file) - api_export_flags = ['-DDAV1D_BUILDING_DLL'] -else - libdav1d_rc_obj = [] -endif + api_export_flags = ['-DDAV1D_BUILDING_DLL'] + endif + if (host_machine.cpu_family() == 'x86_64' and cc.get_id() == 'gcc') + # We don't expect to reference data members from other DLLs without + # dllimport attributes. Set the -mcmodel=small flag, which avoids + # generating indirection via .refptr. for all potentially + # dllimported variable references. + libdav1d_flags += '-mcmodel=small' + endif +endif @@ -288,7 +306,7 @@ include_directories : dav1d_inc_dirs, dependencies: [stdatomic_dependencies], - c_args : [stackalign_flag, stackrealign_flag, api_export_flags], + c_args : [libdav1d_flags, stackrealign_flag, api_export_flags], install : false, build_by_default : false, ).extract_all_objects(recursive: true) @@ -301,7 +319,7 @@ libdav1d_tmpl_sources, config_h_target, include_directories: dav1d_inc_dirs, dependencies : [stdatomic_dependencies], - c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag, + c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags, install : false, build_by_default : false, ).extract_all_objects(recursive: true) @@ -314,7 +332,7 @@ libdav1d_arch_tmpl_sources, config_h_target, include_directories: dav1d_inc_dirs, dependencies : [stdatomic_dependencies], - c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags, + c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags, install : false, build_by_default : false, ).extract_all_objects(recursive: true) @@ -344,7 +362,7 @@ thread_compat_dep, libdl_dependency, ], - c_args : [stackalign_flag, api_export_flags], + c_args : [libdav1d_flags, api_export_flags], version : dav1d_soname_version, soversion : dav1d_soversion, install : true, diff -Nru dav1d-0.9.2/src/obu.c dav1d-1.0.0/src/obu.c --- dav1d-0.9.2/src/obu.c 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/obu.c 2022-03-18 14:31:55.998355900 +0000 @@ -135,15 +135,18 @@ op->initial_display_delay = dav1d_get_bits(gb, 4) + 1; } } - const int op_idx = - c->operating_point < hdr->num_operating_points ? c->operating_point : 0; - c->operating_point_idc = hdr->operating_points[op_idx].idc; #if DEBUG_SEQ_HDR printf("SEQHDR: post-operating-points: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif } + const int op_idx = + c->operating_point < hdr->num_operating_points ? c->operating_point : 0; + c->operating_point_idc = hdr->operating_points[op_idx].idc; + const unsigned spatial_mask = c->operating_point_idc >> 8; + c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0; + hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1; hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1; hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1; @@ -263,6 +266,11 @@ hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ? dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN; } + if (c->strict_std_compliance && + hdr->mtrx == DAV1D_MC_IDENTITY && hdr->layout != DAV1D_PIXEL_LAYOUT_I444) + { + goto error; + } hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1); #if DEBUG_SEQ_HDR printf("SEQHDR: post-colorinfo: off=%u\n", @@ -378,7 +386,7 @@ if (seqhdr->frame_id_numbers_present) { hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits); Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr; - if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) return DAV1D_ERR(EINVAL); + if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) goto error; } return 0; } @@ -762,7 +770,7 @@ // segmentation data from the reference frame. assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE); const int pri_ref = hdr->refidx[hdr->primary_ref_frame]; - if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL); + if (!c->refs[pri_ref].p.p.frame_hdr) goto error; hdr->segmentation.seg_data = c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data; } @@ -824,7 +832,7 @@ hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas; } else { const int ref = hdr->refidx[hdr->primary_ref_frame]; - if (!c->refs[ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL); + if (!c->refs[ref].p.p.frame_hdr) goto error; hdr->loopfilter.mode_ref_deltas = c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas; } @@ -927,7 +935,7 @@ int off_after = -1; int off_before_idx, off_after_idx; for (int i = 0; i < 7; i++) { - if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL); + if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error; const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc); @@ -955,7 +963,7 @@ unsigned off_before2 = 0xFFFFFFFFU; int off_before2_idx; for (int i = 0; i < 7; i++) { - if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL); + if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error; const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; if (get_poc_diff(seqhdr->order_hint_n_bits, refpoc, off_before) < 0) { @@ -1010,7 +1018,7 @@ ref_gmv = &dav1d_default_wm_params; } else { const int pri_ref = hdr->refidx[hdr->primary_ref_frame]; - if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL); + if (!c->refs[pri_ref].p.p.frame_hdr) goto error; ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i]; } int32_t *const mat = hdr->gmv[i].matrix; @@ -1240,11 +1248,11 @@ memset(seq_hdr, 0, sizeof(*seq_hdr)); if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) { dav1d_ref_dec(&ref); - return res; + goto error; } if (check_for_overrun(c, &gb, init_bit_pos, len)) { dav1d_ref_dec(&ref); - return DAV1D_ERR(EINVAL); + goto error; } // If we have read a sequence header which is different from // the old one, this is a new video sequence and can't use any @@ -1302,7 +1310,7 @@ c->frame_hdr->spatial_id = spatial_id; if ((res = parse_frame_hdr(c, &gb)) < 0) { c->frame_hdr = NULL; - return res; + goto error; } for (int n = 0; n < c->n_tile_data; n++) dav1d_data_unref_internal(&c->tile[n].data); @@ -1314,7 +1322,7 @@ dav1d_get_bits(&gb, 1); if (check_for_overrun(c, &gb, init_bit_pos, len)) { c->frame_hdr = NULL; - return DAV1D_ERR(EINVAL); + goto error; } } @@ -1355,7 +1363,7 @@ // Align to the next byte boundary and check for overrun. dav1d_bytealign_get_bits(&gb); if (check_for_overrun(c, &gb, init_bit_pos, len)) - return DAV1D_ERR(EINVAL); + goto error; // The current bit position is a multiple of 8 (because we // just aligned it) and less than 8*pkt_bytelen because // otherwise the overrun check would have fired. @@ -1528,8 +1536,10 @@ break; } - case DAV1D_OBU_PADDING: case DAV1D_OBU_TD: + c->frame_flags |= PICTURE_FLAG_NEW_TEMPORAL_UNIT; + break; + case DAV1D_OBU_PADDING: // ignore OBUs we don't care about break; default: @@ -1540,30 +1550,46 @@ if (c->seq_hdr && c->frame_hdr) { if (c->frame_hdr->show_existing_frame) { - if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL); + if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error; if (c->n_fc == 1) { - dav1d_picture_ref(&c->out, - &c->refs[c->frame_hdr->existing_frame_idx].p.p); - dav1d_data_props_copy(&c->out.m, &in->m); + dav1d_thread_picture_ref(&c->out, + &c->refs[c->frame_hdr->existing_frame_idx].p); + dav1d_data_props_copy(&c->out.p.m, &in->m); c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p); } else { + pthread_mutex_lock(&c->task_thread.lock); // need to append this to the frame output queue const unsigned next = c->frame_thread.next++; if (c->frame_thread.next == c->n_fc) c->frame_thread.next = 0; Dav1dFrameContext *const f = &c->fc[next]; - pthread_mutex_lock(&f->frame_thread.td.lock); while (f->n_tile_data > 0) - pthread_cond_wait(&f->frame_thread.td.cond, - &f->frame_thread.td.lock); + pthread_cond_wait(&f->task_thread.cond, + &f->task_thread.ttd->lock); Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next]; - if (out_delayed->p.data[0]) { + if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) { + if (atomic_load(&c->task_thread.first) + 1U < c->n_fc) + atomic_fetch_add(&c->task_thread.first, 1U); + else + atomic_store(&c->task_thread.first, 0); + if (c->task_thread.cur && c->task_thread.cur < c->n_fc) + c->task_thread.cur--; + } + const int error = f->task_thread.retval; + if (error) { + c->cached_error = error; + f->task_thread.retval = 0; + dav1d_data_props_copy(&c->cached_error_props, &out_delayed->p.m); + dav1d_thread_picture_unref(out_delayed); + } else if (out_delayed->p.data[0]) { const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], memory_order_relaxed); - if (out_delayed->visible && progress != FRAME_ERROR) { - dav1d_picture_ref(&c->out, &out_delayed->p); + if ((out_delayed->visible || c->output_invisible_frames) && + progress != FRAME_ERROR) + { + dav1d_thread_picture_ref(&c->out, out_delayed); c->event_flags |= dav1d_picture_get_event_flags(out_delayed); } dav1d_thread_picture_unref(out_delayed); @@ -1572,7 +1598,7 @@ &c->refs[c->frame_hdr->existing_frame_idx].p); out_delayed->visible = 1; dav1d_data_props_copy(&out_delayed->p.m, &in->m); - pthread_mutex_unlock(&f->frame_thread.td.lock); + pthread_mutex_unlock(&c->task_thread.lock); } if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) { const int r = c->frame_hdr->existing_frame_idx; @@ -1596,7 +1622,7 @@ c->frame_hdr = NULL; } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) { if (!c->n_tile_data) - return DAV1D_ERR(EINVAL); + goto error; if ((res = dav1d_submit_frame(c)) < 0) return res; assert(!c->n_tile_data); @@ -1608,6 +1634,7 @@ return len + init_byte_pos; error: + dav1d_data_props_copy(&c->cached_error_props, &in->m); dav1d_log(c, "Error parsing OBU data\n"); return DAV1D_ERR(EINVAL); } diff -Nru dav1d-0.9.2/src/picture.c dav1d-1.0.0/src/picture.c --- dav1d-0.9.2/src/picture.c 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/picture.c 2022-03-18 14:31:55.998355900 +0000 @@ -176,7 +176,7 @@ const int bpc) { Dav1dThreadPicture *const p = &f->sr_cur; - p->t = c->n_fc > 1 ? &f->frame_thread.td : NULL; + const int have_frame_mt = c->n_fc > 1; const int res = picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height, @@ -186,7 +186,7 @@ c->mastering_display, c->mastering_display_ref, c->itut_t35, c->itut_t35_ref, bpc, &f->tile[0].data.m, &c->allocator, - p->t != NULL ? sizeof(atomic_int) * 2 : 0, + have_frame_mt ? sizeof(atomic_int) * 2 : 0, (void **) &p->progress); if (res) return res; @@ -198,7 +198,7 @@ c->frame_flags = 0; p->visible = f->frame_hdr->show_frame; - if (p->t) { + if (have_frame_mt) { atomic_init(&p->progress[0], 0); atomic_init(&p->progress[1], 0); } @@ -254,12 +254,21 @@ const Dav1dThreadPicture *const src) { dav1d_picture_ref(&dst->p, &src->p); - dst->t = src->t; dst->visible = src->visible; dst->progress = src->progress; dst->flags = src->flags; } +void dav1d_thread_picture_move_ref(Dav1dThreadPicture *const dst, + Dav1dThreadPicture *const src) +{ + dav1d_picture_move_ref(&dst->p, &src->p); + dst->visible = src->visible; + dst->progress = src->progress; + dst->flags = src->flags; + memset(src, 0, sizeof(*src)); +} + void dav1d_picture_unref_internal(Dav1dPicture *const p) { validate_input(p != NULL); @@ -274,59 +283,15 @@ dav1d_ref_dec(&p->itut_t35_ref); } memset(p, 0, sizeof(*p)); + dav1d_data_props_set_defaults(&p->m); } void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) { dav1d_picture_unref_internal(&p->p); - p->t = NULL; p->progress = NULL; } -int dav1d_thread_picture_wait(const Dav1dThreadPicture *const p, - int y_unclipped, const enum PlaneType plane_type) -{ - assert(plane_type != PLANE_TYPE_ALL); - - if (!p->t) - return 0; - - // convert to luma units; include plane delay from loopfilters; clip - const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420; - y_unclipped *= 1 << (plane_type & ss_ver); // we rely here on PLANE_TYPE_UV being 1 - y_unclipped += (plane_type != PLANE_TYPE_BLOCK) * 8; // delay imposed by loopfilter - const unsigned y = iclip(y_unclipped, 1, p->p.p.h); - atomic_uint *const progress = &p->progress[plane_type != PLANE_TYPE_BLOCK]; - unsigned state; - - if ((state = atomic_load_explicit(progress, memory_order_acquire)) >= y) - return state == FRAME_ERROR; - - pthread_mutex_lock(&p->t->lock); - while ((state = atomic_load_explicit(progress, memory_order_relaxed)) < y) - pthread_cond_wait(&p->t->cond, &p->t->lock); - pthread_mutex_unlock(&p->t->lock); - return state == FRAME_ERROR; -} - -void dav1d_thread_picture_signal(const Dav1dThreadPicture *const p, - const int y, // in pixel units - const enum PlaneType plane_type) -{ - assert(plane_type != PLANE_TYPE_UV); - - if (!p->t) - return; - - pthread_mutex_lock(&p->t->lock); - if (plane_type != PLANE_TYPE_Y) - atomic_store(&p->progress[0], y); - if (plane_type != PLANE_TYPE_BLOCK) - atomic_store(&p->progress[1], y); - pthread_cond_broadcast(&p->t->cond); - pthread_mutex_unlock(&p->t->lock); -} - enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *const p) { if (!p->flags) return 0; diff -Nru dav1d-0.9.2/src/picture.h dav1d-1.0.0/src/picture.h --- dav1d-0.9.2/src/picture.h 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/picture.h 2022-03-18 14:31:55.998355900 +0000 @@ -46,13 +46,13 @@ enum PictureFlags { PICTURE_FLAG_NEW_SEQUENCE = 1 << 0, PICTURE_FLAG_NEW_OP_PARAMS_INFO = 1 << 1, + PICTURE_FLAG_NEW_TEMPORAL_UNIT = 1 << 2, }; typedef struct Dav1dThreadPicture { Dav1dPicture p; int visible; enum PictureFlags flags; - struct thread_data *t; // [0] block data (including segmentation map and motion vectors) // [1] pixel data atomic_uint *progress; @@ -84,6 +84,8 @@ void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src); void dav1d_thread_picture_ref(Dav1dThreadPicture *dst, const Dav1dThreadPicture *src); +void dav1d_thread_picture_move_ref(Dav1dThreadPicture *dst, + Dav1dThreadPicture *src); void dav1d_thread_picture_unref(Dav1dThreadPicture *p); /** @@ -91,31 +93,6 @@ */ void dav1d_picture_move_ref(Dav1dPicture *dst, Dav1dPicture *src); -/** - * Wait for picture to reach a certain stage. - * - * y is in full-pixel units. If pt is not UV, this is in luma - * units, else it is in chroma units. - * plane_type is used to determine how many pixels delay are - * introduced by loopfilter processes. - * - * Returns 0 on success, and 1 if there was an error while decoding p - */ -int dav1d_thread_picture_wait(const Dav1dThreadPicture *p, int y, - enum PlaneType plane_type); - -/** - * Signal decoding progress. - * - * y is in full-pixel luma units. FRAME_ERROR is used to signal a decoding - * error to frames using this frame as reference frame. - * plane_type denotes whether we have completed block data (pass 1; - * PLANE_TYPE_BLOCK), pixel data (pass 2, PLANE_TYPE_Y) or both (no - * 2-pass decoding; PLANE_TYPE_ALL). - */ -void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y, - enum PlaneType plane_type); - int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie); void dav1d_default_picture_release(Dav1dPicture *p, void *cookie); void dav1d_picture_unref_internal(Dav1dPicture *p); diff -Nru dav1d-0.9.2/src/ppc/cdef_init_tmpl.c dav1d-1.0.0/src/ppc/cdef_init_tmpl.c --- dav1d-0.9.2/src/ppc/cdef_init_tmpl.c 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/ppc/cdef_init_tmpl.c 2022-03-18 14:31:55.998355900 +0000 @@ -32,7 +32,7 @@ #include "src/cdef.h" #include "src/cpu.h" -#include "src/ppc/types.h" +#include "src/ppc/dav1d_types.h" #if BITDEPTH == 8 static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, @@ -54,7 +54,7 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride, const uint8_t *src, const ptrdiff_t src_stride, const uint8_t (*left)[2], const uint8_t *const top, - const int w, const int h, + const uint8_t *const bottom, const int w, const int h, const enum CdefEdgeFlags edges) { const u16x8 fill = vec_splats((uint16_t)INT16_MAX); @@ -82,8 +82,8 @@ l1 = fill; y_end -= 2; } else { - l0 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 0) * src_stride)); - l1 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 1) * src_stride)); + l0 = u8h_to_u16(vec_vsx_ld(0, bottom + 0 * src_stride - 2)); + l1 = u8h_to_u16(vec_vsx_ld(0, bottom + 1 * src_stride - 2)); } vec_st(l0, 0, tmp + (h + 0) * 8); @@ -116,7 +116,7 @@ static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride, const uint8_t *src, const ptrdiff_t src_stride, const uint8_t (*left)[2], const uint8_t *const top, - const int w, const int h, + const uint8_t *const bottom, const int w, const int h, const enum CdefEdgeFlags edges) { const u16x8 fill = vec_splats((uint16_t)INT16_MAX); @@ -154,8 +154,8 @@ l1l = fill; y_end -= 2; } else { - u8x16 l0 = vec_vsx_ld(0, src - 2 + (h + 0) * src_stride); - u8x16 l1 = vec_vsx_ld(0, src - 2 + (h + 1) * src_stride); + u8x16 l0 = vec_vsx_ld(0, bottom + 0 * src_stride - 2); + u8x16 l1 = vec_vsx_ld(0, bottom + 1 * src_stride - 2); l0h = u8h_to_u16(l0); l0l = u8l_to_u16(l0); l1h = u8h_to_u16(l1); @@ -276,8 +276,8 @@ static inline void filter_4xN(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, - const int w, const int h, const int pri_strength, - const int sec_strength, const int dir, + const pixel *const bottom, const int w, const int h, + const int pri_strength, const int sec_strength, const int dir, const int damping, const enum CdefEdgeFlags edges, const ptrdiff_t tmp_stride, uint16_t *tmp) { @@ -302,8 +302,8 @@ const int off2_1 = cdef_directions[(dir + 2) & 7][1]; const int off3_1 = cdef_directions[(dir + 6) & 7][1]; + copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); - copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges); for (int y = 0; y < h / 2; y++) { LOAD_PIX4(tmp) @@ -365,8 +365,8 @@ static inline void filter_8xN(pixel *dst, const ptrdiff_t dst_stride, const pixel (*left)[2], const pixel *const top, - const int w, const int h, const int pri_strength, - const int sec_strength, const int dir, + const pixel *const bottom, const int w, const int h, + const int pri_strength, const int sec_strength, const int dir, const int damping, const enum CdefEdgeFlags edges, const ptrdiff_t tmp_stride, uint16_t *tmp) { @@ -393,7 +393,7 @@ const int off2_1 = cdef_directions[(dir + 2) & 7][1]; const int off3_1 = cdef_directions[(dir + 6) & 7][1]; - copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges); + copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges); for (int y = 0; y < h; y++) { LOAD_PIX(tmp) @@ -457,6 +457,7 @@ const ptrdiff_t dst_stride, \ const pixel (*left)[2], \ const pixel *const top, \ + const pixel *const bottom, \ const int pri_strength, \ const int sec_strength, \ const int dir, \ @@ -465,8 +466,8 @@ { \ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride,); \ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ - filter_##w##xN(dst, dst_stride, left, top, w, h, pri_strength, sec_strength, \ - dir, damping, edges, tmp_stride, tmp); \ + filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ + sec_strength, dir, damping, edges, tmp_stride, tmp); \ } cdef_fn(4, 4, 8); diff -Nru dav1d-0.9.2/src/ppc/dav1d_types.h dav1d-1.0.0/src/ppc/dav1d_types.h --- dav1d-0.9.2/src/ppc/dav1d_types.h 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/ppc/dav1d_types.h 2022-03-18 14:31:55.998355900 +0000 @@ -0,0 +1,54 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_PPC_TYPES_H +#define DAV1D_SRC_PPC_TYPES_H + +#include +#undef pixel + +#define u8x16 vector unsigned char +#define i8x16 vector signed char +#define b8x16 vector bool char +#define u16x8 vector unsigned short +#define i16x8 vector signed short +#define b16x8 vector bool short +#define u32x4 vector unsigned int +#define i32x4 vector signed int +#define b32x4 vector bool int +#define u64x2 vector unsigned long long +#define i64x2 vector signed long long +#define b64x2 vector bool long long + +#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0))) +#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0))) +#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0))) +#define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v)) +#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0))) +#define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v)) + +#endif /* DAV1D_SRC_PPC_TYPES_H */ diff -Nru dav1d-0.9.2/src/ppc/looprestoration_init_tmpl.c dav1d-1.0.0/src/ppc/looprestoration_init_tmpl.c --- dav1d-0.9.2/src/ppc/looprestoration_init_tmpl.c 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/ppc/looprestoration_init_tmpl.c 2022-03-18 14:31:55.998355900 +0000 @@ -26,7 +26,7 @@ */ #include "common/intops.h" -#include "src/ppc/types.h" +#include "src/ppc/dav1d_types.h" #include "src/cpu.h" #include "src/looprestoration.h" @@ -172,7 +172,7 @@ } while (0) static inline void wiener_filter_v_vsx(uint8_t *p, - const ptrdiff_t p_stride, + const ptrdiff_t stride, const int32_t *hor, const int16_t filterv[8], const int w, const int h) @@ -192,7 +192,7 @@ for (int i = 0; i <(w-w%16); i += 16) { u8x16 sum_pixel; LOAD_AND_APPLY_FILTER_V(sum_pixel, hor); - vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(p_stride) + i]); + vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(stride) + i]); } // remaining loop if (w & 0xf){ @@ -204,16 +204,15 @@ vec_vsx_st(sum_pixel, 0, tmp_out); for (int k=0; i -#undef pixel - -#define u8x16 vector unsigned char -#define i8x16 vector signed char -#define b8x16 vector bool char -#define u16x8 vector unsigned short -#define i16x8 vector signed short -#define b16x8 vector bool short -#define u32x4 vector unsigned int -#define i32x4 vector signed int -#define b32x4 vector bool int -#define u64x2 vector unsigned long long -#define i64x2 vector signed long long -#define b64x2 vector bool long long - -#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0))) -#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0))) -#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0))) -#define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v)) -#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0))) -#define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v)) - -#endif /* DAV1D_SRC_PPC_TYPES_H */ diff -Nru dav1d-0.9.2/src/recon.h dav1d-1.0.0/src/recon.h --- dav1d-0.9.2/src/recon.h 2021-09-03 15:51:24.409037000 +0000 +++ dav1d-1.0.0/src/recon.h 2022-03-18 14:31:56.002356000 +0000 @@ -37,12 +37,12 @@ #define DEBUG_B_PIXELS 0 #define decl_recon_b_intra_fn(name) \ -void (name)(Dav1dTileContext *t, enum BlockSize bs, \ +void (name)(Dav1dTaskContext *t, enum BlockSize bs, \ enum EdgeFlags intra_edge_flags, const Av1Block *b) typedef decl_recon_b_intra_fn(*recon_b_intra_fn); #define decl_recon_b_inter_fn(name) \ -int (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b) +int (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b) typedef decl_recon_b_inter_fn(*recon_b_inter_fn); #define decl_filter_sbrow_fn(name) \ @@ -50,11 +50,11 @@ typedef decl_filter_sbrow_fn(*filter_sbrow_fn); #define decl_backup_ipred_edge_fn(name) \ -void (name)(Dav1dTileContext *t) +void (name)(Dav1dTaskContext *t) typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn); #define decl_read_coef_blocks_fn(name) \ -void (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b) +void (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b) typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn); decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc); @@ -65,10 +65,12 @@ decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc); decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc); -decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_8bpc); -decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_16bpc); -decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_8bpc); -decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_cols_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_rows_16bpc); +void dav1d_filter_sbrow_cdef_8bpc(Dav1dTaskContext *tc, int sby); +void dav1d_filter_sbrow_cdef_16bpc(Dav1dTaskContext *tc, int sby); decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc); decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_16bpc); decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_8bpc); diff -Nru dav1d-0.9.2/src/recon_tmpl.c dav1d-1.0.0/src/recon_tmpl.c --- dav1d-0.9.2/src/recon_tmpl.c 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/recon_tmpl.c 2022-03-18 14:31:56.002356000 +0000 @@ -318,7 +318,7 @@ return offset + (mag > 512 ? 4 : (mag + 64) >> 7); } -static int decode_coefs(Dav1dTileContext *const t, +static int decode_coefs(Dav1dTaskContext *const t, uint8_t *const a, uint8_t *const l, const enum RectTxfmSize tx, const enum BlockSize bs, const Av1Block *const b, const int intra, @@ -719,7 +719,7 @@ return eob; } -static void read_coef_tree(Dav1dTileContext *const t, +static void read_coef_tree(Dav1dTaskContext *const t, const enum BlockSize bs, const Av1Block *const b, const enum RectTxfmSize ytx, const int depth, const uint16_t *const tx_split, @@ -768,15 +768,16 @@ coef *cf; struct CodedBlockInfo *cbi; - if (f->frame_thread.pass) { - assert(ts->frame_thread.cf); - cf = ts->frame_thread.cf; - ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + assert(ts->frame_thread[p].cf); + cf = ts->frame_thread[p].cf; + ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; } else { cf = bitfn(t->cf); } - if (f->frame_thread.pass != 2) { + if (t->frame_thread.pass != 2) { eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4], ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx); if (DEBUG_BLOCK_INFO) @@ -798,7 +799,7 @@ uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4]; case_set_upto16(txw,,,); #undef set_ctx - if (f->frame_thread.pass == 1) { + if (t->frame_thread.pass == 1) { cbi->eob[0] = eob; cbi->txtp[0] = txtp; } @@ -806,7 +807,7 @@ eob = cbi->eob[0]; txtp = cbi->txtp[0]; } - if (!(f->frame_thread.pass & 1)) { + if (!(t->frame_thread.pass & 1)) { assert(dst); if (eob >= 0) { if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) @@ -820,7 +821,7 @@ } } -void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t, +void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, const enum BlockSize bs, const Av1Block *const b) { const Dav1dFrameContext *const f = t->f; @@ -855,7 +856,7 @@ Dav1dTileState *const ts = t->ts; const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by); const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver; - assert(f->frame_thread.pass == 1); + assert(t->frame_thread.pass == 1); assert(!b->skip); const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx]; const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx]; @@ -884,12 +885,12 @@ const int eob = cbi[t->bx].eob[0] = decode_coefs(t, &t->a->lcoef[bx4 + x], &t->l.lcoef[by4 + y], b->tx, bs, b, 1, - 0, ts->frame_thread.cf, &txtp, &cf_ctx); + 0, ts->frame_thread[1].cf, &txtp, &cf_ctx); if (DEBUG_BLOCK_INFO) printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", b->tx, txtp, eob, ts->msac.rng); cbi[t->bx].txtp[0] = txtp; - ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; + ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; #define set_ctx(type, dir, diridx, off, mul, rep_macro) \ rep_macro(type, t->dir lcoef, off, mul * cf_ctx) #define default_memset(dir, diridx, off, sz) \ @@ -927,14 +928,14 @@ const int eob = cbi[t->bx].eob[1 + pl] = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], &t->l.ccoef[pl][cby4 + y], b->uvtx, bs, - b, b->intra, 1 + pl, ts->frame_thread.cf, + b, b->intra, 1 + pl, ts->frame_thread[1].cf, &txtp, &cf_ctx); if (DEBUG_BLOCK_INFO) printf("Post-uv-cf-blk[pl=%d,tx=%d," "txtp=%d,eob=%d]: r=%d\n", pl, b->uvtx, txtp, eob, ts->msac.rng); cbi[t->bx].txtp[1 + pl] = txtp; - ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16; + ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16; #define set_ctx(type, dir, diridx, off, mul, rep_macro) \ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx) #define default_memset(dir, diridx, off, sz) \ @@ -956,7 +957,7 @@ } } -static int mc(Dav1dTileContext *const t, +static int mc(Dav1dTaskContext *const t, pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride, const int bw4, const int bh4, const int bx, const int by, const int pl, @@ -979,11 +980,6 @@ int w, h; if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc - if (dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4, - PLANE_TYPE_Y + !!pl)) - { - return -1; - } w = (f->cur.p.w + ss_hor) >> ss_hor; h = (f->cur.p.h + ss_ver) >> ss_ver; } else { @@ -1034,8 +1030,6 @@ const int bottom = ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1; - if (dav1d_thread_picture_wait(refp, bottom + 4, PLANE_TYPE_Y + !!pl)) - return -1; if (DEBUG_BLOCK_INFO) printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n", left, top, orig_pos_x, f->svc[refidx][0].scale, refidx, @@ -1077,7 +1071,7 @@ return 0; } -static int obmc(Dav1dTileContext *const t, +static int obmc(Dav1dTaskContext *const t, pixel *const dst, const ptrdiff_t dst_stride, const uint8_t *const b_dim, const int pl, const int bx4, const int by4, const int w4, const int h4) @@ -1138,7 +1132,7 @@ return 0; } -static int warp_affine(Dav1dTileContext *const t, +static int warp_affine(Dav1dTaskContext *const t, pixel *dst8, int16_t *dst16, const ptrdiff_t dstride, const uint8_t *const b_dim, const int pl, const Dav1dThreadPicture *const refp, @@ -1176,11 +1170,6 @@ const pixel *ref_ptr; ptrdiff_t ref_stride = refp->p.stride[!!pl]; - if (dav1d_thread_picture_wait(refp, dy + 4 + 8, - PLANE_TYPE_Y + !!pl)) - { - return -1; - } if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) { pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge); f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3, @@ -1204,7 +1193,7 @@ return 0; } -void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs, +void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs, const enum EdgeFlags intra_edge_flags, const Av1Block *const b) { @@ -1239,14 +1228,15 @@ pixel *dst = ((pixel *) f->cur.data[0]) + 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); const uint8_t *pal_idx; - if (f->frame_thread.pass) { - assert(ts->frame_thread.pal_idx); - pal_idx = ts->frame_thread.pal_idx; - ts->frame_thread.pal_idx += bw4 * bh4 * 16; + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + assert(ts->frame_thread[p].pal_idx); + pal_idx = ts->frame_thread[p].pal_idx; + ts->frame_thread[p].pal_idx += bw4 * bh4 * 16; } else { pal_idx = t->scratch.pal_idx; } - const uint16_t *const pal = f->frame_thread.pass ? + const uint16_t *const pal = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0]; f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal, @@ -1323,9 +1313,10 @@ coef *cf; int eob; enum TxfmType txtp; - if (f->frame_thread.pass) { - cf = ts->frame_thread.cf; - ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + cf = ts->frame_thread[p].cf; + ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; const struct CodedBlockInfo *const cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; eob = cbi->eob[0]; @@ -1362,7 +1353,7 @@ hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon"); } - } else if (!f->frame_thread.pass) { + } else if (!t->frame_thread.pass) { #define set_ctx(type, dir, diridx, off, mul, rep_macro) \ rep_macro(type, t->dir lcoef, off, mul * 0x40) case_set_upto16(t_dim->h, l., 1, by4 + y); @@ -1435,12 +1426,13 @@ (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); const uint16_t (*pal)[8]; const uint8_t *pal_idx; - if (f->frame_thread.pass) { - assert(ts->frame_thread.pal_idx); + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + assert(ts->frame_thread[p].pal_idx); pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))]; - pal_idx = ts->frame_thread.pal_idx; - ts->frame_thread.pal_idx += cbw4 * cbh4 * 16; + pal_idx = ts->frame_thread[p].pal_idx; + ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16; } else { pal = t->scratch.pal; pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16]; @@ -1545,9 +1537,10 @@ enum TxfmType txtp; int eob; coef *cf; - if (f->frame_thread.pass) { - cf = ts->frame_thread.cf; - ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16; + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + cf = ts->frame_thread[p].cf; + ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16; const struct CodedBlockInfo *const cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; eob = cbi->eob[pl + 1]; @@ -1587,7 +1580,7 @@ hex_dump(dst, stride, uv_t_dim->w * 4, uv_t_dim->h * 4, "recon"); } - } else if (!f->frame_thread.pass) { + } else if (!t->frame_thread.pass) { #define set_ctx(type, dir, diridx, off, mul, rep_macro) \ rep_macro(type, t->dir ccoef[pl], off, mul * 0x40) case_set_upto16(uv_t_dim->h, l., 1, cby4 + y); @@ -1604,7 +1597,7 @@ } } -int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs, +int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs, const Av1Block *const b) { Dav1dTileState *const ts = t->ts; @@ -1719,7 +1712,7 @@ r[-1][t->bx - 1].mv.mv[0], &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1], r[-1][t->bx - 1].ref.ref[0] - 1, - f->frame_thread.pass != 2 ? t->tl_4x4_filter : + t->frame_thread.pass != 2 ? t->tl_4x4_filter : f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d); if (res) return res; } @@ -1735,7 +1728,7 @@ t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0], &f->refp[r[0][t->bx - 1].ref.ref[0] - 1], r[0][t->bx - 1].ref.ref[0] - 1, - f->frame_thread.pass != 2 ? left_filter_2d : + t->frame_thread.pass != 2 ? left_filter_2d : f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d); if (res) return res; } @@ -1750,7 +1743,7 @@ 1 + pl, r[-1][t->bx].mv.mv[0], &f->refp[r[-1][t->bx].ref.ref[0] - 1], r[-1][t->bx].ref.ref[0] - 1, - f->frame_thread.pass != 2 ? top_filter_2d : + t->frame_thread.pass != 2 ? top_filter_2d : f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d); if (res) return res; } @@ -1994,9 +1987,10 @@ coef *cf; int eob; enum TxfmType txtp; - if (f->frame_thread.pass) { - cf = ts->frame_thread.cf; - ts->frame_thread.cf += uvtx->w * uvtx->h * 16; + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + cf = ts->frame_thread[p].cf; + ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16; const struct CodedBlockInfo *const cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; eob = cbi->eob[1 + pl]; @@ -2051,7 +2045,12 @@ return 0; } -void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby) { +void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) { + if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) || + (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1])) + { + return; + } const int y = sby * f->sb_step * 4; const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; pixel *const p[3] = { @@ -2060,19 +2059,33 @@ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) }; Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; - if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) { - int start_of_tile_row = 0; - if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby) - start_of_tile_row = f->lf.tile_row++; - bytefn(dav1d_loopfilter_sbrow)(f, p, mask, sby, start_of_tile_row); - } - if (f->lf.restore_planes) { - // Store loop filtered pixels required by loop restoration - bytefn(dav1d_lr_copy_lpf)(f, p, sby); + bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby, + f->lf.start_of_tile_row[sby]); +} + +void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) { + const int y = sby * f->sb_step * 4; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *const p[3] = { + f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), + f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) + }; + Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; + if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK && + (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1])) + { + bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby); + } + if (f->seq_hdr->cdef || f->lf.restore_planes) { + // Store loop filtered pixels required by CDEF / LR + bytefn(dav1d_copy_lpf)(f, p, sby); } } -void bytefn(dav1d_filter_sbrow_cdef)(Dav1dFrameContext *const f, const int sby) { +void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) { + const Dav1dFrameContext *const f = tc->f; + if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return; const int sbsz = f->sb_step; const int y = sby * sbsz * 4; const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; @@ -2091,11 +2104,11 @@ p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), }; - bytefn(dav1d_cdef_brow)(f, p_up, prev_mask, start - 2, start); + bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby); } const int n_blks = sbsz - 2 * (sby + 1 < f->sbh); const int end = imin(start + n_blks, f->bh); - bytefn(dav1d_cdef_brow)(f, p, mask, start, end); + bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby); } void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) { @@ -2134,6 +2147,7 @@ } void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) { + if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return; const int y = sby * f->sb_step * 4; const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; pixel *const sr_p[3] = { @@ -2145,16 +2159,17 @@ } void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) { - bytefn(dav1d_filter_sbrow_deblock)(f, sby); + bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby); + bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby); if (f->seq_hdr->cdef) - bytefn(dav1d_filter_sbrow_cdef)(f, sby); + bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby); if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) bytefn(dav1d_filter_sbrow_resize)(f, sby); if (f->lf.restore_planes) bytefn(dav1d_filter_sbrow_lr)(f, sby); } -void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) { +void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) { const Dav1dFrameContext *const f = t->f; Dav1dTileState *const ts = t->ts; const int sby = t->by >> f->sb_shift; diff -Nru dav1d-0.9.2/src/refmvs.c dav1d-1.0.0/src/refmvs.c --- dav1d-0.9.2/src/refmvs.c 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/refmvs.c 2022-03-18 14:31:56.002356000 +0000 @@ -653,11 +653,14 @@ void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *const rf, const int tile_col_start4, const int tile_col_end4, const int tile_row_start4, const int tile_row_end4, - const int sby, int tile_row_idx) + const int sby, int tile_row_idx, const int pass) { if (rf->n_tile_threads == 1) tile_row_idx = 0; rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx]; - refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx]; + const int uses_2pass = rf->n_tile_threads > 1 && rf->n_frame_threads > 1; + const ptrdiff_t pass_off = (uses_2pass && pass == 2) ? + 35 * rf->r_stride * rf->n_tile_rows : 0; + refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx + pass_off]; const int sbsz = rf->sbsz; const int off = (sbsz * sby) & 16; for (int i = 0; i < sbsz; i++, r += rf->r_stride) @@ -806,7 +809,7 @@ refmvs_temporal_block *const rp, const unsigned ref_ref_poc[7][7], /*const*/ refmvs_temporal_block *const rp_ref[7], - const int n_tile_threads) + const int n_tile_threads, const int n_frame_threads) { rf->sbsz = 16 << seq_hdr->sb128; rf->frm_hdr = frm_hdr; @@ -819,7 +822,8 @@ const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1; if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) { if (rf->r) dav1d_freep_aligned(&rf->r); - rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows, 64); + const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1; + rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64); if (!rf->r) return DAV1D_ERR(ENOMEM); rf->r_stride = r_stride; } @@ -833,6 +837,7 @@ } rf->n_tile_rows = n_tile_rows; rf->n_tile_threads = n_tile_threads; + rf->n_frame_threads = n_frame_threads; rf->rp = rp; rf->rp_ref = rp_ref; const unsigned poc = frm_hdr->frame_offset; diff -Nru dav1d-0.9.2/src/refmvs.h dav1d-1.0.0/src/refmvs.h --- dav1d-0.9.2/src/refmvs.h 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/refmvs.h 2022-03-18 14:31:56.002356000 +0000 @@ -79,7 +79,7 @@ refmvs_block *r; // 35 x r_stride memory ptrdiff_t r_stride; - int n_tile_rows, n_tile_threads; + int n_tile_rows, n_tile_threads, n_frame_threads; } refmvs_frame; typedef struct refmvs_tile { @@ -116,7 +116,7 @@ refmvs_temporal_block *rp, const unsigned ref_ref_poc[7][7], /*const*/ refmvs_temporal_block *const rp_ref[7], - int n_tile_threads); + int n_tile_threads, int n_frame_threads); // initialize temporal MVs; this can be done in any configuration, e.g. one // tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or @@ -136,7 +136,7 @@ void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf, int tile_col_start4, int tile_col_end4, int tile_row_start4, int tile_row_end4, - int sby, int tile_row_idx); + int sby, int tile_row_idx, int pass); // call for each block void dav1d_refmvs_find(const refmvs_tile *rt, diff -Nru dav1d-0.9.2/src/tables.c dav1d-1.0.0/src/tables.c --- dav1d-0.9.2/src/tables.c 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/tables.c 2022-03-18 14:31:56.002356000 +0000 @@ -419,7 +419,7 @@ { 0, 1177 }, { 0, 925 }, { 56, 0 }, { 22, 0 }, }; -const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = { +const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 64) = { 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, @@ -440,7 +440,7 @@ 0 }; -const int8_t ALIGN(dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8], 8) = { +const int8_t ALIGN(dav1d_mc_subpel_filters[6][15][8], 8) = { [DAV1D_FILTER_8TAP_REGULAR] = { { 0, 1, -3, 63, 4, -1, 0, 0 }, { 0, 1, -5, 61, 9, -2, 0, 0 }, @@ -522,7 +522,6 @@ { 0, 0, 2, 20, 31, 11, 0, 0 }, { 0, 0, 2, 18, 31, 13, 0, 0 }, { 0, 0, 1, 17, 31, 15, 0, 0 } -#if ARCH_X86_64 /* Bilin scaled being very rarely used, add a new table entry * and use the put/prep_8tap_scaled code, thus acting as a * scaled bilinear filter. */ @@ -542,7 +541,6 @@ { 0, 0, 0, 12, 52, 0, 0, 0 }, { 0, 0, 0, 8, 56, 0, 0, 0 }, { 0, 0, 0, 4, 60, 0, 0, 0 } -#endif } }; @@ -758,7 +756,7 @@ [1*idx+32] = f4, [1*idx+40] = f5, \ [1*idx+48] = f6 #endif -const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = { +const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 64) = { { F( 0, -6, 10, 0, 0, 0, 12, 0 ), F( 1, -5, 2, 10, 0, 0, 9, 0 ), diff -Nru dav1d-0.9.2/src/tables.h dav1d-1.0.0/src/tables.h --- dav1d-0.9.2/src/tables.h 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/tables.h 2022-03-18 14:31:56.002356000 +0000 @@ -110,7 +110,7 @@ extern const uint16_t dav1d_sgr_params[16][2]; extern const uint8_t dav1d_sgr_x_by_x[256]; -extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8]; +extern const int8_t dav1d_mc_subpel_filters[6][15][8]; extern const int8_t dav1d_mc_warp_filter[193][8]; extern const int8_t dav1d_resize_filter[64][8]; diff -Nru dav1d-0.9.2/src/thread_task.c dav1d-1.0.0/src/thread_task.c --- dav1d-0.9.2/src/thread_task.c 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/thread_task.c 2022-03-18 14:31:56.002356000 +0000 @@ -27,345 +27,798 @@ #include "config.h" +#include "common/frame.h" + #include "src/thread_task.h" +#include "src/fg_apply.h" -int dav1d_task_create_filter_sbrow(Dav1dFrameContext *const f) { - struct PostFilterThreadData *const pftd = f->lf.thread.pftd; - const int frame_idx = (int)(f - f->c->fc); +// This function resets the cur pointer to the first frame theoretically +// executable after a task completed (ie. each time we update some progress or +// insert some tasks in the queue). +// When frame_idx is set, it can be either from a completed task, or from tasks +// inserted in the queue, in which case we have to make sure the cur pointer +// isn't past this insert. +// The special case where frame_idx is UINT_MAX is to handle the reset after +// completing a task and locklessly signaling progress. In this case we don't +// enter a critical section, which is needed for this function, so we set an +// atomic for a delayed handling, happening here. Meaning we can call this +// function without any actual update other than what's in the atomic, hence +// this special case. +static inline int reset_task_cur(const Dav1dContext *const c, + struct TaskThreadData *const ttd, + unsigned frame_idx) +{ + const unsigned first = atomic_load(&ttd->first); + if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL) + return 0; + unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX); + if (reset_frame_idx != UINT_MAX) { + if (frame_idx == UINT_MAX) { + if (reset_frame_idx > first + ttd->cur) + return 0; + ttd->cur = reset_frame_idx - first; + goto cur_found; + } + } else if (frame_idx == UINT_MAX) + return 0; + if (frame_idx < first) frame_idx += c->n_fc; + const unsigned min_frame_idx = umin(reset_frame_idx, frame_idx); + const unsigned cur_frame_idx = first + ttd->cur; + if (ttd->cur < c->n_fc && cur_frame_idx < min_frame_idx) + return 0; + for (ttd->cur = min_frame_idx - first; ttd->cur < c->n_fc; ttd->cur++) + if (c->fc[(first + ttd->cur) % c->n_fc].task_thread.task_head) + break; +cur_found: + for (unsigned i = ttd->cur; i < c->n_fc; i++) + c->fc[(first + i) % c->n_fc].task_thread.task_cur_prev = NULL; + return 1; +} +static inline void reset_task_cur_async(struct TaskThreadData *const ttd, + unsigned frame_idx, unsigned n_frames) +{ + if (frame_idx < (unsigned)atomic_load(&ttd->first)) frame_idx += n_frames; + unsigned last_idx = frame_idx; + do { + frame_idx = last_idx; + last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx); + } while (last_idx < frame_idx); +} + +static void insert_tasks_between(Dav1dFrameContext *const f, + Dav1dTask *const first, Dav1dTask *const last, + Dav1dTask *const a, Dav1dTask *const b, + const int cond_signal) +{ + struct TaskThreadData *const ttd = f->task_thread.ttd; + if (atomic_load(f->c->flush)) return; + assert(!a || a->next == b); + if (!a) f->task_thread.task_head = first; + else a->next = first; + if (!b) f->task_thread.task_tail = last; + last->next = b; + reset_task_cur(f->c, ttd, first->frame_idx); + if (cond_signal && !atomic_fetch_or(&ttd->cond_signaled, 1)) + pthread_cond_signal(&ttd->cond); +} + +static void insert_tasks(Dav1dFrameContext *const f, + Dav1dTask *const first, Dav1dTask *const last, + const int cond_signal) +{ + // insert task back into task queue + Dav1dTask *t_ptr, *prev_t = NULL; + for (t_ptr = f->task_thread.task_head; + t_ptr; prev_t = t_ptr, t_ptr = t_ptr->next) + { + // entropy coding precedes other steps + if (t_ptr->type == DAV1D_TASK_TYPE_TILE_ENTROPY) { + if (first->type > DAV1D_TASK_TYPE_TILE_ENTROPY) continue; + // both are entropy + if (first->sby > t_ptr->sby) continue; + if (first->sby < t_ptr->sby) { + insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); + return; + } + // same sby + } else { + if (first->type == DAV1D_TASK_TYPE_TILE_ENTROPY) { + insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); + return; + } + if (first->sby > t_ptr->sby) continue; + if (first->sby < t_ptr->sby) { + insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); + return; + } + // same sby + if (first->type > t_ptr->type) continue; + if (first->type < t_ptr->type) { + insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); + return; + } + // same task type + } + + // sort by tile-id + assert(first->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION || + first->type == DAV1D_TASK_TYPE_TILE_ENTROPY); + assert(first->type == t_ptr->type); + assert(t_ptr->sby == first->sby); + const int p = first->type == DAV1D_TASK_TYPE_TILE_ENTROPY; + const int t_tile_idx = (int) (first - f->task_thread.tile_tasks[p]); + const int p_tile_idx = (int) (t_ptr - f->task_thread.tile_tasks[p]); + assert(t_tile_idx != p_tile_idx); + if (t_tile_idx > p_tile_idx) continue; + insert_tasks_between(f, first, last, prev_t, t_ptr, cond_signal); + return; + } + // append at the end + insert_tasks_between(f, first, last, prev_t, NULL, cond_signal); +} + +static inline void insert_task(Dav1dFrameContext *const f, + Dav1dTask *const t, const int cond_signal) +{ + insert_tasks(f, t, t, cond_signal); +} + +static int create_filter_sbrow(Dav1dFrameContext *const f, + const int pass, Dav1dTask **res_t) +{ const int has_deblock = f->frame_hdr->loopfilter.level_y[0] || - f->frame_hdr->loopfilter.level_y[1] || - f->lf.restore_planes; + f->frame_hdr->loopfilter.level_y[1]; const int has_cdef = f->seq_hdr->cdef; const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1]; - const int has_lr = !!f->lf.restore_planes; - f->lf.thread.npf = has_deblock + has_cdef + has_resize + has_lr; - if (f->lf.thread.npf == 0) return 0; - - pthread_mutex_lock(&pftd->lock); - - Dav1dTask *tasks = f->lf.thread.tasks; - int num_tasks = f->sbh * f->lf.thread.npf; - if (num_tasks > f->lf.thread.num_tasks) { + const int has_lr = f->lf.restore_planes; + + Dav1dTask *tasks = f->task_thread.tasks; + const int uses_2pass = f->c->n_fc > 1; + int num_tasks = f->sbh * (1 + uses_2pass); + if (num_tasks > f->task_thread.num_tasks) { const size_t size = sizeof(Dav1dTask) * num_tasks; - tasks = realloc(f->lf.thread.tasks, size); - if (!tasks) { - pthread_mutex_unlock(&pftd->lock); - return -1; - } + tasks = realloc(f->task_thread.tasks, size); + if (!tasks) return -1; memset(tasks, 0, size); - f->lf.thread.tasks = tasks; - f->lf.thread.num_tasks = num_tasks; + f->task_thread.tasks = tasks; + f->task_thread.num_tasks = num_tasks; } + tasks += f->sbh * (pass & 1); -#define create_task(task, ready_cond, start_cond) \ - do { \ - t = &tasks[num_tasks++]; \ - t->status = ready_cond ? DAV1D_TASK_READY : DAV1D_TASK_DEFAULT; \ - t->start = start_cond; \ - t->frame_id = frame_cnt; \ - t->frame_idx = frame_idx; \ - t->sby = sby; \ - t->fn = f->bd_fn.filter_sbrow_##task; \ - t->last_deps[0] = NULL; \ - t->last_deps[1] = NULL; \ - t->next_deps[0] = NULL; \ - t->next_deps[1] = NULL; \ - t->next_exec = NULL; \ - } while (0) - - Dav1dTask *last_sbrow_deblock = NULL; - Dav1dTask *last_sbrow_cdef = NULL; - Dav1dTask *last_sbrow_resize = NULL; - Dav1dTask *last_sbrow_lr = NULL; - num_tasks = 0; - const int frame_cnt = pftd->frame_cnt++; - - for (int sby = 0; sby < f->sbh; ++sby) { - Dav1dTask *t; - Dav1dTask *last = NULL; - if (has_deblock) { - create_task(deblock, sby == 0, 0); - if (sby) { - t->last_deps[1] = last_sbrow_deblock; - last_sbrow_deblock->next_deps[1] = t; - } - last = t; - last_sbrow_deblock = t; - } - if (has_cdef) { - create_task(cdef, sby == 0 && !has_deblock, has_deblock); - if (has_deblock) { - t->last_deps[0] = last; - last->next_deps[0] = t; - } - if (sby) { - t->last_deps[1] = last_sbrow_cdef; - last_sbrow_cdef->next_deps[1] = t; - } - last = t; - last_sbrow_cdef = t; - }; - if (has_resize) { - create_task(resize, sby == 0 && !last, !!last); - if (last) { - t->last_deps[0] = last; - last->next_deps[0] = t; - } - if (sby) { - t->last_deps[1] = last_sbrow_resize; - last_sbrow_resize->next_deps[1] = t; - } - last = t; - last_sbrow_resize = t; - } - if (has_lr) { - create_task(lr, sby == 0 && !last, !!last); - if (last) { - t->last_deps[0] = last; - last->next_deps[0] = t; - } - if (sby) { - t->last_deps[1] = last_sbrow_lr; - last_sbrow_lr->next_deps[1] = t; - } - last_sbrow_lr = t; + if (pass & 1) { + f->frame_thread.entropy_progress = 0; + } else { + const int prog_sz = ((f->sbh + 31) & ~31) >> 5; + if (prog_sz > f->frame_thread.prog_sz) { + atomic_uint *const prog = realloc(f->frame_thread.frame_progress, + prog_sz * 2 * sizeof(*prog)); + if (!prog) return -1; + f->frame_thread.frame_progress = prog; + f->frame_thread.copy_lpf_progress = prog + prog_sz; + f->frame_thread.prog_sz = prog_sz; } + memset(f->frame_thread.frame_progress, 0, prog_sz * 2 * sizeof(atomic_uint)); + atomic_store(&f->frame_thread.deblock_progress, 0); } - f->lf.thread.done = 0; - pthread_mutex_unlock(&pftd->lock); + f->frame_thread.next_tile_row[pass & 1] = 0; - return 0; -} + Dav1dTask *t = &tasks[0]; + t->sby = 0; + t->recon_progress = 1; + t->deblock_progress = 0; + t->type = pass == 1 ? DAV1D_TASK_TYPE_ENTROPY_PROGRESS : + has_deblock ? DAV1D_TASK_TYPE_DEBLOCK_COLS : + has_cdef || has_lr /* i.e. LR backup */ ? DAV1D_TASK_TYPE_DEBLOCK_ROWS : + has_resize ? DAV1D_TASK_TYPE_SUPER_RESOLUTION : + DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS; + t->frame_idx = (int)(f - f->c->fc); -void dav1d_task_schedule(struct PostFilterThreadData *const pftd, - Dav1dTask *const t) -{ - Dav1dTask **pt = &pftd->tasks; - while (*pt && - ((*pt)->sby < t->sby || - ((*pt)->sby == t->sby && (*pt)->frame_id <= t->frame_id))) - pt = &(*pt)->next_exec; - t->next_exec = *pt; - *pt = t; - pthread_cond_signal(&pftd->cond); + *res_t = t; + return 0; } -static inline void update_task(Dav1dTask *const t, const int dep_type, - Dav1dFrameContext *const f) +int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass, + const int cond_signal) { - if (!t->last_deps[!dep_type] || - t->last_deps[!dep_type]->status == DAV1D_TASK_DONE) - { - t->status = DAV1D_TASK_READY; - if (t->start) - dav1d_task_schedule(f->lf.thread.pftd, t); + Dav1dTask *tasks = f->task_thread.tile_tasks[0]; + const int uses_2pass = f->c->n_fc > 1; + const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; + int alloc_num_tasks = num_tasks * (1 + uses_2pass); + if (alloc_num_tasks > f->task_thread.num_tile_tasks) { + const size_t size = sizeof(Dav1dTask) * alloc_num_tasks; + tasks = realloc(f->task_thread.tile_tasks[0], size); + if (!tasks) return -1; + memset(tasks, 0, size); + f->task_thread.tile_tasks[0] = tasks; + f->task_thread.num_tile_tasks = alloc_num_tasks; } -} - -void *dav1d_frame_task(void *const data) { - Dav1dFrameContext *const f = data; + f->task_thread.tile_tasks[1] = tasks + num_tasks; + tasks += num_tasks * (pass & 1); - dav1d_set_thread_name("dav1d-frame"); - pthread_mutex_lock(&f->frame_thread.td.lock); - for (;;) { - while (!f->n_tile_data && !f->frame_thread.die) { - pthread_cond_wait(&f->frame_thread.td.cond, - &f->frame_thread.td.lock); - } - if (f->frame_thread.die) break; - pthread_mutex_unlock(&f->frame_thread.td.lock); - - if (dav1d_decode_frame(f)) - memset(f->frame_thread.cf, 0, - (size_t)f->frame_thread.cf_sz * 128 * 128 / 2); - - pthread_mutex_lock(&f->frame_thread.td.lock); - f->n_tile_data = 0; - pthread_cond_signal(&f->frame_thread.td.cond); + Dav1dTask *pf_t; + if (create_filter_sbrow(f, pass, &pf_t)) + return -1; + + Dav1dTask *prev_t = NULL; + for (int tile_idx = 0; tile_idx < num_tasks; tile_idx++) { + Dav1dTileState *const ts = &f->ts[tile_idx]; + Dav1dTask *t = &tasks[tile_idx]; + t->sby = ts->tiling.row_start >> f->sb_shift; + if (pf_t && t->sby) { + prev_t->next = pf_t; + prev_t = pf_t; + pf_t = NULL; + } + t->recon_progress = 0; + t->deblock_progress = 0; + t->deps_skip = 0; + t->type = pass != 1 ? DAV1D_TASK_TYPE_TILE_RECONSTRUCTION : + DAV1D_TASK_TYPE_TILE_ENTROPY; + t->frame_idx = (int)(f - f->c->fc); + if (prev_t) prev_t->next = t; + prev_t = t; + } + if (pf_t) { + prev_t->next = pf_t; + prev_t = pf_t; } - pthread_mutex_unlock(&f->frame_thread.td.lock); + insert_tasks(f, &tasks[0], prev_t, cond_signal); + f->task_thread.done[pass & 1] = 0; - return NULL; + return 0; } -void *dav1d_tile_task(void *const data) { - Dav1dTileContext *const t = data; - struct FrameTileThreadData *const fttd = t->tile_thread.fttd; - const Dav1dFrameContext *const f = t->f; - const int tile_thread_idx = (int) (t - f->tc); - const uint64_t mask = 1ULL << tile_thread_idx; - - dav1d_set_thread_name("dav1d-tile"); +void dav1d_task_frame_init(Dav1dFrameContext *const f) { + const Dav1dContext *const c = f->c; - for (;;) { - pthread_mutex_lock(&fttd->lock); - fttd->available |= mask; - int did_signal = 0; - while (!fttd->tasks_left && !t->tile_thread.die) { - if (!did_signal) { - did_signal = 1; - pthread_cond_signal(&fttd->icond); - } - pthread_cond_wait(&fttd->cond, &fttd->lock); - } - if (t->tile_thread.die) { - pthread_cond_signal(&fttd->icond); - pthread_mutex_unlock(&fttd->lock); - break; - } - fttd->available &= ~mask; - const int task_idx = fttd->num_tasks - fttd->tasks_left--; - pthread_mutex_unlock(&fttd->lock); - - if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) { - // we can (or in fact, if >, we need to) do full tile decoding. - // loopfilter happens in the main thread - Dav1dTileState *const ts = t->ts = &f->ts[task_idx]; - for (t->by = ts->tiling.row_start; t->by < ts->tiling.row_end; - t->by += f->sb_step) - { - const int error = dav1d_decode_tile_sbrow(t); - const int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift); + f->task_thread.init_done = 0; + // schedule init task, which will schedule the remaining tasks + Dav1dTask *const t = &f->task_thread.init_task; + t->type = DAV1D_TASK_TYPE_INIT; + t->frame_idx = (int)(f - c->fc); + t->sby = 0; + t->recon_progress = t->deblock_progress = 0; + insert_task(f, t, 1); +} - // signal progress - pthread_mutex_lock(&ts->tile_thread.lock); - atomic_store(&ts->progress, progress); - pthread_cond_signal(&ts->tile_thread.cond); - pthread_mutex_unlock(&ts->tile_thread.lock); - if (error) break; - } - } else { - const int sby = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0]; - const int tile_idx = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1]; - Dav1dTileState *const ts = &f->ts[tile_idx]; - int progress; +void dav1d_task_delayed_fg(Dav1dContext *const c, Dav1dPicture *const out, + const Dav1dPicture *const in) +{ + struct TaskThreadData *const ttd = &c->task_thread; + ttd->delayed_fg.in = in; + ttd->delayed_fg.out = out; + ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_PREP; + atomic_init(&ttd->delayed_fg.progress[0], 0); + atomic_init(&ttd->delayed_fg.progress[1], 0); + pthread_mutex_lock(&ttd->lock); + ttd->delayed_fg.exec = 1; + pthread_cond_signal(&ttd->cond); + pthread_cond_wait(&ttd->delayed_fg.cond, &ttd->lock); + pthread_mutex_unlock(&ttd->lock); +} - // the interleaved decoding can sometimes cause dependency issues - // if one part of the frame decodes signifcantly faster than others. - // Ideally, we'd "skip" tile_sbrows where dependencies are missing, - // and resume them later as dependencies are met. This also would - // solve the broadcast() below and allow us to use signal(). However, - // for now, we use linear dependency tracking because it's simpler. - if ((progress = atomic_load(&ts->progress)) < sby) { - pthread_mutex_lock(&ts->tile_thread.lock); - while ((progress = atomic_load(&ts->progress)) < sby) - pthread_cond_wait(&ts->tile_thread.cond, - &ts->tile_thread.lock); - pthread_mutex_unlock(&ts->tile_thread.lock); - } - if (progress == TILE_ERROR) continue; - - // we need to interleave sbrow decoding for all tile cols in a - // tile row, since otherwise subsequent threads will be blocked - // waiting for the post-filter to complete - t->ts = ts; - t->by = sby << f->sb_shift; - const int error = dav1d_decode_tile_sbrow(t); - progress = error ? TILE_ERROR : 1 + sby; +static inline int ensure_progress(struct TaskThreadData *const ttd, + Dav1dFrameContext *const f, + Dav1dTask *const t, const enum TaskType type, + atomic_int *const state, int *const target) +{ + // deblock_rows (non-LR portion) depends on deblock of previous sbrow, + // so ensure that completed. if not, re-add to task-queue; else, fall-through + int p1 = atomic_load(state); + if (p1 < t->sby) { + pthread_mutex_lock(&ttd->lock); + p1 = atomic_load(state); + if (p1 < t->sby) { + t->type = type; + t->recon_progress = t->deblock_progress = 0; + *target = t->sby; + insert_task(f, t, 0); + return 1; + } + pthread_mutex_unlock(&ttd->lock); + } + return 0; +} - // signal progress - pthread_mutex_lock(&ts->tile_thread.lock); - atomic_store(&ts->progress, progress); - pthread_cond_broadcast(&ts->tile_thread.cond); - pthread_mutex_unlock(&ts->tile_thread.lock); +static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f, + const int frame_mt) +{ + const int tp = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY; + const int tile_idx = (int)(t - f->task_thread.tile_tasks[tp]); + Dav1dTileState *const ts = &f->ts[tile_idx]; + const int p1 = atomic_load(&ts->progress[tp]); + if (p1 < t->sby) return 1; + int error = p1 == TILE_ERROR; + error |= atomic_fetch_or(&f->task_thread.error, error); + if (!error && frame_mt && !tp) { + const int p2 = atomic_load(&ts->progress[1]); + if (p2 <= t->sby) return 1; + error = p2 == TILE_ERROR; + error |= atomic_fetch_or(&f->task_thread.error, error); + } + if (!error && frame_mt && !IS_KEY_OR_INTRA(f->frame_hdr)) { + // check reference state + const Dav1dThreadPicture *p = &f->sr_cur; + const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const unsigned p_b = (t->sby + 1) << (f->sb_shift + 2); + const int tile_sby = t->sby - (ts->tiling.row_start >> f->sb_shift); + const int (*const lowest_px)[2] = ts->lowest_pixel[tile_sby]; + for (int n = t->deps_skip; n < 7; n++, t->deps_skip++) { + unsigned lowest; + if (tp) { + // if temporal mv refs are disabled, we only need this + // for the primary ref; if segmentation is disabled, we + // don't even need that + lowest = p_b; + } else { + // +8 is postfilter-induced delay + const int y = lowest_px[n][0] == INT_MIN ? INT_MIN : + lowest_px[n][0] + 8; + const int uv = lowest_px[n][1] == INT_MIN ? INT_MIN : + lowest_px[n][1] * (1 << ss_ver) + 8; + const int max = imax(y, uv); + if (max == INT_MIN) continue; + lowest = iclip(max, 1, f->refp[n].p.p.h); + } + const unsigned p3 = atomic_load(&f->refp[n].progress[!tp]); + if (p3 < lowest) return 1; + atomic_fetch_or(&f->task_thread.error, p3 == FRAME_ERROR); } } + return 0; +} - return NULL; +static inline void abort_frame(Dav1dFrameContext *const f, const int error) { + atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1); + f->task_thread.task_counter = 0; + f->task_thread.done[0] = 1; + f->task_thread.done[1] = 1; + atomic_store(&f->sr_cur.progress[0], FRAME_ERROR); + atomic_store(&f->sr_cur.progress[1], FRAME_ERROR); + dav1d_decode_frame_exit(f, error); + f->n_tile_data = 0; + pthread_cond_signal(&f->task_thread.cond); } -static inline int handle_abortion(Dav1dPostFilterContext *const pf, - Dav1dContext *const c, - struct PostFilterThreadData *const pftd) -{ - const int flush = atomic_load_explicit(c->flush, memory_order_acquire); - if (flush) { - pthread_mutex_lock(&pf->td.lock); - pf->flushed = 0; - pthread_mutex_unlock(&pf->td.lock); - } - for (unsigned i = 0; i < c->n_fc; i++) { - Dav1dFrameContext *const f = &c->fc[i]; - int send_signal; - if (flush) // TODO before merge, see if this can be safely merged - send_signal = f->lf.thread.done != 1 && f->lf.thread.num_tasks != 0; - else - send_signal = f->lf.thread.done == -1; - for (int j = 0; send_signal && j < f->lf.thread.num_tasks; j++) { - Dav1dTask *const t = &f->lf.thread.tasks[j]; - if (t->status == DAV1D_TASK_RUNNING || - (t->status == DAV1D_TASK_DONE && t->start != -1)) - send_signal = 0; - } - if (send_signal) { - if (!flush) { - Dav1dTask **pt = &pftd->tasks; - while (*pt) { - if ((*pt)->frame_idx == i) - *pt = (*pt)->next_exec; - else - pt = &(*pt)->next_exec; - } - } - f->lf.thread.done = 1; - pthread_cond_signal(&f->lf.thread.cond); +static inline void delayed_fg_task(const Dav1dContext *const c, + struct TaskThreadData *const ttd) +{ + const Dav1dPicture *const in = ttd->delayed_fg.in; + Dav1dPicture *const out = ttd->delayed_fg.out; +#if CONFIG_16BPC + int off; + if (out->p.bpc != 8) + off = (out->p.bpc >> 1) - 4; +#endif + switch (ttd->delayed_fg.type) { + case DAV1D_TASK_TYPE_FG_PREP: + ttd->delayed_fg.exec = 0; + if (atomic_load(&ttd->cond_signaled)) + pthread_cond_signal(&ttd->cond); + pthread_mutex_unlock(&ttd->lock); + switch (out->p.bpc) { +#if CONFIG_8BPC + case 8: + dav1d_prep_grain_8bpc(&c->dsp[0].fg, out, in, + ttd->delayed_fg.scaling_8bpc, + ttd->delayed_fg.grain_lut_8bpc); + break; +#endif +#if CONFIG_16BPC + case 10: + case 12: + dav1d_prep_grain_16bpc(&c->dsp[off].fg, out, in, + ttd->delayed_fg.scaling_16bpc, + ttd->delayed_fg.grain_lut_16bpc); + break; +#endif + default: abort(); } + ttd->delayed_fg.type = DAV1D_TASK_TYPE_FG_APPLY; + pthread_mutex_lock(&ttd->lock); + ttd->delayed_fg.exec = 1; + // fall-through + case DAV1D_TASK_TYPE_FG_APPLY:; + int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); + pthread_mutex_unlock(&ttd->lock); + int progmax = (out->p.h + 31) >> 5; + fg_apply_loop: + if (row + 1 < progmax) + pthread_cond_signal(&ttd->cond); + else if (row + 1 >= progmax) { + pthread_mutex_lock(&ttd->lock); + ttd->delayed_fg.exec = 0; + if (row >= progmax) goto end_add; + pthread_mutex_unlock(&ttd->lock); + } + switch (out->p.bpc) { +#if CONFIG_8BPC + case 8: + dav1d_apply_grain_row_8bpc(&c->dsp[0].fg, out, in, + ttd->delayed_fg.scaling_8bpc, + ttd->delayed_fg.grain_lut_8bpc, row); + break; +#endif +#if CONFIG_16BPC + case 10: + case 12: + dav1d_apply_grain_row_16bpc(&c->dsp[off].fg, out, in, + ttd->delayed_fg.scaling_16bpc, + ttd->delayed_fg.grain_lut_16bpc, row); + break; +#endif + default: abort(); + } + row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); + int done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1; + if (row < progmax) goto fg_apply_loop; + pthread_mutex_lock(&ttd->lock); + ttd->delayed_fg.exec = 0; + end_add: + done = atomic_fetch_add(&ttd->delayed_fg.progress[1], 1) + 1; + progmax = atomic_load(&ttd->delayed_fg.progress[0]); + // signal for completion only once the last runner reaches this + if (done < progmax) + break; + pthread_cond_signal(&ttd->delayed_fg.cond); + break; + default: abort(); } - if (flush) { - pthread_mutex_lock(&pf->td.lock); - pf->flushed = 1; - pthread_cond_signal(&pf->td.cond); - pthread_mutex_unlock(&pf->td.lock); - } - return !flush; } -void *dav1d_postfilter_task(void *data) { - Dav1dPostFilterContext *const pf = data; - Dav1dContext *const c = pf->c; - struct PostFilterThreadData *pftd = &c->postfilter_thread; +void *dav1d_worker_task(void *data) { + Dav1dTaskContext *const tc = data; + const Dav1dContext *const c = tc->c; + struct TaskThreadData *const ttd = tc->task_thread.ttd; - dav1d_set_thread_name("dav1d-postfilter"); + dav1d_set_thread_name("dav1d-worker"); - int exec = 1; - pthread_mutex_lock(&pftd->lock); + pthread_mutex_lock(&ttd->lock); for (;;) { - if (!exec && !pf->die) - pthread_cond_wait(&pftd->cond, &pftd->lock); - if (!(exec = handle_abortion(pf, c, pftd))) continue; - if (pf->die) break; - - Dav1dTask *const t = pftd->tasks; - if (!t) { exec = 0; continue; } - pftd->tasks = t->next_exec; - t->status = DAV1D_TASK_RUNNING; - - pthread_mutex_unlock(&pftd->lock); - Dav1dFrameContext *const f = &c->fc[t->frame_idx]; - t->fn(f, t->sby); - exec = 1; - pthread_mutex_lock(&pftd->lock); - - if (t->next_deps[0]) - update_task(t->next_deps[0], 0, f); - if (t->next_deps[1]) - update_task(t->next_deps[1], 1, f); - t->status = DAV1D_TASK_DONE; - if (!t->next_deps[0]) { - const enum PlaneType progress_plane_type = - c->n_fc > 1 && f->frame_hdr->refresh_context ? - PLANE_TYPE_Y : PLANE_TYPE_ALL; - const int y = (t->sby + 1) * f->sb_step * 4; - dav1d_thread_picture_signal(&f->sr_cur, y, progress_plane_type); - if (t->sby + 1 == f->sbh) { - f->lf.thread.done = 1; - pthread_cond_signal(&f->lf.thread.cond); + if (tc->task_thread.die) break; + if (atomic_load(c->flush)) goto park; + if (ttd->delayed_fg.exec) { // run delayed film grain first + delayed_fg_task(c, ttd); + continue; + } + Dav1dFrameContext *f; + Dav1dTask *t, *prev_t = NULL; + if (c->n_fc > 1) { // run init tasks second + for (unsigned i = 0; i < c->n_fc; i++) { + const unsigned first = atomic_load(&ttd->first); + f = &c->fc[(first + i) % c->n_fc]; + if (f->task_thread.init_done) continue; + t = f->task_thread.task_head; + if (!t) continue; + if (t->type == DAV1D_TASK_TYPE_INIT) goto found; + if (t->type == DAV1D_TASK_TYPE_INIT_CDF) { + const int p1 = f->in_cdf.progress ? + atomic_load(f->in_cdf.progress) : 1; + if (p1) { + atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR); + goto found; + } + } } } - t->start = -1; + while (ttd->cur < c->n_fc) { // run decoding tasks last + const unsigned first = atomic_load(&ttd->first); + f = &c->fc[(first + ttd->cur) % c->n_fc]; + prev_t = f->task_thread.task_cur_prev; + t = prev_t ? prev_t->next : f->task_thread.task_head; + while (t) { + if (t->type == DAV1D_TASK_TYPE_INIT_CDF) goto next; + else if (t->type == DAV1D_TASK_TYPE_TILE_ENTROPY || + t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION) + { + // if not bottom sbrow of tile, this task will be re-added + // after it's finished + if (!check_tile(t, f, c->n_fc > 1)) + goto found; + } else if (t->recon_progress) { + const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS; + int error = atomic_load(&f->task_thread.error); + assert(!f->task_thread.done[p] || error); + const int tile_row_base = f->frame_hdr->tiling.cols * + f->frame_thread.next_tile_row[p]; + if (p) { + const int p1 = f->frame_thread.entropy_progress; + if (p1 < t->sby) goto next; + atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR); + } + for (int tc = 0; tc < f->frame_hdr->tiling.cols; tc++) { + Dav1dTileState *const ts = &f->ts[tile_row_base + tc]; + const int p2 = atomic_load(&ts->progress[p]); + if (p2 < t->recon_progress) goto next; + atomic_fetch_or(&f->task_thread.error, p2 == TILE_ERROR); + } + if (t->sby + 1 < f->sbh) { + // add sby+1 to list to replace this one + Dav1dTask *next_t = &t[1]; + *next_t = *t; + next_t->sby++; + const int ntr = f->frame_thread.next_tile_row[p] + 1; + const int start = f->frame_hdr->tiling.row_start_sb[ntr]; + if (next_t->sby == start) + f->frame_thread.next_tile_row[p] = ntr; + next_t->recon_progress = next_t->sby + 1; + insert_task(f, next_t, 0); + } + goto found; + } else if (t->type == DAV1D_TASK_TYPE_CDEF) { + atomic_uint *prog = f->frame_thread.copy_lpf_progress; + const int p1 = atomic_load(&prog[(t->sby - 1) >> 5]); + if (p1 & (1U << ((t->sby - 1) & 31))) + goto found; + } else { + assert(t->deblock_progress); + const int p1 = atomic_load(&f->frame_thread.deblock_progress); + if (p1 >= t->deblock_progress) { + atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR); + goto found; + } + } + next: + prev_t = t; + t = t->next; + f->task_thread.task_cur_prev = prev_t; + } + ttd->cur++; + } + if (reset_task_cur(c, ttd, UINT_MAX)) continue; + park: + tc->task_thread.flushed = 1; + pthread_cond_signal(&tc->task_thread.td.cond); + // we want to be woken up next time progress is signaled + atomic_store(&ttd->cond_signaled, 0); + pthread_cond_wait(&ttd->cond, &ttd->lock); + tc->task_thread.flushed = 0; + reset_task_cur(c, ttd, UINT_MAX); + continue; + + found: + // remove t from list + if (prev_t) prev_t->next = t->next; + else f->task_thread.task_head = t->next; + if (!t->next) f->task_thread.task_tail = prev_t; + if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head) + ttd->cur++; + // we don't need to check cond_signaled here, since we found a task + // after the last signal so we want to re-signal the next waiting thread + // and again won't need to signal after that + atomic_store(&ttd->cond_signaled, 1); + pthread_cond_signal(&ttd->cond); + pthread_mutex_unlock(&ttd->lock); + found_unlocked:; + const int flush = atomic_load(c->flush); + int error = atomic_fetch_or(&f->task_thread.error, flush) | flush; + + // run it + tc->f = f; + int sby = t->sby; + switch (t->type) { + case DAV1D_TASK_TYPE_INIT: { + assert(c->n_fc > 1); + int res = dav1d_decode_frame_init(f); + int p1 = f->in_cdf.progress ? atomic_load(f->in_cdf.progress) : 1; + if (res || p1 == TILE_ERROR) { + pthread_mutex_lock(&ttd->lock); + abort_frame(f, res ? res : DAV1D_ERR(EINVAL)); + } else if (!res) { + t->type = DAV1D_TASK_TYPE_INIT_CDF; + if (p1) goto found_unlocked; + pthread_mutex_lock(&ttd->lock); + insert_task(f, t, 0); + } + reset_task_cur(c, ttd, t->frame_idx); + continue; + } + case DAV1D_TASK_TYPE_INIT_CDF: { + assert(c->n_fc > 1); + int res = DAV1D_ERR(EINVAL); + if (!atomic_load(&f->task_thread.error)) + res = dav1d_decode_frame_init_cdf(f); + pthread_mutex_lock(&ttd->lock); + if (f->frame_hdr->refresh_context && !f->task_thread.update_set) { + atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1); + } + if (!res) { + assert(c->n_fc > 1); + for (int p = 1; p <= 2; p++) { + const int res = dav1d_task_create_tile_sbrow(f, p, 0); + if (res) { + // memory allocation failed + f->task_thread.done[2 - p] = 1; + atomic_store(&f->task_thread.error, -1); + f->task_thread.task_counter -= f->sbh + + f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; + atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR); + if (p == 2 && f->task_thread.done[1]) { + assert(!f->task_thread.task_counter); + dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM)); + f->n_tile_data = 0; + pthread_cond_signal(&f->task_thread.cond); + } + } + } + } else abort_frame(f, res); + reset_task_cur(c, ttd, t->frame_idx); + f->task_thread.init_done = 1; + continue; + } + case DAV1D_TASK_TYPE_TILE_ENTROPY: + case DAV1D_TASK_TYPE_TILE_RECONSTRUCTION: { + const int p = t->type == DAV1D_TASK_TYPE_TILE_ENTROPY; + const int tile_idx = (int)(t - f->task_thread.tile_tasks[p]); + Dav1dTileState *const ts = &f->ts[tile_idx]; + + tc->ts = ts; + tc->by = sby << f->sb_shift; + const int uses_2pass = c->n_fc > 1; + tc->frame_thread.pass = !uses_2pass ? 0 : + 1 + (t->type == DAV1D_TASK_TYPE_TILE_RECONSTRUCTION); + if (!error) error = dav1d_decode_tile_sbrow(tc); + const int progress = error ? TILE_ERROR : 1 + sby; + + // signal progress + atomic_fetch_or(&f->task_thread.error, error); + if (((sby + 1) << f->sb_shift) < ts->tiling.row_end) { + t->sby++; + t->deps_skip = 0; + if (!check_tile(t, f, uses_2pass)) { + atomic_store(&ts->progress[p], progress); + reset_task_cur_async(ttd, t->frame_idx, c->n_fc); + if (!atomic_fetch_or(&ttd->cond_signaled, 1)) + pthread_cond_signal(&ttd->cond); + goto found_unlocked; + } + pthread_mutex_lock(&ttd->lock); + atomic_store(&ts->progress[p], progress); + reset_task_cur(c, ttd, t->frame_idx); + insert_task(f, t, 0); + } else { + pthread_mutex_lock(&ttd->lock); + atomic_store(&ts->progress[p], progress); + reset_task_cur(c, ttd, t->frame_idx); + error = atomic_load(&f->task_thread.error); + if (f->frame_hdr->refresh_context && + tc->frame_thread.pass <= 1 && f->task_thread.update_set && + f->frame_hdr->tiling.update == tile_idx) + { + if (!error) + dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf, + &f->ts[f->frame_hdr->tiling.update].cdf); + if (c->n_fc > 1) + atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1); + } + if (!--f->task_thread.task_counter && f->task_thread.done[0] && + (!uses_2pass || f->task_thread.done[1])) + { + dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : + error ? DAV1D_ERR(ENOMEM) : 0); + f->n_tile_data = 0; + pthread_cond_signal(&f->task_thread.cond); + } + assert(f->task_thread.task_counter >= 0); + if (!atomic_fetch_or(&ttd->cond_signaled, 1)) + pthread_cond_signal(&ttd->cond); + } + continue; + } + case DAV1D_TASK_TYPE_DEBLOCK_COLS: + if (!atomic_load(&f->task_thread.error)) + f->bd_fn.filter_sbrow_deblock_cols(f, sby); + if (ensure_progress(ttd, f, t, DAV1D_TASK_TYPE_DEBLOCK_ROWS, + &f->frame_thread.deblock_progress, + &t->deblock_progress)) continue; + // fall-through + case DAV1D_TASK_TYPE_DEBLOCK_ROWS: + if (!atomic_load(&f->task_thread.error)) + f->bd_fn.filter_sbrow_deblock_rows(f, sby); + // signal deblock progress + if (f->frame_hdr->loopfilter.level_y[0] || + f->frame_hdr->loopfilter.level_y[1]) + { + error = atomic_load(&f->task_thread.error); + atomic_store(&f->frame_thread.deblock_progress, + error ? TILE_ERROR : sby + 1); + reset_task_cur_async(ttd, t->frame_idx, c->n_fc); + if (!atomic_fetch_or(&ttd->cond_signaled, 1)) + pthread_cond_signal(&ttd->cond); + } else if (f->seq_hdr->cdef || f->lf.restore_planes) { + atomic_fetch_or(&f->frame_thread.copy_lpf_progress[sby >> 5], + 1U << (sby & 31)); + // CDEF needs the top buffer to be saved by lr_copy_lpf of the + // previous sbrow + if (sby) { + int prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]); + if (~prog & (1U << ((sby - 1) & 31))) { + pthread_mutex_lock(&ttd->lock); + prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]); + if (~prog & (1U << ((sby - 1) & 31))) { + t->type = DAV1D_TASK_TYPE_CDEF; + t->recon_progress = t->deblock_progress = 0; + insert_task(f, t, 0); + continue; + } + pthread_mutex_unlock(&ttd->lock); + } + } + } + // fall-through + case DAV1D_TASK_TYPE_CDEF: + if (f->seq_hdr->cdef) { + if (!atomic_load(&f->task_thread.error)) + f->bd_fn.filter_sbrow_cdef(tc, sby); + reset_task_cur_async(ttd, t->frame_idx, c->n_fc); + if (!atomic_fetch_or(&ttd->cond_signaled, 1)) + pthread_cond_signal(&ttd->cond); + } + // fall-through + case DAV1D_TASK_TYPE_SUPER_RESOLUTION: + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) + if (!atomic_load(&f->task_thread.error)) + f->bd_fn.filter_sbrow_resize(f, sby); + // fall-through + case DAV1D_TASK_TYPE_LOOP_RESTORATION: + if (!atomic_load(&f->task_thread.error) && f->lf.restore_planes) + f->bd_fn.filter_sbrow_lr(f, sby); + // fall-through + case DAV1D_TASK_TYPE_RECONSTRUCTION_PROGRESS: + // dummy to cover for no post-filters + case DAV1D_TASK_TYPE_ENTROPY_PROGRESS: + // dummy to convert tile progress to frame + break; + default: abort(); + } + // if task completed [typically LR], signal picture progress as per below + const int uses_2pass = c->n_fc > 1; + const int sbh = f->sbh; + const int sbsz = f->sb_step * 4; + const enum PlaneType progress_plane_type = + t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS ? PLANE_TYPE_BLOCK : + c->n_fc > 1 ? PLANE_TYPE_Y : PLANE_TYPE_ALL; + if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS) + atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5], + 1U << (sby & 31)); + pthread_mutex_lock(&ttd->lock); + if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS) { + unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0; + if (frame_prog < FRAME_ERROR) { + int idx = frame_prog >> (f->sb_shift + 7); + int prog; + do { + atomic_uint *state = &f->frame_thread.frame_progress[idx]; + const unsigned val = ~atomic_load(state); + prog = val ? ctz(val) : 32; + if (prog != 32) break; + prog = 0; + } while (++idx < f->frame_thread.prog_sz); + sby = ((idx << 5) | prog) - 1; + } else sby = sbh - 1; + } + error = atomic_load(&f->task_thread.error); + const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz; + if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */) { + const int idx = t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS; + atomic_store(&f->sr_cur.progress[idx], error ? FRAME_ERROR : y); + } + if (progress_plane_type == PLANE_TYPE_BLOCK) + f->frame_thread.entropy_progress = error ? TILE_ERROR : sby + 1; + if (sby + 1 == sbh) + f->task_thread.done[progress_plane_type == PLANE_TYPE_BLOCK] = 1; + if (!--f->task_thread.task_counter && + f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1])) + { + dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : + error ? DAV1D_ERR(ENOMEM) : 0); + f->n_tile_data = 0; + pthread_cond_signal(&f->task_thread.cond); + } + reset_task_cur(c, ttd, t->frame_idx); } - pthread_mutex_unlock(&pftd->lock); + pthread_mutex_unlock(&ttd->lock); return NULL; } diff -Nru dav1d-0.9.2/src/thread_task.h dav1d-1.0.0/src/thread_task.h --- dav1d-0.9.2/src/thread_task.h 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/thread_task.h 2022-03-18 14:31:56.002356000 +0000 @@ -35,33 +35,19 @@ #define FRAME_ERROR (UINT_MAX - 1) #define TILE_ERROR (INT_MAX - 1) -enum TaskStatus { - DAV1D_TASK_DEFAULT, - DAV1D_TASK_READY, - DAV1D_TASK_RUNNING, - DAV1D_TASK_DONE, -}; - -struct Dav1dTask { - enum TaskStatus status; // task status - int start; // frame thread start flag - unsigned frame_idx; // frame thread id - int frame_id; // frame ordering - int sby; // sbrow - filter_sbrow_fn fn; // task work - Dav1dTask *last_deps[2]; // dependencies - Dav1dTask *next_deps[2]; // dependant tasks - Dav1dTask *next_exec; // tasks scheduling -}; - -int dav1d_task_create_filter_sbrow(Dav1dFrameContext *f); -void dav1d_task_schedule(struct PostFilterThreadData *pftd, Dav1dTask *t); - -void *dav1d_frame_task(void *data); -void *dav1d_tile_task(void *data); -void *dav1d_postfilter_task(void *data); - +// these functions assume the task scheduling lock is already taken +int dav1d_task_create_tile_sbrow(Dav1dFrameContext *f, int pass, int cond_signal); +void dav1d_task_frame_init(Dav1dFrameContext *f); + +void dav1d_task_delayed_fg(Dav1dContext *c, Dav1dPicture *out, const Dav1dPicture *in); + +void *dav1d_worker_task(void *data); + +int dav1d_decode_frame_init(Dav1dFrameContext *f); +int dav1d_decode_frame_init_cdf(Dav1dFrameContext *f); +int dav1d_decode_frame_main(Dav1dFrameContext *f); +void dav1d_decode_frame_exit(Dav1dFrameContext *f, int retval); int dav1d_decode_frame(Dav1dFrameContext *f); -int dav1d_decode_tile_sbrow(Dav1dTileContext *t); +int dav1d_decode_tile_sbrow(Dav1dTaskContext *t); #endif /* DAV1D_SRC_THREAD_TASK_H */ diff -Nru dav1d-0.9.2/src/x86/cdef16_avx2.asm dav1d-1.0.0/src/x86/cdef16_avx2.asm --- dav1d-0.9.2/src/x86/cdef16_avx2.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/cdef16_avx2.asm 2022-03-18 14:31:56.006356000 +0000 @@ -59,19 +59,11 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %macro CDEF_FILTER 2 ; w, h - DEFINE_ARGS dst, stride, dir, pridmp, pri, sec, tmp - movifnidn prid, r4m - movifnidn secd, r5m - mov dird, r6m + DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp + movifnidn prid, r5m + movifnidn secd, r6m + mov dird, r7m vpbroadcastd m8, [base+pw_2048] lea dirq, [base+dir_table%1+dirq*2] test prid, prid @@ -86,9 +78,9 @@ %endif lzcnt pridmpd, prid rorx tmpd, prid, 2 - cmp dword r9m, 0xfff ; if (bpc == 12) + cmp dword r10m, 0xfff ; if (bpc == 12) cmove prid, tmpd ; pri >>= 2 - mov tmpd, r7m ; damping + mov tmpd, r8m ; damping and prid, 4 sub tmpd, 31 vpbroadcastd m9, [base+pri_taps+priq+8*0] @@ -137,7 +129,7 @@ .end: RET .sec_only: - mov tmpd, r7m ; damping + mov tmpd, r8m ; damping %if WIN64 vpbroadcastw m6, secm %else @@ -226,7 +218,7 @@ movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+r8 ], xm1 + movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] %else mova [dstq+strideq*0], xm0 @@ -362,7 +354,7 @@ movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+r8 ], xm1 + movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] %else mova [dstq+strideq*0], xm0 @@ -582,7 +574,7 @@ movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+r8 ], xm1 + movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] %else mova [dstq+strideq*0], xm0 @@ -594,26 +586,27 @@ %endmacro INIT_YMM avx2 -cglobal cdef_filter_4x4_16bpc, 4, 9, 9, 16*10, dst, stride, left, top, pri, sec, edge +cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \ + pri, sec, edge %if WIN64 %define px rsp+16*6 - %define offq r7 + %define offq r8 %define pri_shift rsp+16*2 %define sec_shift rsp+16*3 %else %define px rsp+16*4 - %define offq r3 + %define offq r4 %define pri_shift rsp+16*0 %define sec_shift rsp+16*1 %endif - %define base r7-dir_table4 - mov edged, r8m - lea r7, [dir_table4] + %define base r8-dir_table4 + mov edged, r9m + lea r8, [dir_table4] movu xm0, [dstq+strideq*0] movu xm1, [dstq+strideq*1] - lea r8, [strideq*3] + lea r9, [strideq*3] movu xm2, [dstq+strideq*2] - movu xm3, [dstq+r8 ] + movu xm3, [dstq+r9 ] vpbroadcastd m7, [base+pw_m16384] mova [px+16*0+0], xm0 mova [px+16*1+0], xm1 @@ -640,15 +633,14 @@ .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom - lea r3, [dstq+strideq*4] - movu xm0, [r3+strideq*0] - movu xm1, [r3+strideq*1] + movu xm0, [botq+strideq*0] + movu xm1, [botq+strideq*1] mova [px+16*4+0], xm0 mova [px+16*5+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left - movd xm0, [r3+strideq*0-4] - movd xm1, [r3+strideq*1-4] + movd xm0, [botq+strideq*0-4] + movd xm1, [botq+strideq*1-4] movd [px+16*4-4], xm0 movd [px+16*5-4], xm1 jmp .bottom_done @@ -678,19 +670,20 @@ .padding_done: CDEF_FILTER 4, 4 -cglobal cdef_filter_4x8_16bpc, 4, 9, 9, 16*14, dst, stride, left, top, pri, sec, edge - mov edged, r8m +cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \ + pri, sec, edge + mov edged, r9m movu xm0, [dstq+strideq*0] movu xm1, [dstq+strideq*1] - lea r8, [strideq*3] + lea r9, [strideq*3] movu xm2, [dstq+strideq*2] - movu xm3, [dstq+r8 ] - lea r7, [dstq+strideq*4] - movu xm4, [r7 +strideq*0] - movu xm5, [r7 +strideq*1] - movu xm6, [r7 +strideq*2] - movu xm7, [r7 +r8 ] - lea r7, [dir_table4] + movu xm3, [dstq+r9 ] + lea r6, [dstq+strideq*4] + movu xm4, [r6 +strideq*0] + movu xm5, [r6 +strideq*1] + movu xm6, [r6 +strideq*2] + movu xm7, [r6 +r9 ] + lea r8, [dir_table4] mova [px+16*0+0], xm0 mova [px+16*1+0], xm1 mova [px+16*2+0], xm2 @@ -721,15 +714,14 @@ .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom - lea r3, [dstq+strideq*8] - movu xm0, [r3+strideq*0] - movu xm1, [r3+strideq*1] + movu xm0, [botq+strideq*0] + movu xm1, [botq+strideq*1] mova [px+16*8+0], xm0 mova [px+16*9+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left - movd xm0, [r3+strideq*0-4] - movd xm1, [r3+strideq*1-4] + movd xm0, [botq+strideq*0-4] + movd xm1, [botq+strideq*1-4] movd [px+16*8-4], xm0 movd [px+16*9-4], xm1 jmp .bottom_done @@ -767,26 +759,27 @@ .padding_done: CDEF_FILTER 4, 8 -cglobal cdef_filter_8x8_16bpc, 4, 8, 9, 32*13, dst, stride, left, top, pri, sec, edge +cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \ + pri, sec, edge %if WIN64 %define px rsp+32*4 %else %define px rsp+32*3 %endif - %define base r7-dir_table8 - mov edged, r8m + %define base r8-dir_table8 + mov edged, r9m movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] - lea r7, [dstq+strideq*2] - movu m2, [r7 +strideq*0] - movu m3, [r7 +strideq*1] - lea r7, [r7 +strideq*2] - movu m4, [r7 +strideq*0] - movu m5, [r7 +strideq*1] - lea r7, [r7 +strideq*2] - movu m6, [r7 +strideq*0] - movu m7, [r7 +strideq*1] - lea r7, [dir_table8] + lea r6, [dstq+strideq*2] + movu m2, [r6 +strideq*0] + movu m3, [r6 +strideq*1] + lea r6, [r6 +strideq*2] + movu m4, [r6 +strideq*0] + movu m5, [r6 +strideq*1] + lea r6, [r6 +strideq*2] + movu m6, [r6 +strideq*0] + movu m7, [r6 +strideq*1] + lea r8, [dir_table8] mova [px+32*0+0], m0 mova [px+32*1+0], m1 mova [px+32*2+0], m2 @@ -818,15 +811,14 @@ .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom - lea r3, [dstq+strideq*8] - movu m0, [r3+strideq*0] - movu m1, [r3+strideq*1] + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] mova [px+32*8+0], m0 mova [px+32*9+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left - movd xm0, [r3+strideq*0-4] - movd xm1, [r3+strideq*1-4] + movd xm0, [botq+strideq*0-4] + movd xm1, [botq+strideq*1-4] movd [px+32*8-4], xm0 movd [px+32*9-4], xm1 jmp .bottom_done diff -Nru dav1d-0.9.2/src/x86/cdef16_sse.asm dav1d-1.0.0/src/x86/cdef16_sse.asm --- dav1d-0.9.2/src/x86/cdef16_sse.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/cdef16_sse.asm 2022-03-18 14:31:56.006356000 +0000 @@ -64,25 +64,17 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %if ARCH_X86_32 DECLARE_REG_TMP 5, 3 %elif WIN64 -DECLARE_REG_TMP 7, 4 +DECLARE_REG_TMP 8, 4 %else -DECLARE_REG_TMP 7, 8 +DECLARE_REG_TMP 8, 6 %endif %macro CDEF_FILTER 2 ; w, h %if ARCH_X86_64 - DEFINE_ARGS dst, stride, tmp, pridmp, pri, sec, dir + DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir mova m8, [base+pw_2048] %else DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir @@ -90,20 +82,20 @@ %define m9 [rsp+16*1+gprsize] %define m10 [rsp+16*2+gprsize] %endif - movifnidn prid, r4m - movifnidn secd, r5m + movifnidn prid, r5m + movifnidn secd, r6m test prid, prid jz .sec_only - movd m6, r4m + movd m6, r5m %if ARCH_X86_32 mov [rsp+24], pridmpd %endif bsr pridmpd, prid lea tmpd, [priq*4] - cmp dword r9m, 0x3ff ; if (bpc == 10) + cmp dword r10m, 0x3ff ; if (bpc == 10) cmove prid, tmpd ; pri <<= 2 - mov tmpd, r7m ; damping - mov dird, r6m + mov tmpd, r8m ; damping + mov dird, r7m and prid, 16 pshufb m6, m7 ; splat lea dirq, [base+dir_table+dirq*2] @@ -155,10 +147,10 @@ .end: RET .sec_only: - mov tmpd, r7m ; damping - movd m6, r5m + mov tmpd, r8m ; damping + movd m6, r6m tzcnt secd, secd - mov dird, r6m + mov dird, r7m pshufb m6, m7 sub tmpd, secd lea dirq, [base+dir_table+dirq*2] @@ -172,7 +164,11 @@ %endrep jmp .end %if %1 == %2 -DEFINE_ARGS dst, stride, tmp, off, pri, _, dir + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir + %else + DEFINE_ARGS dst, stride, tmp, off, pri, _, dir + %endif ALIGN function_align .pri: movsx offq, byte [dirq+4] ; off_k0 @@ -649,16 +645,18 @@ INIT_XMM ssse3 %if ARCH_X86_64 -cglobal cdef_filter_4x4_16bpc, 4, 8, 9, 32*10, dst, stride, left, top, pri, sec, edge +cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \ + pri, sec, edge %define px rsp+32*4 %else cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left + %define botq topq %define px rsp+32*5 %endif %define base t0-dir_table %define pri_shift px-16*6 %define sec_shift px-16*5 - mov edged, r8m + mov edged, r9m LEA t0, dir_table movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] @@ -693,15 +691,15 @@ .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom - lea r3, [dstq+strideq*4] - movu m0, [r3+strideq*0] - movu m1, [r3+strideq*1] + movifnidn botq, r4mp + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] mova [px+32*4+0], m0 mova [px+32*5+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left - movd m0, [r3+strideq*0-4] - movd m1, [r3+strideq*1-4] + movd m0, [botq+strideq*0-4] + movd m1, [botq+strideq*1-4] movd [px+32*4-4], m0 movd [px+32*5-4], m1 jmp .bottom_done @@ -734,11 +732,12 @@ CDEF_FILTER 4, 4 %if ARCH_X86_64 -cglobal cdef_filter_4x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge +cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ + pri, sec, edge %else cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left %endif - mov edged, r8m + mov edged, r9m LEA t0, dir_table movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] @@ -783,15 +782,15 @@ .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom - lea r3, [dstq+strideq*8] - movu m0, [r3+strideq*0] - movu m1, [r3+strideq*1] + movifnidn botq, r4mp + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] mova [px+32*8+0], m0 mova [px+32*9+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left - movd m0, [r3+strideq*0-4] - movd m1, [r3+strideq*1-4] + movd m0, [botq+strideq*0-4] + movd m1, [botq+strideq*1-4] movd [px+32*8-4], m0 movd [px+32*9-4], m1 jmp .bottom_done @@ -832,11 +831,12 @@ CDEF_FILTER 4, 8 %if ARCH_X86_64 -cglobal cdef_filter_8x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge +cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ + pri, sec, edge %else cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left %endif - mov edged, r8m + mov edged, r9m LEA t0, dir_table mova m0, [dstq+strideq*0+ 0] movd m1, [dstq+strideq*0+16] @@ -903,19 +903,19 @@ .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom - lea r3, [dstq+strideq*8] - mova m0, [r3+strideq*0+ 0] - movd m1, [r3+strideq*0+16] - mova m2, [r3+strideq*1+ 0] - movd m3, [r3+strideq*1+16] + movifnidn botq, r4mp + mova m0, [botq+strideq*0+ 0] + movd m1, [botq+strideq*0+16] + mova m2, [botq+strideq*1+ 0] + movd m3, [botq+strideq*1+16] mova [px+32*8+ 0], m0 movd [px+32*8+16], m1 mova [px+32*9+ 0], m2 movd [px+32*9+16], m3 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left - movd m0, [r3+strideq*0-4] - movd m1, [r3+strideq*1-4] + movd m0, [botq+strideq*0-4] + movd m1, [botq+strideq*1-4] movd [px+32*8- 4], m0 movd [px+32*9- 4], m1 jmp .bottom_done diff -Nru dav1d-0.9.2/src/x86/cdef_avx2.asm dav1d-1.0.0/src/x86/cdef_avx2.asm --- dav1d-0.9.2/src/x86/cdef_avx2.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/cdef_avx2.asm 2022-03-18 14:31:56.006356000 +0000 @@ -93,26 +93,25 @@ %macro PREP_REGS 2 ; w, h ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] - mov dird, r6m + mov dird, r7m lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] lea dirq, [tableq+dirq*2*4] %if %1 == 4 %if %2 == 4 - DEFINE_ARGS dst, stride, left, top, pri, sec, \ - table, dir, dirjmp, dst4, stride3, k + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ + table, dir, dirjmp, stride3, k %else - DEFINE_ARGS dst, stride, left, top, pri, sec, \ - table, dir, dirjmp, dst4, dst8, stride3, k - lea dst8q, [dstq+strideq*8] + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ + table, dir, dirjmp, dst4, stride3, k + lea dst4q, [dstq+strideq*4] %endif %else - DEFINE_ARGS dst, stride, h, top1, pri, sec, \ - table, dir, dirjmp, top2, dst4, stride3, k + DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \ + table, dir, dirjmp, top2, stride3, k mov hq, -8 lea top1q, [top1q+strideq*0] lea top2q, [top1q+strideq*1] %endif - lea dst4q, [dstq+strideq*4] %if %1 == 4 lea stride3q, [strideq*3] %endif @@ -280,17 +279,17 @@ %macro BORDER_PREP_REGS 2 ; w, h ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] - mov dird, r6m + mov dird, r7m lea dirq, [tableq+dirq*2+14] %if %1*%2*2/mmsize > 1 %if %1 == 4 - DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k + DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off %else - DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k + DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off %endif mov hd, %1*%2*2/mmsize %else - DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k + DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off %endif lea stkq, [px] pxor m11, m11 @@ -385,10 +384,10 @@ packuswb m4, m4 vextracti128 xm5, m4, 1 %if %1 == 4 - movd [dstq+strideq*0], xm4 + movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 - movd [dstq+strideq*2], xm5 - pextrd [dstq+stride3q], xm5, 1 + movd [dstq+strideq*2], xm5 + pextrd [dstq+stride3q ], xm5, 1 %else movq [dstq+strideq*0], xm4 movq [dstq+strideq*1], xm5 @@ -397,14 +396,13 @@ %macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 -cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \ - pri, sec, dir, damping, edge +cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge %assign stack_offset_entry stack_offset mov edged, edgem cmp edged, 0xf jne .border_block - PUSH r9 PUSH r10 PUSH r11 %if %2 == 4 @@ -413,7 +411,7 @@ PUSH r%+regs_used %assign regs_used regs_used+1 %endif - ALLOC_STACK 0x60, 16 + ALLOC_STACK 0x60, 16 pmovzxbw xm0, [leftq+1] vpermq m0, m0, q0110 psrldq m1, m0, 4 @@ -439,7 +437,7 @@ PUSH r%+regs_used %assign regs_used regs_used+1 %endif - ALLOC_STACK 8*2+%1*%2*2+32, 16 + ALLOC_STACK 8*4+%1*%2*2+32, 16 lea r11, [strideq*3] movu xm4, [dstq+strideq*2] pmovzxwq m0, [leftq+0] @@ -447,15 +445,17 @@ vinserti128 m4, [dstq+r11], 1 pmovzxbd m2, [leftq+1] pmovzxbd m3, [leftq+9] - mova [rsp+0x10], m0 - mova [rsp+0x30], m1 - mova [rsp+0x50], m2 - mova [rsp+0x70], m3 - mova [rsp+0x90], m4 + mov [rsp+16], botq + mova [rsp+0x20], m0 + mova [rsp+0x40], m1 + mova [rsp+0x60], m2 + mova [rsp+0x80], m3 + mova [rsp+0xa0], m4 + lea botq, [dstq+strideq*4] %endif - DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping - mov dampingd, r7m + DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping + mov dampingd, r8m xor zerod, zerod movifnidn prid, prim sub dampingd, 31 @@ -474,13 +474,13 @@ add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift - DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp + DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] - DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength and prid, 1 @@ -505,21 +505,21 @@ pxor m9, m9 ADJUST_PIXEL %1, %2, m9, m10, 1 %if %1*%2 > mmsize - mov dstq, dst4q - lea top1q, [rsp+0x90] - lea top2q, [rsp+0xA0] - lea dst4q, [dst4q+strideq*4] + lea dstq, [dstq+strideq*4] + lea top1q, [rsp+0xa0] + lea top2q, [rsp+0xb0] + mov botq, [rsp+16] add hq, 4 jl .v_loop %endif RET .pri_only: - DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp + DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask ; pri/sec_taps[k] [4 total] - DEFINE_ARGS dst, stride, left, top, pri, _, table, dir + DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir vpbroadcastb m0, xm0 ; pri_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps @@ -537,26 +537,26 @@ jge .pri_k_loop ADJUST_PIXEL %1, %2, m1, m3 %if %1*%2 > mmsize - mov dstq, dst4q - lea top1q, [rsp+0x90] - lea top2q, [rsp+0xA0] - lea dst4q, [dst4q+strideq*4] + lea dstq, [dstq+strideq*4] + lea top1q, [rsp+0xa0] + lea top2q, [rsp+0xb0] + mov botq, [rsp+16] add hq, 4 jl .pri_v_loop %endif RET .sec_only: - DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping + DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift - DEFINE_ARGS dst, stride, left, top, _, secdmp, table + DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table lea tableq, [tap_table] vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] - DEFINE_ARGS dst, stride, left, top, _, sec, table, dir + DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir vpbroadcastb m1, xm1 ; sec_strength lea secq, [tableq+12] ; sec_taps PREP_REGS %1, %2 @@ -574,10 +574,10 @@ jge .sec_k_loop ADJUST_PIXEL %1, %2, m0, m2 %if %1*%2 > mmsize - mov dstq, dst4q - lea top1q, [rsp+0x90] - lea top2q, [rsp+0xA0] - lea dst4q, [dst4q+strideq*4] + lea dstq, [dstq+strideq*4] + lea top1q, [rsp+0xa0] + lea top2q, [rsp+0xb0] + mov botq, [rsp+16] add hq, 4 jl .sec_v_loop %endif @@ -593,7 +593,7 @@ psrldq m11, m6, 2 psrldq m12, m10, 2 vinserti128 m6, [dstq+stride3q -1], 1 - vinserti128 m10, [dstq+strideq*4-1], 1 + vinserti128 m10, [botq -1], 1 vpblendd m5, m11, 0x10 vpblendd m9, m12, 0x10 movu m11, [blend_4x4+16] @@ -616,7 +616,7 @@ psrldq xm10, 2 shufps xm10, xm9, q2020 ; +3 +4 +5 +6 movd xm9, [dst4q+stride3q -1] - pinsrd xm9, [dst4q+strideq*4-1], 1 + pinsrd xm9, [botq -1], 1 shufps xm11, xm9, q1020 ; +5 +6 +7 +8 pmovzxbw m9, [leftq+3] vinserti128 m6, xm11, 1 @@ -630,15 +630,15 @@ vbroadcasti128 m10, [dstq+strideq*1-1] vbroadcasti128 m11, [dstq+strideq*2-1] movhps xm5, [dstq+strideq*0+1] - vinserti128 m6, m10, [dstq+stride3q -1], 1 - vinserti128 m9, m11, [dstq+strideq*4-1], 1 + vinserti128 m6, m10, [dstq+stride3q-1], 1 + vinserti128 m9, m11, [botq -1], 1 psrldq m10, 2 psrldq m11, 2 punpcklqdq m6, m9 movu m9, [r13+hq*2*1+16*1] punpcklqdq m10, m11 vpblendd m5, m10, 0xF0 - vpblendvb m6, [rsp+gprsize+80+hq*8+64+8*1], m9 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9 %endif ret .d1k0: @@ -688,7 +688,7 @@ vinserti128 m9, [dstq+stride3q -1], 1 movu m10, [blend_8x8_0+16] punpcklqdq m6, m5, m9 - vpblendvb m6, [rsp+gprsize+80+hq*8+64], m10 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10 psrldq m5, 2 psrldq m9, 2 punpcklqdq m5, m9 @@ -707,7 +707,7 @@ vpblendd m9, m11, 0x10 movu m10, [blend_4x4] vinserti128 m5, [dstq+stride3q +1], 1 - vinserti128 m12, [dstq+strideq*4+1], 1 + vinserti128 m12, [botq +1], 1 punpckldq m6, m9 punpckldq m5, m12 vpblendvb m6, [rsp+gprsize+0x40], m10 @@ -727,7 +727,7 @@ shufps xm10, xm11, q2020 movd xm9, [dst4q+stride3q +1] vinserti128 m6, xm10, 1 - pinsrd xm9, [dst4q+strideq*4+1], 1 + pinsrd xm9, [botq +1], 1 psrldq xm11, 2 pmovzxbw m10, [leftq-1] shufps xm11, xm9, q1020 @@ -744,9 +744,9 @@ movu m11, [r13+hq*2*1+16*1] punpcklqdq m10, m5, m9 vinserti128 m5, [dstq+stride3q -1], 1 - vinserti128 m9, [dstq+strideq*4-1], 1 + vinserti128 m9, [botq -1], 1 vpblendd m6, m10, 0xF0 - vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*1], m11 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11 psrldq m5, 2 psrldq m9, 2 punpcklqdq m5, m9 @@ -763,8 +763,8 @@ vpblendd xm6, [dstq+strideq*0-4], 0x2 vpblendd m5, m9, 0x22 vpblendd m6, m5, 0x30 - vinserti128 m5, [dstq+stride3q ], 1 - vpblendd m5, [dstq+strideq*4-20], 0x20 + vinserti128 m5, [dstq+stride3q ], 1 + vpblendd m5, [botq -20], 0x20 %else movd xm6, [topq +strideq*1] movd xm5, [dstq +strideq*1] @@ -775,7 +775,7 @@ pinsrd xm5, [dstq +strideq*2], 1 pinsrd xm9, [dst4q+strideq*0], 1 pinsrd xm10, [dst4q+strideq*2], 1 - pinsrd xm11, [dst4q+strideq*4], 1 + pinsrd xm11, [botq ], 1 punpcklqdq xm6, xm5 punpcklqdq xm5, xm9 punpcklqdq xm9, xm10 @@ -789,7 +789,7 @@ movq xm9, [dstq+stride3q ] movhps xm6, [dstq+strideq*0] movhps xm5, [dstq+strideq*2] - movhps xm9, [dstq+strideq*4] + movhps xm9, [botq ] vinserti128 m6, xm5, 1 vinserti128 m5, xm9, 1 %endif @@ -797,16 +797,16 @@ .d0k1: %if %1 == 4 %if %2 == 4 - movd xm6, [dstq +strideq*2-2] - movd xm9, [dstq +stride3q -2] - movd xm5, [topq +strideq*0+2] - movd xm10, [topq +strideq*1+2] + movd xm6, [dstq+strideq*2-2] + movd xm9, [dstq+stride3q -2] + movd xm5, [topq+strideq*0+2] + movd xm10, [topq+strideq*1+2] pinsrw xm6, [leftq+4], 0 pinsrw xm9, [leftq+6], 0 - vinserti128 m5, [dstq +strideq*0+2], 1 - vinserti128 m10, [dstq +strideq*1+2], 1 - vinserti128 m6, [dst4q+strideq*0-2], 1 - vinserti128 m9, [dst4q+strideq*1-2], 1 + vinserti128 m5, [dstq+strideq*0+2], 1 + vinserti128 m10, [dstq+strideq*1+2], 1 + vinserti128 m6, [botq+strideq*0-2], 1 + vinserti128 m9, [botq+strideq*1-2], 1 punpckldq m5, m10 punpckldq m6, m9 %else @@ -818,31 +818,31 @@ pinsrw xm10, [dst4q+stride3q ], 3 pinsrd xm5, [topq +strideq*1+2], 1 movhps xm9, [dst4q+strideq*1-2] - pinsrd xm10, [dst8q+strideq*0-2], 2 + pinsrd xm10, [botq +strideq*0-2], 2 pinsrd xm5, [dstq +strideq*0+2], 2 - pinsrd xm10, [dst8q+strideq*1-2], 3 + pinsrd xm10, [botq +strideq*1-2], 3 pinsrd xm5, [dstq +strideq*1+2], 3 shufps xm11, xm6, xm9, q3131 shufps xm6, xm9, q2020 movu m9, [blend_4x8_3+8] vinserti128 m6, xm10, 1 vinserti128 m5, xm11, 1 - vpblendvb m6, [rsp+gprsize+16+8], m9 + vpblendvb m6, [rsp+gprsize+0x10+8], m9 %endif %else lea r13, [blend_8x8_1+16] - movq xm6, [dstq +strideq*2-2] - movq xm9, [dstq +stride3q -2] - movq xm5, [top1q +2] - movq xm10, [top2q +2] + movq xm6, [dstq+strideq*2-2] + movq xm9, [dstq+stride3q -2] + movq xm5, [top1q +2] + movq xm10, [top2q +2] movu m11, [r13+hq*2*2+16*2] - vinserti128 m6, [dst4q+strideq*0-2], 1 - vinserti128 m9, [dst4q+strideq*1-2], 1 - vinserti128 m5, [dstq +strideq*0+2], 1 - vinserti128 m10, [dstq +strideq*1+2], 1 + vinserti128 m6, [botq+strideq*0-2], 1 + vinserti128 m9, [botq+strideq*1-2], 1 + vinserti128 m5, [dstq+strideq*0+2], 1 + vinserti128 m10, [dstq+strideq*1+2], 1 punpcklqdq m6, m9 punpcklqdq m5, m10 - vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*2], m11 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11 %endif ret .d1k1: @@ -856,14 +856,14 @@ psrldq m12, m9, 4 vpblendd m5, m11, 0x10 movq xm11, [leftq+2] - vinserti128 m6, [dstq+stride3q -2], 1 + vinserti128 m6, [dstq+stride3q-2], 1 punpckldq xm11, xm11 vpblendd m10, m12, 0x10 pcmpeqd m12, m12 pmovzxwd m11, xm11 psrld m12, 16 punpckldq m6, m9 - vpbroadcastd m9, [dstq+strideq*4-2] + vpbroadcastd m9, [botq-2] vpblendvb m6, m11, m12 punpckldq m5, m10 vpblendd m6, m9, 0x20 @@ -877,7 +877,7 @@ movhps xm6, [dstq +strideq*2-2] movhps xm9, [dst4q+strideq*0-2] movhps xm10, [dst4q+strideq*2-2] - pinsrd xm11, [dst4q+strideq*4-2], 1 + pinsrd xm11, [botq -2], 1 shufps xm5, xm6, q3110 shufps xm6, xm9, q2020 shufps xm9, xm10, q3131 @@ -885,7 +885,7 @@ movu m11, [blend_4x8_2+4] vinserti128 m6, xm10, 1 vinserti128 m5, xm9, 1 - vpblendvb m6, [rsp+gprsize+16+4], m11 + vpblendvb m6, [rsp+gprsize+0x10+4], m11 %endif %else lea r13, [blend_8x8_1+16] @@ -895,11 +895,11 @@ movhps xm5, [dstq+strideq*0+2] shufps m10, m6, m9, q2121 vinserti128 m6, [dstq+stride3q -2], 1 - vinserti128 m9, [dstq+strideq*4-2], 1 + vinserti128 m9, [botq -2], 1 movu m11, [r13+hq*2*1+16*1] vpblendd m5, m10, 0xF0 punpcklqdq m6, m9 - vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*1], m11 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11 %endif ret .d2k1: @@ -936,7 +936,7 @@ pblendw m6, m11, 0x55 %endif %else - mova m11, [rsp+gprsize+16+hq*8+64] + mova m11, [rsp+gprsize+0x20+hq*8+64] movu xm5, [dstq+strideq*0-2] movu xm9, [dstq+strideq*1-2] vinserti128 m5, [dstq+strideq*2-2], 1 @@ -959,7 +959,7 @@ psrldq m5, m11, 4 psrldq m10, m12, 4 vinserti128 m5, [dstq+stride3q +2], 1 - vinserti128 m10, [dstq+strideq*4+2], 1 + vinserti128 m10, [botq +2], 1 vpblendd m6, m11, 0x10 vpblendd m9, m12, 0x10 punpckldq m6, m9 @@ -974,7 +974,7 @@ movhps xm5, [dstq +strideq*2-2] movhps xm9, [dst4q+strideq*0-2] movhps xm10, [dst4q+strideq*2-2] - pinsrd xm11, [dst4q+strideq*4+2], 1 + pinsrd xm11, [botq +2], 1 shufps xm6, xm5, q2010 shufps xm5, xm9, q3131 shufps xm9, xm10, q2020 @@ -982,7 +982,7 @@ movu m11, [blend_4x8_2] vinserti128 m6, xm9, 1 vinserti128 m5, xm10, 1 - vpblendvb m6, [rsp+gprsize+16-4], m11 + vpblendvb m6, [rsp+gprsize+0x10-4], m11 %endif %else lea r13, [blend_8x8_1+8] @@ -992,26 +992,26 @@ movhps xm6, [dstq+strideq*0-2] punpcklqdq m9, m5, m10 vinserti128 m5, [dstq+stride3q -2], 1 - vinserti128 m10, [dstq+strideq*4-2], 1 + vinserti128 m10, [botq -2], 1 movu m11, [r13+hq*2*1+16*1] vpblendd m6, m9, 0xF0 shufps m5, m10, q2121 - vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*1], m11 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11 %endif ret .d4k1: %if %1 == 4 %if %2 == 4 - vinserti128 m6, [dstq +strideq*0-2], 1 - vinserti128 m9, [dstq +strideq*1-2], 1 - movd xm5, [dstq +strideq*2+2] - movd xm10, [dstq +stride3q +2] + vinserti128 m6, [dstq+strideq*0-2], 1 + vinserti128 m9, [dstq+strideq*1-2], 1 + movd xm5, [dstq+strideq*2+2] + movd xm10, [dstq+stride3q +2] pblendw m6, [leftq-16+0], 0x01 pblendw m9, [leftq-16+2], 0x01 - vinserti128 m5, [dst4q+strideq*0+2], 1 - vinserti128 m10, [dst4q+strideq*1+2], 1 - vpblendd m6, [topq +strideq*0-2], 0x01 - vpblendd m9, [topq +strideq*1-2], 0x01 + vinserti128 m5, [botq+strideq*0+2], 1 + vinserti128 m10, [botq+strideq*1+2], 1 + vpblendd m6, [topq+strideq*0-2], 0x01 + vpblendd m9, [topq+strideq*1-2], 0x01 punpckldq m5, m10 punpckldq m6, m9 %else @@ -1024,46 +1024,46 @@ movhps xm9, [dst4q+strideq*1-2] pinsrd xm10, [dst4q+stride3q +2], 1 pinsrd xm6, [dstq +strideq*0-2], 2 - pinsrd xm10, [dst8q+strideq*0+2], 2 + pinsrd xm10, [botq +strideq*0+2], 2 pinsrd xm6, [dstq +strideq*1-2], 3 - pinsrd xm10, [dst8q+strideq*1+2], 3 + pinsrd xm10, [botq +strideq*1+2], 3 shufps xm11, xm5, xm9, q2020 shufps xm5, xm9, q3131 movu m9, [blend_4x8_3] vinserti128 m6, xm11, 1 vinserti128 m5, xm10, 1 - vpblendvb m6, [rsp+gprsize+16-8], m9 + vpblendvb m6, [rsp+gprsize+0x10-8], m9 %endif %else lea r13, [blend_8x8_1] movu m11, [r13+hq*2*2+16*2] - movq xm6, [top1q -2] - movq xm9, [top2q -2] - movq xm5, [dstq +strideq*2+2] - movq xm10, [dstq +stride3q +2] - vinserti128 m6, [dstq +strideq*0-2], 1 - vinserti128 m9, [dstq +strideq*1-2], 1 - vinserti128 m5, [dst4q+strideq*0+2], 1 - vinserti128 m10, [dst4q+strideq*1+2], 1 + movq xm6, [top1q -2] + movq xm9, [top2q -2] + movq xm5, [dstq+strideq*2+2] + movq xm10, [dstq+stride3q +2] + vinserti128 m6, [dstq+strideq*0-2], 1 + vinserti128 m9, [dstq+strideq*1-2], 1 + vinserti128 m5, [botq+strideq*0+2], 1 + vinserti128 m10, [botq+strideq*1+2], 1 punpcklqdq m6, m9 - vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*2], m11 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11 punpcklqdq m5, m10 %endif ret .d5k1: %if %1 == 4 %if %2 == 4 - movd xm6, [topq +strideq*0-1] - movd xm9, [topq +strideq*1-1] - movd xm5, [dstq +strideq*2+1] - movd xm10, [dstq +stride3q +1] + movd xm6, [topq+strideq*0-1] + movd xm9, [topq+strideq*1-1] + movd xm5, [dstq+strideq*2+1] + movd xm10, [dstq+stride3q +1] pcmpeqd m12, m12 pmovzxbw m11, [leftq-8+1] psrld m12, 24 - vinserti128 m6, [dstq +strideq*0-1], 1 - vinserti128 m9, [dstq +strideq*1-1], 1 - vinserti128 m5, [dst4q+strideq*0+1], 1 - vinserti128 m10, [dst4q+strideq*1+1], 1 + vinserti128 m6, [dstq+strideq*0-1], 1 + vinserti128 m9, [dstq+strideq*1-1], 1 + vinserti128 m5, [botq+strideq*0+1], 1 + vinserti128 m10, [botq+strideq*1+1], 1 punpckldq m6, m9 pxor m9, m9 vpblendd m12, m9, 0x0F @@ -1079,9 +1079,9 @@ movhps xm9, [dst4q+strideq*1-1] pinsrd xm10, [dst4q+stride3q +1], 1 pinsrd xm6, [dstq +strideq*0-1], 2 - pinsrd xm10, [dst8q+strideq*0+1], 2 + pinsrd xm10, [botq +strideq*0+1], 2 pinsrd xm6, [dstq +strideq*1-1], 3 - pinsrd xm10, [dst8q+strideq*1+1], 3 + pinsrd xm10, [botq +strideq*1+1], 3 shufps xm11, xm5, xm9, q2020 vinserti128 m6, xm11, 1 pmovzxbw m11, [leftq-3] @@ -1095,30 +1095,30 @@ %else lea r13, [blend_8x8_0] movu m11, [r13+hq*2*2+16*2] - movq xm6, [top1q -1] - movq xm9, [top2q -1] - movq xm5, [dstq +strideq*2+1] - movq xm10, [dstq +stride3q +1] - vinserti128 m6, [dstq +strideq*0-1], 1 - vinserti128 m9, [dstq +strideq*1-1], 1 - vinserti128 m5, [dst4q+strideq*0+1], 1 - vinserti128 m10, [dst4q+strideq*1+1], 1 + movq xm6, [top1q -1] + movq xm9, [top2q -1] + movq xm5, [dstq+strideq*2+1] + movq xm10, [dstq+stride3q +1] + vinserti128 m6, [dstq+strideq*0-1], 1 + vinserti128 m9, [dstq+strideq*1-1], 1 + vinserti128 m5, [botq+strideq*0+1], 1 + vinserti128 m10, [botq+strideq*1+1], 1 punpcklqdq m6, m9 punpcklqdq m5, m10 - vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*2], m11 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11 %endif ret .d6k1: %if %1 == 4 %if %2 == 4 - movd xm6, [topq +strideq*0] - movd xm9, [topq +strideq*1] - movd xm5, [dstq +strideq*2] - movd xm10, [dstq +stride3q ] - vinserti128 m6, [dstq +strideq*0], 1 - vinserti128 m9, [dstq +strideq*1], 1 - vinserti128 m5, [dst4q+strideq*0], 1 - vinserti128 m10, [dst4q+strideq*1], 1 + movd xm6, [topq+strideq*0] + movd xm9, [topq+strideq*1] + movd xm5, [dstq+strideq*2] + movd xm10, [dstq+stride3q ] + vinserti128 m6, [dstq+strideq*0], 1 + vinserti128 m9, [dstq+strideq*1], 1 + vinserti128 m5, [botq+strideq*0], 1 + vinserti128 m10, [botq+strideq*1], 1 punpckldq m6, m9 punpckldq m5, m10 %else @@ -1130,22 +1130,22 @@ pinsrd xm9, [dst4q+stride3q ], 1 pinsrd xm5, [dst4q+strideq*0], 2 pinsrd xm6, [dstq +strideq*0], 2 - pinsrd xm9, [dst8q+strideq*0], 2 + pinsrd xm9, [botq +strideq*0], 2 pinsrd xm5, [dst4q+strideq*1], 3 pinsrd xm6, [dstq +strideq*1], 3 - pinsrd xm9, [dst8q+strideq*1], 3 + pinsrd xm9, [botq +strideq*1], 3 vinserti128 m6, xm5, 1 vinserti128 m5, xm9, 1 %endif %else - movq xm5, [dstq +strideq*2] - movq xm9, [dst4q+strideq*0] - movq xm6, [top1q ] - movq xm10, [dstq +strideq*0] - movhps xm5, [dstq +stride3q ] - movhps xm9, [dst4q+strideq*1] - movhps xm6, [top2q ] - movhps xm10, [dstq +strideq*1] + movq xm5, [dstq+strideq*2] + movq xm9, [botq+strideq*0] + movq xm6, [top1q ] + movq xm10, [dstq+strideq*0] + movhps xm5, [dstq+stride3q ] + movhps xm9, [botq+strideq*1] + movhps xm6, [top2q ] + movhps xm10, [dstq+strideq*1] vinserti128 m5, xm9, 1 vinserti128 m6, xm10, 1 %endif @@ -1153,16 +1153,16 @@ .d7k1: %if %1 == 4 %if %2 == 4 - movd xm5, [dstq +strideq*2-1] - movd xm9, [dstq +stride3q -1] - movd xm6, [topq +strideq*0+1] - movd xm10, [topq +strideq*1+1] + movd xm5, [dstq+strideq*2-1] + movd xm9, [dstq+stride3q -1] + movd xm6, [topq+strideq*0+1] + movd xm10, [topq+strideq*1+1] pinsrb xm5, [leftq+ 5], 0 pinsrb xm9, [leftq+ 7], 0 - vinserti128 m6, [dstq +strideq*0+1], 1 - vinserti128 m10, [dstq +strideq*1+1], 1 - vinserti128 m5, [dst4q+strideq*0-1], 1 - vinserti128 m9, [dst4q+strideq*1-1], 1 + vinserti128 m6, [dstq+strideq*0+1], 1 + vinserti128 m10, [dstq+strideq*1+1], 1 + vinserti128 m5, [botq+strideq*0-1], 1 + vinserti128 m9, [botq+strideq*1-1], 1 punpckldq m6, m10 punpckldq m5, m9 %else @@ -1175,9 +1175,9 @@ movhps xm10, [dst4q+strideq*1-1] pinsrd xm11, [dst4q+stride3q -1], 1 pinsrd xm6, [dstq +strideq*0+1], 2 - pinsrd xm11, [dst8q+strideq*0-1], 2 + pinsrd xm11, [botq +strideq*0-1], 2 pinsrd xm6, [dstq +strideq*1+1], 3 - pinsrd xm11, [dst8q+strideq*1-1], 3 + pinsrd xm11, [botq +strideq*1-1], 3 shufps xm5, xm9, xm10, q2020 vinserti128 m5, xm11, 1 pmovzxbw m11, [leftq+5] @@ -1190,31 +1190,26 @@ %endif %else lea r13, [blend_8x8_0+16] - movq xm5, [dstq +strideq*2-1] - movq xm9, [dst4q+strideq*0-1] - movq xm6, [top1q +1] - movq xm10, [dstq +strideq*0+1] - movhps xm5, [dstq +stride3q -1] - movhps xm9, [dst4q+strideq*1-1] - movhps xm6, [top2q +1] - movhps xm10, [dstq +strideq*1+1] + movq xm5, [dstq+strideq*2-1] + movq xm9, [botq+strideq*0-1] + movq xm6, [top1q +1] + movq xm10, [dstq+strideq*0+1] + movhps xm5, [dstq+stride3q -1] + movhps xm9, [botq+strideq*1-1] + movhps xm6, [top2q +1] + movhps xm10, [dstq+strideq*1+1] movu m11, [r13+hq*2*2+16*2] vinserti128 m5, xm9, 1 vinserti128 m6, xm10, 1 - vpblendvb m5, [rsp+gprsize+80+hq*8+64+8*2], m11 + vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11 %endif ret .border_block: - DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge %define rstk rsp %assign stack_offset stack_offset_entry -%if %1 == 4 && %2 == 8 - PUSH r9 - %assign regs_used 10 -%else - %assign regs_used 9 -%endif +%assign regs_used 10 %if STACK_ALIGNMENT < 32 PUSH r%+regs_used %assign regs_used regs_used+1 @@ -1398,22 +1393,21 @@ .left_done: ; bottom - DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge + DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge test edgeb, 8 ; have_bottom jz .no_bottom - lea dst8q, [dstq+%2*strideq] test edgeb, 1 ; have_left jz .bottom_no_left test edgeb, 2 ; have_right jz .bottom_no_right - pmovzxbw m1, [dst8q-(%1/2)] - pmovzxbw m2, [dst8q+strideq-(%1/2)] + pmovzxbw m1, [botq+strideq*0-(%1/2)] + pmovzxbw m2, [botq+strideq*1-(%1/2)] movu [px+(%2+0)*32-%1], m1 movu [px+(%2+1)*32-%1], m2 jmp .bottom_done .bottom_no_right: - pmovzxbw m1, [dst8q-%1] - pmovzxbw m2, [dst8q+strideq-%1] + pmovzxbw m1, [botq+strideq*0-%1] + pmovzxbw m2, [botq+strideq*1-%1] movu [px+(%2+0)*32-%1*2], m1 movu [px+(%2+1)*32-%1*2], m2 %if %1 == 8 @@ -1425,8 +1419,8 @@ .bottom_no_left: test edgeb, 2 ; have_right jz .bottom_no_left_right - pmovzxbw m1, [dst8q] - pmovzxbw m2, [dst8q+strideq] + pmovzxbw m1, [botq+strideq*0] + pmovzxbw m2, [botq+strideq*1] mova [px+(%2+0)*32+0], m1 mova [px+(%2+1)*32+0], m2 movd [px+(%2+0)*32-4], xm14 @@ -1434,14 +1428,14 @@ jmp .bottom_done .bottom_no_left_right: %if %1 == 4 - movd xm1, [dst8q] - pinsrd xm1, [dst8q+strideq], 1 + movd xm1, [botq+strideq*0] + pinsrd xm1, [botq+strideq*1], 1 pmovzxbw xm1, xm1 movq [px+(%2+0)*32+0], xm1 movhps [px+(%2+1)*32+0], xm1 %else - pmovzxbw xm1, [dst8q] - pmovzxbw xm2, [dst8q+strideq] + pmovzxbw xm1, [botq+strideq*0] + pmovzxbw xm2, [botq+strideq*1] mova [px+(%2+0)*32+0], xm1 mova [px+(%2+1)*32+0], xm2 %endif @@ -1456,13 +1450,13 @@ .bottom_done: ; actual filter - INIT_YMM avx2 - DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero + INIT_YMM avx2 + DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero %undef edged ; register to shuffle values into after packing vbroadcasti128 m12, [shufb_lohi] - mov dampingd, r7m + mov dampingd, r8m xor zerod, zerod movifnidn prid, prim sub dampingd, 31 @@ -1481,13 +1475,13 @@ add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift - DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3 + DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3 lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] - DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3 + DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3 vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength and prid, 1 @@ -1520,10 +1514,10 @@ RET .border_pri_only: - DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3 + DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3 lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask - DEFINE_ARGS dst, stride, dir, table, pri, _, stride3 + DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3 vpbroadcastb m0, xm0 ; pri_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps @@ -1549,15 +1543,15 @@ RET .border_sec_only: - DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero + DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3 movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift - DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3 + DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3 lea tableq, [tap_table] vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask - DEFINE_ARGS dst, stride, dir, table, _, sec, stride3 + DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3 vpbroadcastb m1, xm1 ; sec_strength lea secq, [tableq+12] ; sec_taps BORDER_PREP_REGS %1, %2 diff -Nru dav1d-0.9.2/src/x86/cdef_avx512.asm dav1d-1.0.0/src/x86/cdef_avx512.asm --- dav1d-0.9.2/src/x86/cdef_avx512.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/cdef_avx512.asm 2022-03-18 14:31:56.006356000 +0000 @@ -26,7 +26,7 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -%if HAVE_AVX512ICL && ARCH_X86_64 +%if ARCH_X86_64 %macro DUP4 1-* %rep %0 @@ -51,20 +51,24 @@ db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 - db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 + db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 -lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 - db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55 - db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87 - db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119 -lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27 +lut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55 db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 - db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91 - db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123 +lut_perm_8x8b: db 12, 13, 0, 1, 2, 3, 4, 5, 14, 15, 16, 17, 18, 19, 20, 21 + db 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25 + db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53 + db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57 +end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 +end_perm_clip: db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 + db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 + db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 + db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 @@ -79,13 +83,6 @@ dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 - times 16 db 0 ; realign (introduced by cdef_dirs) -end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 - db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 - db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 - db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 -end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 - db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 sec_tap: db 32, 32, 16, 16 pd_268435568: dd 268435568 @@ -93,9 +90,9 @@ SECTION .text %if WIN64 -DECLARE_REG_TMP 5, 6 +DECLARE_REG_TMP 4 %else -DECLARE_REG_TMP 8, 5 +DECLARE_REG_TMP 8 %endif ; lut: @@ -105,11 +102,11 @@ ; L2 L3 10 11 12 13 14 15 ; L4 L5 20 21 22 23 24 25 ; L6 L7 30 31 32 33 34 35 -; 4e 4f 40 41 42 43 44 45 -; 5e 5f 50 51 52 53 54 55 +; b0 b1 b2 b3 b4 b5 b6 b7 +; B0 B1 B2 B3 B4 B5 B6 B7 INIT_ZMM avx512icl -cglobal cdef_filter_4x4_8bpc, 4, 8, 13, dst, stride, left, top, \ +cglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r7-edge_mask movq xmm0, [dstq+strideq*0] @@ -125,9 +122,8 @@ vinserti32x4 m0, [dstq+r2], 2 test r6b, 0x08 ; avoid buffer overread jz .main - lea r3, [dstq+strideq*4-4] - vinserti32x4 m1, [r3+strideq*0], 2 - vinserti32x4 m0, [r3+strideq*1], 3 + vinserti32x4 m1, [botq+strideq*0-4], 2 + vinserti32x4 m0, [botq+strideq*1-4], 3 .main: movifnidn prid, prim mov t0d, dirm @@ -152,7 +148,7 @@ vpbroadcastb m4, prid and prid, 1 vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift - movifnidn t1d, secm + movifnidn secd, secm vpbroadcastd m10, [base+pri_tap+priq*4] vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) @@ -160,7 +156,7 @@ vpdpbusd m0, m2, m10 ; sum %endmacro CDEF_FILTER_4x4_PRI - test t1d, t1d ; sec + test secd, secd jz .end_no_clip call .sec .end_clip: @@ -187,7 +183,7 @@ pminub m0, m2 jmp .end .sec_only: - movifnidn t1d, secm + movifnidn secd, secm call .sec .end_no_clip: vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) @@ -201,7 +197,7 @@ pextrd [dstq+r2 ], xm0, 3 RET .mask_edges_sec_only: - movifnidn t1d, secm + movifnidn secd, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align @@ -214,7 +210,7 @@ mova m1, m6 vpermb m1{k1}, m2, m5 CDEF_FILTER_4x4_PRI - test t1d, t1d + test secd, secd jz .end_no_clip call .mask_edges_sec jmp .end_clip @@ -238,11 +234,11 @@ vpbroadcastd m8, [base+sec_tap] vpcmpub k1, m6, m2, 6 psubb m4, m2, m6 - vpbroadcastb m12, t1d - lzcnt t1d, t1d + vpbroadcastb m12, secd + lzcnt secd, secd vpsubb m4{k1}, m6, m2 vpcmpub k2, m6, m3, 6 - vpbroadcastq m11, [r3+t1*8] + vpbroadcastq m11, [r3+secq*8] gf2p8affineqb m10, m4, m11, 0 psubb m5, m3, m6 mova m9, m8 @@ -267,10 +263,11 @@ ; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 ; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 ; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 -; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 -; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 +; L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 +; La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 -cglobal cdef_filter_4x8_8bpc, 4, 9, 22, dst, stride, left, top, pri, sec, dir, damping, edge +cglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge %define base r8-edge_mask vpbroadcastd ym21, strided mov r6d, edgem @@ -284,9 +281,8 @@ movu m15, [base+lut_perm_4x8b] test r6b, 0x08 ; avoid buffer overread jz .main - lea r7, [dstq+strideq*8-2] - vinserti32x4 ym1, [r7+strideq*0], 1 - vinserti32x4 ym2, [r7+strideq*1], 1 + vinserti32x4 ym1, [botq+strideq*0-2], 1 + vinserti32x4 ym2, [botq+strideq*1-2], 1 .main: punpcklqdq ym1, ym2 vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ @@ -486,76 +482,74 @@ ret ; lut tl lut tr -; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td -; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD -; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b -; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b -; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b -; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b -; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b -; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b +; t0 t1 t2 t3 t4 t5 t6 t7 t4 t5 t6 t7 t8 t9 ta tb +; T0 T1 T2 T3 T4 T5 T6 T7 T4 T5 T6 T7 T8 T9 Ta Tb +; L0 L1 00 01 02 03 04 05 02 03 04 05 06 07 08 09 +; L2 L3 10 11 12 13 14 15 12 13 14 15 16 17 18 19 +; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 +; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 +; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 +; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 ; lut bl lut br -; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b -; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b -; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b -; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b -; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b -; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b -; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b -; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b +; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 +; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 +; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 +; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 +; Lc Ld 60 61 62 63 64 65 62 63 64 65 66 67 68 69 +; Le Lf 70 71 72 73 74 75 72 73 74 75 76 77 78 79 +; b0 b1 b2 b3 b4 b5 b6 b7 b4 b5 b6 b7 b8 b9 ba bb +; B0 B1 B2 B3 B4 B5 B6 B7 B4 B5 B6 B7 B8 B9 Ba Bb -cglobal cdef_filter_8x8_8bpc, 4, 11, 32, 4*64, dst, stride, left, top, \ +cglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r8-edge_mask + movu xm16, [dstq+strideq*0] + pinsrd xm16, [leftq+4*0], 3 mov r6d, edgem - lea r10, [dstq+strideq*4-2] - movu xmm0, [topq+strideq*0-2] - movu xmm1, [dstq+strideq*2-2] - movu xmm2, [r10 +strideq*2 ] - lea r8, [edge_mask] + vinserti128 ym16, [dstq+strideq*1], 1 + lea r10, [dstq+strideq*4] + movu xm17, [dstq+strideq*2] + vinserti32x4 m16, [topq+strideq*0-2], 2 lea r9, [strideq*3] - pmovzxwq m10, [leftq-4] - vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1 - vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1 - vinserti32x4 ym2, ymm2, [r10 +r9 ], 1 - lea r7, [r10 +strideq*4 ] - pmovzxwq m11, [leftq+4] - vinserti32x4 m0, [dstq+strideq*0-2], 2 - vinserti32x4 m1, [r10 +strideq*0 ], 2 - mova m12, [base+lut_perm_8x8a] - movu m13, [base+lut_perm_8x8b] - vinserti32x4 m0, [dstq+strideq*1-2], 3 - vinserti32x4 m1, [r10 +strideq*1 ], 3 + pinsrd xm17, [leftq+4*1], 3 + vinserti32x4 m16, [topq+strideq*1-2], 3 ; 0 1 t T + lea r8, [edge_mask] + vinserti128 ym17, [dstq+r9 ], 1 + vpbroadcastd ym18, [leftq+4*2] + vpblendd ym17, ym18, 0x80 + movu xm18, [r10 +strideq*2] + vinserti32x4 m17, [r10 +strideq*0], 2 + pinsrd xm18, [leftq+4*3], 3 + vinserti32x4 m17, [r10 +strideq*1], 3 ; 2 3 4 5 + vinserti128 ym18, [r10 +r9 ], 1 test r6b, 0x08 ; avoid buffer overread jz .main - vinserti32x4 m2, [r7 +strideq*0], 2 - vinserti32x4 m2, [r7 +strideq*1], 3 + vinserti32x4 m18, [botq+strideq*0-2], 2 + vinserti32x4 m18, [botq+strideq*1-2], 3 ; 6 7 b B .main: - mov t1d, 0x11111100 - mova m14, m12 - mova m15, m13 - kmovd k1, t1d - kshiftrd k2, k1, 8 + mova m0, [base+lut_perm_8x8a] + movu m1, [base+lut_perm_8x8b] + mova m30, [base+px_idx] + vpermb m16, m0, m16 movifnidn prid, prim + vpermb m17, m1, m17 mov t0d, dirm - mova m30, [base+px_idx] + vpermb m18, m0, m18 mov r3d, dampingm - vpermi2b m12, m0, m1 ; lut tl - vpermi2b m14, m1, m2 ; lut bl - vpermi2b m13, m0, m1 ; lut tr - vpermi2b m15, m1, m2 ; lut br - vpblendmw m12{k1}, m12, m10 - vpblendmw m14{k2}, m14, m11 + vshufi32x4 m12, m16, m17, q2020 ; lut tl + vshufi32x4 m13, m16, m17, q3131 ; lut tr + vshufi32x4 m14, m17, m18, q0220 ; lut bl + vshufi32x4 m15, m17, m18, q1331 ; lut br vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m31, m31 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m4, m30, m12 ; pxtl - vpermb m5, m30, m13 ; pxtr - vpermb m6, m30, m14 ; pxbl - vpermb m7, m30, m15 ; pxbr mova m1, m0 + vpermb m5, m30, m13 ; pxtr mova m2, m0 + vpermb m6, m30, m14 ; pxbl mova m3, m0 + vpermb m7, m30, m15 ; pxbr cmp r6b, 0x0f jne .mask_edges ; mask edges only if required test prid, prid @@ -659,7 +653,7 @@ pminub m14, m15 pmaxub m16, m17 pmaxub m18, m19 - mova m8, [base+end_perm_w8clip] + mova m8, [base+end_perm_clip] vpcmpw k2, m20, m31, 1 vpcmpw k3, m22, m31, 1 vpshldw m4, m21, 8 @@ -683,7 +677,6 @@ pmaxub m0, m11 pminub m0, m15 vpermb m0, m8, m0 - add r10, 2 vextracti32x4 xm1, m0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 @@ -714,7 +707,6 @@ vpermb m1, m8, m1 vpermb m2, m8, m2 vpermb m3, m8, m3 - add r10, 2 punpckldq m4, m0, m1 punpckhdq m0, m1 punpckldq m5, m2, m3 @@ -865,4 +857,4 @@ CDEF_FILTER_8x8_SEC m12, m13, m14, m15 ret -%endif ; HAVE_AVX512ICL && ARCH_X86_64 +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/cdef_init_tmpl.c dav1d-1.0.0/src/x86/cdef_init_tmpl.c --- dav1d-0.9.2/src/x86/cdef_init_tmpl.c 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/cdef_init_tmpl.c 2022-03-18 14:31:56.006356000 +0000 @@ -80,7 +80,7 @@ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; -#if HAVE_AVX512ICL && BITDEPTH == 8 +#if BITDEPTH == 8 c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl); c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl); c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl); diff -Nru dav1d-0.9.2/src/x86/cdef_sse.asm dav1d-1.0.0/src/x86/cdef_sse.asm --- dav1d-0.9.2/src/x86/cdef_sse.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/cdef_sse.asm 2022-03-18 14:31:56.006356000 +0000 @@ -250,20 +250,22 @@ %macro CDEF_FILTER 2 ; w, h %if ARCH_X86_64 -cglobal cdef_filter_%1x%2_8bpc, 4, 9, 16, 3 * 16 + (%2+4)*32, \ - dst, stride, left, top, pri, sec, edge, stride3, dst4 +cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \ + dst, stride, left, top, bot, pri, dst4, edge, \ + stride3 %define px rsp+3*16+2*32 %define base 0 %else cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ dst, stride, left, edge, stride3 %define topq r2 + %define botq r2 %define dst4q r2 LEA r5, tap_table %define px esp+7*16+2*32 %define base r5-tap_table %endif - mov edged, r8m + mov edged, r9m %if cpuflag(sse4) %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] %else @@ -442,24 +444,19 @@ .left_done: ; bottom - %if ARCH_X86_64 - DEFINE_ARGS dst, stride, dst8, dummy, pri, sec, edge, stride3 - %else - DEFINE_ARGS dst, stride, dst8, edge, stride3 - %endif + movifnidn botq, r4mp test edgeb, 8 ; have_bottom jz .no_bottom - lea dst8q, [dstq+%2*strideq] test edgeb, 1 ; have_left jz .bottom_no_left test edgeb, 2 ; have_right jz .bottom_no_right %if %1 == 4 - PMOVZXBW m0, [dst8q-(%1/2)] - PMOVZXBW m1, [dst8q+strideq-(%1/2)] + PMOVZXBW m0, [botq+strideq*0-(%1/2)] + PMOVZXBW m1, [botq+strideq*1-(%1/2)] %else - movu m0, [dst8q-4] - movu m1, [dst8q+strideq-4] + movu m0, [botq+strideq*0-4] + movu m1, [botq+strideq*1-4] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 @@ -472,13 +469,13 @@ jmp .bottom_done .bottom_no_right: %if %1 == 4 - PMOVZXBW m0, [dst8q-4] - PMOVZXBW m1, [dst8q+strideq-4] + PMOVZXBW m0, [botq+strideq*0-4] + PMOVZXBW m1, [botq+strideq*1-4] movu [px+32*(%2+0)-8], m0 movu [px+32*(%2+1)-8], m1 %else - movu m0, [dst8q-8] - movu m1, [dst8q+strideq-8] + movu m0, [botq+strideq*0-8] + movu m1, [botq+strideq*1-8] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 @@ -496,11 +493,11 @@ test edgeb, 2 ; have_right jz .bottom_no_left_right %if %1 == 4 - PMOVZXBW m0, [dst8q] - PMOVZXBW m1, [dst8q+strideq] + PMOVZXBW m0, [botq+strideq*0] + PMOVZXBW m1, [botq+strideq*1] %else - movu m0, [dst8q] - movu m1, [dst8q+strideq] + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 @@ -514,8 +511,8 @@ movd [px+32*(%2+1)- 4], m6 jmp .bottom_done .bottom_no_left_right: - PMOVZXBW m0, [dst8q+strideq*0], %1 == 4 - PMOVZXBW m1, [dst8q+strideq*1], %1 == 4 + PMOVZXBW m0, [botq+strideq*0], %1 == 4 + PMOVZXBW m1, [botq+strideq*1], %1 == 4 mova [px+32*(%2+0)+ 0], m0 mova [px+32*(%2+1)+ 0], m1 movd [px+32*(%2+0)+%1*2], m6 @@ -534,7 +531,7 @@ ; actual filter %if ARCH_X86_64 - DEFINE_ARGS dst, stride, pridmp, damping, pri, sec + DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec mova m13, [shufb_lohi] %if cpuflag(ssse3) mova m15, [pw_2048] @@ -555,17 +552,17 @@ %xdefine m15 [base+pw_8] %endif %endif - movifnidn prid, r4m - movifnidn secd, r5m - mov dampingd, r7m + movifnidn prid, r5m + movifnidn secd, r6m + mov dampingd, r8m movif32 [esp+0x3C], r1d test prid, prid jz .sec_only - movd m1, prim + movd m1, r5m bsr pridmpd, prid test secd, secd jz .pri_only - movd m10, r5m + movd m10, r6m tzcnt secd, secd and prid, 1 sub pridmpd, dampingd @@ -578,13 +575,13 @@ PSHUFB_0 m1, m7 PSHUFB_0 m10, m7 %if ARCH_X86_64 - DEFINE_ARGS dst, stride, pridmp, tap, pri, sec + DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec lea tapq, [tap_table] MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask mov [rsp+0x00], pridmpq ; pri_shift mov [rsp+0x10], secq ; sec_shift - DEFINE_ARGS dst, stride, dir, tap, pri, stk, k, off, h + DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off %else MOVDDUP m2, [tapq+pridmpq*8] MOVDDUP m3, [tapq+secq*8] @@ -601,7 +598,7 @@ mova [esp+0x20], m1 mova [esp+0x50], m10 %endif - mov dird, r6m + mov dird, r7m lea stkq, [px] lea priq, [tapq+8*8+priq*8] ; pri_taps mov hd, %1*%2/8 @@ -643,7 +640,7 @@ .pri_only: %if ARCH_X86_64 - DEFINE_ARGS dst, stride, pridmp, damping, pri, tap, zero + DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap lea tapq, [tap_table] %else DEFINE_ARGS dst, pridmp, zero, damping, pri, tap @@ -657,12 +654,12 @@ MOVDDUP m7, [tapq+dampingq*8] mov [rsp+0x00], dampingq %if ARCH_X86_64 - DEFINE_ARGS dst, stride, dir, stk, pri, tap, k, off, h + DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off %else mov [rsp+0x04], zerod DEFINE_ARGS dst, stride, dir, stk, pri, tap, h %endif - mov dird, r6m + mov dird, r7m lea stkq, [px] lea priq, [tapq+8*8+priq*8] mov hd, %1*%2/8 @@ -691,13 +688,13 @@ .sec_only: %if ARCH_X86_64 - DEFINE_ARGS dst, stride, dir, damping, tap, sec, zero + DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec %else DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero %endif - movd m1, r5m + movd m1, r6m tzcnt secd, secd - mov dird, r6m + mov dird, r7m xor zerod, zerod sub dampingd, secd cmovs dampingd, zerod @@ -711,7 +708,7 @@ MOVDDUP m7, [tapq+dampingq*8] lea dirq, [tapq+dirq*2] %if ARCH_X86_64 - DEFINE_ARGS dst, stride, dir, stk, tap, off, k, h + DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k %else DEFINE_ARGS dst, stride, off, stk, dir, tap, h %endif diff -Nru dav1d-0.9.2/src/x86/cpu.c dav1d-1.0.0/src/x86/cpu.c --- dav1d-0.9.2/src/x86/cpu.c 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/cpu.c 2022-03-18 14:31:56.006356000 +0000 @@ -28,13 +28,14 @@ #include "config.h" #include +#include #include "common/attributes.h" #include "src/x86/cpu.h" typedef struct { - uint32_t eax, ebx, ecx, edx; + uint32_t eax, ebx, edx, ecx; } CpuidRegisters; void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf); @@ -43,13 +44,22 @@ #define X(reg, mask) (((reg) & (mask)) == (mask)) COLD unsigned dav1d_get_cpu_flags_x86(void) { - CpuidRegisters r = { 0 }; - dav1d_cpu_cpuid(&r, 0, 0); - const unsigned max_leaf = r.eax; + union { + CpuidRegisters r; + struct { + uint32_t max_leaf; + char vendor[12]; + }; + } cpu; + dav1d_cpu_cpuid(&cpu.r, 0, 0); unsigned flags = 0; - if (max_leaf >= 1) { + if (cpu.max_leaf >= 1) { + CpuidRegisters r; dav1d_cpu_cpuid(&r, 1, 0); + const unsigned model = ((r.eax >> 4) & 0x0f) + ((r.eax >> 12) & 0xf0); + const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff); + if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ { flags |= DAV1D_X86_CPU_FLAG_SSE2; if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ { @@ -63,7 +73,7 @@ if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ { const uint64_t xcr0 = dav1d_cpu_xgetbv(0); if (X(xcr0, 0x00000006)) /* XMM/YMM */ { - if (max_leaf >= 7) { + if (cpu.max_leaf >= 7) { dav1d_cpu_cpuid(&r, 7, 0); if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ { flags |= DAV1D_X86_CPU_FLAG_AVX2; @@ -76,6 +86,14 @@ } } #endif + if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) { + if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && (family < 0x19 || + (family == 0x19 && (model < 0x10 || (model >= 0x20 && model < 0x60))))) + { + /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+ */ + flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER; + } + } } return flags; diff -Nru dav1d-0.9.2/src/x86/cpu.h dav1d-1.0.0/src/x86/cpu.h --- dav1d-0.9.2/src/x86/cpu.h 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/cpu.h 2022-03-18 14:31:56.006356000 +0000 @@ -29,12 +29,14 @@ #define DAV1D_SRC_X86_CPU_H enum CpuFlags { - DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0, - DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1, - DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2, - DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3, - DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/ - * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */ + DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0, + DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1, + DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2, + DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3, + DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/ + * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */ + DAV1D_X86_CPU_FLAG_SLOW_GATHER = 1 << 5, /* Flag CPUs where gather instructions are slow enough + * to cause performance regressions. */ }; unsigned dav1d_get_cpu_flags_x86(void); diff -Nru dav1d-0.9.2/src/x86/cpuid.asm dav1d-1.0.0/src/x86/cpuid.asm --- dav1d-0.9.2/src/x86/cpuid.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/cpuid.asm 2022-03-18 14:31:56.006356000 +0000 @@ -38,8 +38,8 @@ cpuid mov [r4+4*0], eax mov [r4+4*1], ebx - mov [r4+4*2], ecx - mov [r4+4*3], edx + mov [r4+4*2], edx + mov [r4+4*3], ecx %if ARCH_X86_64 mov rbx, r5 %endif diff -Nru dav1d-0.9.2/src/x86/film_grain16_avx2.asm dav1d-1.0.0/src/x86/film_grain16_avx2.asm --- dav1d-0.9.2/src/x86/film_grain16_avx2.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/film_grain16_avx2.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,2375 +0,0 @@ -; Copyright © 2021, VideoLAN and dav1d authors -; Copyright © 2021, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "config.asm" -%include "ext/x86/x86inc.asm" - -%if ARCH_X86_64 - -SECTION_RODATA 32 -pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 -rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 -pw_seed_xor: times 2 dw 0xb524 - times 2 dw 0x49d8 -pd_16: dd 16 -pd_m65536: dd ~0xffff -pb_1: times 4 db 1 -hmul_bits: dw 32768, 16384, 8192, 4096 -round: dw 2048, 1024, 512 -mul_bits: dw 256, 128, 64, 32, 16 -round_vals: dw 32, 64, 128, 256, 512, 1024 -max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 -min: dw 0, 16*4, 16*16 -pw_27_17_17_27: dw 27, 17, 17, 27 -; these two should be next to each other -pw_4: times 2 dw 4 -pw_16: times 2 dw 16 -pw_23_22: dw 23, 22, 0, 32 - -%macro JMP_TABLE 1-* - %xdefine %1_table %%table - %xdefine %%base %1_table - %xdefine %%prefix mangle(private_prefix %+ _%1) - %%table: - %rep %0 - 1 - dd %%prefix %+ .ar%2 - %%base - %rotate 1 - %endrep -%endmacro - -JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 - -struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - -SECTION .text - -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - -%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) - -INIT_YMM avx2 -cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax - lea r4, [pb_mask] -%define base r4-pb_mask - movq xm1, [base+rnd_next_upperbit_mask] - movq xm4, [base+mul_bits] - movq xm7, [base+hmul_bits] - mov r3d, [fg_dataq+FGData.grain_scale_shift] - lea r6d, [bdmaxq+1] - shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc - sub r3, r6 - vpbroadcastw xm8, [base+round+r3*2-2] - mova xm5, [base+pb_mask] - vpbroadcastw xm0, [fg_dataq+FGData.seed] - vpbroadcastd xm9, [base+pd_m65536] - mov r3, -73*82*2 - sub bufq, r3 - lea r6, [gaussian_sequence] -.loop: - pand xm2, xm0, xm1 - psrlw xm3, xm2, 10 - por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw xm2, xm4 ; bits 0x0f00 are set - pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds - psllq xm6, xm2, 30 - por xm2, xm6 - psllq xm6, xm2, 15 - por xm2, xm6 ; aggregate each bit into next seed's high bit - pmulhuw xm3, xm0, xm7 - por xm2, xm3 ; 4 next output seeds - pshuflw xm0, xm2, q3333 - psrlw xm2, 5 - pmovzxwd xm3, xm2 - mova xm6, xm9 - vpgatherdd xm2, [r6+xm3*2], xm6 - pandn xm2, xm9, xm2 - packusdw xm2, xm2 - paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 - ; shifts by 0, which pmulhrsw does not support - pmulhrsw xm2, xm8 - movq [bufq+r3], xm2 - add r3, 4*2 - jl .loop - - ; auto-regression code - movsxd r3, [fg_dataq+FGData.ar_coeff_lag] - movsxd r3, [base+generate_grain_y_16bpc_avx2_table+r3*4] - lea r3, [r3+base+generate_grain_y_16bpc_avx2_table] - jmp r3 - -.ar1: - DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] - movd xm4, [fg_dataq+FGData.ar_coeffs_y] - DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 - pinsrb xm4, [pb_1], 3 - pmovsxbw xm4, xm4 - pshufd xm5, xm4, q1111 - pshufd xm4, xm4, q0000 - vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd - sub bufq, 2*(82*73-(82*3+79)) - mov hd, 70 - sar maxd, 1 - mov mind, maxd - xor mind, -1 -.y_loop_ar1: - mov xq, -76 - movsx val3d, word [bufq+xq*2-2] -.x_loop_ar1: - movu xm0, [bufq+xq*2-82*2-2] ; top/left - psrldq xm2, xm0, 2 ; top - psrldq xm1, xm0, 4 ; top/right - punpcklwd xm0, xm2 - punpcklwd xm1, xm3 - pmaddwd xm0, xm4 - pmaddwd xm1, xm5 - paddd xm0, xm1 -.x_loop_ar1_inner: - movd val0d, xm0 - psrldq xm0, 4 - imul val3d, cf3d - add val3d, val0d - sarx val3d, val3d, shiftd - movsx val0d, word [bufq+xq*2] - add val3d, val0d - cmp val3d, maxd - cmovg val3d, maxd - cmp val3d, mind - cmovl val3d, mind - mov word [bufq+xq*2], val3w - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82*2 - dec hd - jg .y_loop_ar1 -.ar0: - RET - -.ar2: - DEFINE_ARGS buf, fg_data, bdmax, shift - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - vpbroadcastw xm14, [base+round_vals-12+shiftq*2] - movq xm8, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 - vinserti128 m8, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 - pxor m9, m9 - punpcklwd xm14, xm9 - pcmpgtb m9, m8 - punpcklbw m8, m9 ; cf5-11,0-4 - vpermq m9, m8, q3333 ; cf4 - psrldq xm10, xm8, 6 ; cf8-11 - vpblendw xm9, xm10, 11111110b ; cf4,9-11 - pshufd m12, m8, q0000 ; cf[5,6], cf[0-1] - pshufd m11, m8, q1111 ; cf[7,8], cf[2-3] - pshufd xm13, xm9, q1111 ; cf[10,11] - pshufd xm10, xm9, q0000 ; cf[4,9] - sar bdmaxd, 1 - movd xm15, bdmaxd - pcmpeqd xm7, xm7 - vpbroadcastd xm15, xm15 ; max_grain - pxor xm7, xm15 ; min_grain - sub bufq, 2*(82*73-(82*3+79)) - DEFINE_ARGS buf, fg_data, h, x - mov hd, 70 -.y_loop_ar2: - mov xq, -76 - -.x_loop_ar2: - movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] - vinserti128 m0, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] - psrldq m1, m0, 2 ; y=-1/-2,x=[-1,+5] - psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] - psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] - - vextracti128 xm4, m0, 1 ; y=-2,x=[-2,+5] - punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] - punpckhwd xm4, xm0 ; y=-2/-1 interleaved, x=[+2,+5] - punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] - - pmaddwd m2, m11 - pmaddwd m0, m12 - pmaddwd xm4, xm10 - - paddd m0, m2 - vextracti128 xm2, m0, 1 - paddd xm4, xm0 - paddd xm2, xm14 - paddd xm2, xm4 - - movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] - pshufd xm4, xm0, q3321 - pmovsxwd xm4, xm4 ; in dwords, y=0,x=[0,3] -.x_loop_ar2_inner: - pmaddwd xm3, xm0, xm13 - paddd xm3, xm2 - psrldq xm2, 4 ; shift top to next pixel - psrad xm3, [fg_dataq+FGData.ar_coeff_shift] - ; skip packssdw because we only care about one value - paddd xm3, xm4 - pminsd xm3, xm15 - pmaxsd xm3, xm7 - pextrw [bufq+xq*2], xm3, 0 - psrldq xm4, 4 - pslldq xm3, 2 - psrldq xm0, 2 - vpblendw xm0, xm3, 0010b - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82*2 - dec hd - jg .y_loop_ar2 - RET - -.ar3: - DEFINE_ARGS buf, fg_data, bdmax, shift -%if WIN64 - mov r6, rsp - and rsp, ~31 - sub rsp, 64 - %define tmp rsp -%elif STACK_ALIGNMENT < 32 - mov r6, rsp - and r6, ~31 - %define tmp r6-64 -%else - %define tmp rsp+stack_offset-88 -%endif - sar bdmaxd, 1 - movd xm15, bdmaxd - pcmpeqd xm13, xm13 - vpbroadcastd xm15, xm15 ; max_grain - pxor xm13, xm15 ; min_grain - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - vpbroadcastw m14, [base+round_vals+shiftq*2-12] - movq xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 - movd xm1, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 - pinsrb xm0, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 - pinsrb xm1, [pb_1], 3 ; cf14-16,pb_1 - movd xm2, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 - vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 - vinserti128 m1, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 - punpcklbw m0, m0 ; sign-extension - punpcklbw m1, m1 ; sign-extension - punpcklbw xm2, xm2 - REPX {psraw x, 8}, m0, m1, xm2 - - pshufd m8, m0, q0000 ; cf[0,1] | cf[7,8] - pshufd m9, m0, q1111 ; cf[2,3] | cf[9,10] - pshufd m10, m0, q2222 ; cf[4,5] | cf[11,12] - pshufd xm11, xm0, q3333 ; cf[6,13] - - pshufd m3, m1, q0000 ; cf[14,15] | cf[17,18] - pshufd m4, m1, q1111 ; cf[16],pw_1 | cf[19,20] - mova [tmp+0*32], m3 - mova [tmp+1*32], m4 - - paddw xm5, xm14, xm14 - vpblendw xm12, xm2, xm5, 00001000b - - DEFINE_ARGS buf, fg_data, h, x - sub bufq, 2*(82*73-(82*3+79)) - mov hd, 70 -.y_loop_ar3: - mov xq, -76 - -.x_loop_ar3: - movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] - movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] - movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] - vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] - vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] - vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] - - palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] - palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] - punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] - punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] - palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] - vextracti128 xm7, m1, 1 - punpcklwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] - - psrldq m3, m2, 2 - psrldq m4, m2, 4 - psrldq m7, m2, 6 - vpblendd m7, m14, 00001111b ; rounding constant - punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] - ; x=[+0/+1,+1/+2,+2/+3,+3/+4] - punpcklwd m4, m7 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] - ; x=[+2/+3,+3/+4,+4/+5,+5,+6] - - pmaddwd m0, m8 - pmaddwd m6, m9 - pmaddwd m5, m10 - pmaddwd xm1, xm11 - pmaddwd m2, [tmp+0*32] - pmaddwd m4, [tmp+1*32] - - paddd m0, m6 - paddd m5, m2 - paddd m0, m4 - paddd m0, m5 - vextracti128 xm4, m0, 1 - paddd xm0, xm1 - paddd xm0, xm4 - - movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pmaddwd xm2, xm1, xm12 - pshufd xm3, xm2, q1111 - paddd xm2, xm3 ; left+cur - paddd xm2, xm0 ; add top - psrldq xm0, 4 - psrad xm2, [fg_dataq+FGData.ar_coeff_shift] - ; skip packssdw because we only care about one value - pminsd xm2, xm15 - pmaxsd xm2, xm13 - pextrw [bufq+xq*2], xm2, 0 - pslldq xm2, 4 - psrldq xm1, 2 - vpblendw xm1, xm2, 0100b - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82*2 - dec hd - jg .y_loop_ar3 -%if WIN64 - mov rsp, r6 -%endif - RET - -%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y -INIT_XMM avx2 -cglobal generate_grain_uv_%1_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax -%define base r8-pb_mask - lea r8, [pb_mask] - movifnidn bdmaxd, bdmaxm - movq xm1, [base+rnd_next_upperbit_mask] - movq xm4, [base+mul_bits] - movq xm7, [base+hmul_bits] - mov r5d, [fg_dataq+FGData.grain_scale_shift] - lea r6d, [bdmaxq+1] - shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc - sub r5, r6 - vpbroadcastw xm8, [base+round+r5*2-2] - mova xm5, [base+pb_mask] - vpbroadcastw xm0, [fg_dataq+FGData.seed] - vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] - pxor xm0, xm9 - vpbroadcastd xm9, [base+pd_m65536] - lea r6, [gaussian_sequence] -%if %2 - mov r7d, 73-35*%3 - add bufq, 44*2 -.loop_y: - mov r5, -44 -%else - mov r5, -82*73 - add bufq, 2*82*73 -%endif -.loop_x: - pand xm2, xm0, xm1 - psrlw xm3, xm2, 10 - por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw xm2, xm4 ; bits 0x0f00 are set - pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds - psllq xm6, xm2, 30 - por xm2, xm6 - psllq xm6, xm2, 15 - por xm2, xm6 ; aggregate each bit into next seed's high bit - pmulhuw xm3, xm0, xm7 - por xm2, xm3 ; 4 next output seeds - pshuflw xm0, xm2, q3333 - psrlw xm2, 5 - pmovzxwd xm3, xm2 - mova xm6, xm9 - vpgatherdd xm2, [r6+xm3*2], xm6 - pandn xm2, xm9, xm2 - packusdw xm2, xm2 - paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 - ; shifts by 0, which pmulhrsw does not support - pmulhrsw xm2, xm8 - movq [bufq+r5*2], xm2 - add r5, 4 - jl .loop_x -%if %2 - add bufq, 82*2 - dec r7d - jg .loop_y -%endif - - ; auto-regression code - movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_16bpc_avx2_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_16bpc_avx2_table] - jmp r5 - -.ar0: - INIT_YMM avx2 - DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift - imul uvd, 28 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] - vpbroadcastw m3, [base+hmul_bits+shiftq*2-10] - sar bdmaxd, 1 - movd xm14, bdmaxd - pcmpeqw m7, m7 - vpbroadcastw m14, xm14 ; max_gain - pxor m7, m14 ; min_grain - DEFINE_ARGS buf, bufy, h, x - pmovsxbw xm4, xm4 -%if %2 - vpbroadcastw m6, [hmul_bits+2+%3*2] -%endif - vpbroadcastw m4, xm4 - pxor m5, m5 -%if %2 - sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) -%else - sub bufq, 2*(82*70-3) -%endif - add bufyq, 2*(3+82*3) - mov hd, 70-35*%3 -.y_loop_ar0: -%if %2 - ; first 32 pixels - movu xm8, [bufyq] - movu xm10, [bufyq+ 16] -%if %3 - movu xm9, [bufyq+82*2] - movu xm11, [bufyq+82*2+16] -%endif - vinserti128 m8, [bufyq+ 32], 1 - vinserti128 m10, [bufyq+ 48], 1 -%if %3 - vinserti128 m9, [bufyq+82*2+32], 1 - vinserti128 m11, [bufyq+82*2+48], 1 - paddw m8, m9 - paddw m10, m11 -%endif - phaddw m8, m10 - movu xm10, [bufyq+ 64] - movu xm12, [bufyq+ 80] -%if %3 - movu xm11, [bufyq+82*2+64] - movu xm13, [bufyq+82*2+80] -%endif - vinserti128 m10, [bufyq+ 96], 1 - vinserti128 m12, [bufyq+ 112], 1 -%if %3 - vinserti128 m11, [bufyq+82*2+96], 1 - vinserti128 m13, [bufyq+82*2+112], 1 - paddw m10, m11 - paddw m12, m13 -%endif - phaddw m10, m12 - pmulhrsw m8, m6 - pmulhrsw m10, m6 -%else - xor xd, xd -.x_loop_ar0: - movu m8, [bufyq+xq*2] - movu m10, [bufyq+xq*2+32] -%endif - punpckhwd m9, m8, m5 - punpcklwd m8, m5 - punpckhwd m11, m10, m5 - punpcklwd m10, m5 - REPX {pmaddwd x, m4}, m8, m9, m10, m11 - REPX {psrad x, 5}, m8, m9, m10, m11 - packssdw m8, m9 - packssdw m10, m11 - REPX {pmulhrsw x, m3}, m8, m10 -%if %2 - paddw m8, [bufq+ 0] - paddw m10, [bufq+32] -%else - paddw m8, [bufq+xq*2+ 0] - paddw m10, [bufq+xq*2+32] -%endif - pminsw m8, m14 - pminsw m10, m14 - pmaxsw m8, m7 - pmaxsw m10, m7 -%if %2 - movu [bufq+ 0], m8 - movu [bufq+32], m10 - - ; last 6 pixels - movu xm8, [bufyq+32*4] - movu xm10, [bufyq+32*4+16] -%if %3 - paddw xm8, [bufyq+32*4+82*2] - paddw xm10, [bufyq+32*4+82*2+16] -%endif - phaddw xm8, xm10 - pmulhrsw xm8, xm6 - punpckhwd xm9, xm8, xm5 - punpcklwd xm8, xm5 - REPX {pmaddwd x, xm4}, xm8, xm9 - REPX {psrad x, 5}, xm8, xm9 - packssdw xm8, xm9 - pmulhrsw xm8, xm3 - movu xm0, [bufq+32*2] - paddw xm8, xm0 - pminsw xm8, xm14 - pmaxsw xm8, xm7 - vpblendw xm0, xm8, xm0, 11000000b - movu [bufq+32*2], xm0 -%else - movu [bufq+xq*2+ 0], m8 - movu [bufq+xq*2+32], m10 - add xd, 32 - cmp xd, 64 - jl .x_loop_ar0 - - ; last 12 pixels - movu m8, [bufyq+64*2] - punpckhwd m9, m8, m5 - punpcklwd m8, m5 - REPX {pmaddwd x, m4}, m8, m9 - REPX {psrad x, 5}, m8, m9 - packssdw m8, m9 - pmulhrsw m8, m3 - movu m0, [bufq+64*2] - paddw m8, m0 - pminsw m8, m14 - pmaxsw m8, m7 - vpblendd m0, m8, m0, 11000000b - movu [bufq+64*2], m0 -%endif - - add bufq, 82*2 - add bufyq, 82*2<<%3 - dec hd - jg .y_loop_ar0 - RET - -.ar1: - INIT_XMM avx2 - DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift - imul uvd, 28 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] - movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] - pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 - DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift - pmovsxbw xm4, xm4 - pshufd xm5, xm4, q1111 - pshufd xm4, xm4, q0000 - pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd - vpbroadcastw xm6, [hmul_bits+2+%3*2] - vpbroadcastd xm3, xm3 -%if %2 - sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) -%else - sub bufq, 2*(82*69+3) -%endif - add bufyq, 2*(79+82*3) - mov hd, 70-35*%3 - sar maxd, 1 - mov mind, maxd - xor mind, -1 -.y_loop_ar1: - mov xq, -(76>>%2) - movsx val3d, word [bufq+xq*2-2] -.x_loop_ar1: - movu xm0, [bufq+xq*2-82*2-2] ; top/left -%if %2 - movu xm8, [bufyq+xq*4] -%else - movq xm8, [bufyq+xq*2] -%endif - psrldq xm2, xm0, 2 ; top - psrldq xm1, xm0, 4 ; top/right -%if %2 -%if %3 - phaddw xm8, [bufyq+xq*4+82*2] - pshufd xm9, xm8, q3232 - paddw xm8, xm9 -%else - phaddw xm8, xm8 -%endif - pmulhrsw xm8, xm6 -%endif - punpcklwd xm0, xm2 - punpcklwd xm1, xm8 - pmaddwd xm0, xm4 - pmaddwd xm1, xm5 - paddd xm0, xm1 - paddd xm0, xm3 -.x_loop_ar1_inner: - movd val0d, xm0 - psrldq xm0, 4 - imul val3d, cf3d - add val3d, val0d - sarx val3d, val3d, shiftd - movsx val0d, word [bufq+xq*2] - add val3d, val0d - cmp val3d, maxd - cmovg val3d, maxd - cmp val3d, mind - cmovl val3d, mind - mov word [bufq+xq*2], val3w - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82*2 - add bufyq, 82*2<<%3 - dec hd - jg .y_loop_ar1 - RET - - INIT_YMM avx2 -.ar2: - DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - sar bdmaxd, 1 - movd xm6, bdmaxd - pcmpeqd xm5, xm5 - vpbroadcastd xm6, xm6 ; max_grain - pxor xm5, xm6 ; min_grain -%if %2 - vpbroadcastw xm7, [base+hmul_bits+2+%3*2] -%endif - vpbroadcastw xm15, [base+round_vals-12+shiftq*2] - - movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+5] - pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 - pinsrb xm0, [pb_1], 5 - pinsrw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 - movhps xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] - pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+9], 13 - pmovsxbw m0, xm0 - - pshufd xm13, xm0, q3333 - pshufd m12, m0, q0000 - pshufd m11, m0, q1111 - pshufd m10, m0, q2222 - - DEFINE_ARGS buf, bufy, fg_data, h, x -%if %2 - sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) -%else - sub bufq, 2*(82*69+3) -%endif - add bufyq, 2*(79+82*3) - mov hd, 70-35*%3 -.y_loop_ar2: - mov xq, -(76>>%2) - -.x_loop_ar2: - movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] - vinserti128 m0, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] - psrldq m1, m0, 2 ; y=-1/-2,x=[-1,+5] - psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] - psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] - -%if %2 - movu xm8, [bufyq+xq*4] -%if %3 - paddw xm8, [bufyq+xq*4+82*2] -%endif - phaddw xm8, xm8 -%else - movq xm8, [bufyq+xq*2] -%endif - - vinserti128 m4, xm0, 1 ; y=-1,x=[-2,+5] - punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] - punpckhwd m4, m0, m4 ; y=-2/-1 interleaved, x=[+2,+5] - punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] - -%if %2 - pmulhrsw xm1, xm8, xm7 - punpcklwd xm1, xm15 ; luma, round interleaved -%else - punpcklwd xm1, xm8, xm15 -%endif - vpblendd m1, m1, m4, 11110000b - - pmaddwd m2, m11 - pmaddwd m0, m12 - pmaddwd m1, m10 - paddd m2, m0 - paddd m2, m1 - vextracti128 xm0, m2, 1 - paddd xm2, xm0 - - movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] - pshufd xm4, xm0, q3321 - pmovsxwd xm4, xm4 ; y=0,x=[0,3] in dword -.x_loop_ar2_inner: - pmaddwd xm3, xm0, xm13 - paddd xm3, xm2 - psrldq xm2, 4 ; shift top to next pixel - psrad xm3, [fg_dataq+FGData.ar_coeff_shift] - ; we do not need to packssdw since we only care about one value - paddd xm3, xm4 - pminsd xm3, xm6 - pmaxsd xm3, xm5 - pextrw [bufq+xq*2], xm3, 0 - psrldq xm0, 2 - pslldq xm3, 2 - psrldq xm4, 4 - vpblendw xm0, xm3, 00000010b - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82*2 - add bufyq, 82*2<<%3 - dec hd - jg .y_loop_ar2 - RET - -.ar3: - DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift -%if WIN64 - mov r6, rsp - and rsp, ~31 - sub rsp, 96 - %define tmp rsp -%elif STACK_ALIGNMENT < 32 - mov r6, rsp - and r6, ~31 - %define tmp r6-96 -%else - %define tmp rsp+stack_offset-120 -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - vpbroadcastw xm14, [base+round_vals-12+shiftq*2] - sar bdmaxd, 1 - movd xm15, bdmaxd - pcmpeqd xm13, xm13 - vpbroadcastd xm15, xm15 ; max_grain - pxor xm13, xm15 ; min_grain -%if %2 - vpbroadcastw xm12, [base+hmul_bits+2+%3*2] -%endif - - movq xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] - pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma - movhps xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] - pmovsxbw m0, xm0 - - pshufd m11, m0, q3333 - pshufd m10, m0, q2222 - pshufd m9, m0, q1111 - pshufd m8, m0, q0000 - - movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] - pinsrb xm0, [pb_1], 3 - pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 - pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 - pmovsxbw m0, xm0 - - pshufd m1, m0, q0000 - pshufd m2, m0, q1111 - mova [tmp+32*2], m11 - pshufd xm11, xm0, q3232 - mova [tmp+32*0], m1 - mova [tmp+32*1], m2 - pinsrw xm11, [base+round_vals-10+shiftq*2], 3 - - DEFINE_ARGS buf, bufy, fg_data, h, unused, x -%if %2 - sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) -%else - sub bufq, 2*(82*69+3) -%endif - add bufyq, 2*(79+82*3) - mov hd, 70-35*%3 -.y_loop_ar3: - mov xq, -(76>>%2) - -.x_loop_ar3: - movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] - movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] - movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] - vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] - vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] - vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] - -%if %2 - movu xm7, [bufyq+xq*4] -%if %3 - paddw xm7, [bufyq+xq*4+82*2] -%endif - phaddw xm7, xm7 -%else - movq xm7, [bufyq+xq*2] -%endif - - palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] - palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] - punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] - punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] - palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] -%if %2 - pmulhrsw xm7, xm12 -%endif - punpcklwd m1, m7 - - psrldq m3, m2, 2 - psrldq m4, m2, 4 - psrldq m7, m2, 6 - vpblendd m7, m14, 00001111b ; rounding constant - punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] - ; x=[+0/+1,+1/+2,+2/+3,+3/+4] - punpcklwd m4, m7 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] - ; x=[+2/+3,+3/+4,+4/+5,+5,+6] - - pmaddwd m0, m8 - pmaddwd m6, m9 - pmaddwd m5, m10 - pmaddwd m1, [tmp+32*2] - pmaddwd m2, [tmp+32*0] - pmaddwd m4, [tmp+32*1] - - paddd m0, m6 - paddd m5, m2 - paddd m4, m1 - paddd m0, m4 - paddd m0, m5 - vextracti128 xm4, m0, 1 - paddd xm0, xm4 - - movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pmaddwd xm2, xm1, xm11 - pshufd xm3, xm2, q1111 - paddd xm2, xm3 ; left+cur - paddd xm2, xm0 ; add top - psrldq xm0, 4 - psrad xm2, [fg_dataq+FGData.ar_coeff_shift] - ; no need to packssdw since we only care about one value - pminsd xm2, xm15 - pmaxsd xm2, xm13 - pextrw [bufq+xq*2], xm2, 0 - pslldq xm2, 4 - psrldq xm1, 2 - vpblendw xm1, xm2, 00000100b - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82*2 - add bufyq, 82*2<<%3 - dec hd - jg .y_loop_ar3 -%if WIN64 - mov rsp, r6 -%endif - RET -%endmacro - -generate_grain_uv_fn 420, 1, 1 -generate_grain_uv_fn 422, 1, 0 -generate_grain_uv_fn 444, 0, 0 - -INIT_YMM avx2 -cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, grain_lut - mov r7d, [fg_dataq+FGData.scaling_shift] - lea r8, [pb_mask] -%define base r8-pb_mask - vpbroadcastw m11, [base+mul_bits+r7*2-14] - mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - mov r9d, r9m ; bdmax - sar r9d, 11 ; is_12bpc - shlx r10d, r6d, r9d - vpbroadcastw m13, [base+min+r10*2] - lea r9d, [r9d*3] - lea r9d, [r6d*2+r9d] - vpbroadcastw m12, [base+max+r9*2] - vpbroadcastw m10, r9m - pxor m2, m2 - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ - sby, see - - movifnidn sbyd, sbym - test sbyd, sbyd - setnz r7b - test r7b, byte [fg_dataq+FGData.overlap_flag] - jnz .vertical_overlap - - imul seed, sbyd, (173 << 24) | 37 - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - unused1, unused2, see, src_bak - - lea src_bakq, [srcq+wq*2] - neg wq - sub dstq, srcq - -.loop_x: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, src_bak - - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, src_bak - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y: - ; src - pminuw m0, m10, [srcq+ 0] - pminuw m1, m10, [srcq+32] ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - mova m9, m3 - vpgatherdd m8, [scalingq+m4-3], m3 - vpgatherdd m4, [scalingq+m5-3], m9 - pcmpeqw m3, m3 - mova m9, m3 - vpgatherdd m5, [scalingq+m6-3], m3 - vpgatherdd m6, [scalingq+m7-3], m9 - REPX {psrld x, 24}, m8, m4, m5, m6 - packssdw m8, m4 - packssdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m9, [grain_lutq+offxyq*2] - movu m3, [grain_lutq+offxyq*2+32] - - ; noise = round2(scaling[src] * grain, scaling_shift) - REPX {pmullw x, m11}, m8, m5 - pmulhrsw m9, m8 - pmulhrsw m3, m5 - - ; dst = clip_pixel(src, noise) - paddw m0, m9 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - mova [dstq+srcq+ 0], m0 - mova [dstq+srcq+32], m1 - - add srcq, strideq - add grain_lutq, 82*2 - dec hd - jg .loop_y - - add wq, 32 - jge .end - lea srcq, [src_bakq+wq*2] - cmp byte [fg_dataq+FGData.overlap_flag], 0 - je .loop_x - - ; r8m = sbym - movq xm15, [pw_27_17_17_27] - cmp dword r8m, 0 - jne .loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) - vpbroadcastd xm14, [pd_16] -.loop_x_h_overlap: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, src_bak, left_offxy - - lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, src_bak, left_offxy - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y_h_overlap: - ; src - pminuw m0, m10, [srcq+ 0] - pminuw m1, m10, [srcq+32] ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - mova m9, m3 - vpgatherdd m8, [scalingq+m4-3], m3 - vpgatherdd m4, [scalingq+m5-3], m9 - pcmpeqw m3, m3 - mova m9, m3 - vpgatherdd m5, [scalingq+m6-3], m3 - vpgatherdd m6, [scalingq+m7-3], m9 - REPX {psrld x, 24}, m8, m4, m5, m6 - packssdw m8, m4 - packssdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m9, [grain_lutq+offxyq*2] - movd xm7, [grain_lutq+left_offxyq*2] - punpcklwd xm7, xm9 - pmaddwd xm7, xm15 - paddd xm7, xm14 - psrad xm7, 5 - packssdw xm7, xm7 - vpblendd m9, m7, 00000001b - pcmpeqw m3, m3 - psraw m7, m10, 1 ; max_grain - pxor m3, m7 ; min_grain - pminsw m9, m7 - pmaxsw m9, m3 - movu m3, [grain_lutq+offxyq*2+32] - - ; noise = round2(scaling[src] * grain, scaling_shift) - REPX {pmullw x, m11}, m8, m5 - pmulhrsw m9, m8 - pmulhrsw m3, m5 - - ; dst = clip_pixel(src, noise) - paddw m0, m9 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - mova [dstq+srcq+ 0], m0 - mova [dstq+srcq+32], m1 - - add srcq, strideq - add grain_lutq, 82*2 - dec hd - jg .loop_y_h_overlap - - add wq, 32 - jge .end - lea srcq, [src_bakq+wq*2] - - ; r8m = sbym - cmp dword r8m, 0 - jne .loop_x_hv_overlap - jmp .loop_x_h_overlap - -.end: - RET - -.vertical_overlap: - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ - sby, see - - movzx sbyd, sbyb - imul seed, [fg_dataq+FGData.seed], 0x00010001 - imul r7d, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add r7d, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and r7d, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, r7d - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - unused1, unused2, see, src_bak - - lea src_bakq, [srcq+wq*2] - neg wq - sub dstq, srcq - - vpbroadcastd m14, [pd_16] -.loop_x_v_overlap: - vpbroadcastd m15, [pw_27_17_17_27] - - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, src_bak, unused, top_offxy - - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, src_bak, unused, top_offxy - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y_v_overlap: - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq*2] - movu m7, [grain_lutq+top_offxyq*2] - punpckhwd m9, m7, m3 - punpcklwd m7, m3 - REPX {pmaddwd x, m15}, m9, m7 - REPX {paddd x, m14}, m9, m7 - REPX {psrad x, 5}, m9, m7 - packssdw m7, m9 - pcmpeqw m0, m0 - psraw m1, m10, 1 ; max_grain - pxor m0, m1 ; min_grain - pminsw m7, m1 - pmaxsw m7, m0 - movu m3, [grain_lutq+offxyq*2+32] - movu m8, [grain_lutq+top_offxyq*2+32] - punpckhwd m9, m8, m3 - punpcklwd m8, m3 - REPX {pmaddwd x, m15}, m9, m8 - REPX {paddd x, m14}, m9, m8 - REPX {psrad x, 5}, m9, m8 - packssdw m8, m9 - pminsw m8, m1 - pmaxsw m8, m0 - - ; src - pminuw m0, m10, [srcq+ 0] ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - - ; scaling[src] - pcmpeqw m3, m3 - mova m9, m3 - vpgatherdd m6, [scalingq+m4-3], m3 - vpgatherdd m4, [scalingq+m5-3], m9 - REPX {psrld x, 24}, m6, m4 - packssdw m6, m4 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m6, m11 - pmulhrsw m6, m7 - - ; same for the other half - pminuw m1, m10, [srcq+32] ; m0-1: src as word - punpckhwd m9, m1, m2 - punpcklwd m4, m1, m2 ; m4-7: src as dword - pcmpeqw m3, m3 - mova m7, m3 - vpgatherdd m5, [scalingq+m4-3], m3 - vpgatherdd m4, [scalingq+m9-3], m7 - REPX {psrld x, 24}, m5, m4 - packssdw m5, m4 - - pmullw m5, m11 - pmulhrsw m5, m8 - - ; dst = clip_pixel(src, noise) - paddw m0, m6 - paddw m1, m5 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - mova [dstq+srcq+ 0], m0 - mova [dstq+srcq+32], m1 - - vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line - add srcq, strideq - add grain_lutq, 82*2 - dec hw - jz .end_y_v_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - xor hd, 0x10000 - test hd, 0x10000 - jnz .loop_y_v_overlap - jmp .loop_y - -.end_y_v_overlap: - add wq, 32 - jge .end_hv - lea srcq, [src_bakq+wq*2] - - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap - - movq xm15, [pw_27_17_17_27] -.loop_x_hv_overlap: - vpbroadcastd m8, [pw_27_17_17_27] - - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy - - lea topleft_offxyq, [top_offxyq+32] - lea left_offxyq, [offyq+32] - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y_hv_overlap: - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq*2] - movu m0, [grain_lutq+offxyq*2+32] - movu m6, [grain_lutq+top_offxyq*2] - movu m1, [grain_lutq+top_offxyq*2+32] - movd xm4, [grain_lutq+left_offxyq*2] - movd xm7, [grain_lutq+topleft_offxyq*2] - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklwd xm4, xm3 - punpcklwd xm7, xm6 - REPX {pmaddwd x, xm15}, xm4, xm7 - REPX {paddd x, xm14}, xm4, xm7 - REPX {psrad x, 5}, xm4, xm7 - REPX {packssdw x, x}, xm4, xm7 - pcmpeqw m5, m5 - psraw m9, m10, 1 ; max_grain - pxor m5, m9 ; min_grain - REPX {pminsw x, xm9}, xm4, xm7 - REPX {pmaxsw x, xm5}, xm4, xm7 - vpblendd m3, m4, 00000001b - vpblendd m6, m7, 00000001b - ; followed by v interpolation (top | cur -> cur) - punpckhwd m7, m6, m3 - punpcklwd m6, m3 - punpckhwd m3, m1, m0 - punpcklwd m1, m0 - REPX {pmaddwd x, m8}, m7, m6, m3, m1 - REPX {paddd x, m14}, m7, m6, m3, m1 - REPX {psrad x, 5}, m7, m6, m3, m1 - packssdw m7, m6, m7 - packssdw m3, m1, m3 - REPX {pminsw x, m9}, m7, m3 - REPX {pmaxsw x, m5}, m7, m3 - - ; src - pminuw m0, m10, [srcq+ 0] - pminuw m1, m10, [srcq+32] ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - - ; scaling[src] - pcmpeqw m9, m9 - vpgatherdd m6, [scalingq+m4-3], m9 - pcmpeqw m9, m9 - vpgatherdd m4, [scalingq+m5-3], m9 - REPX {psrld x, 24}, m6, m4 - packssdw m6, m4 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m6, m11 - pmulhrsw m7, m6 - - ; other half - punpckhwd m5, m1, m2 - punpcklwd m4, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m6, m6 - vpgatherdd m9, [scalingq+m4-3], m6 - pcmpeqw m6, m6 - vpgatherdd m4, [scalingq+m5-3], m6 - REPX {psrld x, 24}, m9, m4 - packssdw m9, m4 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m9, m11 - pmulhrsw m3, m9 - - ; dst = clip_pixel(src, noise) - paddw m0, m7 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - mova [dstq+srcq+ 0], m0 - mova [dstq+srcq+32], m1 - - vpbroadcastd m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line - add srcq, strideq - add grain_lutq, 82*2 - dec hw - jz .end_y_hv_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - xor hd, 0x10000 - test hd, 0x10000 - jnz .loop_y_hv_overlap - jmp .loop_y_h_overlap - -.end_y_hv_overlap: - add wq, 32 - lea srcq, [src_bakq+wq*2] - jl .loop_x_hv_overlap - -.end_hv: - RET - -%macro FGUV_FN 3 ; name, ss_hor, ss_ver -cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ - grain_lut, h, sby, luma, lstride, uv_pl, is_id -%define base r8-pb_mask - lea r8, [pb_mask] - mov r7d, [fg_dataq+FGData.scaling_shift] - vpbroadcastw m11, [base+mul_bits+r7*2-14] - mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - mov r9d, r13m ; bdmax - sar r9d, 11 ; is_12bpc - shlx r10d, r6d, r9d - vpbroadcastw m13, [base+min+r10*2] - lea r10d, [r9d*3] - mov r11d, is_idm - shlx r6d, r6d, r11d - add r10d, r6d - vpbroadcastw m12, [base+max+r10*2] - vpbroadcastw m10, r13m - pxor m2, m2 - - cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 - jne .csfl - -%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap - -%if %1 - mov r7d, r11m - vpbroadcastw m0, [fg_dataq+FGData.uv_mult+r7*4] - vpbroadcastw m1, [fg_dataq+FGData.uv_luma_mult+r7*4] - punpcklwd m14, m1, m0 - vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] - vpbroadcastd m9, [base+pw_4+r9*4] - pmullw m15, m9 -%else - vpbroadcastd m14, [pd_16] -%if %2 - vpbroadcastq m15, [pw_23_22] -%else - vpbroadcastq m15, [pw_27_17_17_27] -%endif -%endif - - movifnidn sbyd, sbym - test sbyd, sbyd - setnz r7b - test r7b, byte [fg_dataq+FGData.overlap_flag] - jnz %%vertical_overlap - - imul seed, sbyd, (173 << 24) | 37 - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - unused2, unused3, see, unused4, unused5, unused6, luma, lstride - - mov lumaq, r9mp - mov lstrideq, r10mp - lea r10, [srcq+wq*2] - lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*(2<<%2)] - mov r10mp, r10 - mov r11mp, r11 - mov r12mp, r12 - neg wq - -%%loop_x: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, unused1, unused2, unused3, luma, lstride - - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, unused1, unused2, unused3, luma, lstride - - mov hd, hm - mov grain_lutq, grain_lutmp -%%loop_y: - ; src - mova m0, [srcq] -%if %2 - mova m1, [srcq+strideq] ; m0-1: src as word -%else - mova m1, [srcq+32] -%endif - - ; luma_src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm7, [lumaq+lstrideq*0+16] - vinserti128 m4, [lumaq+lstrideq*0+32], 1 - vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] - mova xm8, [lumaq+lstrideq*(1<<%3)+16] - vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 - vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 - phaddw m4, m7 - phaddw m6, m8 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m6, [lumaq+32] -%endif - -%if %1 - punpckhwd m3, m4, m0 - punpcklwd m4, m0 - punpckhwd m5, m6, m1 - punpcklwd m6, m1 ; { luma, chroma } - REPX {pmaddwd x, m14}, m3, m4, m5, m6 - REPX {psrad x, 6}, m3, m4, m5, m6 - packssdw m4, m3 - packssdw m6, m5 - REPX {paddw x, m15}, m4, m6 - REPX {pmaxsw x, m2}, m4, m6 - REPX {pminsw x, m10}, m4, m6 ; clip_pixel() -%else - REPX {pminuw x, m10}, m4, m6 -%endif - - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - - ; scaling[luma_src] - pcmpeqw m3, m3 - mova m9, m3 - vpgatherdd m8, [scalingq+m4-3], m3 - vpgatherdd m4, [scalingq+m5-3], m9 - pcmpeqw m3, m3 - mova m9, m3 - vpgatherdd m5, [scalingq+m6-3], m3 - vpgatherdd m6, [scalingq+m7-3], m9 - REPX {psrld x, 24}, m8, m4, m5, m6 - packssdw m8, m4 - packssdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m9, [grain_lutq+offxyq*2] -%if %2 - movu m3, [grain_lutq+offxyq*2+82*2] -%else - movu m3, [grain_lutq+offxyq*2+32] -%endif - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - REPX {pmullw x, m11}, m8, m5 - pmulhrsw m9, m8 - pmulhrsw m3, m5 - - ; dst = clip_pixel(src, noise) - paddw m0, m9 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - mova [dstq], m0 -%if %2 - mova [dstq+strideq], m1 - - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - mova [dstq+32], m1 - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82*(2<<%2) -%if %2 - sub hb, 2 -%else - dec hb -%endif - jg %%loop_y - - add wq, 32>>%2 - jge %%end - mov srcq, r10mp - mov dstq, r11mp - mov lumaq, r12mp - lea srcq, [srcq+wq*2] - lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*(2<<%2)] - cmp byte [fg_dataq+FGData.overlap_flag], 0 - je %%loop_x - - ; r8m = sbym - cmp dword r8m, 0 - jne %%loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) -%%loop_x_h_overlap: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, unused1, unused2, luma, lstride - - lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, unused1, unused2, luma, lstride - - mov hd, hm - mov grain_lutq, grain_lutmp -%%loop_y_h_overlap: - mova m0, [srcq] -%if %2 - mova m1, [srcq+strideq] - - ; luma_src - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm7, [lumaq+lstrideq*0+16] - vinserti128 m4, [lumaq+lstrideq*0+32], 1 - vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] - mova xm8, [lumaq+lstrideq*(1<<%3)+16] - vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 - vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 - phaddw m4, m7 - phaddw m6, m8 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m1, [srcq+32] - - ; luma_src - mova m4, [lumaq] - mova m6, [lumaq+32] -%endif - -%if %1 - punpckhwd m3, m4, m0 - punpcklwd m4, m0 - punpckhwd m5, m6, m1 - punpcklwd m6, m1 ; { luma, chroma } - REPX {pmaddwd x, m14}, m3, m4, m5, m6 - REPX {psrad x, 6}, m3, m4, m5, m6 - packssdw m4, m3 - packssdw m6, m5 - REPX {paddw x, m15}, m4, m6 - REPX {pmaxsw x, m2}, m4, m6 - REPX {pminsw x, m10}, m4, m6 ; clip_pixel() -%else - REPX {pminuw x, m10}, m4, m6 -%endif - - ; grain = grain_lut[offy+y][offx+x] - movu m9, [grain_lutq+offxyq*2] -%if %2 - movu m3, [grain_lutq+offxyq*2+82*2] -%else - movu m3, [grain_lutq+offxyq*2+32] -%endif - movd xm5, [grain_lutq+left_offxyq*2+ 0] -%if %2 - pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} - punpckldq xm7, xm9, xm3 ; {cur0, cur1} - punpcklwd xm5, xm7 ; {left0, cur0, left1, cur1} -%else - punpcklwd xm5, xm9 -%endif -%if %1 -%if %2 - vpbroadcastq xm8, [pw_23_22] -%else - movq xm8, [pw_27_17_17_27] -%endif - pmaddwd xm5, xm8 - vpbroadcastd xm8, [pd_16] - paddd xm5, xm8 -%else - pmaddwd xm5, xm15 - paddd xm5, xm14 -%endif - psrad xm5, 5 - packssdw xm5, xm5 - pcmpeqw xm8, xm8 - psraw xm7, xm10, 1 - pxor xm8, xm7 - pmaxsw xm5, xm8 - pminsw xm5, xm7 - vpblendd m9, m9, m5, 00000001b -%if %2 - psrldq xm5, 4 - vpblendd m3, m3, m5, 00000001b -%endif - - ; scaling[luma_src] - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - pcmpeqw m7, m7 - vpgatherdd m8, [scalingq+m4-3], m7 - pcmpeqw m7, m7 - vpgatherdd m4, [scalingq+m5-3], m7 - REPX {psrld x, 24}, m8, m4 - packssdw m8, m4 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m8, m11 - pmulhrsw m9, m8 - - ; same for the other half - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - pcmpeqw m8, m8 - mova m4, m8 - vpgatherdd m5, [scalingq+m6-3], m8 - vpgatherdd m6, [scalingq+m7-3], m4 - REPX {psrld x, 24}, m5, m6 - packssdw m5, m6 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m5, m11 - pmulhrsw m3, m5 - - ; dst = clip_pixel(src, noise) - paddw m0, m9 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - mova [dstq], m0 -%if %2 - mova [dstq+strideq], m1 - - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - mova [dstq+32], m1 - - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - - add grain_lutq, 82*(2<<%2) -%if %2 - sub hb, 2 -%else - dec hb -%endif - jg %%loop_y_h_overlap - - add wq, 32>>%2 - jge %%end - mov srcq, r10mp - mov dstq, r11mp - mov lumaq, r12mp - lea srcq, [srcq+wq*2] - lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*(2<<%2)] - - ; r8m = sbym - cmp dword r8m, 0 - jne %%loop_x_hv_overlap - jmp %%loop_x_h_overlap - -%%end: - RET - -%%vertical_overlap: - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ - sby, see, unused1, unused2, unused3, lstride - - movzx sbyd, sbyb - imul seed, [fg_dataq+FGData.seed], 0x00010001 - imul r7d, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add r7d, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and r7d, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, r7d - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - unused1, unused2, see, unused3, unused4, unused5, luma, lstride - - mov lumaq, r9mp - mov lstrideq, r10mp - lea r10, [srcq+wq*2] - lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*(2<<%2)] - mov r10mp, r10 - mov r11mp, r11 - mov r12mp, r12 - neg wq - -%%loop_x_v_overlap: - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, unused1, top_offxy, unused2, luma, lstride - - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, unused1, top_offxy, unused2, luma, lstride - - movzx top_offxyd, offxyw - shr offxyd, 16 - -%if %2 == 0 - lea r10, [pw_27_17_17_27] -%endif - mov hd, hm - mov grain_lutq, grain_lutmp -%%loop_y_v_overlap: - ; src - mova m0, [srcq] -%if %2 - mova m1, [srcq+strideq] - - ; luma_src - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm7, [lumaq+lstrideq*0+16] - vinserti128 m4, [lumaq+lstrideq*0+32], 1 - vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] - mova xm8, [lumaq+lstrideq*(1<<%3)+16] - vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 - vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 - phaddw m4, m7 - phaddw m6, m8 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m1, [srcq+32] - - ; luma_src - mova m4, [lumaq] - mova m6, [lumaq+32] -%endif - -%if %1 - punpckhwd m3, m4, m0 - punpcklwd m4, m0 - punpckhwd m5, m6, m1 - punpcklwd m6, m1 ; { luma, chroma } - REPX {pmaddwd x, m14}, m3, m4, m5, m6 - REPX {psrad x, 6}, m3, m4, m5, m6 - packssdw m4, m3 - packssdw m6, m5 - REPX {paddw x, m15}, m4, m6 - REPX {pmaxsw x, m2}, m4, m6 - REPX {pminsw x, m10}, m4, m6 ; clip_pixel() -%else - REPX {pminuw x, m10}, m4, m6 -%endif - - ; grain = grain_lut[offy+y][offx+x] - movu m9, [grain_lutq+offxyq*2] - movu m5, [grain_lutq+top_offxyq*2] - punpckhwd m7, m5, m9 - punpcklwd m5, m9 ; {top/cur interleaved} -%if %3 - vpbroadcastd m3, [pw_23_22] -%elif %2 - vpbroadcastd m3, [pw_27_17_17_27] -%else - vpbroadcastd m3, [r10] -%endif - REPX {pmaddwd x, m3}, m7, m5 -%if %1 - vpbroadcastd m8, [pd_16] - REPX {paddd x, m8}, m7, m5 -%else - REPX {paddd x, m14}, m7, m5 -%endif - REPX {psrad x, 5}, m7, m5 - packssdw m9, m5, m7 -%if %2 - movu m3, [grain_lutq+offxyq*2+82*2] -%else - movu m3, [grain_lutq+offxyq*2+32] -%endif -%if %3 == 0 -%if %2 - movu m5, [grain_lutq+top_offxyq*2+82*2] -%else - movu m5, [grain_lutq+top_offxyq*2+32] -%endif - punpckhwd m7, m5, m3 - punpcklwd m5, m3 ; {top/cur interleaved} -%if %2 - vpbroadcastd m3, [pw_27_17_17_27+4] -%else - vpbroadcastd m3, [r10] -%endif - REPX {pmaddwd x, m3}, m7, m5 -%if %1 - REPX {paddd x, m8}, m7, m5 -%else - REPX {paddd x, m14}, m7, m5 -%endif - REPX {psrad x, 5}, m7, m5 - packssdw m3, m5, m7 -%endif ; %3 == 0 - pcmpeqw m7, m7 - psraw m5, m10, 1 - pxor m7, m5 -%if %3 - pmaxsw m9, m7 - pminsw m9, m5 -%else - REPX {pmaxsw x, m7}, m9, m3 - REPX {pminsw x, m5}, m9, m3 -%endif - - ; scaling[luma_src] - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - pcmpeqw m7, m7 - vpgatherdd m8, [scalingq+m4-3], m7 - pcmpeqw m7, m7 - vpgatherdd m4, [scalingq+m5-3], m7 - REPX {psrld x, 24}, m8, m4 - packssdw m8, m4 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m8, m11 - pmulhrsw m9, m8 - - ; scaling for the other half - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - pcmpeqw m8, m8 - mova m4, m8 - vpgatherdd m5, [scalingq+m6-3], m8 - vpgatherdd m6, [scalingq+m7-3], m4 - REPX {psrld x, 24}, m5, m6 - packssdw m5, m6 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m5, m11 - pmulhrsw m3, m5 - - ; dst = clip_pixel(src, noise) - paddw m0, m9 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - mova [dstq], m0 -%if %2 - mova [dstq+strideq], m1 - - sub hb, 2 -%else - mova [dstq+32], m1 - dec hb -%endif - jle %%end_y_v_overlap -%if %2 - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82*(2<<%2) -%if %2 - jmp %%loop_y -%else - btc hd, 16 - jc %%loop_y - add r10, 4 - jmp %%loop_y_v_overlap -%endif - -%%end_y_v_overlap: - add wq, 32>>%2 - jge %%end_hv - mov srcq, r10mp - mov dstq, r11mp - mov lumaq, r12mp - lea srcq, [srcq+wq*2] - lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*(2<<%2)] - - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap - -%%loop_x_hv_overlap: - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride - -%if %2 == 0 - lea r12, [pw_27_17_17_27] - mov r13mp, r12 -%endif - lea topleft_offxyq, [top_offxyq+(32>>%2)] - lea left_offxyq, [offyq+(32>>%2)] - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -%%loop_y_hv_overlap: - ; grain = grain_lut[offy+y][offx+x] - movd xm5, [grain_lutq+left_offxyq*2] -%if %2 - pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 -%if %3 - vinserti128 m5, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } -%else - ; insert both top/left lines - movd xm9, [grain_lutq+topleft_offxyq*2+82*2] - pinsrw xm9, [grain_lutq+topleft_offxyq*2], 2 - vinserti128 m5, xm9, 1 -%endif -%else - pinsrd xm5, [grain_lutq+topleft_offxyq*2], 1 -%endif - movu m9, [grain_lutq+offxyq*2] -%if %2 - movu m3, [grain_lutq+offxyq*2+82*2] -%else - movu m3, [grain_lutq+offxyq*2+32] -%endif - movu m8, [grain_lutq+top_offxyq*2] -%if %2 - punpckldq xm7, xm9, xm3 ; { cur0, cur1 } -%if %3 - vinserti128 m7, xm8, 1 ; { cur0, cur1, top0 } -%else - ; insert both top lines - movu m1, [grain_lutq+top_offxyq*2+82*2] - punpckldq xm0, xm1, xm8 - vinserti128 m7, xm0, 1 -%endif -%else - movu m1, [grain_lutq+top_offxyq*2+32] - punpckldq xm7, xm9, xm8 -%endif - punpcklwd m5, m7 ; { cur/left } interleaved -%if %2 -%if %1 - vpbroadcastq m0, [pw_23_22] - pmaddwd m5, m0 - vpbroadcastd m0, [pd_16] - paddd m5, m0 -%else - pmaddwd m5, m15 - paddd m5, m14 -%endif - psrad m5, 5 - vextracti128 xm0, m5, 1 - packssdw xm5, xm0 -%else -%if %1 - movddup xm0, [pw_27_17_17_27] - pmaddwd xm5, xm0 - vpbroadcastd m0, [pd_16] - paddd xm5, xm0 -%else - pmaddwd xm5, xm15 - paddd xm5, xm14 -%endif - psrad xm5, 5 - packssdw xm5, xm5 -%endif - pcmpeqw m0, m0 - psraw m7, m10, 1 - pxor m0, m7 - pminsw xm5, xm7 - pmaxsw xm5, xm0 - vpblendd m9, m9, m5, 00000001b -%if %2 - psrldq xm5, 4 - vpblendd m3, m3, m5, 00000001b -%if %3 == 0 - psrldq xm5, 4 - vpblendd m1, m1, m5, 00000001b -%endif -%endif - psrldq xm5, 4 - vpblendd m5, m8, m5, 00000001b - - punpckhwd m8, m5, m9 - punpcklwd m5, m9 ; {top/cur interleaved} -%if %3 - vpbroadcastd m9, [pw_23_22] -%elif %2 - vpbroadcastd m9, [pw_27_17_17_27] -%else - xchg r12, r13mp - vpbroadcastd m9, [r12] -%endif - REPX {pmaddwd x, m9}, m8, m5 -%if %1 - vpbroadcastd m4, [pd_16] - REPX {paddd x, m4}, m8, m5 -%else - REPX {paddd x, m14}, m8, m5 -%endif - REPX {psrad x, 5}, m8, m5 - packssdw m9, m5, m8 -%if %3 - pminsw m9, m7 - pmaxsw m9, m0 -%else - punpckhwd m8, m1, m3 - punpcklwd m1, m3 ; {top/cur interleaved} -%if %2 - vpbroadcastd m3, [pw_27_17_17_27+4] -%else - vpbroadcastd m3, [r12] - xchg r12, r13mp -%endif - REPX {pmaddwd x, m3}, m8, m1 -%if %1 - REPX {paddd x, m4}, m8, m1 -%else - REPX {paddd x, m14}, m8, m1 -%endif - REPX {psrad x, 5}, m8, m1 - packssdw m3, m1, m8 - REPX {pminsw x, m7}, m9, m3 - REPX {pmaxsw x, m0}, m9, m3 -%endif - - ; src - mova m0, [srcq] -%if %2 - mova m1, [srcq+strideq] -%else - mova m1, [srcq+32] -%endif - - ; luma_src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm7, [lumaq+lstrideq*0+16] - vinserti128 m4, [lumaq+lstrideq*0+32], 1 - vinserti128 m7, [lumaq+lstrideq*0+48], 1 - mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] - mova xm8, [lumaq+lstrideq*(1<<%3)+16] - vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 - vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 - phaddw m4, m7 - phaddw m6, m8 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m6, [lumaq+32] -%endif - -%if %1 - punpckhwd m8, m4, m0 - punpcklwd m4, m0 - punpckhwd m5, m6, m1 - punpcklwd m6, m1 ; { luma, chroma } - REPX {pmaddwd x, m14}, m8, m4, m5, m6 - REPX {psrad x, 6}, m8, m4, m5, m6 - packssdw m4, m8 - packssdw m6, m5 - REPX {paddw x, m15}, m4, m6 - REPX {pmaxsw x, m2}, m4, m6 - REPX {pminsw x, m10}, m4, m6 ; clip_pixel() -%else - REPX {pminuw x, m10}, m4, m6 -%endif - - ; scaling[luma_src] - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - pcmpeqw m7, m7 - vpgatherdd m8, [scalingq+m4-3], m7 - pcmpeqw m7, m7 - vpgatherdd m4, [scalingq+m5-3], m7 - REPX {psrld x, 24}, m8, m4 - packssdw m8, m4 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m8, m11 - pmulhrsw m9, m8 - - ; same for the other half - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - pcmpeqw m8, m8 - mova m4, m8 - vpgatherdd m5, [scalingq+m6-3], m8 - vpgatherdd m6, [scalingq+m7-3], m4 - REPX {psrld x, 24}, m5, m6 - packssdw m5, m6 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m5, m11 - pmulhrsw m3, m5 - - ; dst = clip_pixel(src, noise) - paddw m0, m9 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - mova [dstq], m0 -%if %2 - mova [dstq+strideq], m1 - - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - mova [dstq+32], m1 - - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82*(2<<%2) -%if %2 - sub hb, 2 - jg %%loop_y_h_overlap -%else - dec hb - jle %%end_y_hv_overlap - btc hd, 16 - jc %%loop_y_h_overlap - add r13mp, 4 - jmp %%loop_y_hv_overlap -%endif - -%%end_y_hv_overlap: - add wq, 32>>%2 - jge %%end_hv - mov srcq, r10mp - mov dstq, r11mp - mov lumaq, r12mp - lea srcq, [srcq+wq*2] - lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*(2<<%2)] - jmp %%loop_x_hv_overlap - -%%end_hv: - RET -%endmacro - - %%FGUV_32x32xN_LOOP 1, %2, %3 -.csfl: - %%FGUV_32x32xN_LOOP 0, %2, %3 -%endmacro - -FGUV_FN 420, 1, 1 -FGUV_FN 422, 1, 0 -FGUV_FN 444, 0, 0 -%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/filmgrain16_avx2.asm dav1d-1.0.0/src/x86/filmgrain16_avx2.asm --- dav1d-0.9.2/src/x86/filmgrain16_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/filmgrain16_avx2.asm 2022-03-18 14:31:56.006356000 +0000 @@ -0,0 +1,2248 @@ +; Copyright © 2021-2022, VideoLAN and dav1d authors +; Copyright © 2021-2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 +pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 +gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_27_17_17_27: dw 27, 17, 17, 27 +pw_23_22: dw 23, 22, 0, 32 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +gen_ar0_shift: times 4 db 128 + times 4 db 64 + times 4 db 32 + times 4 db 16 +pd_16: dd 16 +pd_m65536: dd -65536 +pb_1: times 4 db 1 +grain_max: times 2 dw 511 + times 2 dw 2047 +grain_min: times 2 dw -512 + times 2 dw -2048 +fg_max: times 2 dw 1023 + times 2 dw 4095 + times 2 dw 960 + times 2 dw 3840 + times 2 dw 940 + times 2 dw 3760 +fg_min: times 2 dw 0 + times 2 dw 64 + times 2 dw 256 +uv_offset_mul: dd 256 + dd 1024 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16, 8 +round_vals: dw 32, 64, 128, 256, 512, 1024 +pb_8_9_0_1: db 8, 9, 0, 1 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 + +SECTION .text + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +INIT_YMM avx2 +cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax +%define base r4-generate_grain_y_16bpc_avx2_table + lea r4, [generate_grain_y_16bpc_avx2_table] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r6d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + mov r3, -73*82*2 + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + lea r7d, [bdmaxq+1] + movq xm4, [base+mul_bits] + shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc + movq xm5, [base+hmul_bits] + sub r6, r7 + mova xm6, [base+pb_mask] + sub bufq, r3 + vpbroadcastw xm7, [base+round+r6*2-2] + lea r6, [gaussian_sequence] + movsxd r5, [r4+r5*4] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm3, xm2 ; 4 next output seeds + pshuflw xm0, xm3, q3333 + psrlw xm3, 5 + pand xm2, xm0, xm1 + movq r7, xm3 + psrlw xm3, xm2, 10 + por xm2, xm3 + pmullw xm2, xm4 + pmulhuw xm0, xm5 + movzx r8d, r7w + pshufb xm3, xm6, xm2 + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm0, xm2 + movd xm2, [r6+r8*2] + rorx r8, r7, 32 + por xm3, xm0 + shr r7d, 16 + pinsrw xm2, [r6+r7*2], 1 + pshuflw xm0, xm3, q3333 + movzx r7d, r8w + psrlw xm3, 5 + pinsrw xm2, [r6+r7*2], 2 + shr r8d, 16 + movq r7, xm3 + pinsrw xm2, [r6+r8*2], 3 + movzx r8d, r7w + pinsrw xm2, [r6+r8*2], 4 + rorx r8, r7, 32 + shr r7d, 16 + pinsrw xm2, [r6+r7*2], 5 + movzx r7d, r8w + pinsrw xm2, [r6+r7*2], 6 + shr r8d, 16 + pinsrw xm2, [r6+r8*2], 7 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support + mova [bufq+r3], xm2 + add r3, 8*2 + jl .loop + + ; auto-regression code + add r5, r4 + jmp r5 + +.ar1: + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_y] + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 + pinsrb xm4, [base+pb_1], 3 + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right + punpcklwd xm0, xm2 + punpcklwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xb, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 + vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 + vpbroadcastw xm10, [base+round_vals-12+shiftq*2] + pxor m1, m1 + punpcklwd xm10, xm1 + pcmpgtb m1, m0 + punpcklbw m0, m1 ; cf5-11,0-4 + vpermq m1, m0, q3333 ; cf4 + vbroadcasti128 m11, [base+gen_shufA] + pshufd m6, m0, q0000 ; cf[5,6], cf[0-1] + vbroadcasti128 m12, [base+gen_shufB] + pshufd m7, m0, q1111 ; cf[7,8], cf[2-3] + punpckhwd xm1, xm0 + pshufhw xm9, xm0, q2121 + pshufd xm8, xm1, q0000 ; cf[4,9] + sar bdmaxd, 1 + punpckhqdq xm9, xm9 ; cf[10,11] + movd xm4, bdmaxd ; max_grain + pcmpeqd xm5, xm5 + sub bufq, 2*(82*73-(82*3+79)) + pxor xm5, xm4 ; min_grain + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 +.x_loop_ar2: + vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5] + pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + pmaddwd m0, m6 + punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5] + pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, m7 + pmaddwd xm2, xm8 + paddd m0, m1 + vextracti128 xm1, m0, 1 + paddd xm0, xm10 + paddd xm2, xm0 + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + paddd xm2, xm1 + pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd xm3, xm9, xm0 + psrldq xm0, 2 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + paddd xm3, xm1 + pminsd xm3, xm4 + psrldq xm1, 4 + pmaxsd xm3, xm5 + pextrw [bufq+xq*2], xm3, 0 + punpcklwd xm3, xm3 + pblendw xm0, xm3, 0010b + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + sar bdmaxd, 1 + movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 + movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 + pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1 + movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 + vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 + vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 + vpbroadcastw xm11, [base+round_vals+shiftq*2-12] + movd xm12, bdmaxd ; max_grain + punpcklbw m7, m7 ; sign-extension + punpcklbw m0, m0 ; sign-extension + punpcklbw xm1, xm1 + REPX {psraw x, 8}, m7, m0, xm1 + pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8] + pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10] + pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12] + pshufd xm7, xm7, q3333 ; cf[6,13] + pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18] + pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20] + paddw xm0, xm11, xm11 + pcmpeqd xm13, xm13 + pblendw xm10, xm1, xm0, 00001000b + pxor xm13, xm12 ; min_grain + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 +.x_loop_ar3: + movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] + punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] + pmaddwd m0, m4 + pmaddwd m2, m6 + pmaddwd m3, m5 + paddd m0, m2 + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] + paddd m0, m3 + psrldq m3, m2, 2 + punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + paddd m0, m3 + psrldq m3, m2, 4 + psrldq m2, 6 + vpblendd m2, m11, 0x0f ; rounding constant + punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + vextracti128 xm2, m1, 1 + punpcklwd xm1, xm2 + pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] + paddd m0, m3 + vextracti128 xm2, m0, 1 + paddd xm0, xm1 + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] + paddd xm0, xm2 +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm10 + pshuflw xm3, xm2, q1032 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + pminsd xm2, xm12 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + psrldq xm1, 2 + pblendw xm1, xm2, 0100b + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 + RET + +%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y +INIT_XMM avx2 +cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax +%define base r8-generate_grain_uv_%1_16bpc_avx2_table + lea r8, [generate_grain_uv_%1_16bpc_avx2_table] + movifnidn bdmaxd, bdmaxm + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + lea r6d, [bdmaxq+1] + movq xm4, [base+mul_bits] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + movq xm5, [base+hmul_bits] + sub r5, r6 + mova xm6, [base+pb_mask] + vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] + vpbroadcastw xm7, [base+round+r5*2-2] + pxor xm0, xm2 + lea r6, [gaussian_sequence] +%if %2 + mov r7d, 73-35*%3 + add bufq, 44*2 +.loop_y: + mov r5, -44*2 +%else + mov r5, -82*73*2 + sub bufq, r5 +%endif +.loop_x: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + movq r10, xm2 + movzx r9d, r10w + movd xm2, [r6+r9*2] + rorx r9, r10, 32 + shr r10d, 16 + pinsrw xm2, [r6+r10*2], 1 + movzx r10d, r9w + pinsrw xm2, [r6+r10*2], 2 + shr r9d, 16 + pinsrw xm2, [r6+r9*2], 3 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support + movq [bufq+r5], xm2 + add r5, 8 + jl .loop_x +%if %2 + add bufq, 82*2 + dec r7d + jg .loop_y +%endif + + ; auto-regression code + movsxd r6, [fg_dataq+FGData.ar_coeff_lag] + movsxd r6, [r8+r6*4] + add r6, r8 + jmp r6 + +INIT_YMM avx2 +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq] + sar bdmaxd, 1 + vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4] + movd xm6, bdmaxd + pcmpeqw m7, m7 + pmaddubsw m4, m0 ; ar_coeff << (14 - shift) + vpbroadcastw m6, xm6 ; max_gain + pxor m7, m6 ; min_grain + DEFINE_ARGS buf, bufy, h, x +%if %2 + vpbroadcastw m5, [base+hmul_bits+2+%3*2] + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif + add bufyq, 2*(3+82*3) + mov hd, 70-35*%3 +.y_loop_ar0: +%if %2 + ; first 32 pixels + movu xm0, [bufyq+16*0] + vinserti128 m0, [bufyq+16*2], 1 + movu xm1, [bufyq+16*1] + vinserti128 m1, [bufyq+16*3], 1 +%if %3 + movu xm2, [bufyq+82*2+16*0] + vinserti128 m2, [bufyq+82*2+16*2], 1 + movu xm3, [bufyq+82*2+16*1] + vinserti128 m3, [bufyq+82*2+16*3], 1 + paddw m0, m2 + paddw m1, m3 +%endif + phaddw m0, m1 + movu xm1, [bufyq+16*4] + vinserti128 m1, [bufyq+16*6], 1 + movu xm2, [bufyq+16*5] + vinserti128 m2, [bufyq+16*7], 1 +%if %3 + movu xm3, [bufyq+82*2+16*4] + vinserti128 m3, [bufyq+82*2+16*6], 1 + paddw m1, m3 + movu xm3, [bufyq+82*2+16*5] + vinserti128 m3, [bufyq+82*2+16*7], 1 + paddw m2, m3 +%endif + phaddw m1, m2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 +%else + xor xd, xd +.x_loop_ar0: + movu m0, [bufyq+xq*2] + movu m1, [bufyq+xq*2+32] +%endif + paddw m0, m0 + paddw m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 +%if %2 + paddw m0, [bufq+ 0] + paddw m1, [bufq+32] +%else + paddw m0, [bufq+xq*2+ 0] + paddw m1, [bufq+xq*2+32] +%endif + pminsw m0, m6 + pminsw m1, m6 + pmaxsw m0, m7 + pmaxsw m1, m7 +%if %2 + movu [bufq+ 0], m0 + movu [bufq+32], m1 + + ; last 6 pixels + movu xm0, [bufyq+32*4] + movu xm1, [bufyq+32*4+16] +%if %3 + paddw xm0, [bufyq+32*4+82*2] + paddw xm1, [bufyq+32*4+82*2+16] +%endif + phaddw xm0, xm1 + movu xm1, [bufq+32*2] + pmulhrsw xm0, xm5 + paddw xm0, xm0 + pmulhrsw xm0, xm4 + paddw xm0, xm1 + pminsw xm0, xm6 + pmaxsw xm0, xm7 + vpblendd xm0, xm1, 0x08 + movu [bufq+32*2], xm0 +%else + movu [bufq+xq*2+ 0], m0 + movu [bufq+xq*2+32], m1 + add xd, 32 + cmp xd, 64 + jl .x_loop_ar0 + + ; last 12 pixels + movu m0, [bufyq+64*2] + movu m1, [bufq+64*2] + paddw m0, m0 + pmulhrsw m0, m4 + paddw m0, m1 + pminsw m0, m6 + pmaxsw m0, m7 + vpblendd m0, m1, 0xc0 + movu [bufq+64*2], m0 +%endif + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar0 + RET + +INIT_XMM avx2 +.ar1: + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd + vpbroadcastw xm6, [base+hmul_bits+2+%3*2] + vpbroadcastd xm3, xm3 +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left +%if %2 + movu xm2, [bufyq+xq*4] +%else + movq xm2, [bufyq+xq*2] +%endif +%if %2 +%if %3 + phaddw xm2, [bufyq+xq*4+82*2] + punpckhqdq xm1, xm2, xm2 + paddw xm2, xm1 +%else + phaddw xm2, xm2 +%endif + pmulhrsw xm2, xm6 +%endif + psrldq xm1, xm0, 4 ; top/right + punpcklwd xm1, xm2 + psrldq xm2, xm0, 2 ; top + punpcklwd xm0, xm2 + pmaddwd xm1, xm5 + pmaddwd xm0, xm4 + paddd xm1, xm3 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xb, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar1 + RET + +INIT_YMM avx2 +.ar2: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign xmm_regs_used 13 + %2 + %assign stack_size_padded 136 + SUB rsp, stack_size_padded + movaps [rsp+16*2], xmm8 + movaps [rsp+16*3], xmm9 + movaps [rsp+16*4], xmm10 + movaps [rsp+16*5], xmm11 + movaps [rsp+16*6], xmm12 +%if %2 + movaps [rsp+16*7], xmm13 +%endif +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vbroadcasti128 m10, [base+gen_shufA] + sar bdmaxd, 1 + vbroadcasti128 m11, [base+gen_shufB] + movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5] + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 + pinsrb xm7, [base+pb_1], 5 + pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 + movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13 + pmovsxbw m7, xm7 + movd xm8, bdmaxd ; max_grain + pshufd m4, m7, q0000 + vpbroadcastw xm12, [base+round_vals-12+shiftq*2] + pshufd m5, m7, q1111 + pcmpeqd xm9, xm9 + pshufd m6, m7, q2222 + pxor xm9, xm8 ; min_grain + pshufd xm7, xm7, q3333 + DEFINE_ARGS buf, bufy, fg_data, h, x +%if %2 + vpbroadcastw xm13, [base+hmul_bits+2+%3*2] + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) +.x_loop_ar2: + vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] + pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + pmaddwd m0, m4 + pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, m5 + punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5] +%if %2 + movu xm3, [bufyq+xq*4] +%if %3 + paddw xm3, [bufyq+xq*4+82*2] +%endif + phaddw xm3, xm3 + pmulhrsw xm3, xm13 +%else + movq xm3, [bufyq+xq*2] +%endif + punpcklwd xm3, xm12 ; luma, round interleaved + vpblendd m2, m3, 0x0f + pmaddwd m2, m6 + paddd m1, m0 + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + paddd m2, m1 + vextracti128 xm1, m2, 1 + paddd xm2, xm1 + pshufd xm1, xm0, q3321 + pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd xm3, xm7, xm0 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd xm3, xm1 + psrldq xm1, 4 + pminsd xm3, xm8 + pmaxsd xm3, xm9 + pextrw [bufq+xq*2], xm3, 0 + psrldq xm0, 2 + pslldq xm3, 2 + pblendw xm0, xm3, 00000010b + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign stack_offset 32 + %assign xmm_regs_used 14 + %2 + %assign stack_size_padded 152 + SUB rsp, stack_size_padded + movaps [rsp+16*2], xmm8 + movaps [rsp+16*3], xmm9 + movaps [rsp+16*4], xmm10 + movaps [rsp+16*5], xmm11 + movaps [rsp+16*6], xmm12 + movaps [rsp+16*7], xmm13 +%if %2 + movaps [rsp+16*8], xmm14 +%endif +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm11, [base+round_vals-12+shiftq*2] + sar bdmaxd, 1 + movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma + movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] + pmovsxbw m7, xm7 +%if %2 + vpbroadcastw xm14, [base+hmul_bits+2+%3*2] +%endif + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] + pinsrb xm0, [base+pb_1], 3 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 + pmovsxbw m0, xm0 + movd xm12, bdmaxd ; max_grain + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pcmpeqd xm13, xm13 + punpckhqdq xm10, xm0, xm0 + pxor xm13, xm12 ; min_grain + pinsrw xm10, [base+round_vals-10+shiftq*2], 3 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) +.x_loop_ar3: + movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6] + punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] + pmaddwd m0, m4 + pmaddwd m2, m6 + pmaddwd m3, m5 + paddd m0, m2 + paddd m0, m3 + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] +%if %2 + movu xm3, [bufyq+xq*4] +%if %3 + paddw xm3, [bufyq+xq*4+82*2] +%endif + phaddw xm3, xm3 + pmulhrsw xm3, xm14 +%else + movq xm3, [bufyq+xq*2] +%endif + punpcklwd m1, m3 + pmaddwd m1, m7 + paddd m0, m1 + psrldq m1, m2, 4 + psrldq m3, m2, 6 + vpblendd m3, m11, 0x0f ; rounding constant + punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + psrldq m3, m2, 2 + punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + paddd m0, m1 + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] + paddd m0, m2 + vextracti128 xm2, m0, 1 + paddd xm0, xm2 +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm10 + pshuflw xm3, xm2, q1032 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + psrldq xm1, 2 + ; no need to packssdw since we only care about one value + pminsd xm2, xm12 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + pblendw xm1, xm2, 00000100b + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, unused, sby, see +%define base r11-grain_min + lea r11, [grain_min] + mov r6d, r9m ; bdmax + mov r9d, [fg_dataq+FGData.clip_to_restricted_range] + mov r7d, [fg_dataq+FGData.scaling_shift] + mov sbyd, sbym + vpbroadcastd m8, r9m + shr r6d, 11 ; is_12bpc + vpbroadcastd m9, [base+grain_min+r6*4] + shlx r10d, r9d, r6d + vpbroadcastd m10, [base+grain_max+r6*4] + lea r9d, [r6+r9*4] + vpbroadcastw m11, [base+mul_bits+r7*2-12] + vpbroadcastd m12, [base+fg_min+r10*4] + vpbroadcastd m13, [base+fg_max+r9*4] + test sbyd, sbyd + setnz r7b + vpbroadcastd m14, [base+pd_16] + test r7b, [fg_dataq+FGData.overlap_flag] + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m4, m2, 0x55 + psrld m2, m1, 16 + mova m9, m6 + pand m2, m8 + vpgatherdd m5, [scalingq+m2-2], m6 + pblendw m5, m3, 0x55 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m4, [grain_lutq+offxyq*2] + pmulhrsw m5, [grain_lutq+offxyq*2+32] + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je .loop_x + movq xm7, [pw_27_17_17_27] + cmp dword r8m, 0 ; sby + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m4, m2, 0x55 + psrld m2, m1, 16 + mova m9, m6 + pand m2, m8 + vpgatherdd m5, [scalingq+m2-2], m6 + pblendw m5, m3, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movd xm6, [grain_lutq+left_offxyq*2] + punpcklwd xm6, xm3 + pmaddwd xm6, xm7 + paddd xm6, xm14 + psrad xm6, 5 + packssdw xm6, xm6 + pmaxsw xm6, xm9 + pminsw xm6, xm10 + vpblendd m3, m6, 0x01 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m4, m3 + pmulhrsw m5, [grain_lutq+offxyq*2+32] + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp dword r8m, 0 ; sby + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see, src_bak + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x_v_overlap: + vpbroadcastd m15, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_v_overlap: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m9, m6 + pand m4, m8 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + pmaddwd m4, m15 + pmaddwd m5, m15 + movu m7, [grain_lutq+offxyq*2+32] + movu m6, [grain_lutq+top_offxyq*2+32] + paddd m4, m14 + paddd m5, m14 + psrad m4, 5 + psrad m5, 5 + packssdw m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + pmaddwd m5, m15 + pmaddwd m6, m15 + paddd m5, m14 + paddd m6, m14 + psrad m5, 5 + psrad m6, 5 + packssdw m5, m6 + pmaxsw m4, m9 + pmaxsw m5, m9 + pminsw m4, m10 + pminsw m5, m10 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m11 + pmaddubsw m3, m11 + paddw m2, m2 + paddw m3, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hb + jz .end_y_v_overlap + vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_v_overlap + jmp .loop_y +.end_y_v_overlap: + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +.loop_x_hv_overlap: + vpbroadcastd m15, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyd, [top_offxyq+32] + lea left_offxyd, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_hv_overlap: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m9, m6 + pand m4, m8 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m7, [grain_lutq+offxyq*2] + movd xm6, [grain_lutq+left_offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + movd xm4, [grain_lutq+topleft_offxyq*2] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd xm6, xm7 + punpcklwd xm4, xm5 + punpcklqdq xm6, xm4 + movddup xm4, [pw_27_17_17_27] + pmaddwd xm6, xm4 + paddd xm6, xm14 + psrad xm6, 5 + packssdw xm6, xm6 + pmaxsw xm6, xm9 + pminsw xm6, xm10 + pshuflw xm4, xm6, q1032 + vpblendd m6, m7, 0xfe + vpblendd m4, m5, 0xfe + ; followed by v interpolation (top | cur -> cur) + punpckhwd m5, m7 + pmaddwd m5, m15 + punpcklwd m4, m6 + pmaddwd m4, m15 + movu m7, [grain_lutq+offxyq*2+32] + movu m6, [grain_lutq+top_offxyq*2+32] + paddd m5, m14 + paddd m4, m14 + psrad m5, 5 + psrad m4, 5 + packssdw m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + pmaddwd m5, m15 + pmaddwd m6, m15 + paddd m5, m14 + paddd m6, m14 + psrad m5, 5 + psrad m6, 5 + packssdw m5, m6 + pmaxsw m4, m9 + pmaxsw m5, m9 + pminsw m4, m10 + pminsw m5, m10 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m11 + pmaddubsw m3, m11 + paddw m2, m2 + paddw m3, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hb + jz .end_y_hv_overlap + vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_hv_overlap + movq xm7, [pw_27_17_17_27] + jmp .loop_y_h_overlap +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq*2] + jl .loop_x_hv_overlap +.end: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r12-grain_min + lea r12, [grain_min] + mov r9d, r13m ; bdmax + mov r7d, [fg_dataq+FGData.scaling_shift] + mov r11d, is_idm + mov sbyd, sbym + vpbroadcastw m11, [base+mul_bits+r7*2-12] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + shr r9d, 11 ; is_12bpc + vpbroadcastd m8, [base+grain_min+r9*4] + shlx r10d, r6d, r9d + vpbroadcastd m9, [base+grain_max+r9*4] + vpbroadcastw m10, r13m + shlx r6d, r6d, r11d + vpbroadcastd m12, [base+fg_min+r10*4] + lea r6d, [r9+r6*2] + vpbroadcastd m13, [base+fg_max+r6*4] + test sbyd, sbyd + setnz r7b + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused, sby, see, overlap + +%if %1 + mov r6d, r11m + vpbroadcastd m0, [base+pb_8_9_0_1] + vpbroadcastd m1, [base+uv_offset_mul+r9*4] + vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] + vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4] + pshufb m14, m0 ; { uv_luma_mult, uv_mult } + pmaddwd m15, m1 +%else +%if %2 + vpbroadcastq m15, [base+pw_23_22] +%else + vpbroadcastq m15, [base+pw_27_17_17_27] +%endif + vpbroadcastd m14, [base+pd_16] +%endif + test r7b, [fg_dataq+FGData.overlap_flag] + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r9mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq+ 0] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 + pminuw m3, m10 ; clip_pixel() +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m4, [grain_lutq+offxyq*2] +%if %2 + pmulhrsw m5, [grain_lutq+offxyq*2+82*2] +%else + pmulhrsw m5, [grain_lutq+offxyq*2+32] +%endif + + ; dst = clip_pixel(src, noise) +%if %1 + paddw m0, m4 + paddw m1, m5 +%else + paddw m0, m4, [srcq] +%if %2 + paddw m1, m5, [srcq+strideq] +%else + paddw m1, m5, [srcq+32] +%endif +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 +%else + dec hb +%endif + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je %%loop_x + cmp dword r8m, 0 ; sby + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 ; clip_pixel() + pminuw m3, m10 +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m2, [grain_lutq+offxyq*2] +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif + movd xm6, [grain_lutq+left_offxyq*2] +%if %2 + pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} + punpckldq xm7, xm2, xm3 ; {cur0, cur1} + punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1} +%else + punpcklwd xm6, xm2 +%endif +%if %1 +%if %2 + vpbroadcastq xm7, [pw_23_22] +%else + movq xm7, [pw_27_17_17_27] +%endif + pmaddwd xm6, xm7 + vpbroadcastd xm7, [pd_16] + paddd xm6, xm7 +%else + pmaddwd xm6, xm15 + paddd xm6, xm14 +%endif + psrad xm6, 5 + packssdw xm6, xm6 + pmaxsw xm6, xm8 + pminsw xm6, xm9 + vpblendd m2, m6, 0x01 +%if %2 + pshuflw xm6, xm6, q1032 + vpblendd m3, m6, 0x01 +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) +%if %1 + paddw m0, m2 + paddw m1, m3 +%else + paddw m0, m2, [srcq] +%if %2 + paddw m1, m3, [srcq+strideq] +%else + paddw m1, m3, [srcq+32] +%endif +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, r10mp +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 +%else + dec hb +%endif + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp dword r8m, 0 ; sby + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r9mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%if %2 == 0 + lea r10, [pw_27_17_17_27] +%endif +%%loop_y_v_overlap: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 ; clip_pixel() + pminuw m3, m10 +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq*2] + movu m3, [grain_lutq+top_offxyq*2] + punpcklwd m2, m3, m6 + punpckhwd m3, m6 ; { top, cur } +%if %3 + vpbroadcastd m0, [pw_23_22] +%elif %2 + vpbroadcastd m0, [pw_27_17_17_27] +%else + vpbroadcastd m0, [r10] +%endif + REPX {pmaddwd x, m0}, m2, m3 +%if %1 + vpbroadcastd m1, [pd_16] + REPX {paddd x, m1}, m2, m3 +%else + REPX {paddd x, m14}, m2, m3 +%endif + REPX {psrad x, 5}, m2, m3 + packssdw m2, m3 +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif +%if %3 + pmaxsw m2, m8 + pminsw m2, m9 +%else +%if %2 + movu m7, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m6, m3, m7 ; { cur, top } + punpcklwd m3, m7 +%else + movu m7, [grain_lutq+top_offxyq*2+32] + punpckhwd m6, m7, m3 + punpcklwd m3, m7, m3 ; { top, cur } +%endif + pmaddwd m6, m0 + pmaddwd m3, m0 +%if %1 + paddd m6, m1 + paddd m3, m1 +%else + paddd m6, m14 + paddd m3, m14 +%endif + psrad m6, 5 + psrad m3, 5 + packssdw m3, m6 + pmaxsw m2, m8 + pmaxsw m3, m8 + pminsw m2, m9 + pminsw m3, m9 +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2, [srcq] +%if %2 + paddw m1, m3, [srcq+strideq] +%else + paddw m1, m3, [srcq+32] +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + sub hb, 2 +%else + mova [dstq+32], m1 + dec hb +%endif + jle %%end_y_v_overlap +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + jmp %%loop_y +%else + add hd, 0x80000000 + jc %%loop_y + add r10, 4 + jmp %%loop_y_v_overlap +%endif +%%end_y_v_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + +%if %2 == 0 + lea r14, [pw_27_17_17_27] +%endif + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%%loop_y_hv_overlap: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 ; clip_pixel() + pminuw m3, m10 +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m0, [grain_lutq+offxyq*2] + movd xm2, [grain_lutq+left_offxyq*2] + movu m6, [grain_lutq+top_offxyq*2] +%if %2 + pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2 + movu m3, [grain_lutq+offxyq*2+82*2] + punpckldq xm1, xm0, xm3 ; { cur0, cur1 } +%if %3 + vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } + vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 } +%else + vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1 + vpbroadcastd m7, [grain_lutq+topleft_offxyq*2] + vpblendd m2, m7, 0x20 + movd xm7, [grain_lutq+top_offxyq*2+82*2] + punpckldq xm7, xm6 + vinserti128 m1, xm7, 1 + movu m7, [grain_lutq+top_offxyq*2+82*2] +%endif + punpcklwd m2, m1 ; { cur, left } +%if %1 + vpbroadcastq m1, [pw_23_22] + pmaddwd m2, m1 + vpbroadcastd m1, [pd_16] + paddd m2, m1 + psrad m2, 5 + packssdw m2, m2 + vpermq m2, m2, q3120 +%else + pmaddwd m2, m15 + paddd m2, m14 + psrad m2, 5 + vextracti128 xm1, m2, 1 + packssdw xm2, xm1 +%endif +%else + pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1 + movu m3, [grain_lutq+offxyq*2+32] + movu m7, [grain_lutq+top_offxyq*2+32] + punpckldq xm1, xm0, xm6 + punpcklwd xm2, xm1 ; { cur, left } +%if %1 + movddup xm1, [pw_27_17_17_27] + pmaddwd xm2, xm1 + vpbroadcastd m1, [pd_16] + paddd xm2, xm1 +%else + pmaddwd xm2, xm15 + paddd xm2, xm14 +%endif + psrad xm2, 5 + packssdw xm2, xm2 +%endif + pmaxsw xm2, xm8 + pminsw xm2, xm9 + vpblendd m0, m2, 0x01 +%if %2 + pshufd xm2, xm2, q0321 + vpblendd m3, m2, 0x01 +%if %3 == 0 + pshufd xm2, xm2, q0321 + vpblendd m7, m2, 0x01 +%endif +%endif + pshuflw xm2, xm2, q1032 + vpblendd m2, m6, 0xfe + punpckhwd m6, m0 ; { top, cur } + punpcklwd m2, m0 +%if %3 + vpbroadcastd m0, [pw_23_22] +%elif %2 + vpbroadcastd m0, [pw_27_17_17_27] +%else + vpbroadcastd m0, [r14] +%endif + pmaddwd m6, m0 + pmaddwd m2, m0 +%if %1 + paddd m6, m1 + paddd m2, m1 +%else + paddd m6, m14 + paddd m2, m14 +%endif + psrad m6, 5 + psrad m2, 5 + packssdw m2, m6 + +%if %3 + pmaxsw m2, m8 + pminsw m2, m9 +%else +%if %2 + punpckhwd m6, m3, m7 + punpcklwd m3, m7 ; { cur, top } +%else + punpckhwd m6, m7, m3 + punpcklwd m3, m7, m3 ; { top, cur } +%endif + REPX {pmaddwd x, m0}, m6, m3 +%if %1 + REPX {paddd x, m1}, m6, m3 +%else + REPX {paddd x, m14}, m6, m3 +%endif + REPX {psrad x, 5}, m6, m3 + packssdw m3, m6 + pmaxsw m2, m8 + pmaxsw m3, m8 + pminsw m2, m9 + pminsw m3, m9 +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2, [srcq] +%if %2 + paddw m1, m3, [srcq+strideq] +%else + paddw m1, m3, [srcq+32] +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, r10mp +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 + jg %%loop_y_h_overlap +%else + dec hb + jle %%end_y_hv_overlap + add hd, 0x80000000 + jc %%loop_y_h_overlap + add r14, 4 + jmp %%loop_y_hv_overlap +%endif +%%end_y_hv_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + jmp %%loop_x_hv_overlap +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +GEN_GRAIN_UV_FN 420, 1, 1 +FGUV_FN 420, 1, 1 +GEN_GRAIN_UV_FN 422, 1, 0 +FGUV_FN 422, 1, 0 +GEN_GRAIN_UV_FN 444, 0, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/filmgrain16_avx512.asm dav1d-1.0.0/src/x86/filmgrain16_avx512.asm --- dav1d-0.9.2/src/x86/filmgrain16_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/filmgrain16_avx512.asm 2022-03-18 14:31:56.006356000 +0000 @@ -0,0 +1,932 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 +pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 + db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 +scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1 +scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4 +pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27 +pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32 +fg_min: times 2 dw 0 + times 2 dw 64 + times 2 dw 256 +fg_max: times 2 dw 1023 + times 2 dw 4095 + times 2 dw 960 + times 2 dw 3840 + times 2 dw 940 + times 2 dw 3760 +scale_rnd: dd 64 + dd 16 +uv_offset_mul: dd 256 + dd 1024 +pb_8_9_0_1: db 8, 9, 0, 1 + +SECTION .text + +INIT_ZMM avx512icl +cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \ + grain_lut, offx, sby, see, offy, src_bak +%define base r11-fg_min + lea r11, [fg_min] + mov r6d, r9m ; bdmax + mov r9d, [fg_dataq+FGData.clip_to_restricted_range] + mov r7d, [fg_dataq+FGData.scaling_shift] + mov sbyd, sbym + vpbroadcastd m6, r9m + shr r6d, 11 ; is_12bpc + vbroadcasti32x4 m7, [base+scale_mask] + shlx r10d, r9d, r6d + vpbroadcastd m10, [base+scale_shift+r7*4-32] + lea r9d, [r6+r9*4] + vpbroadcastd m8, [base+fg_min+r10*4] + kxnorw k1, k1, k1 ; 0xffff + vpbroadcastd m9, [base+fg_max+r9*4] + mov r12, 0xeeeeeeeeeeeeeeee + vpbroadcastd m19, [base+scale_rnd+r6*4] + kshiftrb k2, k1, 4 ; 0xf + vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8] + kmovq k3, r12 + vpbroadcastd m11, [base+scale_shift+r6*8+4] + test sbyd, sbyd + setnz r7b + vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0] + vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4] + test r7b, [fg_dataq+FGData.overlap_flag] + jnz .v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y: + movu m4, [grain_lutq+offxyq*2+82*0] + movu m5, [grain_lutq+offxyq*2+82*2] + call .add_noise + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je .loop_x + test sbyd, sbyd + jnz .hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ + sby, see, offy, src_bak, left_offxy + + lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + movu m4, [grain_lutq+offxyq*2+82*0] + movu m5, [grain_lutq+offxyq*2+82*2] + movd xm17, [grain_lutq+left_offxyq*2-82*1] + pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1 + punpckldq xm16, xm4, xm5 + punpcklwd xm17, xm16 + mova xm16, xm19 + vpdpwssd xm16, xm20, xm17 + psrad xm16, 1 + packssdw xm16, xm16 + vpsravw xm16, xm11 + vmovdqu8 m4{k2}, m16 + vpalignr m5{k2}, m16, m16, 4 + call .add_noise + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + test sbyd, sbyd + jnz .hv_overlap + jmp .loop_x_h_overlap + +.v_overlap: + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ + sby, see, offy, src_bak, _, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak, _, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + + movu m16, [grain_lutq+offxyq*2+82*0] + movu m0, [grain_lutq+top_offxyq*2+82*0] + movu m17, [grain_lutq+offxyq*2+82*2] + movu m1, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m4, m0, m16 + punpcklwd m0, m16 + punpckhwd m5, m1, m17 + punpcklwd m1, m17 + call .add_noise_v + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump back + ; to .v_overlap, and instead always fall-through to .hv_overlap +.hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ + sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyd, [top_offxyq+73] + lea left_offxyd, [offyq+73] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + + movu m5, [grain_lutq+offxyq*2+82*0] + movu m0, [grain_lutq+top_offxyq*2+82*0] + movd xm17, [grain_lutq+left_offxyq*2-82*1] + pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1 + movu m2, [grain_lutq+offxyq*2+82*2] + movu m1, [grain_lutq+top_offxyq*2+82*2] + movd xm18, [grain_lutq+left_offxyq*2+82*1] + pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1 + punpckldq xm16, xm5, xm0 + punpcklwd xm17, xm16 + mova xm16, xm19 + vpdpwssd xm16, xm20, xm17 + punpckldq xm17, xm2, xm1 + punpcklwd xm18, xm17 + mova xm17, xm19 + vpdpwssd xm17, xm20, xm18 + punpckhwd m4, m0, m5 + punpcklwd m0, m5 + punpckhwd m5, m1, m2 + punpcklwd m1, m2 + psrad xm16, 1 + psrad xm17, 1 + packssdw xm16, xm17 + vpsravw xm16, xm11 + vpshuflw m0{k2}, m16, q1302 + punpckhqdq xm16, xm16 + vpshuflw m1{k2}, m16, q1302 + call .add_noise_v + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + lea srcq, [src_bakq+wq*2] + jl .hv_overlap +.end: + RET +ALIGN function_align +.add_noise_v: + mova m2, m19 + vpdpwssd m2, m12, m4 + mova m3, m19 + vpdpwssd m3, m13, m5 + mova m4, m19 + vpdpwssd m4, m12, m0 + mova m5, m19 + vpdpwssd m5, m13, m1 + REPX {psrad x, 1}, m2, m3, m4, m5 + packssdw m4, m2 + packssdw m5, m3 + vpsravw m4, m11 + vpsravw m5, m11 +.add_noise: + mova m0, [srcq+strideq*0] + mova m1, [srcq+strideq*1] + kmovw k4, k1 + pand m16, m6, m0 + psrld m3, m0, 16 + vpgatherdd m2{k4}, [scalingq+m16] + vpcmpud k4, m3, m6, 2 ; px <= bdmax + vpgatherdd m16{k4}, [scalingq+m3] + kmovw k4, k1 + pand m17, m6, m1 + vpgatherdd m3{k4}, [scalingq+m17] + vpshufb m2{k3}, m16, m7 + psrld m16, m1, 16 + vpcmpud k4, m16, m6, 2 + vpgatherdd m17{k4}, [scalingq+m16] + vpshufb m3{k3}, m17, m7 + vpsllvw m2, m10 + vpsllvw m3, m10 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + add grain_lutq, 82*4 + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m8 + pmaxsw m1, m8 + pminsw m0, m9 + pminsw m1, m9 + mova [dstq+srcq], m0 + add srcq, strideq + mova [dstq+srcq], m1 + add srcq, strideq + ret + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r12-fg_min + lea r12, [fg_min] + mov r9d, r13m ; bdmax + mov r7d, [fg_dataq+FGData.scaling_shift] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + mov r11d, is_idm + kxnorw k1, k1, k1 ; 0xffff + vpbroadcastd m5, r13m + mov r13, 0xeeeeeeeeeeeeeeee + vbroadcasti32x4 m6, [base+scale_mask] + shr r9d, 11 ; is_12bpc + vpbroadcastd m7, [base+scale_shift+r7*4-32] + shlx r10d, r6d, r9d + mov sbyd, sbym + shlx r6d, r6d, r11d + vpbroadcastd m8, [base+fg_min+r10*4] + lea r6d, [r9+r6*2] + vpbroadcastd m9, [base+fg_max+r6*4] + kmovq k2, r13 + vpbroadcastd m20, [base+scale_rnd+r9*4] + packssdw m4, m5, m5 + vpbroadcastd m21, [base+scale_shift+r9*8+4] +%if %2 + mova m12, [base+pb_0to63] ; pw_even + mov r13d, 0x0101 + vpbroadcastq m10, [base+pw_23_22+r9*8] + kmovw k3, r13d +%if %3 + pshufd m11, m10, q0000 +%else + vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0] + vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4] + vmovdqu16 m11{k1}, m16 +%endif + psrlw m13, m12, 8 ; pw_odd +%else + vpbroadcastq m10, [base+pw_27_17_17_27+r9*8] + kshiftrb k3, k1, 7 ; 0x01 + kshiftrb k4, k1, 4 ; 0x0f + pshufd m11, m10, q0000 +%endif + mov lstrideq, r10mp + test sbyd, sbyd + setnz r7b + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + _, sby, see, lstride + +%if %1 + mov r6d, r11m + vpbroadcastd m0, [base+uv_offset_mul+r9*4] + vpbroadcastd m1, [base+pb_8_9_0_1] + vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4] + vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4] + pmaddwd m14, m0 + pshufb m15, m1 ; { uv_luma_mult, uv_mult } +%endif + test r7b, [fg_dataq+FGData.overlap_flag] + jnz %%v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma + + mov lumaq, r9mp + lea r12, [srcq+wq*2] + lea r13, [dstq+wq*2] + lea r14, [lumaq+wq*(2<<%2)] + mov r9mp, r12 + mov r10mp, r13 + mov r11mp, r14 + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: +%if %2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+offxyq*2+82*2] +%endif + call %%add_noise + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je %%loop_x + cmp dword r8m, 0 ; sby + jne %%hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma, left_offxy + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: +%if %2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + movd xm16, [grain_lutq+left_offxyq*2+82*0] + vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 + movd xm17, [grain_lutq+left_offxyq*2+82*4] + vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 + punpckldq m16, m17 + punpckldq m17, m18, m19 + punpcklwd m16, m17 + mova m17, m20 + vpdpwssd m17, m16, m10 + psrad m17, 1 + packssdw m17, m17 + vpsravw m17, m21 +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+offxyq*2+82*2] + movd xm16, [grain_lutq+left_offxyq*2+82*0] + pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1 + punpckldq xm17, xm18, xm19 + punpcklwd xm16, xm17 + mova xm17, xm20 + vpdpwssd xm17, xm16, xm10 + psrad xm17, 1 + packssdw xm17, xm17 + vpsravw xm17, xm21 +%endif + vmovdqa32 m18{k3}, m17 + vpshufd m19{k3}, m17, q0321 + call %%add_noise + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp dword r8m, 0 ; sby + jne %%hv_overlap + jmp %%loop_x_h_overlap + +%%v_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + _, sby, see, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma, _, top_offxy + + mov lumaq, r9mp + lea r12, [srcq+wq*2] + lea r13, [dstq+wq*2] + lea r14, [lumaq+wq*(2<<%2)] + mov r9mp, r12 + mov r10mp, r13 + mov r11mp, r14 + neg wq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma, _, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %3 + movu ym16, [grain_lutq+offxyq*2+82*0] + movu ym1, [grain_lutq+top_offxyq*2+82*0] + vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2] + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + punpcklwd ym17, ym1, ym16 + punpckhwd ym1, ym16 +%elif %2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym17, [grain_lutq+top_offxyq*2+82*0] + vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + punpcklwd m16, m17, m18 + punpckhwd m17, m18 +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+top_offxyq*2+82*0] + movu m2, [grain_lutq+offxyq*2+82*2] + movu m16, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m1, m19, m18 + punpcklwd m19, m18 + punpckhwd m18, m2, m16 + punpcklwd m2, m16 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump back + ; to %%v_overlap, and instead always fall-through to %%hv_overlap +%%hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movd xm16, [grain_lutq+left_offxyq*2+82*0] + vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 + movd xm17, [grain_lutq+left_offxyq*2+82*4] + vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + punpckldq m16, m17 + punpckldq m17, m18, m19 + punpcklwd m16, m17 + movu ym1, [grain_lutq+top_offxyq*2+82*0] + movd xm17, [grain_lutq+topleft_offxyq*2+82*0] + mova m0, m20 + vpdpwssd m0, m16, m10 +%if %3 + punpcklwd xm17, xm1 + mova xm16, xm20 + vpdpwssd xm16, xm17, xm10 + psrad xm16, 1 +%else + vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1 + vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2 + punpcklwd m17, m1 + mova m16, m20 + vpdpwssd m16, m17, m10 + psrad m16, 1 +%endif + psrad m0, 1 + packssdw m0, m16 + vpsravw m0, m21 + vmovdqa32 m18{k3}, m0 + vpshufd m19{k3}, m0, q0321 +%if %3 + vpunpckhdq ym1{k3}, ym0, ym0 + punpcklwd ym17, ym1, ym18 + punpckhwd ym1, ym18 +%else + vpunpckhdq m1{k3}, m0, m0 + punpcklwd m16, m1, m18 + punpckhwd m17, m1, m18 +%endif +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+top_offxyq*2+82*0] + movd xm17, [grain_lutq+left_offxyq*2+82*0] + pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1 + punpckldq xm16, xm18, xm19 + punpcklwd xm17, xm16 + movu m2, [grain_lutq+offxyq*2+82*2] + movu m0, [grain_lutq+top_offxyq*2+82*2] + movd xm16, [grain_lutq+left_offxyq*2+82*2] + pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1 + punpckldq xm1, xm2, xm0 + punpcklwd xm1, xm16, xm1 + mova xm16, xm20 + vpdpwssd xm16, xm17, xm10 + mova xm17, xm20 + vpdpwssd xm17, xm1, xm10 + punpckhwd m1, m19, m18 + punpcklwd m19, m18 + punpckhwd m18, m2, m0 + punpcklwd m2, m0 + psrad xm16, 1 + psrad xm17, 1 + packssdw xm16, xm17 + vpsravw xm16, xm21 + vpshuflw m19{k4}, m16, q1302 + punpckhqdq xm16, xm16 + vpshuflw m2{k4}, m16, q3120 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + jmp %%hv_overlap + +ALIGN function_align +%%add_noise_v: +%if %3 + mova ym16, ym20 + vpdpwssd ym16, ym17, ym11 + mova ym17, ym20 + vpdpwssd ym17, ym1, ym11 + psrad ym16, 1 + psrad ym17, 1 + packssdw ym16, ym17 + vpsravw m18{k1}, m16, m21 +%elif %2 + mova m18, m20 + vpdpwssd m18, m16, m11 + mova m16, m20 + vpdpwssd m16, m17, m11 + psrad m18, 1 + psrad m16, 1 + packssdw m18, m16 + vpsravw m18, m21 +%else + mova m16, m20 + vpdpwssd m16, m1, m11 + mova m17, m20 + vpdpwssd m17, m18, m11 + mova m18, m20 + vpdpwssd m18, m19, m11 + mova m19, m20 + vpdpwssd m19, m2, m11 + REPX {psrad x, 1}, m16, m17, m18, m19 + packssdw m18, m16 + packssdw m19, m17 + vpsravw m18, m21 + vpsravw m19, m21 +%endif +%%add_noise: +%if %2 + mova m2, [lumaq+lstrideq*(0<<%3)] + mova m0, [lumaq+lstrideq*(1<<%3)] + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova m3, [lumaq+lstrideq*(0<<%3)] + mova m1, [lumaq+lstrideq*(1<<%3)] + mova m16, m12 + vpermi2w m16, m2, m0 + vpermt2w m2, m13, m0 + mova m17, m12 + vpermi2w m17, m3, m1 + vpermt2w m3, m13, m1 + pavgw m2, m16 + pavgw m3, m17 +%elif %1 + mova m2, [lumaq+lstrideq*0] + mova m3, [lumaq+lstrideq*1] +%endif +%if %2 + mova ym16, [srcq+strideq*0] + vinserti32x8 m16, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] +%else + mova m16, [srcq+strideq*0] +%endif +%if %1 + punpckhwd m17, m2, m16 + mova m0, m14 + vpdpwssd m0, m17, m15 + punpcklwd m17, m2, m16 + mova m2, m14 + vpdpwssd m2, m17, m15 +%endif +%if %2 + mova ym17, [srcq+strideq*0] + vinserti32x8 m17, [srcq+strideq*1], 1 +%else + mova m17, [srcq+strideq*1] +%endif +%if %1 + psrad m0, 6 + psrad m2, 6 + packusdw m2, m0 + punpckhwd m0, m3, m17 + mova m1, m14 + vpdpwssd m1, m15, m0 + punpcklwd m0, m3, m17 + mova m3, m14 + vpdpwssd m3, m15, m0 + psrad m1, 6 + psrad m3, 6 + packusdw m3, m1 + pminuw m2, m4 + pminuw m3, m4 + +.add_noise_main: + ; scaling[luma_src] + kmovw k5, k1 + pand m1, m5, m2 + vpgatherdd m0{k5}, [scalingq+m1] + kmovw k5, k1 + psrld m2, 16 + vpgatherdd m1{k5}, [scalingq+m2] + vpshufb m0{k2}, m1, m6 + kmovw k5, k1 + psrld m1, m3, 16 + vpgatherdd m2{k5}, [scalingq+m1] + kmovw k5, k1 + pand m3, m5 + vpgatherdd m1{k5}, [scalingq+m3] + vpshufb m1{k2}, m2, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + vpsllvw m0, m7 + vpsllvw m1, m7 + pmulhrsw m18, m0 + pmulhrsw m19, m1 + add grain_lutq, 82*(4<<%2) + lea lumaq, [lumaq+lstrideq*(2<<%3)] + lea srcq, [srcq+strideq*2] + paddw m16, m18 + paddw m17, m19 + pmaxsw m16, m8 + pmaxsw m17, m8 + pminsw m16, m9 + pminsw m17, m9 +%if %2 + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 + lea dstq, [dstq+strideq*2] + mova [dstq+strideq*0], ym17 + vextracti32x8 [dstq+strideq*1], m17, 1 +%else + mova [dstq+strideq*0], m16 + mova [dstq+strideq*1], m17 +%endif + lea dstq, [dstq+strideq*2] + ret +%else +%if %2 + pand m2, m4 + pand m3, m4 +%else + pand m2, m4, [lumaq+lstrideq*0] + pand m3, m4, [lumaq+lstrideq*1] +%endif + jmp .add_noise_main +%endif +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 + +%endif diff -Nru dav1d-0.9.2/src/x86/film_grain16_sse.asm dav1d-1.0.0/src/x86/film_grain16_sse.asm --- dav1d-0.9.2/src/x86/film_grain16_sse.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/film_grain16_sse.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,3450 +0,0 @@ -; Copyright © 2021, VideoLAN and dav1d authors -; Copyright © 2021, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "config.asm" -%include "ext/x86/x86inc.asm" - -SECTION_RODATA 16 -pd_16: times 4 dd 16 -pw_1: times 8 dw 1 -pw_16384: times 8 dw 16384 -pw_8192: times 8 dw 8192 -pw_23_22: dw 23, 22 - times 3 dw 0, 32 -pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 -pw_27_17_17_27: dw 27, 17, 17, 27 - times 2 dw 0, 32 -rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 -pw_seed_xor: times 2 dw 0xb524 - times 2 dw 0x49d8 -pb_1: times 4 db 1 -hmul_bits: dw 32768, 16384, 8192, 4096 -round: dw 2048, 1024, 512 -mul_bits: dw 256, 128, 64, 32, 16 -round_vals: dw 32, 64, 128, 256, 512, 1024 -max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 -min: dw 0, 16*4, 16*16 -; these two should be next to each other -pw_4: times 2 dw 4 -pw_16: times 2 dw 16 - -%macro JMP_TABLE 1-* - %xdefine %1_table %%table - %xdefine %%base %1_table - %xdefine %%prefix mangle(private_prefix %+ _%1) - %%table: - %rep %0 - 1 - dd %%prefix %+ .ar%2 - %%base - %rotate 1 - %endrep -%endmacro - -JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 - -struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - -SECTION .text - -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - -%if ARCH_X86_32 -%undef base -%define PIC_ptr(a) base+a -%else -%define PIC_ptr(a) a -%endif - -%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) - -%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg -%assign %%idx 0 -%define %%tmp %2 -%if %0 == 8 -%define %%tmp %8 -%endif -%rep (%6/2) -%if %%idx == 0 - movd %5 %+ d, %2 - pshuflw %%tmp, %2, q3232 -%else - movd %5 %+ d, %%tmp -%if %6 == 8 -%if %%idx == 2 - punpckhqdq %%tmp, %%tmp -%elif %%idx == 4 - psrlq %%tmp, 32 -%endif -%endif -%endif - movzx %4 %+ d, %5 %+ w - shr %5 %+ d, 16 - -%if %%idx == 0 - movd %1, [%3+%4*%7] -%else - pinsrw %1, [%3+%4*%7], %%idx + 0 -%endif - pinsrw %1, [%3+%5*%7], %%idx + 1 -%assign %%idx %%idx+2 -%endrep -%endmacro - -%macro SPLATD 2 ; dst, src -%ifnidn %1, %2 - movd %1, %2 -%endif - pshufd %1, %1, q0000 -%endmacro - -%macro SPLATW 2 ; dst, src -%ifnidn %1, %2 - movd %1, %2 -%endif - pshuflw %1, %1, q0000 - punpcklqdq %1, %1 -%endmacro - - -INIT_XMM ssse3 -%if ARCH_X86_64 -cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax - lea r4, [pb_mask] -%define base r4-pb_mask -%else -cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax - LEA r4, $$ -%define base r4-$$ -%endif - movq m1, [base+rnd_next_upperbit_mask] - movq m4, [base+mul_bits] - movq m7, [base+hmul_bits] - mov r3d, [fg_dataq+FGData.grain_scale_shift] - lea r5d, [bdmaxq+1] - shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc - sub r3, r5 - SPLATW m6, [base+round+r3*2-2] - mova m5, [base+pb_mask] - SPLATW m0, [fg_dataq+FGData.seed] - mov r3, -73*82*2 - sub bufq, r3 -%if ARCH_X86_64 - lea r6, [gaussian_sequence] -%endif -.loop: - pand m2, m0, m1 - psrlw m3, m2, 10 - por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw m2, m4 ; bits 0x0f00 are set - pshufb m3, m5, m2 ; set 15th bit for next 4 seeds - psllq m2, m3, 30 - por m2, m3 - psllq m3, m2, 15 - por m2, m3 ; aggregate each bit into next seed's high bit - pmulhuw m3, m0, m7 - por m2, m3 ; 4 next output seeds - pshuflw m0, m2, q3333 - psrlw m2, 5 -%if ARCH_X86_64 - vpgatherdw m3, m2, r6, r5, r7, 4, 2 -%else - vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 -%endif - paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 - ; shifts by 0, which pmulhrsw does not support - pmulhrsw m3, m6 - movq [bufq+r3], m3 - add r3, 4*2 - jl .loop - - ; auto-regression code - movsxd r3, [fg_dataq+FGData.ar_coeff_lag] - movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] - lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] - jmp r3 - -.ar1: -%if WIN64 - DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 - lea bufq, [r0-2*(82*73-(82*3+79))] - PUSH r8 -%else -%if ARCH_X86_64 - DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 -%else ; x86-32 - DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 - PUSH r6 -%define shiftd r1d -%endif - sub bufq, 2*(82*73-(82*3+79)) -%endif - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] - movd m4, [fg_dataq+FGData.ar_coeffs_y] - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] -%if WIN64 - DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 -%elif ARCH_X86_64 - DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 -%else ; x86-32 -%undef shiftd - DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 -%define hd dword r0m -%define maxd dword minm -%endif -%if cpuflag(sse4) - pmovsxbw m4, m4 -%else - pxor m3, m3 - pcmpgtb m3, m4 - punpcklbw m4, m3 -%endif - pinsrw m4, [base+pw_1], 3 - pshufd m5, m4, q1111 - pshufd m4, m4, q0000 - SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd - mov hd, 70 - sar maxd, 1 - mov mind, maxd - xor mind, -1 -.y_loop_ar1: - mov xq, -76 - movsx val3d, word [bufq+xq*2-2] -.x_loop_ar1: - movu m0, [bufq+xq*2-82*2-2] ; top/left - psrldq m2, m0, 2 ; top - psrldq m1, m0, 4 ; top/right - punpcklwd m0, m2 - punpcklwd m1, m3 - pmaddwd m0, m4 - pmaddwd m1, m5 - paddd m0, m1 -.x_loop_ar1_inner: - movd val0d, m0 - psrldq m0, 4 - imul val3d, cf3d - add val3d, val0d - sar val3d, shiftb - movsx val0d, word [bufq+xq*2] - add val3d, val0d - cmp val3d, maxd - cmovg val3d, maxd - cmp val3d, mind - cmovl val3d, mind - mov word [bufq+xq*2], val3w - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82*2 - dec hd - jg .y_loop_ar1 -%if WIN64 - POP r8 -%elif ARCH_X86_32 - POP r6 -%undef maxd -%undef hd -%endif -.ar0: - RET - -.ar2: -%if ARCH_X86_32 -%assign stack_offset_old stack_offset - ALLOC_STACK -16*8 -%endif - DEFINE_ARGS buf, fg_data, bdmax, shift - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m0, [base+round_vals-12+shiftq*2] - pshuflw m0, m0, q0000 - movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 - pxor m2, m2 - punpcklwd m0, m2 - pcmpgtb m2, m6 - punpckhbw m3, m6, m2 - punpcklbw m6, m2 - pshufd m2, m6, q3333 - pshufd m1, m6, q2222 - pshufd m7, m6, q1111 - pshufd m6, m6, q0000 - pshufd m4, m3, q1111 - pshufd m3, m3, q0000 -%if ARCH_X86_64 - SWAP 0, 12 - SWAP 1, 8 - SWAP 2, 9 - SWAP 3, 10 - SWAP 4, 11 -%else -%define m12 [rsp+0*16] -%define m8 [rsp+1*16] -%define m9 [rsp+2*16] -%define m10 [rsp+3*16] -%define m11 [rsp+4*16] - mova m12, m0 - mova m8, m1 - mova m9, m2 - mova m10, m3 - mova m11, m4 - mov bdmaxd, bdmaxm -%endif - sar bdmaxd, 1 - SPLATW m0, bdmaxd ; max_grain - pcmpeqw m1, m1 -%if !cpuflag(sse4) - pcmpeqw m2, m2 - psrldq m2, 14 - pslldq m2, 2 - pxor m2, m1 -%endif - pxor m1, m0 ; min_grain -%if ARCH_X86_64 - SWAP 0, 13 - SWAP 1, 14 - SWAP 2, 15 -%else -%define m13 [rsp+5*16] -%define m14 [rsp+6*16] - mova m13, m0 - mova m14, m1 -%if !cpuflag(sse4) -%define m15 [rsp+7*16] - mova m15, m2 -%endif -%endif - sub bufq, 2*(82*73-(82*3+79)) - DEFINE_ARGS buf, fg_data, h, x - mov hd, 70 -.y_loop_ar2: - mov xq, -76 - -.x_loop_ar2: - movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] - movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] - psrldq m2, m0, 2 - psrldq m3, m0, 4 - psrldq m4, m0, 6 - psrldq m5, m0, 8 - punpcklwd m0, m2 - punpcklwd m3, m4 - punpcklwd m5, m1 - psrldq m2, m1, 2 - psrldq m4, m1, 4 - punpcklwd m2, m4 - psrldq m4, m1, 6 - psrldq m1, 8 - punpcklwd m4, m1 - pmaddwd m0, m6 - pmaddwd m3, m7 - pmaddwd m5, m8 - pmaddwd m2, m9 - pmaddwd m4, m10 - paddd m0, m3 - paddd m5, m2 - paddd m0, m4 - paddd m0, m5 ; accumulated top 2 rows - paddd m0, m12 - - movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] - pshufd m4, m1, q3321 - pxor m2, m2 - pcmpgtw m2, m4 - punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] -.x_loop_ar2_inner: - pmaddwd m2, m1, m11 - paddd m2, m0 - psrldq m0, 4 ; shift top to next pixel - psrad m2, [fg_dataq+FGData.ar_coeff_shift] - paddd m2, m4 - packssdw m2, m2 - pminsw m2, m13 - pmaxsw m2, m14 - psrldq m4, 4 - pslldq m2, 2 - psrldq m1, 2 -%if cpuflag(sse4) - pblendw m1, m2, 00000010b -%else - pand m1, m15 - pandn m3, m15, m2 - por m1, m3 -%endif - ; overwrite previous pixel, this should be ok - movd [bufq+xq*2-2], m1 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82*2 - dec hd - jg .y_loop_ar2 -%if ARCH_X86_32 -%undef m8 -%undef m9 -%undef m10 -%undef m11 -%undef m12 -%undef m13 -%undef m14 -%undef m15 -%endif - RET - -.ar3: - DEFINE_ARGS buf, fg_data, bdmax, shift -%if WIN64 - mov r6, rsp - and rsp, ~15 - sub rsp, 64 - %define tmp rsp -%elif ARCH_X86_64 - %define tmp rsp+stack_offset-72 -%else -%assign stack_offset stack_offset_old - ALLOC_STACK -16*12 - %define tmp rsp - mov bdmaxd, bdmaxm -%endif - sar bdmaxd, 1 - SPLATW m7, bdmaxd ; max_grain - pcmpeqw m6, m6 -%if !cpuflag(sse4) - pcmpeqw m4, m4 - psrldq m4, 14 - pslldq m4, 4 - pxor m4, m6 -%endif - pxor m6, m7 ; min_grain - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - -%if ARCH_X86_64 - SWAP 6, 14 - SWAP 7, 15 -%else -%define m14 [rsp+10*16] -%define m15 [esp+11*16] - mova m14, m6 - mova m15, m7 -%endif - - ; build cf0-1 until 18-19 in m5-12 and r0/1 - pxor m1, m1 - movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 - pcmpgtb m1, m0 - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - -%if cpuflag(sse4) - pshufd m4, m2, q3333 -%else - pshufd m5, m2, q3333 - mova [tmp+48], m5 -%endif - pshufd m3, m2, q2222 - pshufd m1, m2, q0000 - pshufd m2, m2, q1111 - pshufd m7, m0, q2222 - pshufd m6, m0, q1111 - pshufd m5, m0, q0000 - pshufd m0, m0, q3333 - -%if ARCH_X86_64 - SWAP 0, 8 - SWAP 1, 9 - SWAP 2, 10 - SWAP 3, 11 - SWAP 4, 12 -%else -%define m8 [rsp+4*16] -%define m9 [esp+5*16] -%define m10 [rsp+6*16] -%define m11 [esp+7*16] -%define m12 [rsp+8*16] - mova m8, m0 - mova m9, m1 - mova m10, m2 - mova m11, m3 - mova m12, m4 -%endif - - ; build cf20,round in r2 - ; build cf21-23,round*2 in m13 - pxor m1, m1 - movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 - pcmpgtb m1, m0 - punpcklbw m0, m1 - pshufd m1, m0, q0000 - pshufd m2, m0, q1111 - mova [tmp+ 0], m1 - mova [tmp+16], m2 - psrldq m3, m0, 10 - pinsrw m3, [base+round_vals+shiftq*2-10], 3 - -%if ARCH_X86_64 - SWAP 3, 13 -%else -%define m13 [esp+9*16] - mova m13, m3 -%endif - - pinsrw m0, [base+round_vals+shiftq*2-12], 5 - pshufd m3, m0, q2222 - mova [tmp+32], m3 - - DEFINE_ARGS buf, fg_data, h, x - sub bufq, 2*(82*73-(82*3+79)) - mov hd, 70 -.y_loop_ar3: - mov xq, -76 - -.x_loop_ar3: - movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] - movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] - palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] - palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] - punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] - punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] - shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] - - pmaddwd m0, m5 - pmaddwd m2, m6 - pmaddwd m3, m7 - paddd m0, m2 - paddd m0, m3 - ; m0 = top line first 6 multiplied by cf, m1 = top line last entry - - movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] - movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] - punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] - palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] - palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] - punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] - punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] - shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] - - pmaddwd m1, m8 - pmaddwd m4, m9 - pmaddwd m3, m10 - pmaddwd m2, m11 - paddd m1, m4 - paddd m3, m2 - paddd m0, m1 - paddd m0, m3 - ; m0 = top 2 lines multiplied by cf - - movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] - movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] - palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] - palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] - punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] - punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] - shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] - punpcklwd m2, [base+pw_1] - -%if cpuflag(sse4) - pmaddwd m1, m12 -%else - pmaddwd m1, [tmp+48] -%endif - pmaddwd m3, [tmp+ 0] - pmaddwd m4, [tmp+16] - pmaddwd m2, [tmp+32] - paddd m1, m3 - paddd m4, m2 - paddd m0, m1 - paddd m0, m4 - ; m0 = top 3 lines multiplied by cf plus rounding for downshift - - movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pmaddwd m2, m1, m13 - pshufd m3, m2, q1111 - paddd m2, m3 ; left+cur - paddd m2, m0 ; add top - psrldq m0, 4 - psrad m2, [fg_dataq+FGData.ar_coeff_shift] - packssdw m2, m2 - pminsw m2, m15 - pmaxsw m2, m14 - pslldq m2, 4 - psrldq m1, 2 -%if cpuflag(sse4) - pblendw m1, m2, 00000100b -%else - pand m1, m12 - pandn m3, m12, m2 - por m1, m3 -%endif - ; overwrite a couple of pixels, should be ok - movq [bufq+xq*2-4], m1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82*2 - dec hd - jg .y_loop_ar3 -%if WIN64 - mov rsp, r6 -%elif ARCH_X86_32 -%undef m8 -%undef m9 -%undef m10 -%undef m11 -%undef m12 -%undef m13 -%undef m14 -%undef m15 -%endif - RET - -%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y -INIT_XMM ssse3 -%if ARCH_X86_64 -cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg -%define base r8-pb_mask - lea r8, [pb_mask] - movifnidn bdmaxd, bdmaxm - lea r6d, [bdmaxq+1] -%else -cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h -%define base r2-$$ - LEA r2, $$ - mov fg_dataq, r2m - mov r6d, r4m - inc r6d -%endif - movq m1, [base+rnd_next_upperbit_mask] - movq m4, [base+mul_bits] - movq m7, [base+hmul_bits] - mov r5d, [fg_dataq+FGData.grain_scale_shift] - shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc - sub r5, r6 - SPLATW m6, [base+round+r5*2-2] - mova m5, [base+pb_mask] - SPLATW m0, [fg_dataq+FGData.seed] -%if ARCH_X86_64 - SPLATW m2, [base+pw_seed_xor+uvq*4] -%else - mov r5d, r3m - SPLATW m2, [base+pw_seed_xor+r5*4] -%endif - pxor m0, m2 -%if ARCH_X86_64 - lea r6, [gaussian_sequence] -%endif -%if %2 - mov hd, 73-35*%3 - add bufq, 44*2 -.loop_y: - mov xq, -44 -%else - mov xq, -82*73 - add bufq, 82*73*2 -%endif -.loop_x: - pand m2, m0, m1 - psrlw m3, m2, 10 - por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw m2, m4 ; bits 0x0f00 are set - pshufb m3, m5, m2 ; set 15th bit for next 4 seeds - psllq m2, m3, 30 - por m2, m3 - psllq m3, m2, 15 - por m2, m3 ; aggregate each bit into next seed's high bit - pmulhuw m3, m0, m7 - por m2, m3 ; 4 next output seeds - pshuflw m0, m2, q3333 - psrlw m2, 5 -%if ARCH_X86_64 - vpgatherdw m3, m2, r6, r9, r10, 4, 2 -%else - vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 -%endif - paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 - ; shifts by 0, which pmulhrsw does not support - pmulhrsw m3, m6 - movq [bufq+xq*2], m3 - add xq, 4 - jl .loop_x -%if %2 - add bufq, 82*2 - dec hd - jg .loop_y -%endif - - ; auto-regression code - movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] - jmp r5 - -.ar0: -%if ARCH_X86_64 - DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift -%else - DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift -%assign stack_offset_old stack_offset - ALLOC_STACK -16*2 - mov bufyq, r1m - mov uvd, r3m -%endif - imul uvd, 28 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] - SPLATW m3, [base+hmul_bits+shiftq*2-10] -%if ARCH_X86_64 - sar bdmaxd, 1 - SPLATW m1, bdmaxd ; max_gain -%else - SPLATW m1, r4m - psraw m1, 1 -%endif - pcmpeqw m7, m7 - pxor m7, m1 ; min_grain -%if ARCH_X86_64 - SWAP 1, 14 - DEFINE_ARGS buf, bufy, h, x -%else -%define m14 [rsp+0*16] - mova m14, m1 - DEFINE_ARGS buf, bufy, pic_reg, h, x -%endif - pxor m5, m5 - pcmpgtb m5, m4 - punpcklbw m4, m5 -%if %2 - SPLATW m6, [base+hmul_bits+2+%3*2] -%endif - SPLATW m4, m4 - pxor m5, m5 -%if %2 -%if !cpuflag(sse4) - pcmpeqw m2, m2 - pslldq m2, 12 -%if ARCH_X86_64 - SWAP 2, 12 -%else -%define m12 [rsp+1*16] - mova m12, m2 -%endif -%endif -%endif -%if %2 - sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) -%else - sub bufq, 2*(82*70-3) -%endif - add bufyq, 2*(3+82*3) - mov hd, 70-35*%3 -.y_loop_ar0: - ; first 32 pixels - xor xd, xd -.x_loop_ar0: - movu m0, [bufyq+xq*(2<<%2)] -%if %2 -%if %3 - movu m2, [bufyq+xq*4+82*2] - paddw m0, m2 -%endif - movu m1, [bufyq+xq*4 +16] -%if %3 - movu m2, [bufyq+xq*4+82*2+16] - paddw m1, m2 -%endif - phaddw m0, m1 - pmulhrsw m0, m6 -%endif - punpckhwd m1, m0, m5 - punpcklwd m0, m5 - REPX {pmaddwd x, m4}, m0, m1 - REPX {psrad x, 5}, m0, m1 - packssdw m0, m1 - pmulhrsw m0, m3 - movu m1, [bufq+xq*2] - paddw m0, m1 - pminsw m0, m14 - pmaxsw m0, m7 - cmp xd, 72-40*%2 - je .end - movu [bufq+xq*2], m0 - add xd, 8 - jmp .x_loop_ar0 - - ; last 6/4 pixels -.end: -%if %2 -%if cpuflag(sse4) - pblendw m0, m1, 11000000b -%else - pand m1, m12 - pandn m2, m12, m0 - por m0, m1, m2 -%endif - movu [bufq+xq*2], m0 -%else - movq [bufq+xq*2], m0 -%endif - - add bufq, 82*2 - add bufyq, 82*(2<<%3) - dec hd - jg .y_loop_ar0 -%if ARCH_X86_32 -%undef m12 -%undef m14 -%endif - RET - -.ar1: -%if ARCH_X86_64 - DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x -%else -%assign stack_offset stack_offset_old -%xdefine rstk rsp -%assign stack_size_padded 0 - DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 - mov bufyq, r1m - mov uvd, r3m -%endif - imul uvd, 28 - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] - movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] -%if WIN64 - DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 -%if %2 - lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] -%else - lea bufq, [r0-2*(82*69+3)] -%endif -%else -%if ARCH_X86_64 - DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 -%else - DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 -%define hd dword r1m -%define mind dword r3m -%define maxd dword r4m -%endif -%if %2 - sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) -%else - sub bufq, 2*(82*69+3) -%endif -%endif -%if ARCH_X86_64 - mov shiftd, [r2+FGData.ar_coeff_shift] -%else - mov shiftd, [r3+FGData.ar_coeff_shift] -%endif - pxor m5, m5 - pcmpgtb m5, m4 - punpcklbw m4, m5 ; cf0-4 in words - pshuflw m4, m4, q2100 - psrldq m4, 2 ; cf0-3,4 in words - pshufd m5, m4, q1111 - pshufd m4, m4, q0000 - movd m3, [base+round_vals+shiftq*2-12] ; rnd - pxor m6, m6 - punpcklwd m3, m6 -%if %2 - SPLATW m6, [base+hmul_bits+2+%3*2] -%endif - SPLATD m3, m3 - add bufyq, 2*(79+82*3) - mov hd, 70-35*%3 - sar maxd, 1 -%if ARCH_X86_64 - mov mind, maxd - xor mind, -1 -%else - DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 - mov r2, maxd - xor r2, -1 - mov mind, r2 -%endif -.y_loop_ar1: - mov xq, -(76>>%2) - movsx val3d, word [bufq+xq*2-2] -.x_loop_ar1: - movu m0, [bufq+xq*2-82*2-2] ; top/left -%if %2 - movu m7, [bufyq+xq*4] -%if %3 - movu m1, [bufyq+xq*4+82*2] - phaddw m7, m1 -%else - phaddw m7, m7 -%endif -%else - movq m7, [bufyq+xq*2] -%endif - psrldq m2, m0, 2 ; top - psrldq m1, m0, 4 ; top/right - punpcklwd m0, m2 -%if %2 -%if %3 - pshufd m2, m7, q3232 - paddw m7, m2 -%endif - pmulhrsw m7, m6 -%endif - punpcklwd m1, m7 - pmaddwd m0, m4 - pmaddwd m1, m5 - paddd m0, m1 - paddd m0, m3 -.x_loop_ar1_inner: - movd val0d, m0 - psrldq m0, 4 - imul val3d, cf3d - add val3d, val0d - sar val3d, shiftb - movsx val0d, word [bufq+xq*2] - add val3d, val0d - cmp val3d, maxd - cmovg val3d, maxd - cmp val3d, mind - cmovl val3d, mind - mov word [bufq+xq*2], val3w - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82*2 - add bufyq, 82*2<<%3 - dec hd - jg .y_loop_ar1 -%if ARCH_X86_32 -%undef maxd -%undef mind -%undef hd -%endif - RET - -.ar2: -%if ARCH_X86_64 - DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift -%else - DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift - ALLOC_STACK -16*8 - mov bufyq, r1m - mov uvd, r3m -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 -%if ARCH_X86_64 - sar bdmaxd, 1 - SPLATW m5, bdmaxd ; max_grain -%else - SPLATW m5, r4m - psraw m5, 1 -%endif - pcmpeqw m6, m6 -%if !cpuflag(sse4) - pcmpeqw m7, m7 - psrldq m7, 14 - pslldq m7, 2 - pxor m7, m6 -%endif - pxor m6, m5 ; min_grain -%if %2 && cpuflag(sse4) - SPLATW m7, [base+hmul_bits+2+%3*2] -%endif - -%if ARCH_X86_64 - SWAP 5, 13 - SWAP 6, 14 - SWAP 7, 15 -%else -%define m13 [rsp+5*16] -%define m14 [rsp+6*16] -%define m15 [rsp+7*16] - mova m13, m5 - mova m14, m6 - mova m15, m7 -%endif - - ; coef values - movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] - pxor m1, m1 - pcmpgtb m1, m0 - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pinsrw m2, [base+round_vals-12+shiftq*2], 5 - - pshufd m6, m0, q0000 - pshufd m7, m0, q1111 - pshufd m1, m0, q3333 - pshufd m0, m0, q2222 - pshufd m3, m2, q1111 - pshufd m4, m2, q2222 - pshufd m2, m2, q0000 - -%if ARCH_X86_64 - SWAP 0, 8 - SWAP 1, 9 - SWAP 2, 10 - SWAP 3, 11 - SWAP 4, 12 -%else -%define m8 [rsp+0*16] -%define m9 [rsp+1*16] -%define m10 [rsp+2*16] -%define m11 [rsp+3*16] -%define m12 [rsp+4*16] - mova m8, m0 - mova m9, m1 - mova m10, m2 - mova m11, m3 - mova m12, m4 -%endif - -%if ARCH_X86_64 - DEFINE_ARGS buf, bufy, fg_data, h, x -%else - DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x -%endif -%if %2 - sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) -%else - sub bufq, 2*(82*69+3) -%endif - add bufyq, 2*(79+82*3) - mov hd, 70-35*%3 -.y_loop_ar2: - mov xq, -(76>>%2) - -.x_loop_ar2: - movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] - movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] - psrldq m4, m0, 2 ; y=-2,x=[-1,+5] - psrldq m1, m0, 4 ; y=-2,x=[-0,+5] - psrldq m3, m0, 6 ; y=-2,x=[+1,+5] - psrldq m2, m0, 8 ; y=-2,x=[+2,+5] - punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] - punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] - punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] - pmaddwd m0, m6 - pmaddwd m1, m7 - pmaddwd m2, m8 - paddd m0, m1 - paddd m0, m2 - psrldq m3, m5, 2 ; y=-1,x=[-1,+5] - psrldq m1, m5, 4 ; y=-1,x=[-0,+5] - psrldq m4, m5, 6 ; y=-1,x=[+1,+5] - psrldq m2, m5, 8 ; y=-1,x=[+2,+5] - punpcklwd m3, m1 - punpcklwd m4, m2 - pmaddwd m3, m9 - pmaddwd m4, m10 - paddd m3, m4 - paddd m0, m3 - - ; luma component & rounding -%if %2 - movu m1, [bufyq+xq*4] -%if %3 - movu m2, [bufyq+xq*4+82*2] - phaddw m1, m2 - pshufd m2, m1, q3232 - paddw m1, m2 -%else - phaddw m1, m1 -%endif -%if cpuflag(sse4) - pmulhrsw m1, m15 -%elif %3 - pmulhrsw m1, [base+pw_8192] -%else - pmulhrsw m1, [base+pw_16384] -%endif -%else - movq m1, [bufyq+xq*2] -%endif - punpcklwd m1, [base+pw_1] - pmaddwd m1, m12 - paddd m0, m1 - - movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] - pshufd m2, m1, q3321 - pxor m3, m3 - pcmpgtw m3, m2 - punpcklwd m2, m3 ; y=0,x=[0,3] in dword -.x_loop_ar2_inner: - pmaddwd m3, m1, m11 - paddd m3, m0 - psrldq m0, 4 ; shift top to next pixel - psrad m3, [fg_dataq+FGData.ar_coeff_shift] - ; we do not need to packssdw since we only care about one value - paddd m3, m2 - packssdw m3, m3 - pminsw m3, m13 - pmaxsw m3, m14 - psrldq m1, 2 - pslldq m3, 2 - psrldq m2, 4 -%if cpuflag(sse4) - pblendw m1, m3, 00000010b -%else - pand m1, m15 - pandn m4, m15, m3 - por m1, m4 -%endif - ; overwrite previous pixel, should be ok - movd [bufq+xq*2-2], m1 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82*2 - add bufyq, 82*2<<%3 - dec hd - jg .y_loop_ar2 -%if ARCH_X86_32 -%undef m13 -%undef m14 -%undef m15 -%endif - RET - -.ar3: -%if ARCH_X86_64 - DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift -%if WIN64 - mov r6, rsp - and rsp, ~15 - sub rsp, 96 - %define tmp rsp -%else - %define tmp rsp+stack_offset-120 -%endif -%else - DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift -%assign stack_offset stack_offset_old - ALLOC_STACK -16*14 - mov bufyq, r1m - mov uvd, r3m - %define tmp rsp -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - SPLATW m4, [base+round_vals-12+shiftq*2] - pxor m5, m5 - pcmpgtw m5, m4 - punpcklwd m4, m5 -%if ARCH_X86_64 - sar bdmaxd, 1 - SPLATW m6, bdmaxd ; max_grain -%else - SPLATW m6, r4m - psraw m6, 1 -%endif - pcmpeqw m7, m7 -%if !cpuflag(sse4) - pcmpeqw m3, m3 - psrldq m3, 14 - pslldq m3, 4 - pxor m3, m7 -%endif - pxor m7, m6 ; min_grain -%if %2 && cpuflag(sse4) - SPLATW m3, [base+hmul_bits+2+%3*2] -%endif - -%if ARCH_X86_64 - SWAP 3, 11 - SWAP 4, 12 - SWAP 6, 14 - SWAP 7, 15 -%else -%define m11 [rsp+ 9*16] -%define m12 [rsp+10*16] -%define m14 [rsp+12*16] -%define m15 [rsp+13*16] - mova m11, m3 - mova m12, m4 - mova m14, m6 - mova m15, m7 -%endif - - ; cf from y=-3,x=-3 until y=-3,x=-2 - movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] - pxor m1, m1 - pcmpgtb m1, m0 - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pshufd m1, m0, q0000 - pshufd m3, m0, q1111 - pshufd m4, m0, q2222 - pshufd m0, m0, q3333 - pshufd m5, m2, q0000 - pshufd m6, m2, q1111 - mova [tmp+16*0], m1 - mova [tmp+16*1], m3 - mova [tmp+16*2], m4 - mova [tmp+16*3], m0 - mova [tmp+16*4], m5 - mova [tmp+16*5], m6 - pshufd m6, m2, q2222 - pshufd m7, m2, q3333 - - ; cf from y=-1,x=-1 to y=0,x=-1 + luma component - movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] - pxor m1, m1 - pcmpgtb m1, m0 - punpckhbw m2, m0, m1 ; luma - punpcklbw m0, m1 - pshufd m3, m0, q3232 - psrldq m5, m0, 10 - ; y=0,x=[-3 to -1] + "1.0" for current pixel - pinsrw m5, [base+round_vals-10+shiftq*2], 3 - ; y=-1,x=[-1 to +2] - pshufd m1, m0, q0000 - pshufd m0, m0, q1111 - ; y=-1,x=+3 + luma - punpcklwd m3, m2 - pshufd m3, m3, q0000 - -%if ARCH_X86_64 - SWAP 1, 8 - SWAP 0, 9 - SWAP 3, 10 - SWAP 5, 13 - DEFINE_ARGS buf, bufy, fg_data, h, x -%else -%define m8 [rsp+ 6*16] -%define m9 [rsp+ 7*16] -%define m10 [rsp+ 8*16] -%define m13 [rsp+11*16] - mova m8, m1 - mova m9, m0 - mova m10, m3 - mova m13, m5 - DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x -%endif -%if %2 - sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) -%else - sub bufq, 2*(82*69+3) -%endif - add bufyq, 2*(79+82*3) - mov hd, 70-35*%3 -.y_loop_ar3: - mov xq, -(76>>%2) - -.x_loop_ar3: - ; first line - movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] - movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] - palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] - palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] - punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] - punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] - shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] - - pmaddwd m0, [tmp+0*16] - pmaddwd m2, [tmp+1*16] - pmaddwd m3, [tmp+2*16] - paddd m0, m2 - paddd m0, m3 ; first 6 x of top y - - ; second line [m0/1 are busy] - movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] - movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] - punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] - palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] - palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] - punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] - punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] - shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] - pmaddwd m1, [tmp+3*16] - pmaddwd m4, [tmp+4*16] - pmaddwd m3, [tmp+5*16] - pmaddwd m5, m6 - paddd m1, m4 - paddd m3, m5 - paddd m0, m1 - paddd m0, m3 ; top 2 lines - - ; third line [m0 is busy] & luma + round - movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] - movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] -%if %2 - movu m5, [bufyq+xq*4] -%if %3 - movu m4, [bufyq+xq*4+82*2] - phaddw m5, m4 -%else - phaddw m5, m5 -%endif -%else - movq m5, [bufyq+xq*2] -%endif - palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] - palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] -%if %3 - pshufd m4, m5, q3232 - paddw m5, m4 -%endif -%if %2 -%if cpuflag(sse4) - pmulhrsw m5, m11 -%elif %3 - pmulhrsw m5, [base+pw_8192] -%else - pmulhrsw m5, [base+pw_16384] -%endif -%endif - punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] - punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] - shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] - punpcklwd m2, m5 - pmaddwd m1, m7 - pmaddwd m3, m8 - pmaddwd m4, m9 - pmaddwd m2, m10 - paddd m1, m3 - paddd m4, m2 - paddd m0, m12 ; += round - paddd m1, m4 - paddd m0, m1 - - movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pmaddwd m2, m1, m13 - pshufd m3, m2, q1111 - paddd m2, m3 ; left+cur - paddd m2, m0 ; add top - psrldq m0, 4 - psrad m2, [fg_dataq+FGData.ar_coeff_shift] - packssdw m2, m2 - pminsw m2, m14 - pmaxsw m2, m15 - pslldq m2, 4 - psrldq m1, 2 -%if cpuflag(sse4) - pblendw m1, m2, 00000100b -%else - pand m1, m11 - pandn m3, m11, m2 - por m1, m3 -%endif - ; overwrite previous pixels, should be ok - movq [bufq+xq*2-4], m1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82*2 - add bufyq, 82*2<<%3 - dec hd - jg .y_loop_ar3 -%if WIN64 - mov rsp, r6 -%elif ARCH_X86_32 -%undef m8 -%undef m9 -%undef m10 -%undef m11 -%undef m12 -%undef m13 -%undef m14 -%undef m15 -%endif - RET -%endmacro - -generate_grain_uv_fn 420, 1, 1 -generate_grain_uv_fn 422, 1, 0 -generate_grain_uv_fn 444, 0, 0 - -%macro SCRATCH 3 -%if ARCH_X86_32 - mova [rsp+%3*mmsize], m%1 -%define m%2 [rsp+%3*mmsize] -%else - SWAP %1, %2 -%endif -%endmacro - -INIT_XMM ssse3 -%if ARCH_X86_32 -%if STACK_ALIGNMENT < mmsize -cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ - dst, src, scaling, unused1, fg_data, picptr, unused2 - ; copy stack arguments to new position post-alignment, so that we - ; don't have to keep the old stack location in a separate register - mov r0, r0m - mov r1, r2m - mov r2, r4m - mov r3, r6m - mov r4, r7m - mov r5, r8m - -%define r0m [rsp+8*mmsize+ 3*gprsize] -%define r2m [rsp+8*mmsize+ 5*gprsize] -%define r4m [rsp+8*mmsize+ 7*gprsize] -%define r6m [rsp+8*mmsize+ 9*gprsize] -%define r7m [rsp+8*mmsize+10*gprsize] -%define r8m [rsp+8*mmsize+11*gprsize] - - mov r0m, r0 - mov r2m, r1 - mov r4m, r2 - mov r6m, r3 - mov r7m, r4 - mov r8m, r5 -%else -cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ - dst, src, scaling, unused1, fg_data, picptr, unused2 -%endif - mov srcq, srcm - mov scalingq, r5m - mov fg_dataq, r3m -%if STACK_ALIGNMENT < mmsize - mov r6, r9m - -%define r9m [rsp+8*mmsize+ 4*gprsize] -%define r3m [rsp+8*mmsize+ 6*gprsize] -%define r5m [rsp+8*mmsize+ 8*gprsize] - - mov r9m, r6 -%endif - LEA r5, $$ -%define base r5-$$ - mov r5m, picptrq -%else -cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut - lea r8, [pb_mask] -%define base r8-pb_mask -%endif - mov r6d, [fg_dataq+FGData.scaling_shift] - SPLATW m3, [base+mul_bits+r6*2-14] - mov r6d, [fg_dataq+FGData.clip_to_restricted_range] -%if ARCH_X86_32 - DECLARE_REG_TMP 0, 3 -%else - DECLARE_REG_TMP 9, 10 -%endif - mov t0d, r9m ; bdmax - sar t0d, 11 ; is_12bpc - inc t0d - mov t1d, r6d - imul t1d, t0d - dec t0d - SPLATW m5, [base+min+t1*2] - lea t0d, [t0d*3] - lea t0d, [r6d*2+t0d] - SPLATW m4, [base+max+t0*2] - SPLATW m2, r9m - - pcmpeqw m1, m1 - psraw m7, m2, 1 ; max_grain - pxor m1, m7 ; min_grain - SPLATD m6, [base+pd_16] - - SCRATCH 1, 9, 0 - SCRATCH 2, 10, 1 - SCRATCH 3, 11, 2 - SCRATCH 4, 12, 3 - SCRATCH 5, 13, 4 - SCRATCH 6, 14, 5 - SCRATCH 7, 15, 6 - - mova m6, [base+pw_27_17_17_27] ; for horizontal filter - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 - DECLARE_REG_TMP 0 -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ - sby, see - DECLARE_REG_TMP 7 -%endif - - mov sbyd, r8m - movzx t0d, byte [fg_dataq+FGData.overlap_flag] - test t0d, t0d - jz .no_vertical_overlap - test sbyd, sbyd - jnz .vertical_overlap -.no_vertical_overlap: - mov dword r8m, t0d - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused - imul seed, (173 << 24) | 37 -%else - imul seed, sbyd, (173 << 24) | 37 -%endif - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak - - mov r3m, seed - mov wq, r4m -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - unused1, unused2, see, src_bak -%endif - - lea src_bakq, [srcq+wq*2] - mov r9mp, src_bakq - neg wq - sub dstmp, srcq -%if ARCH_X86_32 - mov r4m, wq -%endif - -.loop_x: -%if ARCH_X86_32 - mov seed, r3m -%endif - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, src_bak - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, src_bak -%endif - -.loop_x_odd: - movzx hd, word r7m - mov grain_lutq, grain_lutmp -.loop_y: - ; src - pand m0, m10, [srcq+ 0] - pand m1, m10, [srcq+16] ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 - vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 -%else - vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 - vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 -%endif - REPX {psrlw x, 8}, m2, m3 - - ; grain = grain_lut[offy+y][offx+x] - movu m4, [grain_lutq+offxyq*2] - movu m5, [grain_lutq+offxyq*2+16] - - ; noise = round2(scaling[src] * grain, scaling_shift) - REPX {pmullw x, m11}, m2, m3 - pmulhrsw m4, m2 - pmulhrsw m5, m3 - - ; dst = clip_pixel(src, noise) - paddw m0, m4 - paddw m1, m5 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - movifnidn dstq, dstmp - mova [dstq+srcq+ 0], m0 - mova [dstq+srcq+16], m1 - - add srcq, r2mp ; src += stride - add grain_lutq, 82*2 - dec hd - jg .loop_y - -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end -%if ARCH_X86_32 - mov srcq, r9mp - add srcq, r4mp - add srcq, r4mp -%else - mov src_bakq, r9mp - lea srcq, [src_bakq+wq*2] -%endif - btc dword r8m, 2 - jc .next_blk - add offxyd, 16 - test dword r8m, 2 - jz .loop_x_odd -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add r12d, 16 ; top_offxy += 16 -%endif - jmp .loop_x_odd_v_overlap - -.next_blk: - test dword r8m, 1 - jz .loop_x - - ; r8m = sbym - test dword r8m, 2 - jnz .loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) -.loop_x_h_overlap: -%if ARCH_X86_32 - add offxyd, 16 - mov [rsp+8*mmsize+0*gprsize], offxyd - DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak - mov seed, r3m -%endif - - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, src_bak, left_offxy - - lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, src_bak, left_offxy -%endif - - mov hd, dword r7m - mov grain_lutq, grain_lutmp -.loop_y_h_overlap: - ; grain = grain_lut[offy+y][offx+x] - movu m5, [grain_lutq+offxyq*2] -%if ARCH_X86_32 - mov r5, [rsp+8*mmsize+0*gprsize] - movd m4, [grain_lutq+r5*2] -%else - movd m4, [grain_lutq+left_offxyq*2] -%endif - punpcklwd m4, m5 - pmaddwd m4, m6 - paddd m4, m14 - psrad m4, 5 - packssdw m4, m4 - pminsw m4, m15 - pmaxsw m4, m9 - shufps m4, m5, q3210 - - ; src - pand m0, m10, [srcq+ 0] - pand m1, m10, [srcq+16] ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 - vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 -%else - vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 - vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 -%endif - REPX {psrlw x, 8}, m2, m3 - - ; noise = round2(scaling[src] * grain, scaling_shift) - movu m5, [grain_lutq+offxyq*2+16] - REPX {pmullw x, m11}, m2, m3 - pmulhrsw m4, m2 - pmulhrsw m5, m3 - - ; dst = clip_pixel(src, noise) - paddw m0, m4 - paddw m1, m5 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - movifnidn dstq, dstmp - mova [dstq+srcq+ 0], m0 - mova [dstq+srcq+16], m1 - - add srcq, r2mp - add grain_lutq, 82*2 - dec hd - jg .loop_y_h_overlap - -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end -%if ARCH_X86_32 - mov srcq, r9mp - add srcq, r4mp - add srcq, r4mp -%else - mov src_bakq, r9mp - lea srcq, [src_bakq+wq*2] -%endif - or dword r8m, 4 - add offxyd, 16 - - ; r8m = sbym - test dword r8m, 2 - jz .loop_x_odd -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add r12d, 16 ; top_offxy += 16 -%endif - jmp .loop_x_odd_v_overlap - -.end: - RET - -.vertical_overlap: - or t0d, 2 - mov r8m, t0d - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ - sby, see -%endif - - movzx sbyd, sbyb -%if ARCH_X86_32 - imul r4, [fg_dataq+FGData.seed], 0x00010001 - DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused -%else - imul seed, [fg_dataq+FGData.seed], 0x00010001 -%endif - imul t0d, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add t0d, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and t0d, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, t0d -%if ARCH_X86_32 - xor sbyd, seed - - DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak - - mov r3m, seed - mov wq, r4m -%else - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - unused1, unused2, see, src_bak -%endif - - lea src_bakq, [srcq+wq*2] - mov r9mp, src_bakq - neg wq - sub dstmp, srcq -%if ARCH_X86_32 - mov r4m, wq -%endif - -.loop_x_v_overlap: -%if ARCH_X86_32 - mov r5, r5m - SPLATD m7, [base+pw_27_17_17_27] - mov seed, r3m -%else - SPLATD m7, [pw_27_17_17_27] -%endif - - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp t0b ; parity of top_seed - shr seed, 16 - shl t0d, 16 - test seeb, seeh - setp t0b ; parity of cur_seed - or r6d, 0x00010001 - xor t0d, r6d - mov seed, t0d - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, src_bak, unused, top_offxy - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - -%if ARCH_X86_32 - DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, src_bak, unused, top_offxy -%endif - - movzx top_offxyd, offxyw -%if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd - - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - shr offxyd, 16 - -.loop_x_odd_v_overlap: -%if ARCH_X86_32 - mov r5, r5m -%endif - SPLATD m7, [PIC_ptr(pw_27_17_17_27)] - mov hd, dword r7m - mov grain_lutq, grain_lutmp -.loop_y_v_overlap: - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq*2] -%if ARCH_X86_32 - mov r5, [rsp+8*mmsize+1*gprsize] - movu m2, [grain_lutq+r5*2] -%else - movu m2, [grain_lutq+top_offxyq*2] -%endif - punpckhwd m4, m2, m3 - punpcklwd m2, m3 - REPX {pmaddwd x, m7}, m4, m2 - REPX {paddd x, m14}, m4, m2 - REPX {psrad x, 5}, m4, m2 - packssdw m2, m4 - pminsw m2, m15 - pmaxsw m2, m9 - movu m4, [grain_lutq+offxyq*2+16] -%if ARCH_X86_32 - movu m3, [grain_lutq+r5*2+16] -%else - movu m3, [grain_lutq+top_offxyq*2+16] -%endif - punpckhwd m5, m3, m4 - punpcklwd m3, m4 - REPX {pmaddwd x, m7}, m5, m3 - REPX {paddd x, m14}, m5, m3 - REPX {psrad x, 5}, m5, m3 - packssdw m3, m5 - pminsw m3, m15 - pmaxsw m3, m9 - - ; src - pand m0, m10, [srcq+ 0] ; m0-1: src as word - pand m1, m10, [srcq+16] ; m0-1: src as word - - ; scaling[src] - ; noise = round2(scaling[src] * grain, scaling_shift) -%if ARCH_X86_32 - vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 -%else - vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 -%endif - psrlw m4, 8 - pmullw m4, m11 - pmulhrsw m4, m2 -%if ARCH_X86_32 - vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 -%else - vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 -%endif - psrlw m5, 8 - pmullw m5, m11 - pmulhrsw m5, m3 - - ; dst = clip_pixel(src, noise) - paddw m0, m4 - paddw m1, m5 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - movifnidn dstq, dstmp - mova [dstq+srcq+ 0], m0 - mova [dstq+srcq+16], m1 - - add srcq, r2mp - add grain_lutq, 82*2 - dec hw - jz .end_y_v_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines -%if ARCH_X86_32 - mov r5, r5m -%endif - SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] - xor hd, 0x10000 - test hd, 0x10000 - jnz .loop_y_v_overlap - jmp .loop_y - -.end_y_v_overlap: -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end_hv -%if ARCH_X86_32 - mov srcq, r9mp - add srcq, r4mp - add srcq, r4mp -%else - mov src_bakq, r9mp - lea srcq, [src_bakq+wq*2] -%endif - btc dword r8m, 2 - jc .next_blk_v -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add top_offxyd, 16 -%endif - add offxyd, 16 - jmp .loop_x_odd_v_overlap - -.next_blk_v: - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap - -.loop_x_hv_overlap: -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak - - mov r0, [rsp+8*mmsize+1*gprsize] - add r3, 16 - add r0, 16 - mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy - mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy - - mov seed, r3m - xor r0, r0 -%else - ; we assume from the block above that bits 8-15 of r7d are zero'ed -%endif - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp t0b ; parity of top_seed - shr seed, 16 - shl t0d, 16 - test seeb, seeh - setp t0b ; parity of cur_seed - or r6d, 0x00010001 - xor t0d, r6d - mov seed, t0d - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy - - lea topleft_offxyq, [top_offxyq+16] - lea left_offxyq, [offyq+16] - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - -%if ARCH_X86_32 - DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy -%endif - - movzx top_offxyd, offxyw -%if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd - - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - shr offxyd, 16 - -%if ARCH_X86_32 - mov r5, r5m -%endif - SPLATD m7, [PIC_ptr(pw_27_17_17_27)] - - movzx hd, word r7m - mov grain_lutq, grain_lutmp -.loop_y_hv_overlap: - ; grain = grain_lut[offy+y][offx+x] - movu m2, [grain_lutq+offxyq*2] -%if ARCH_X86_32 - mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy - mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy - movu m4, [grain_lutq+r0*2] - movd m5, [grain_lutq+r5*2] - mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy - movd m3, [grain_lutq+r5*2] -%else - movu m4, [grain_lutq+top_offxyq*2] - movd m5, [grain_lutq+left_offxyq*2] - movd m3, [grain_lutq+topleft_offxyq*2] -%endif - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklwd m5, m2 - punpcklwd m3, m4 - REPX {pmaddwd x, m6}, m5, m3 - REPX {paddd x, m14}, m5, m3 - REPX {psrad x, 5}, m5, m3 - packssdw m5, m3 - pminsw m5, m15 - pmaxsw m5, m9 - shufps m3, m5, m2, q3210 - shufps m5, m4, q3232 - ; followed by v interpolation (top | cur -> cur) - movu m0, [grain_lutq+offxyq*2+16] -%if ARCH_X86_32 - movu m1, [grain_lutq+r0*2+16] -%else - movu m1, [grain_lutq+top_offxyq*2+16] -%endif - punpcklwd m2, m5, m3 - punpckhwd m5, m3 - punpcklwd m3, m1, m0 - punpckhwd m1, m0 - REPX {pmaddwd x, m7}, m2, m5, m3, m1 - REPX {paddd x, m14}, m2, m5, m3, m1 - REPX {psrad x, 5}, m2, m5, m3, m1 - packssdw m2, m5 - packssdw m3, m1 - REPX {pminsw x, m15}, m2, m3 - REPX {pmaxsw x, m9}, m2, m3 - - ; src - pand m0, m10, [srcq+ 0] - pand m1, m10, [srcq+16] ; m0-1: src as word - - ; scaling[src] - ; noise = round2(scaling[src] * grain, scaling_shift) -%if ARCH_X86_32 - vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 -%else - vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 -%endif - psrlw m4, 8 - pmullw m4, m11 - pmulhrsw m2, m4 -%if ARCH_X86_32 - vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 -%else - vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 -%endif - psrlw m5, 8 - pmullw m5, m11 - pmulhrsw m3, m5 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - movifnidn dstq, dstmp - mova [dstq+srcq+ 0], m0 - mova [dstq+srcq+16], m1 - - add srcq, r2mp - add grain_lutq, 82*2 - dec hw - jz .end_y_hv_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines -%if ARCH_X86_32 - mov r5, r5m -%endif - SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] - xor hd, 0x10000 - test hd, 0x10000 - jnz .loop_y_hv_overlap - jmp .loop_y_h_overlap - -.end_y_hv_overlap: - or dword r8m, 4 -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end_hv -%if ARCH_X86_32 - mov r5, r5m - add offxyd, 16 - add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 - mov srcq, r9mp - add srcq, r4mp - add srcq, r4mp -%else - add offxyd, 16 - add top_offxyd, 16 - mov src_bakq, r9mp - lea srcq, [src_bakq+wq*2] -%endif - jmp .loop_x_odd_v_overlap - -.end_hv: - RET -%if ARCH_X86_32 - DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 -%endif - -%macro FGUV_FN 3 ; name, ss_hor, ss_ver -INIT_XMM ssse3 -%if ARCH_X86_32 -%if STACK_ALIGNMENT < mmsize -cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ - tmp, src, scaling, h, fg_data, picptr, unused - mov r0, r0m - mov r1, r1m - mov r2, r2m - mov r4, r3m - mov r3, r4m - mov r5, r5m -%define r0m [rsp+8*mmsize+ 3*gprsize] -%define r1m [rsp+8*mmsize+ 4*gprsize] -%define r2m [rsp+8*mmsize+ 5*gprsize] -%define r3m [rsp+8*mmsize+ 6*gprsize] -%define r4m [rsp+8*mmsize+ 7*gprsize] -%define r5m [rsp+8*mmsize+ 8*gprsize] - mov r0m, r0 - mov r2m, r2 - mov r4m, r3 - mov r5m, r5 - - mov r0, r6m - mov r2, r7m - mov r3, r8m - mov r5, r9m -%define r6m [rsp+8*mmsize+ 9*gprsize] -%define r7m [rsp+8*mmsize+10*gprsize] -%define r8m [rsp+8*mmsize+11*gprsize] -%define r9m [rsp+8*mmsize+12*gprsize] - mov r6m, r0 - mov r7m, r2 - mov r8m, r3 - mov r9m, r5 - - mov r2, r10m - mov r3, r11m - mov r5, r12m - mov r0, r13m -%define r10m [rsp+8*mmsize+13*gprsize] -%define r11m [rsp+8*mmsize+14*gprsize] -%define r12m [rsp+8*mmsize+15*gprsize] - mov r10m, r2 - mov r11m, r3 - mov r12m, r5 - - SPLATW m2, r13m -%else -cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ - tmp, src, scaling, h, fg_data, picptr, unused - mov srcq, srcm - mov fg_dataq, r3m -%endif - LEA r5, $$ -%define base r5-$$ - - DECLARE_REG_TMP 0, 2, 3 -%else -cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ - grain_lut, h, sby, luma, lstride, uv_pl, is_id -%define base r8-pb_mask - lea r8, [pb_mask] - - DECLARE_REG_TMP 9, 10, 11 -%endif - mov r6d, [fg_dataq+FGData.scaling_shift] - SPLATW m3, [base+mul_bits+r6*2-14] - mov r6d, [fg_dataq+FGData.clip_to_restricted_range] -%if STACK_ALIGNMENT >= mmsize - mov t0d, r13m ; bdmax -%endif - sar t0d, 11 ; is_12bpc - inc t0d - mov t1d, r6d - imul t1d, t0d - dec t0d - SPLATW m5, [base+min+t1*2] - lea t1d, [t0d*3] - mov t2d, r12m - inc t2d - imul r6d, t2d - add t1d, r6d - SPLATW m4, [base+max+t1*2] -%if STACK_ALIGNMENT >= mmsize - SPLATW m2, r13m -%endif - - SCRATCH 2, 10, 2 - SCRATCH 3, 11, 3 - SCRATCH 4, 12, 4 - SCRATCH 5, 13, 5 - -%define mzero m7 - -%if %3 - SPLATD m2, [base+pw_23_22] -%endif - -%if ARCH_X86_32 - mov scalingq, r5m - mov r5m, r5 -%else - mov r13mp, strideq -%endif - - pcmpeqw m0, m0 - psraw m1, m10, 1 - pxor m0, m1 - - SCRATCH 0, 8, 0 - SCRATCH 1, 9, 1 - - cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 - jne .csfl - -%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap - - DECLARE_REG_TMP 0 -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap - - DECLARE_REG_TMP 9 -%endif - -%if %1 - mov r6d, r11m - SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] - SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] - punpcklwd m6, m1, m0 - SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] - SPLATD m7, [base+pw_4+t0*4] - pmullw m5, m7 -%else - SPLATD m6, [base+pd_16] -%if %2 - mova m5, [base+pw_23_22] -%else - mova m5, [base+pw_27_17_17_27] -%endif -%endif - - SCRATCH 6, 14, 6 - SCRATCH 5, 15, 7 - -%if ARCH_X86_32 - DECLARE_REG_TMP 0 -%else - DECLARE_REG_TMP 7 -%endif - - mov sbyd, r8m - mov t0d, [fg_dataq+FGData.overlap_flag] - test t0d, t0d - jz %%no_vertical_overlap - test sbyd, sbyd - jnz %%vertical_overlap - -%%no_vertical_overlap: - mov r8m, t0d -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap - imul seed, (173 << 24) | 37 -%else - imul seed, sbyd, (173 << 24) | 37 -%endif - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, see, w, picptr, luma - - mov dstq, r0mp - mov lumaq, r9mp - mov wq, r4m - lea r3, [srcq+wq*2] - mov r1mp, r3 - lea r3, [dstq+wq*2] - mov r11mp, r3 - lea r3, [lumaq+wq*(2<<%2)] - mov r12mp, r3 -%if %3 - shl r10mp, 1 -%endif -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - unused2, unused3, see, unused4, unused5, unused6, luma, lstride - - mov lstrideq, r10mp -%if %3 - add lstrideq, lstrideq -%endif - mov lumaq, r9mp - lea r10, [srcq+wq*2] - lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*(2<<%2)] - mov r10mp, r10 - mov r11mp, r11 - mov r12mp, r12 -%endif - neg wq -%if ARCH_X86_32 - mov r4mp, wq -%endif - -%%loop_x: -%if ARCH_X86_32 - mov seed, r3m -%endif - - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, unused1, unused2, unused3, luma, lstride - - mov offxd, seed - mov offyd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, unused1, unused2, unused3, luma, lstride -%endif - -%if %2 == 0 -%%loop_x_odd: -%endif - mov hd, r7m - mov grain_lutq, grain_lutmp -%%loop_y: - ; src - mova m0, [srcq] - mova m1, [srcq+16] ; m0-1: src as word - - ; luma_src - pxor mzero, mzero -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut - - mov lumaq, r9m -%endif - mova m4, [lumaq+ 0] - mova m6, [lumaq+(16<<%2)] -%if %2 - phaddw m4, [lumaq+16] - phaddw m6, [lumaq+48] -%endif -%if ARCH_X86_32 - add lumaq, r10mp - mov r9m, lumaq -%endif -%if %2 - pavgw m4, mzero - pavgw m6, mzero -%endif - -%if %1 - punpckhwd m3, m4, m0 - punpcklwd m4, m0 - punpckhwd m5, m6, m1 - punpcklwd m6, m1 ; { luma, chroma } - REPX {pmaddwd x, m14}, m3, m4, m5, m6 - REPX {psrad x, 6}, m3, m4, m5, m6 - packssdw m4, m3 - packssdw m6, m5 - REPX {paddw x, m15}, m4, m6 - REPX {pmaxsw x, mzero}, m4, m6 - REPX {pminsw x, m10}, m4, m6 ; clip_pixel() -%else - REPX {pand x, m10}, m4, m6 -%endif - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 - vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 -%else - vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 - vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 -%endif - REPX {psrlw x, 8}, m3, m5 - - ; grain = grain_lut[offy+y][offx+x] - movu m4, [grain_lutq+offxyq*2] - movu m6, [grain_lutq+offxyq*2+16] - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - REPX {pmullw x, m11}, m3, m5 - pmulhrsw m4, m3 - pmulhrsw m6, m5 - - ; dst = clip_pixel(src, noise) - paddw m0, m4 - paddw m1, m6 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - movifnidn dstq, dstmp - mova [dstq+ 0], m0 - mova [dstq+16], m1 - -%if ARCH_X86_32 - add srcq, r2mp - add dstq, r2mp - mov dstmp, dstq -%else - add srcq, r13mp - add dstq, r13mp - add lumaq, lstrideq -%endif - add grain_lutq, 82*2 - dec hd - jg %%loop_y - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma - - mov wq, r4mp -%endif - add wq, 16 - jge %%end -%if ARCH_X86_32 - mov srcq, r1mp -%else - mov srcq, r10mp -%endif - mov dstq, r11mp - mov lumaq, r12mp - lea srcq, [srcq+wq*2] - lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*(2<<%2)] -%if ARCH_X86_32 - mov r0m, dstq - mov r9m, lumaq - mov r4m, wq -%endif -%if %2 == 0 - btc dword r8m, 2 - jc %%next_blk - add offxyd, 16 - test dword r8m, 2 - jz %%loop_x_odd -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add r11d, 16 -%endif - jmp %%loop_x_odd_v_overlap -%%next_blk: -%endif - test dword r8m, 1 - je %%loop_x - - ; r8m = sbym - test dword r8m, 2 - jnz %%loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) -%%loop_x_h_overlap: -%if ARCH_X86_32 - add offxyd, 16 - mov [rsp+8*mmsize+0*gprsize], offxyd - - DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut - - mov seed, r3m -%endif - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, unused1, unused2, luma, lstride - - lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx - mov offxd, seed - mov offyd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, unused1, unused2, luma, lstride -%endif - - mov hd, r7m - mov grain_lutq, grain_lutmp -%%loop_y_h_overlap: - mova m0, [srcq] - mova m1, [srcq+16] - - ; luma_src - pxor mzero, mzero -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut - mov lumaq, r9m -%endif - mova m4, [lumaq+ 0] - mova m6, [lumaq+(16<<%2)] -%if %2 - phaddw m4, [lumaq+16] - phaddw m6, [lumaq+48] -%endif -%if ARCH_X86_32 - add lumaq, r10mp - mov r9m, lumaq -%endif -%if %2 - pavgw m4, mzero - pavgw m6, mzero -%endif - -%if %1 - punpckhwd m3, m4, m0 - punpcklwd m4, m0 - punpckhwd m5, m6, m1 - punpcklwd m6, m1 ; { luma, chroma } - REPX {pmaddwd x, m14}, m3, m4, m5, m6 - REPX {psrad x, 6}, m3, m4, m5, m6 - packssdw m4, m3 - packssdw m6, m5 - REPX {paddw x, m15}, m4, m6 - REPX {pmaxsw x, mzero}, m4, m6 - REPX {pminsw x, m10}, m4, m6 ; clip_pixel() -%else - REPX {pand x, m10}, m4, m6 -%endif - - ; grain = grain_lut[offy+y][offx+x] - movu m7, [grain_lutq+offxyq*2] -%if ARCH_X86_32 - mov r5, [rsp+8*mmsize+0*gprsize] - movd m5, [grain_lutq+r5*2] -%else - movd m5, [grain_lutq+left_offxyq*2+ 0] -%endif - punpcklwd m5, m7 ; {left0, cur0} -%if %1 -%if ARCH_X86_32 - mov r5, r5m -%endif -%if %2 - pmaddwd m5, [PIC_ptr(pw_23_22)] -%else - pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] -%endif - paddd m5, [PIC_ptr(pd_16)] -%else - pmaddwd m5, m15 - paddd m5, m14 -%endif - psrad m5, 5 - packssdw m5, m5 - pmaxsw m5, m8 - pminsw m5, m9 - shufps m5, m7, q3210 - movu m3, [grain_lutq+offxyq*2+16] - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 - vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 -%else - vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 - vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 -%endif - REPX {psrlw x, 8}, m7, m4 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - REPX {pmullw x, m11}, m7, m4 - pmulhrsw m5, m7 - pmulhrsw m3, m4 - - ; dst = clip_pixel(src, noise) - paddw m0, m5 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - movifnidn dstq, dstmp - mova [dstq+ 0], m0 - mova [dstq+16], m1 - -%if ARCH_X86_32 - add srcq, r2mp - add dstq, r2mp - mov dstmp, dstq -%else - add srcq, r13mp - add dstq, r13mp - add lumaq, lstrideq -%endif - add grain_lutq, 82*2 - dec hd - jg %%loop_y_h_overlap - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut - mov wq, r4mp -%endif - add wq, 16 - jge %%end -%if ARCH_X86_32 - mov srcq, r1mp -%else - mov srcq, r10mp -%endif - mov dstq, r11mp - mov lumaq, r12mp - lea srcq, [srcq+wq*2] - lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*(2<<%2)] -%if ARCH_X86_32 - mov r0mp, dstq - mov r9mp, lumaq - mov r4m, wq -%endif - -%if %2 - ; r8m = sbym - test dword r8m, 2 - jne %%loop_x_hv_overlap - jmp %%loop_x_h_overlap -%else - or dword r8m, 4 - add offxyd, 16 - - ; r8m = sbym - test dword r8m, 2 - jz %%loop_x_odd -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add r11d, 16 ; top_offxy += 16 -%endif - jmp %%loop_x_odd_v_overlap -%endif - -%%end: - RET - -%%vertical_overlap: - or t0d, 2 - mov r8m, t0d - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ - sby, see, unused1, unused2, unused3, lstride -%endif - - movzx sbyd, sbyb -%if ARCH_X86_32 - imul r4, [fg_dataq+FGData.seed], 0x00010001 - - DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused -%else - imul seed, [fg_dataq+FGData.seed], 0x00010001 -%endif - imul t0d, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add t0d, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and t0d, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, t0d -%if ARCH_X86_32 - xor sbyd, seed - - DEFINE_ARGS dst, src, scaling, see, w, picptr, luma - - mov r3m, seed - mov dstq, r0mp - mov lumaq, r9mp - mov wq, r4m - lea r3, [srcq+wq*2] - mov r1mp, r3 - lea r3, [dstq+wq*2] - mov r11mp, r3 - lea r3, [lumaq+wq*(2<<%2)] - mov r12mp, r3 -%if %3 - shl r10mp, 1 -%endif -%else - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - unused1, unused2, see, unused3, unused4, unused5, luma, lstride - - mov lstrideq, r10mp -%if %3 - add lstrideq, lstrideq -%endif - mov lumaq, r9mp - lea r10, [srcq+wq*2] - lea r11, [dstq+wq*2] - lea r12, [lumaq+wq*(2<<%2)] - mov r10mp, r10 - mov r11mp, r11 - mov r12mp, r12 -%endif - neg wq -%if ARCH_X86_32 - mov r4m, wq -%endif - -%%loop_x_v_overlap: -%if ARCH_X86_32 - mov seed, r3m - xor t0d, t0d -%else - ; we assume from the block above that bits 8-15 of r7d are zero'ed -%endif - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp t0b ; parity of top_seed - shr seed, 16 - shl t0d, 16 - test seeb, seeh - setp t0b ; parity of cur_seed - or r6d, 0x00010001 - xor t0d, r6d - mov seed, t0d - ror seed, 1 ; updated (cur_seed << 16) | top_seed -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, unused1, top_offxy, unused2, luma, lstride - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - -%if ARCH_X86_32 - DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, unused1, top_offxy, unused2, luma, lstride -%endif - movzx top_offxyd, offxyw -%if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - shr offxyd, 16 - -%if %2 == 0 -%%loop_x_odd_v_overlap: -%endif -%if %3 == 0 -%if ARCH_X86_32 - mov r5, r5m -%endif - SPLATD m2, [PIC_ptr(pw_27_17_17_27)] -%endif - - mov hd, r7m - mov grain_lutq, grain_lutmp -%%loop_y_v_overlap: - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq*2] -%if ARCH_X86_32 - mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy - movu m5, [grain_lutq+r0*2] -%else - movu m5, [grain_lutq+top_offxyq*2] -%endif - punpckhwd m7, m5, m3 - punpcklwd m5, m3 ; {top/cur interleaved} - REPX {pmaddwd x, m2}, m7, m5 -%if %1 -%if ARCH_X86_32 - mov r5, r5m -%endif - REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 -%else - REPX {paddd x, m14}, m7, m5 -%endif - REPX {psrad x, 5}, m7, m5 - packssdw m3, m5, m7 - pmaxsw m3, m8 - pminsw m3, m9 - - ; grain = grain_lut[offy+y][offx+x] - movu m4, [grain_lutq+offxyq*2+16] -%if ARCH_X86_32 - movu m5, [grain_lutq+r0*2+16] -%else - movu m5, [grain_lutq+top_offxyq*2+16] -%endif - punpckhwd m7, m5, m4 - punpcklwd m5, m4 ; {top/cur interleaved} - REPX {pmaddwd x, m2}, m7, m5 -%if %1 - REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 -%else - REPX {paddd x, m14}, m7, m5 -%endif - REPX {psrad x, 5}, m7, m5 - packssdw m4, m5, m7 - pmaxsw m4, m8 - pminsw m4, m9 - - ; src - mova m0, [srcq] - mova m1, [srcq+16] - - ; luma_src - pxor mzero, mzero -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut - - mov lumaq, r9mp -%endif - mova m5, [lumaq+ 0] - mova m6, [lumaq+(16<<%2)] -%if %2 - phaddw m5, [lumaq+16] - phaddw m6, [lumaq+48] -%endif -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif -%if %2 - pavgw m5, mzero - pavgw m6, mzero -%endif - -%if %1 - punpckhwd m7, m5, m0 - punpcklwd m5, m0 - REPX {pmaddwd x, m14}, m7, m5 - REPX {psrad x, 6}, m7, m5 - packssdw m5, m7 - punpckhwd m7, m6, m1 - punpcklwd m6, m1 ; { luma, chroma } - REPX {pmaddwd x, m14}, m7, m6 - REPX {psrad x, 6}, m7, m6 - packssdw m6, m7 - pxor mzero, mzero - REPX {paddw x, m15}, m5, m6 - REPX {pmaxsw x, mzero}, m5, m6 - REPX {pminsw x, m10}, m5, m6 ; clip_pixel() -%else - REPX {pand x, m10}, m5, m6 -%endif - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 - vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 -%else - vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 - vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 -%endif - REPX {psrlw x, 8}, m7, m5 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - REPX {pmullw x, m11}, m7, m5 - pmulhrsw m3, m7 - pmulhrsw m4, m5 - - ; dst = clip_pixel(src, noise) - paddw m0, m3 - paddw m1, m4 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - movifnidn dstq, dstmp - mova [dstq+ 0], m0 - mova [dstq+16], m1 - - dec hw - jle %%end_y_v_overlap -%if ARCH_X86_32 - add srcq, r2mp - add dstq, r2mp - mov dstmp, dstq -%else - add srcq, r13mp - add dstq, r13mp - add lumaq, lstrideq -%endif - add grain_lutq, 82*2 -%if %3 - jmp %%loop_y -%else - btc hd, 16 - jc %%loop_y -%if ARCH_X86_32 - mov r5, r5m -%endif - SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] - jmp %%loop_y_v_overlap -%endif - -%%end_y_v_overlap: -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end_hv -%if ARCH_X86_32 - mov srcq, r1mp -%else - mov srcq, r10mp -%endif - mov dstq, r11mp - mov lumaq, r12mp - lea srcq, [srcq+wq*2] - lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*(2<<%2)] -%if ARCH_X86_32 - mov r0mp, dstq - mov r9mp, lumaq - mov r4m, wq -%endif - -%if %2 - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap -%else - btc dword r8m, 2 - jc %%loop_x_hv_overlap - add offxyd, 16 -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add r11d, 16 -%endif - jmp %%loop_x_odd_v_overlap -%endif - -%%loop_x_hv_overlap: -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut - - mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy - add offxyd, 16 - add t0d, 16 - mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd - mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd - - DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut - - mov seed, r3m - xor t0d, t0d -%else - ; we assume from the block above that bits 8-15 of r7d are zero'ed -%endif - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp t0b ; parity of top_seed - shr seed, 16 - shl t0d, 16 - test seeb, seeh - setp t0b ; parity of cur_seed - or r6d, 0x00010001 - xor t0d, r6d - mov seed, t0d - ror seed, 1 ; updated (cur_seed << 16) | top_seed -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride - - lea topleft_offxyq, [top_offxyq+16] - lea left_offxyq, [offyq+16] - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride -%endif - movzx top_offxyd, offxyw -%if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd - - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - shr offxyd, 16 - -%if %3 == 0 -%if ARCH_X86_32 - mov r5, r5m -%endif - SPLATD m2, [PIC_ptr(pw_27_17_17_27)] -%endif - - mov hd, r7m - mov grain_lutq, grain_lutmp -%%loop_y_hv_overlap: - ; grain = grain_lut[offy+y][offx+x] -%if ARCH_X86_32 - mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy - mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy - movd m5, [grain_lutq+r5*2] -%else - movd m5, [grain_lutq+left_offxyq*2] -%endif - movu m7, [grain_lutq+offxyq*2] -%if ARCH_X86_32 - mov r5, [rsp+8*mmsize+2*gprsize] - movu m4, [grain_lutq+r0*2] -%if %2 - pinsrw m5, [grain_lutq+r5*2], 2 -%else - movd m3, [grain_lutq+r5*2] -%endif -%else - movu m4, [grain_lutq+top_offxyq*2] -%if %2 - pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } -%else - movd m3, [grain_lutq+topleft_offxyq*2] -%endif -%endif -%if %2 == 0 - punpckldq m5, m3 -%endif - punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } - punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } -%if %1 -%if ARCH_X86_32 - mov r5, r5m -%endif -%if %2 - movddup m0, [PIC_ptr(pw_23_22)] -%else - movddup m0, [PIC_ptr(pw_27_17_17_27)] -%endif -%else - pshufd m0, m15, q1010 -%endif - pmaddwd m5, m0 -%if %1 - paddd m5, [PIC_ptr(pd_16)] -%else - paddd m5, m14 -%endif - psrad m5, 5 - packssdw m5, m5 - pmaxsw m5, m8 - pminsw m5, m9 - shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 - shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter - shufps m5, m4, q3231 ; top0-7 post-h_filter - - punpckhwd m7, m5, m3 - punpcklwd m5, m3 ; {top/cur interleaved} - REPX {pmaddwd x, m2}, m7, m5 -%if %1 - REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 -%else - REPX {paddd x, m14}, m5, m7 -%endif - REPX {psrad x, 5}, m5, m7 - packssdw m3, m5, m7 - pmaxsw m3, m8 - pminsw m3, m9 - - ; right half - movu m4, [grain_lutq+offxyq*2+16] -%if ARCH_X86_32 - movu m0, [grain_lutq+r0*2+16] -%else - movu m0, [grain_lutq+top_offxyq*2+16] -%endif - punpckhwd m1, m0, m4 - punpcklwd m0, m4 ; {top/cur interleaved} - REPX {pmaddwd x, m2}, m1, m0 -%if %1 - REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 -%else - REPX {paddd x, m14}, m1, m0 -%endif - REPX {psrad x, 5}, m1, m0 - packssdw m4, m0, m1 - pmaxsw m4, m8 - pminsw m4, m9 - - ; src - mova m0, [srcq] - mova m1, [srcq+16] - - ; luma_src - pxor mzero, mzero -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut - - mov lumaq, r9mp -%endif - mova m6, [lumaq+ 0] - mova m5, [lumaq+(16<<%2)] -%if %2 - phaddw m6, [lumaq+16] - phaddw m5, [lumaq+48] -%endif -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif -%if %2 - pavgw m6, mzero - pavgw m5, mzero -%endif - -%if %1 - punpckhwd m7, m6, m0 - punpcklwd m6, m0 - REPX {pmaddwd x, m14}, m7, m6 - REPX {psrad x, 6}, m7, m6 - packssdw m6, m7 - punpckhwd m7, m5, m1 - punpcklwd m5, m1 ; { luma, chroma } - REPX {pmaddwd x, m14}, m7, m5 - REPX {psrad x, 6}, m7, m5 - packssdw m5, m7 - pxor mzero, mzero - REPX {paddw x, m15}, m6, m5 - REPX {pmaxsw x, mzero}, m6, m5 - REPX {pminsw x, m10}, m6, m5 ; clip_pixel() -%else - REPX {pand x, m10}, m6, m5 -%endif - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 - vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 -%else -%if %3 == 0 - ; register shortage :) - push r12 -%endif - vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 - vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 -%if %3 == 0 - pop r12 -%endif -%endif - REPX {psrlw x, 8}, m7, m6 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - REPX {pmullw x, m11}, m7, m6 - pmulhrsw m3, m7 - pmulhrsw m4, m6 - - ; dst = clip_pixel(src, noise) - paddw m0, m3 - paddw m1, m4 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - movifnidn dstq, dstmp - mova [dstq+ 0], m0 - mova [dstq+16], m1 - -%if ARCH_X86_32 - add srcq, r2mp - add dstq, r2mp - mov dstmp, dstq -%else - add srcq, r13mp - add dstq, r13mp - add lumaq, lstrideq -%endif - add grain_lutq, 82*2 - dec hw -%if %3 - jg %%loop_y_h_overlap -%else - jle %%end_y_hv_overlap - btc hd, 16 - jc %%loop_y_h_overlap -%if ARCH_X86_32 - mov r5, r5m -%endif - SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] - jmp %%loop_y_hv_overlap -%%end_y_hv_overlap: -%endif -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end_hv -%if ARCH_X86_32 - mov srcq, r1mp -%else - mov srcq, r10mp -%endif - mov dstq, r11mp - mov lumaq, r12mp - lea srcq, [srcq+wq*2] - lea dstq, [dstq+wq*2] - lea lumaq, [lumaq+wq*(2<<%2)] -%if ARCH_X86_32 - mov dstmp, dstq - mov r9mp, lumaq - mov r4m, wq -%endif -%if %2 - jmp %%loop_x_hv_overlap -%else - or dword r8m, 4 - add offxyd, 16 -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add r11d, 16 ; top_offxy += 16 -%endif - jmp %%loop_x_odd_v_overlap -%endif - -%%end_hv: - RET -%endmacro - - %%FGUV_32x32xN_LOOP 1, %2, %3 -.csfl: - %%FGUV_32x32xN_LOOP 0, %2, %3 - -%if STACK_ALIGNMENT < mmsize -DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 -%endif -%endmacro - -FGUV_FN 420, 1, 1 -FGUV_FN 422, 1, 0 -FGUV_FN 444, 0, 0 diff -Nru dav1d-0.9.2/src/x86/filmgrain16_sse.asm dav1d-1.0.0/src/x86/filmgrain16_sse.asm --- dav1d-0.9.2/src/x86/filmgrain16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/filmgrain16_sse.asm 2022-03-18 14:31:56.006356000 +0000 @@ -0,0 +1,3421 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +SECTION_RODATA 16 +pd_16: times 4 dd 16 +pw_1: times 8 dw 1 +pw_16384: times 8 dw 16384 +pw_8192: times 8 dw 8192 +pw_23_22: dw 23, 22 + times 3 dw 0, 32 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +pw_27_17_17_27: dw 27, 17, 17, 27 + times 2 dw 0, 32 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512, 1024 +max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 +min: dw 0, 16*4, 16*16 +; these two should be next to each other +pw_4: times 2 dw 4 +pw_16: times 2 dw 16 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 + +SECTION .text + +%if ARCH_X86_32 +%undef base +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 8 +%define %%tmp %8 +%endif +%rep (%6/2) +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %6 == 8 +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4*%7] +%else + pinsrw %1, [%3+%4*%7], %%idx + 0 +%endif + pinsrw %1, [%3+%5*%7], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +%macro SPLATD 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax + lea r4, [pb_mask] +%define base r4-pb_mask +%else +cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax + LEA r4, $$ +%define base r4-$$ +%endif + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r3d, [fg_dataq+FGData.grain_scale_shift] + lea r5d, [bdmaxq+1] + shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r3, r5 + SPLATW m6, [base+round+r3*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] + mov r3, -73*82*2 + sub bufq, r3 +%if ARCH_X86_64 + lea r6, [gaussian_sequence] +%endif +.loop: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + vpgatherdw m3, m2, r6, r5, r7, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 +%endif + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m6 + movq [bufq+r3], m3 + add r3, 4*2 + jl .loop + + ; auto-regression code + movsxd r3, [fg_dataq+FGData.ar_coeff_lag] + movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] + lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] + jmp r3 + +.ar1: +%if WIN64 + DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 + lea bufq, [r0-2*(82*73-(82*3+79))] + PUSH r8 +%else +%if ARCH_X86_64 + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 +%else ; x86-32 + DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 + PUSH r6 +%define shiftd r1d +%endif + sub bufq, 2*(82*73-(82*3+79)) +%endif + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] +%if WIN64 + DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 +%elif ARCH_X86_64 + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 +%else ; x86-32 +%undef shiftd + DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 +%define hd dword r0m +%define maxd dword minm +%endif +%if cpuflag(sse4) + pmovsxbw m4, m4 +%else + pxor m3, m3 + pcmpgtb m3, m4 + punpcklbw m4, m3 +%endif + pinsrw m4, [base+pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +%if WIN64 + POP r8 +%elif ARCH_X86_32 + POP r6 +%undef maxd +%undef hd +%endif +.ar0: + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m0, [base+round_vals-12+shiftq*2] + pshuflw m0, m0, q0000 + movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 + pxor m2, m2 + punpcklwd m0, m2 + pcmpgtb m2, m6 + punpckhbw m3, m6, m2 + punpcklbw m6, m2 + pshufd m2, m6, q3333 + pshufd m1, m6, q2222 + pshufd m7, m6, q1111 + pshufd m6, m6, q0000 + pshufd m4, m3, q1111 + pshufd m3, m3, q0000 +%if ARCH_X86_64 + SWAP 0, 12 + SWAP 1, 8 + SWAP 2, 9 + SWAP 3, 10 + SWAP 4, 11 +%else +%define m12 [rsp+0*16] +%define m8 [rsp+1*16] +%define m9 [rsp+2*16] +%define m10 [rsp+3*16] +%define m11 [rsp+4*16] + mova m12, m0 + mova m8, m1 + mova m9, m2 + mova m10, m3 + mova m11, m4 + mov bdmaxd, bdmaxm +%endif + sar bdmaxd, 1 + SPLATW m0, bdmaxd ; max_grain + pcmpeqw m1, m1 +%if !cpuflag(sse4) + pcmpeqw m2, m2 + psrldq m2, 14 + pslldq m2, 2 + pxor m2, m1 +%endif + pxor m1, m0 ; min_grain +%if ARCH_X86_64 + SWAP 0, 13 + SWAP 1, 14 + SWAP 2, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] + mova m13, m0 + mova m14, m1 +%if !cpuflag(sse4) +%define m15 [rsp+7*16] + mova m15, m2 +%endif +%endif + sub bufq, 2*(82*73-(82*3+79)) + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m2, m0, 2 + psrldq m3, m0, 4 + psrldq m4, m0, 6 + psrldq m5, m0, 8 + punpcklwd m0, m2 + punpcklwd m3, m4 + punpcklwd m5, m1 + psrldq m2, m1, 2 + psrldq m4, m1, 4 + punpcklwd m2, m4 + psrldq m4, m1, 6 + psrldq m1, 8 + punpcklwd m4, m1 + pmaddwd m0, m6 + pmaddwd m3, m7 + pmaddwd m5, m8 + pmaddwd m2, m9 + pmaddwd m4, m10 + paddd m0, m3 + paddd m5, m2 + paddd m0, m4 + paddd m0, m5 ; accumulated top 2 rows + paddd m0, m12 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m4, m1, q3321 + pxor m2, m2 + pcmpgtw m2, m4 + punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd m2, m1, m11 + paddd m2, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + paddd m2, m4 + packssdw m2, m2 + pminsw m2, m13 + pmaxsw m2, m14 + psrldq m4, 4 + pslldq m2, 2 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000010b +%else + pand m1, m15 + pandn m3, m15, m2 + por m1, m3 +%endif + ; overwrite previous pixel, this should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 64 + %define tmp rsp +%elif ARCH_X86_64 + %define tmp rsp+stack_offset-72 +%else +%assign stack_offset stack_offset_old + ALLOC_STACK -16*12 + %define tmp rsp + mov bdmaxd, bdmaxm +%endif + sar bdmaxd, 1 + SPLATW m7, bdmaxd ; max_grain + pcmpeqw m6, m6 +%if !cpuflag(sse4) + pcmpeqw m4, m4 + psrldq m4, 14 + pslldq m4, 4 + pxor m4, m6 +%endif + pxor m6, m7 ; min_grain + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + +%if ARCH_X86_64 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m14 [rsp+10*16] +%define m15 [esp+11*16] + mova m14, m6 + mova m15, m7 +%endif + + ; build cf0-1 until 18-19 in m5-12 and r0/1 + pxor m1, m1 + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + +%if cpuflag(sse4) + pshufd m4, m2, q3333 +%else + pshufd m5, m2, q3333 + mova [tmp+48], m5 +%endif + pshufd m3, m2, q2222 + pshufd m1, m2, q0000 + pshufd m2, m2, q1111 + pshufd m7, m0, q2222 + pshufd m6, m0, q1111 + pshufd m5, m0, q0000 + pshufd m0, m0, q3333 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+4*16] +%define m9 [esp+5*16] +%define m10 [rsp+6*16] +%define m11 [esp+7*16] +%define m12 [rsp+8*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif + + ; build cf20,round in r2 + ; build cf21-23,round*2 in m13 + pxor m1, m1 + movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pcmpgtb m1, m0 + punpcklbw m0, m1 + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + mova [tmp+ 0], m1 + mova [tmp+16], m2 + psrldq m3, m0, 10 + pinsrw m3, [base+round_vals+shiftq*2-10], 3 + +%if ARCH_X86_64 + SWAP 3, 13 +%else +%define m13 [esp+9*16] + mova m13, m3 +%endif + + pinsrw m0, [base+round_vals+shiftq*2-12], 5 + pshufd m3, m0, q2222 + mova [tmp+32], m3 + + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + paddd m0, m2 + paddd m0, m3 + ; m0 = top line first 6 multiplied by cf, m1 = top line last entry + + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] + punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + + pmaddwd m1, m8 + pmaddwd m4, m9 + pmaddwd m3, m10 + pmaddwd m2, m11 + paddd m1, m4 + paddd m3, m2 + paddd m0, m1 + paddd m0, m3 + ; m0 = top 2 lines multiplied by cf + + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, [base+pw_1] + +%if cpuflag(sse4) + pmaddwd m1, m12 +%else + pmaddwd m1, [tmp+48] +%endif + pmaddwd m3, [tmp+ 0] + pmaddwd m4, [tmp+16] + pmaddwd m2, [tmp+32] + paddd m1, m3 + paddd m4, m2 + paddd m0, m1 + paddd m0, m4 + ; m0 = top 3 lines multiplied by cf plus rounding for downshift + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m15 + pmaxsw m2, m14 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m12 + pandn m3, m12, m2 + por m1, m3 +%endif + ; overwrite a couple of pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg +%define base r8-pb_mask + lea r8, [pb_mask] + movifnidn bdmaxd, bdmaxm + lea r6d, [bdmaxq+1] +%else +cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h +%define base r2-$$ + LEA r2, $$ + mov fg_dataq, r2m + mov r6d, r4m + inc r6d +%endif + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r5, r6 + SPLATW m6, [base+round+r5*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] +%if ARCH_X86_64 + SPLATW m2, [base+pw_seed_xor+uvq*4] +%else + mov r5d, r3m + SPLATW m2, [base+pw_seed_xor+r5*4] +%endif + pxor m0, m2 +%if ARCH_X86_64 + lea r6, [gaussian_sequence] +%endif +%if %2 + mov hd, 73-35*%3 + add bufq, 44*2 +.loop_y: + mov xq, -44 +%else + mov xq, -82*73 + add bufq, 82*73*2 +%endif +.loop_x: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + vpgatherdw m3, m2, r6, r9, r10, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 +%endif + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m6 + movq [bufq+xq*2], m3 + add xq, 4 + jl .loop_x +%if %2 + add bufq, 82*2 + dec hd + jg .loop_y +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] + jmp r5 + +.ar0: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset_old stack_offset + ALLOC_STACK -16*2 + mov bufyq, r1m + mov uvd, r3m +%endif + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + SPLATW m3, [base+hmul_bits+shiftq*2-10] +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m1, bdmaxd ; max_gain +%else + SPLATW m1, r4m + psraw m1, 1 +%endif + pcmpeqw m7, m7 + pxor m7, m1 ; min_grain +%if ARCH_X86_64 + SWAP 1, 14 + DEFINE_ARGS buf, bufy, h, x +%else +%define m14 [rsp+0*16] + mova m14, m1 + DEFINE_ARGS buf, bufy, pic_reg, h, x +%endif + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif + SPLATW m4, m4 + pxor m5, m5 +%if %2 +%if !cpuflag(sse4) + pcmpeqw m2, m2 + pslldq m2, 12 +%if ARCH_X86_64 + SWAP 2, 12 +%else +%define m12 [rsp+1*16] + mova m12, m2 +%endif +%endif +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif + add bufyq, 2*(3+82*3) + mov hd, 70-35*%3 +.y_loop_ar0: + ; first 32 pixels + xor xd, xd +.x_loop_ar0: + movu m0, [bufyq+xq*(2<<%2)] +%if %2 +%if %3 + movu m2, [bufyq+xq*4+82*2] + paddw m0, m2 +%endif + movu m1, [bufyq+xq*4 +16] +%if %3 + movu m2, [bufyq+xq*4+82*2+16] + paddw m1, m2 +%endif + phaddw m0, m1 + pmulhrsw m0, m6 +%endif + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + REPX {pmaddwd x, m4}, m0, m1 + REPX {psrad x, 5}, m0, m1 + packssdw m0, m1 + pmulhrsw m0, m3 + movu m1, [bufq+xq*2] + paddw m0, m1 + pminsw m0, m14 + pmaxsw m0, m7 + cmp xd, 72-40*%2 + je .end + movu [bufq+xq*2], m0 + add xd, 8 + jmp .x_loop_ar0 + + ; last 6/4 pixels +.end: +%if %2 +%if cpuflag(sse4) + pblendw m0, m1, 11000000b +%else + pand m1, m12 + pandn m2, m12, m0 + por m0, m1, m2 +%endif + movu [bufq+xq*2], m0 +%else + movq [bufq+xq*2], m0 +%endif + + add bufq, 82*2 + add bufyq, 82*(2<<%3) + dec hd + jg .y_loop_ar0 +%if ARCH_X86_32 +%undef m12 +%undef m14 +%endif + RET + +.ar1: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x +%else +%assign stack_offset stack_offset_old +%xdefine rstk rsp +%assign stack_size_padded 0 + DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 + mov bufyq, r1m + mov uvd, r3m +%endif + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] +%if WIN64 + DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 +%if %2 + lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] +%else + lea bufq, [r0-2*(82*69+3)] +%endif +%else +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 +%else + DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 +%define hd dword r1m +%define mind dword r3m +%define maxd dword r4m +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif +%endif +%if ARCH_X86_64 + mov shiftd, [r2+FGData.ar_coeff_shift] +%else + mov shiftd, [r3+FGData.ar_coeff_shift] +%endif + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 ; cf0-4 in words + pshuflw m4, m4, q2100 + psrldq m4, 2 ; cf0-3,4 in words + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pxor m6, m6 + punpcklwd m3, m6 +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif + SPLATD m3, m3 + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 + sar maxd, 1 +%if ARCH_X86_64 + mov mind, maxd + xor mind, -1 +%else + DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 + mov r2, maxd + xor r2, -1 + mov mind, r2 +%endif +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left +%if %2 + movu m7, [bufyq+xq*4] +%if %3 + movu m1, [bufyq+xq*4+82*2] + phaddw m7, m1 +%else + phaddw m7, m7 +%endif +%else + movq m7, [bufyq+xq*2] +%endif + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 +%if %2 +%if %3 + pshufd m2, m7, q3232 + paddw m7, m2 +%endif + pmulhrsw m7, m6 +%endif + punpcklwd m1, m7 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar1 +%if ARCH_X86_32 +%undef maxd +%undef mind +%undef hd +%endif + RET + +.ar2: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift + ALLOC_STACK -16*8 + mov bufyq, r1m + mov uvd, r3m +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m5, bdmaxd ; max_grain +%else + SPLATW m5, r4m + psraw m5, 1 +%endif + pcmpeqw m6, m6 +%if !cpuflag(sse4) + pcmpeqw m7, m7 + psrldq m7, 14 + pslldq m7, 2 + pxor m7, m6 +%endif + pxor m6, m5 ; min_grain +%if %2 && cpuflag(sse4) + SPLATW m7, [base+hmul_bits+2+%3*2] +%endif + +%if ARCH_X86_64 + SWAP 5, 13 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] +%define m15 [rsp+7*16] + mova m13, m5 + mova m14, m6 + mova m15, m7 +%endif + + ; coef values + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pinsrw m2, [base+round_vals-12+shiftq*2], 5 + + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m1, m0, q3333 + pshufd m0, m0, q2222 + pshufd m3, m2, q1111 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+0*16] +%define m9 [rsp+1*16] +%define m10 [rsp+2*16] +%define m11 [rsp+3*16] +%define m12 [rsp+4*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif + +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, h, x +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m4, m0, 2 ; y=-2,x=[-1,+5] + psrldq m1, m0, 4 ; y=-2,x=[-0,+5] + psrldq m3, m0, 6 ; y=-2,x=[+1,+5] + psrldq m2, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] + pmaddwd m0, m6 + pmaddwd m1, m7 + pmaddwd m2, m8 + paddd m0, m1 + paddd m0, m2 + psrldq m3, m5, 2 ; y=-1,x=[-1,+5] + psrldq m1, m5, 4 ; y=-1,x=[-0,+5] + psrldq m4, m5, 6 ; y=-1,x=[+1,+5] + psrldq m2, m5, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + punpcklwd m4, m2 + pmaddwd m3, m9 + pmaddwd m4, m10 + paddd m3, m4 + paddd m0, m3 + + ; luma component & rounding +%if %2 + movu m1, [bufyq+xq*4] +%if %3 + movu m2, [bufyq+xq*4+82*2] + phaddw m1, m2 + pshufd m2, m1, q3232 + paddw m1, m2 +%else + phaddw m1, m1 +%endif +%if cpuflag(sse4) + pmulhrsw m1, m15 +%elif %3 + pmulhrsw m1, [base+pw_8192] +%else + pmulhrsw m1, [base+pw_16384] +%endif +%else + movq m1, [bufyq+xq*2] +%endif + punpcklwd m1, [base+pw_1] + pmaddwd m1, m12 + paddd m0, m1 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m2, m1, q3321 + pxor m3, m3 + pcmpgtw m3, m2 + punpcklwd m2, m3 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd m3, m1, m11 + paddd m3, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd m3, m2 + packssdw m3, m3 + pminsw m3, m13 + pmaxsw m3, m14 + psrldq m1, 2 + pslldq m3, 2 + psrldq m2, 4 +%if cpuflag(sse4) + pblendw m1, m3, 00000010b +%else + pand m1, m15 + pandn m4, m15, m3 + por m1, m4 +%endif + ; overwrite previous pixel, should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +.ar3: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 96 + %define tmp rsp +%else + %define tmp rsp+stack_offset-120 +%endif +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 + mov bufyq, r1m + mov uvd, r3m + %define tmp rsp +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + SPLATW m4, [base+round_vals-12+shiftq*2] + pxor m5, m5 + pcmpgtw m5, m4 + punpcklwd m4, m5 +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m6, bdmaxd ; max_grain +%else + SPLATW m6, r4m + psraw m6, 1 +%endif + pcmpeqw m7, m7 +%if !cpuflag(sse4) + pcmpeqw m3, m3 + psrldq m3, 14 + pslldq m3, 4 + pxor m3, m7 +%endif + pxor m7, m6 ; min_grain +%if %2 && cpuflag(sse4) + SPLATW m3, [base+hmul_bits+2+%3*2] +%endif + +%if ARCH_X86_64 + SWAP 3, 11 + SWAP 4, 12 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m11 [rsp+ 9*16] +%define m12 [rsp+10*16] +%define m14 [rsp+12*16] +%define m15 [rsp+13*16] + mova m11, m3 + mova m12, m4 + mova m14, m6 + mova m15, m7 +%endif + + ; cf from y=-3,x=-3 until y=-3,x=-2 + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pshufd m1, m0, q0000 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m0, m0, q3333 + pshufd m5, m2, q0000 + pshufd m6, m2, q1111 + mova [tmp+16*0], m1 + mova [tmp+16*1], m3 + mova [tmp+16*2], m4 + mova [tmp+16*3], m0 + mova [tmp+16*4], m5 + mova [tmp+16*5], m6 + pshufd m6, m2, q2222 + pshufd m7, m2, q3333 + + ; cf from y=-1,x=-1 to y=0,x=-1 + luma component + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 ; luma + punpcklbw m0, m1 + pshufd m3, m0, q3232 + psrldq m5, m0, 10 + ; y=0,x=[-3 to -1] + "1.0" for current pixel + pinsrw m5, [base+round_vals-10+shiftq*2], 3 + ; y=-1,x=[-1 to +2] + pshufd m1, m0, q0000 + pshufd m0, m0, q1111 + ; y=-1,x=+3 + luma + punpcklwd m3, m2 + pshufd m3, m3, q0000 + +%if ARCH_X86_64 + SWAP 1, 8 + SWAP 0, 9 + SWAP 3, 10 + SWAP 5, 13 + DEFINE_ARGS buf, bufy, fg_data, h, x +%else +%define m8 [rsp+ 6*16] +%define m9 [rsp+ 7*16] +%define m10 [rsp+ 8*16] +%define m13 [rsp+11*16] + mova m8, m1 + mova m9, m0 + mova m10, m3 + mova m13, m5 + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + ; first line + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, [tmp+0*16] + pmaddwd m2, [tmp+1*16] + pmaddwd m3, [tmp+2*16] + paddd m0, m2 + paddd m0, m3 ; first 6 x of top y + + ; second line [m0/1 are busy] + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] + punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, [tmp+3*16] + pmaddwd m4, [tmp+4*16] + pmaddwd m3, [tmp+5*16] + pmaddwd m5, m6 + paddd m1, m4 + paddd m3, m5 + paddd m0, m1 + paddd m0, m3 ; top 2 lines + + ; third line [m0 is busy] & luma + round + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] +%if %2 + movu m5, [bufyq+xq*4] +%if %3 + movu m4, [bufyq+xq*4+82*2] + phaddw m5, m4 +%else + phaddw m5, m5 +%endif +%else + movq m5, [bufyq+xq*2] +%endif + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] +%if %3 + pshufd m4, m5, q3232 + paddw m5, m4 +%endif +%if %2 +%if cpuflag(sse4) + pmulhrsw m5, m11 +%elif %3 + pmulhrsw m5, [base+pw_8192] +%else + pmulhrsw m5, [base+pw_16384] +%endif +%endif + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, m5 + pmaddwd m1, m7 + pmaddwd m3, m8 + pmaddwd m4, m9 + pmaddwd m2, m10 + paddd m1, m3 + paddd m4, m2 + paddd m0, m12 ; += round + paddd m1, m4 + paddd m0, m1 + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m14 + pmaxsw m2, m15 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m11 + pandn m3, m11, m2 + por m1, m3 +%endif + ; overwrite previous pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] + + mov r0m, r0 + mov r2m, r1 + mov r4m, r2 + mov r6m, r3 + mov r7m, r4 + mov r8m, r5 +%else +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov scalingq, r5m + mov fg_dataq, r3m +%if STACK_ALIGNMENT < mmsize + mov r6, r9m + +%define r9m [rsp+8*mmsize+ 4*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + + mov r9m, r6 +%endif + LEA r5, $$ +%define base r5-$$ + mov r5m, picptrq +%else +cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + lea r8, [pb_mask] +%define base r8-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] +%if ARCH_X86_32 + DECLARE_REG_TMP 0, 3 +%else + DECLARE_REG_TMP 9, 10 +%endif + mov t0d, r9m ; bdmax + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t0d, [t0d*3] + lea t0d, [r6d*2+t0d] + SPLATW m4, [base+max+t0*2] + SPLATW m2, r9m + + pcmpeqw m1, m1 + psraw m7, m2, 1 ; max_grain + pxor m1, m7 ; min_grain + SPLATD m6, [base+pd_16] + + SCRATCH 1, 9, 0 + SCRATCH 2, 10, 1 + SCRATCH 3, 11, 2 + SCRATCH 4, 12, 3 + SCRATCH 5, 13, 4 + SCRATCH 6, 14, 5 + SCRATCH 7, 15, 6 + + mova m6, [base+pw_27_17_17_27] ; for horizontal filter + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 + DECLARE_REG_TMP 0 +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + DECLARE_REG_TMP 7 +%endif + + mov sbyd, r8m + movzx t0d, byte [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz .no_vertical_overlap + test sbyd, sbyd + jnz .vertical_overlap +.no_vertical_overlap: + mov dword r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak +%endif + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif + +.loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak +%endif + +.loop_x_odd: + movzx hd, word r7m + mov grain_lutq, grain_lutmp +.loop_y: + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 +%else + vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 +%endif + REPX {psrlw x, 8}, m2, m3 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[src] * grain, scaling_shift) + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp ; src += stride + add grain_lutq, 82*2 + dec hd + jg .loop_y + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + btc dword r8m, 2 + jc .next_blk + add offxyd, 16 + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r12d, 16 ; top_offxy += 16 +%endif + jmp .loop_x_odd_v_overlap + +.next_blk: + test dword r8m, 1 + jz .loop_x + + ; r8m = sbym + test dword r8m, 2 + jnz .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + mov seed, r3m +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy +%endif + + mov hd, dword r7m + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m5, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m4, [grain_lutq+r5*2] +%else + movd m4, [grain_lutq+left_offxyq*2] +%endif + punpcklwd m4, m5 + pmaddwd m4, m6 + paddd m4, m14 + psrad m4, 5 + packssdw m4, m4 + pminsw m4, m15 + pmaxsw m4, m9 + shufps m4, m5, q3210 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 +%endif + REPX {psrlw x, 8}, m2, m3 + + ; noise = round2(scaling[src] * grain, scaling_shift) + movu m5, [grain_lutq+offxyq*2+16] + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + or dword r8m, 4 + add offxyd, 16 + + ; r8m = sbym + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r12d, 16 ; top_offxy += 16 +%endif + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see +%endif + + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add t0d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and t0d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak +%endif + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif + +.loop_x_v_overlap: +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27] + mov seed, r3m +%else + SPLATD m7, [pw_27_17_17_27] +%endif + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +.loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] + mov hd, dword r7m + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+1*gprsize] + movu m2, [grain_lutq+r5*2] +%else + movu m2, [grain_lutq+top_offxyq*2] +%endif + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + REPX {pmaddwd x, m7}, m4, m2 + REPX {paddd x, m14}, m4, m2 + REPX {psrad x, 5}, m4, m2 + packssdw m2, m4 + pminsw m2, m15 + pmaxsw m2, m9 + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m3, [grain_lutq+r5*2+16] +%else + movu m3, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m5, m3, m4 + punpcklwd m3, m4 + REPX {pmaddwd x, m7}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m3, m5 + pminsw m3, m15 + pmaxsw m3, m9 + + ; src + pand m0, m10, [srcq+ 0] ; m0-1: src as word + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 +%endif + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m4, m2 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 +%else + vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 +%endif + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + btc dword r8m, 2 + jc .next_blk_v +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + jmp .loop_x_odd_v_overlap + +.next_blk_v: + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +.loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r0, [rsp+8*mmsize+1*gprsize] + add r3, 16 + add r0, 16 + mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy + mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy + + mov seed, r3m + xor r0, r0 +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] + + movzx hd, word r7m + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m2, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + movu m4, [grain_lutq+r0*2] + movd m5, [grain_lutq+r5*2] + mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy + movd m3, [grain_lutq+r5*2] +%else + movu m4, [grain_lutq+top_offxyq*2] + movd m5, [grain_lutq+left_offxyq*2] + movd m3, [grain_lutq+topleft_offxyq*2] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd m5, m2 + punpcklwd m3, m4 + REPX {pmaddwd x, m6}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m5, m3 + pminsw m5, m15 + pmaxsw m5, m9 + shufps m3, m5, m2, q3210 + shufps m5, m4, q3232 + ; followed by v interpolation (top | cur -> cur) + movu m0, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m1, [grain_lutq+r0*2+16] +%else + movu m1, [grain_lutq+top_offxyq*2+16] +%endif + punpcklwd m2, m5, m3 + punpckhwd m5, m3 + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + REPX {pmaddwd x, m7}, m2, m5, m3, m1 + REPX {paddd x, m14}, m2, m5, m3, m1 + REPX {psrad x, 5}, m2, m5, m3, m1 + packssdw m2, m5 + packssdw m3, m1 + REPX {pminsw x, m15}, m2, m3 + REPX {pmaxsw x, m9}, m2, m3 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 +%endif + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m2, m4 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 +%else + vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 +%endif + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + or dword r8m, 4 +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov r5, r5m + add offxyd, 16 + add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + add offxyd, 16 + add top_offxyd, 16 + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + jmp .loop_x_odd_v_overlap + +.end_hv: + RET +%if ARCH_X86_32 + DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +%endif + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r1m + mov r2, r2m + mov r4, r3m + mov r3, r4m + mov r5, r5m +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r1m [rsp+8*mmsize+ 4*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + mov r0m, r0 + mov r2m, r2 + mov r4m, r3 + mov r5m, r5 + + mov r0, r6m + mov r2, r7m + mov r3, r8m + mov r5, r9m +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] +%define r9m [rsp+8*mmsize+12*gprsize] + mov r6m, r0 + mov r7m, r2 + mov r8m, r3 + mov r9m, r5 + + mov r2, r10m + mov r3, r11m + mov r5, r12m + mov r0, r13m +%define r10m [rsp+8*mmsize+13*gprsize] +%define r11m [rsp+8*mmsize+14*gprsize] +%define r12m [rsp+8*mmsize+15*gprsize] + mov r10m, r2 + mov r11m, r3 + mov r12m, r5 + + SPLATW m2, r13m +%else +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused + mov srcq, srcm + mov fg_dataq, r3m +%endif + LEA r5, $$ +%define base r5-$$ + + DECLARE_REG_TMP 0, 2, 3 +%else +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r8-pb_mask + lea r8, [pb_mask] + + DECLARE_REG_TMP 9, 10, 11 +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] +%if STACK_ALIGNMENT >= mmsize + mov t0d, r13m ; bdmax +%endif + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t1d, [t0d*3] + mov t2d, r12m + inc t2d + imul r6d, t2d + add t1d, r6d + SPLATW m4, [base+max+t1*2] +%if STACK_ALIGNMENT >= mmsize + SPLATW m2, r13m +%endif + + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + +%define mzero m7 + +%if %3 + SPLATD m2, [base+pw_23_22] +%endif + +%if ARCH_X86_32 + mov scalingq, r5m + mov r5m, r5 +%else + mov r13mp, strideq +%endif + + pcmpeqw m0, m0 + psraw m1, m10, 1 + pxor m0, m1 + + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap + + DECLARE_REG_TMP 0 +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + DECLARE_REG_TMP 9 +%endif + +%if %1 + mov r6d, r11m + SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] + SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklwd m6, m1, m0 + SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] + SPLATD m7, [base+pw_4+t0*4] + pmullw m5, m7 +%else + SPLATD m6, [base+pd_16] +%if %2 + mova m5, [base+pw_23_22] +%else + mova m5, [base+pw_27_17_17_27] +%endif +%endif + + SCRATCH 6, 14, 6 + SCRATCH 5, 15, 7 + +%if ARCH_X86_32 + DECLARE_REG_TMP 0 +%else + DECLARE_REG_TMP 7 +%endif + + mov sbyd, r8m + mov t0d, [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz %%no_vertical_overlap + test sbyd, sbyd + jnz %%vertical_overlap + +%%no_vertical_overlap: + mov r8m, t0d +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*(2<<%2)] + mov r12mp, r3 +%if %3 + shl r10mp, 1 +%endif +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif + mov lumaq, r9mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 +%endif + neg wq +%if ARCH_X86_32 + mov r4mp, wq +%endif + +%%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride +%endif + +%if %2 == 0 +%%loop_x_odd: +%endif + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y: + ; src + mova m0, [srcq] + mova m1, [srcq+16] ; m0-1: src as word + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif +%if %2 + pavgw m4, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m3, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m6, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m3, m5 + pmulhrsw m4, m3 + pmulhrsw m6, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hd + jg %%loop_y + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma + + mov wq, r4mp +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0m, dstq + mov r9m, lumaq + mov r4m, wq +%endif +%if %2 == 0 + btc dword r8m, 2 + jc %%next_blk + add offxyd, 16 + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%%next_blk: +%endif + test dword r8m, 1 + je %%loop_x + + ; r8m = sbym + test dword r8m, 2 + jnz %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif +%if %2 + pavgw m4, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; grain = grain_lut[offy+y][offx+x] + movu m7, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m5, [grain_lutq+r5*2] +%else + movd m5, [grain_lutq+left_offxyq*2+ 0] +%endif + punpcklwd m5, m7 ; {left0, cur0} +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %2 + pmaddwd m5, [PIC_ptr(pw_23_22)] +%else + pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] +%endif + paddd m5, [PIC_ptr(pd_16)] +%else + pmaddwd m5, m15 + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 + shufps m5, m7, q3210 + movu m3, [grain_lutq+offxyq*2+16] + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 + vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m7, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m4 + pmulhrsw m5, m7 + pmulhrsw m3, m4 + + ; dst = clip_pixel(src, noise) + paddw m0, m5 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hd + jg %%loop_y_h_overlap + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + mov wq, r4mp +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif + +%if %2 + ; r8m = sbym + test dword r8m, 2 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap +%else + or dword r8m, 4 + add offxyd, 16 + + ; r8m = sbym + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%end: + RET + +%%vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride +%endif + + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add t0d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and t0d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov r3m, seed + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*(2<<%2)] + mov r12mp, r3 +%if %3 + shl r10mp, 1 +%endif +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, unused3, unused4, unused5, luma, lstride + + mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif + mov lumaq, r9mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 +%endif + neg wq +%if ARCH_X86_32 + mov r4m, wq +%endif + +%%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor t0d, t0d +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride +%endif + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if %2 == 0 +%%loop_x_odd_v_overlap: +%endif +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy + movu m5, [grain_lutq+r0*2] +%else + movu m5, [grain_lutq+top_offxyq*2] +%endif + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m5, [grain_lutq+r0*2+16] +%else + movu m5, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m7, m5, m4 + punpcklwd m5, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m4, m5, m7 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m5, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m5, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif +%if %2 + pavgw m5, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m7, m5, m0 + punpcklwd m5, m0 + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 + punpckhwd m7, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 + pxor mzero, mzero + REPX {paddw x, m15}, m5, m6 + REPX {pmaxsw x, mzero}, m5, m6 + REPX {pminsw x, m10}, m5, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m5, m6 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m5 + pmulhrsw m3, m7 + pmulhrsw m4, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + + dec hw + jle %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 +%if %3 + jmp %%loop_y +%else + btc hd, 16 + jc %%loop_y +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_v_overlap +%endif + +%%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif + +%if %2 + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%else + btc dword r8m, 2 + jc %%loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut + + mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy + add offxyd, 16 + add t0d, 16 + mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd + mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m + xor t0d, t0d +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride +%endif + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + movd m5, [grain_lutq+r5*2] +%else + movd m5, [grain_lutq+left_offxyq*2] +%endif + movu m7, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+2*gprsize] + movu m4, [grain_lutq+r0*2] +%if %2 + pinsrw m5, [grain_lutq+r5*2], 2 +%else + movd m3, [grain_lutq+r5*2] +%endif +%else + movu m4, [grain_lutq+top_offxyq*2] +%if %2 + pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } +%else + movd m3, [grain_lutq+topleft_offxyq*2] +%endif +%endif +%if %2 == 0 + punpckldq m5, m3 +%endif + punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } + punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %2 + movddup m0, [PIC_ptr(pw_23_22)] +%else + movddup m0, [PIC_ptr(pw_27_17_17_27)] +%endif +%else + pshufd m0, m15, q1010 +%endif + pmaddwd m5, m0 +%if %1 + paddd m5, [PIC_ptr(pd_16)] +%else + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 + shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 + shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter + shufps m5, m4, q3231 ; top0-7 post-h_filter + + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 +%else + REPX {paddd x, m14}, m5, m7 +%endif + REPX {psrad x, 5}, m5, m7 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; right half + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m0, [grain_lutq+r0*2+16] +%else + movu m0, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m1, m0, m4 + punpcklwd m0, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m1, m0 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 +%else + REPX {paddd x, m14}, m1, m0 +%endif + REPX {psrad x, 5}, m1, m0 + packssdw m4, m0, m1 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m6, [lumaq+ 0] + mova m5, [lumaq+(16<<%2)] +%if %2 + phaddw m6, [lumaq+16] + phaddw m5, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif +%if %2 + pavgw m6, mzero + pavgw m5, mzero +%endif + +%if %1 + punpckhwd m7, m6, m0 + punpcklwd m6, m0 + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 + punpckhwd m7, m5, m1 + punpcklwd m5, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 + pxor mzero, mzero + REPX {paddw x, m15}, m6, m5 + REPX {pmaxsw x, mzero}, m6, m5 + REPX {pminsw x, m10}, m6, m5 ; clip_pixel() +%else + REPX {pand x, m10}, m6, m5 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 + vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 +%else +%if %3 == 0 + ; register shortage :) + push r12 +%endif + vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 + vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 +%if %3 == 0 + pop r12 +%endif +%endif + REPX {psrlw x, 8}, m7, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m6 + pmulhrsw m3, m7 + pmulhrsw m4, m6 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hw +%if %3 + jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap + btc hd, 16 + jc %%loop_y_h_overlap +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_hv_overlap +%%end_y_hv_overlap: +%endif +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov dstmp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif +%if %2 + jmp %%loop_x_hv_overlap +%else + or dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 diff -Nru dav1d-0.9.2/src/x86/film_grain_avx2.asm dav1d-1.0.0/src/x86/film_grain_avx2.asm --- dav1d-0.9.2/src/x86/film_grain_avx2.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/film_grain_avx2.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,2378 +0,0 @@ -; Copyright © 2019-2021, VideoLAN and dav1d authors -; Copyright © 2019, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "config.asm" -%include "ext/x86/x86inc.asm" - -%if ARCH_X86_64 - -SECTION_RODATA 32 -pb_8x_27_17_8x_17_27: times 8 db 27, 17 - times 8 db 17, 27 -pw_1024: times 16 dw 1024 -pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 -rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 -byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 -pw_seed_xor: times 2 dw 0xb524 - times 2 dw 0x49d8 -pd_m65536: dd ~0xffff -pb_23_22: db 23, 22 - times 3 db 0, 32 -pb_1: times 4 db 1 -hmul_bits: dw 32768, 16384, 8192, 4096 -round: dw 2048, 1024, 512 -mul_bits: dw 256, 128, 64, 32, 16 -round_vals: dw 32, 64, 128, 256, 512 -max: dw 255, 240, 235 -min: dw 0, 16 -pb_27_17_17_27: db 27, 17, 17, 27 - times 2 db 0, 32 -pw_1: dw 1 - -%macro JMP_TABLE 2-* - %xdefine %1_8bpc_%2_table %%table - %xdefine %%base %1_8bpc_%2_table - %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) - %%table: - %rep %0 - 2 - dd %%prefix %+ .ar%3 - %%base - %rotate 1 - %endrep -%endmacro - -ALIGN 4 -JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 - -struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - -SECTION .text - -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - -INIT_XMM avx2 -cglobal generate_grain_y_8bpc, 2, 9, 16, buf, fg_data - lea r4, [pb_mask] -%define base r4-pb_mask - movq xm1, [base+rnd_next_upperbit_mask] - movq xm4, [base+mul_bits] - movq xm7, [base+hmul_bits] - mov r2d, [fg_dataq+FGData.grain_scale_shift] - vpbroadcastw xm8, [base+round+r2*2] - mova xm5, [base+pb_mask] - vpbroadcastw xm0, [fg_dataq+FGData.seed] - vpbroadcastd xm9, [base+pd_m65536] - mov r2, -73*82 - sub bufq, r2 - lea r3, [gaussian_sequence] -.loop: - pand xm2, xm0, xm1 - psrlw xm3, xm2, 10 - por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw xm2, xm4 ; bits 0x0f00 are set - pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds - psllq xm6, xm2, 30 - por xm2, xm6 - psllq xm6, xm2, 15 - por xm2, xm6 ; aggregate each bit into next seed's high bit - pmulhuw xm3, xm0, xm7 - por xm2, xm3 ; 4 next output seeds - pshuflw xm0, xm2, q3333 - psrlw xm2, 5 - pmovzxwd xm3, xm2 - mova xm6, xm9 - vpgatherdd xm2, [r3+xm3*2], xm6 - pandn xm2, xm9, xm2 - packusdw xm2, xm2 - pmulhrsw xm2, xm8 - packsswb xm2, xm2 - movd [bufq+r2], xm2 - add r2, 4 - jl .loop - - ; auto-regression code - movsxd r2, [fg_dataq+FGData.ar_coeff_lag] - movsxd r2, [base+generate_grain_y_8bpc_avx2_table+r2*4] - lea r2, [r2+base+generate_grain_y_8bpc_avx2_table] - jmp r2 - -.ar1: - DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] - movd xm4, [fg_dataq+FGData.ar_coeffs_y] - DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 - pinsrb xm4, [pb_1], 3 - pmovsxbw xm4, xm4 - pshufd xm5, xm4, q1111 - pshufd xm4, xm4, q0000 - vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd - sub bufq, 82*73-(82*3+79) - mov hd, 70 - mov mind, -128 - mov maxd, 127 -.y_loop_ar1: - mov xq, -76 - movsx val3d, byte [bufq+xq-1] -.x_loop_ar1: - pmovsxbw xm0, [bufq+xq-82-1] ; top/left - pmovsxbw xm2, [bufq+xq-82+0] ; top - pmovsxbw xm1, [bufq+xq-82+1] ; top/right - punpcklwd xm0, xm2 - punpcklwd xm1, xm3 - pmaddwd xm0, xm4 - pmaddwd xm1, xm5 - paddd xm0, xm1 -.x_loop_ar1_inner: - movd val0d, xm0 - psrldq xm0, 4 - imul val3d, cf3d - add val3d, val0d -%if WIN64 - sarx val3d, val3d, shiftd -%else - sar val3d, shiftb -%endif - movsx val0d, byte [bufq+xq] - add val3d, val0d - cmp val3d, maxd - cmovns val3d, maxd - cmp val3d, mind - cmovs val3d, mind - mov byte [bufq+xq], val3b - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82 - dec hd - jg .y_loop_ar1 -.ar0: - RET - -.ar2: - DEFINE_ARGS buf, fg_data, shift - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - vpbroadcastw xm14, [base+round_vals-12+shiftq*2] - movq xm15, [base+byte_blend+1] - pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 - movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 - pmovsxbw xm9, xm9 - DEFINE_ARGS buf, fg_data, h, x - pshufd xm12, xm9, q0000 - pshufd xm13, xm9, q1111 - pshufd xm11, xm8, q3333 - pshufd xm10, xm8, q2222 - pshufd xm9, xm8, q1111 - pshufd xm8, xm8, q0000 - pmovzxwd xm14, xm14 - sub bufq, 82*73-(82*3+79) - mov hd, 70 -.y_loop_ar2: - mov xq, -76 - -.x_loop_ar2: - pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] - pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] - psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] - psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] - psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] - punpcklwd xm2, xm0, xm2 - punpcklwd xm3, xm4 - pmaddwd xm2, xm8 - pmaddwd xm3, xm11 - paddd xm2, xm3 - - psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] - psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] - psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5] - punpcklwd xm4, xm5 - punpcklwd xm6, xm1 - psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5] - psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] - punpcklwd xm7, xm1 - pmaddwd xm4, xm9 - pmaddwd xm6, xm10 - pmaddwd xm7, xm12 - paddd xm4, xm6 - paddd xm2, xm7 - paddd xm2, xm4 - paddd xm2, xm14 - - movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] -.x_loop_ar2_inner: - pmovsxbw xm1, xm0 - pmaddwd xm3, xm1, xm13 - paddd xm3, xm2 - psrldq xm1, 4 ; y=0,x=0 - psrldq xm2, 4 ; shift top to next pixel - psrad xm3, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw since we only care about one value - paddw xm3, xm1 - packsswb xm3, xm3 - pextrb [bufq+xq], xm3, 0 - pslldq xm3, 2 - pand xm3, xm15 - pandn xm0, xm15, xm0 - por xm0, xm3 - psrldq xm0, 1 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82 - dec hd - jg .y_loop_ar2 - RET - -.ar3: - DEFINE_ARGS buf, fg_data, shift -%if WIN64 - SUB rsp, 16*12 -%assign stack_size_padded (stack_size_padded+16*12) -%assign stack_size (stack_size+16*12) -%else - ALLOC_STACK 16*12 -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - vpbroadcastw xm14, [base+round_vals-12+shiftq*2] - movq xm15, [base+byte_blend] - pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7 - pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15 - pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 - pshufd xm9, xm0, q1111 - pshufd xm10, xm0, q2222 - pshufd xm11, xm0, q3333 - pshufd xm0, xm0, q0000 - pshufd xm6, xm1, q1111 - pshufd xm7, xm1, q2222 - pshufd xm8, xm1, q3333 - pshufd xm1, xm1, q0000 - pshufd xm3, xm2, q1111 - psrldq xm13, xm2, 10 - pinsrw xm2, [pw_1], 5 - pshufd xm4, xm2, q2222 - pshufd xm2, xm2, q0000 - pinsrw xm13, [base+round_vals+shiftq*2-10], 3 - mova [rsp+ 0*16], xm0 - mova [rsp+ 1*16], xm9 - mova [rsp+ 2*16], xm10 - mova [rsp+ 3*16], xm11 - mova [rsp+ 4*16], xm1 - mova [rsp+ 5*16], xm6 - mova [rsp+ 6*16], xm7 - mova [rsp+ 7*16], xm8 - mova [rsp+ 8*16], xm2 - mova [rsp+ 9*16], xm3 - mova [rsp+10*16], xm4 - DEFINE_ARGS buf, fg_data, h, x - sub bufq, 82*73-(82*3+79) - mov hd, 70 -.y_loop_ar3: - mov xq, -76 - -.x_loop_ar3: - movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] - movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] - movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] - pxor xm3, xm3 - pcmpgtb xm6, xm3, xm2 - pcmpgtb xm5, xm3, xm1 - pcmpgtb xm4, xm3, xm0 - punpckhbw xm3, xm0, xm4 - punpcklbw xm0, xm4 - punpckhbw xm4, xm1, xm5 - punpcklbw xm1, xm5 - punpckhbw xm5, xm2, xm6 - punpcklbw xm2, xm6 - - psrldq xm6, xm0, 2 - psrldq xm7, xm0, 4 - psrldq xm8, xm0, 6 - psrldq xm9, xm0, 8 - palignr xm10, xm3, xm0, 10 - palignr xm11, xm3, xm0, 12 - - punpcklwd xm0, xm6 - punpcklwd xm7, xm8 - punpcklwd xm9, xm10 - punpcklwd xm11, xm1 - pmaddwd xm0, [rsp+ 0*16] - pmaddwd xm7, [rsp+ 1*16] - pmaddwd xm9, [rsp+ 2*16] - pmaddwd xm11, [rsp+ 3*16] - paddd xm0, xm7 - paddd xm9, xm11 - paddd xm0, xm9 - - psrldq xm6, xm1, 2 - psrldq xm7, xm1, 4 - psrldq xm8, xm1, 6 - psrldq xm9, xm1, 8 - palignr xm10, xm4, xm1, 10 - palignr xm11, xm4, xm1, 12 - psrldq xm12, xm2, 2 - - punpcklwd xm6, xm7 - punpcklwd xm8, xm9 - punpcklwd xm10, xm11 - punpcklwd xm12, xm2, xm12 - pmaddwd xm6, [rsp+ 4*16] - pmaddwd xm8, [rsp+ 5*16] - pmaddwd xm10, [rsp+ 6*16] - pmaddwd xm12, [rsp+ 7*16] - paddd xm6, xm8 - paddd xm10, xm12 - paddd xm6, xm10 - paddd xm0, xm6 - - psrldq xm6, xm2, 4 - psrldq xm7, xm2, 6 - psrldq xm8, xm2, 8 - palignr xm9, xm5, xm2, 10 - palignr xm5, xm5, xm2, 12 - - punpcklwd xm6, xm7 - punpcklwd xm8, xm9 - punpcklwd xm5, xm14 - pmaddwd xm6, [rsp+ 8*16] - pmaddwd xm8, [rsp+ 9*16] - pmaddwd xm5, [rsp+10*16] - paddd xm0, xm6 - paddd xm8, xm5 - paddd xm0, xm8 - - movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pmovsxbw xm2, xm1 - pmaddwd xm2, xm13 - pshufd xm3, xm2, q1111 - paddd xm2, xm3 ; left+cur - paddd xm2, xm0 ; add top - psrldq xm0, 4 - psrad xm2, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw since we only care about one value - packsswb xm2, xm2 - pextrb [bufq+xq], xm2, 0 - pslldq xm2, 3 - pand xm2, xm15 - pandn xm1, xm15, xm1 - por xm1, xm2 - psrldq xm1, 1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82 - dec hd - jg .y_loop_ar3 - RET - -%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y -INIT_XMM avx2 -cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv - lea r4, [pb_mask] -%define base r4-pb_mask - movq xm1, [base+rnd_next_upperbit_mask] - movq xm4, [base+mul_bits] - movq xm7, [base+hmul_bits] - mov r5d, [fg_dataq+FGData.grain_scale_shift] - vpbroadcastw xm8, [base+round+r5*2] - mova xm5, [base+pb_mask] - vpbroadcastw xm0, [fg_dataq+FGData.seed] - vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] - pxor xm0, xm9 - vpbroadcastd xm9, [base+pd_m65536] - lea r6, [gaussian_sequence] -%if %2 - mov r7d, 73-35*%3 - add bufq, 44 -.loop_y: - mov r5, -44 -.loop_x: -%else - mov r5, -73*82 - sub bufq, r5 -.loop: -%endif - pand xm2, xm0, xm1 - psrlw xm3, xm2, 10 - por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw xm2, xm4 ; bits 0x0f00 are set - pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds - psllq xm6, xm2, 30 - por xm2, xm6 - psllq xm6, xm2, 15 - por xm2, xm6 ; aggregate each bit into next seed's high bit - pmulhuw xm3, xm0, xm7 - por xm2, xm3 ; 4 next output seeds - pshuflw xm0, xm2, q3333 - psrlw xm2, 5 - pmovzxwd xm3, xm2 - mova xm6, xm9 - vpgatherdd xm2, [r6+xm3*2], xm6 - pandn xm2, xm9, xm2 - packusdw xm2, xm2 - pmulhrsw xm2, xm8 - packsswb xm2, xm2 - movd [bufq+r5], xm2 - add r5, 4 -%if %2 - jl .loop_x - add bufq, 82 - dec r7d - jg .loop_y -%else - jl .loop -%endif - - ; auto-regression code - movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_8bpc_avx2_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_8bpc_avx2_table] - jmp r5 - -.ar0: - INIT_YMM avx2 - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - imul uvd, 28 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] - movd xm3, [base+hmul_bits+shiftq*2] - DEFINE_ARGS buf, bufy, h - pmovsxbw xm4, xm4 -%if %2 - vpbroadcastd m7, [pb_1] - vpbroadcastw m6, [hmul_bits+2+%3*2] -%endif - vpbroadcastw m4, xm4 - vpbroadcastw m3, xm3 - pxor m12, m12 -%if %2 - sub bufq, 82*(73-35*%3)+82-(82*3+41) -%else - sub bufq, 82*70-3 -%endif - add bufyq, 3+82*3 - mov hd, 70-35*%3 -.y_loop_ar0: -%if %2 - ; first 32 pixels - movu xm8, [bufyq] -%if %3 - movu xm9, [bufyq+82] -%endif - movu xm10, [bufyq+16] -%if %3 - movu xm11, [bufyq+82+16] -%endif - vinserti128 m8, [bufyq+32], 1 -%if %3 - vinserti128 m9, [bufyq+82+32], 1 -%endif - vinserti128 m10, [bufyq+48], 1 -%if %3 - vinserti128 m11, [bufyq+82+48], 1 -%endif - pmaddubsw m8, m7, m8 -%if %3 - pmaddubsw m9, m7, m9 -%endif - pmaddubsw m10, m7, m10 -%if %3 - pmaddubsw m11, m7, m11 - paddw m8, m9 - paddw m10, m11 -%endif - pmulhrsw m8, m6 - pmulhrsw m10, m6 -%else - xor r3d, r3d - ; first 32x2 pixels -.x_loop_ar0: - movu m8, [bufyq+r3] - pcmpgtb m9, m12, m8 - punpckhbw m10, m8, m9 - punpcklbw m8, m9 -%endif - pmullw m8, m4 - pmullw m10, m4 - pmulhrsw m8, m3 - pmulhrsw m10, m3 -%if %2 - movu m0, [bufq] -%else - movu m0, [bufq+r3] -%endif - pcmpgtb m1, m12, m0 - punpckhbw m9, m0, m1 - punpcklbw m0, m1 - paddw m0, m8 - paddw m9, m10 - packsswb m0, m9 -%if %2 - movu [bufq], m0 -%else - movu [bufq+r3], m0 - add r3d, 32 - cmp r3d, 64 - jl .x_loop_ar0 -%endif - - ; last 6/12 pixels - movu xm8, [bufyq+32*2] -%if %2 -%if %3 - movu xm9, [bufyq+32*2+82] -%endif - pmaddubsw xm8, xm7, xm8 -%if %3 - pmaddubsw xm9, xm7, xm9 - paddw xm8, xm9 -%endif - pmulhrsw xm8, xm6 - pmullw xm8, xm4 - pmulhrsw xm8, xm3 - movq xm0, [bufq+32] - pcmpgtb xm9, xm12, xm0 - punpcklbw xm9, xm0, xm9 - paddw xm8, xm9 - packsswb xm8, xm8 - vpblendw xm0, xm8, xm0, 1000b - movq [bufq+32], xm0 -%else - pcmpgtb xm9, xm12, xm8 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - pmullw xm10, xm4 - pmullw xm8, xm4 - pmulhrsw xm10, xm3 - pmulhrsw xm8, xm3 - movu xm0, [bufq+64] - pcmpgtb xm9, xm12, xm0 - punpcklbw xm1, xm0, xm9 - punpckhbw xm9, xm0, xm9 - paddw xm1, xm8 - paddw xm9, xm10 - packsswb xm1, xm9 - vpblendw xm0, xm1, xm0, 11000000b - movu [bufq+64], xm0 -%endif - - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar0 - RET - -.ar1: - INIT_XMM avx2 - DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift - imul uvd, 28 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] - movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] - pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 - DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift - pmovsxbw xm4, xm4 - pshufd xm5, xm4, q1111 - pshufd xm4, xm4, q0000 - pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd -%if %2 - vpbroadcastd xm7, [pb_1] - vpbroadcastw xm6, [hmul_bits+2+%3*2] -%endif - vpbroadcastd xm3, xm3 -%if %2 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*70-(82-3) -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 - mov mind, -128 - mov maxd, 127 -.y_loop_ar1: - mov xq, -(76>>%2) - movsx val3d, byte [bufq+xq-1] -.x_loop_ar1: - pmovsxbw xm0, [bufq+xq-82-1] ; top/left -%if %2 - movq xm8, [bufyq+xq*2] -%if %3 - movq xm9, [bufyq+xq*2+82] -%endif -%endif - psrldq xm2, xm0, 2 ; top - psrldq xm1, xm0, 4 ; top/right -%if %2 - pmaddubsw xm8, xm7, xm8 -%if %3 - pmaddubsw xm9, xm7, xm9 - paddw xm8, xm9 -%endif - pmulhrsw xm8, xm6 -%else - pmovsxbw xm8, [bufyq+xq] -%endif - punpcklwd xm0, xm2 - punpcklwd xm1, xm8 - pmaddwd xm0, xm4 - pmaddwd xm1, xm5 - paddd xm0, xm1 - paddd xm0, xm3 -.x_loop_ar1_inner: - movd val0d, xm0 - psrldq xm0, 4 - imul val3d, cf3d - add val3d, val0d - sarx val3d, val3d, shiftd - movsx val0d, byte [bufq+xq] - add val3d, val0d - cmp val3d, maxd - cmovns val3d, maxd - cmp val3d, mind - cmovs val3d, mind - mov byte [bufq+xq], val3b - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar1 - RET - -.ar2: - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - vpbroadcastw xm15, [base+round_vals-12+shiftq*2] - pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 - pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 - pinsrw xm9, [base+pw_1], 5 -%if %2 - vpbroadcastw xm7, [base+hmul_bits+2+%3*2] - vpbroadcastd xm6, [base+pb_1] -%endif - DEFINE_ARGS buf, bufy, fg_data, h, unused, x - pshufd xm12, xm9, q0000 - pshufd xm13, xm9, q1111 - pshufd xm14, xm9, q2222 - pshufd xm11, xm8, q3333 - pshufd xm10, xm8, q2222 - pshufd xm9, xm8, q1111 - pshufd xm8, xm8, q0000 -%if %2 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*70-(82-3) -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 -.y_loop_ar2: - mov xq, -(76>>%2) - -.x_loop_ar2: - pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] - pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] - psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] - psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] - psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] - punpcklwd xm2, xm0, xm2 - punpcklwd xm3, xm4 - pmaddwd xm2, xm8 - pmaddwd xm3, xm11 - paddd xm2, xm3 - - psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] - psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] - psrldq xm0, 8 ; y=-2,x=[+2,+5] - punpcklwd xm4, xm5 - punpcklwd xm0, xm1 - psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5] - psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] - punpcklwd xm3, xm1 - pmaddwd xm4, xm9 - pmaddwd xm0, xm10 - pmaddwd xm3, xm12 - paddd xm4, xm0 - paddd xm2, xm3 - paddd xm2, xm4 - -%if %2 - movq xm0, [bufyq+xq*2] -%if %3 - movq xm3, [bufyq+xq*2+82] -%endif - pmaddubsw xm0, xm6, xm0 -%if %3 - pmaddubsw xm3, xm6, xm3 - paddw xm0, xm3 -%endif - pmulhrsw xm0, xm7 -%else - pmovsxbw xm0, [bufyq+xq] -%endif - punpcklwd xm0, xm15 - pmaddwd xm0, xm14 - paddd xm2, xm0 - - movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] -.x_loop_ar2_inner: - pmovsxbw xm0, xm0 - pmaddwd xm3, xm0, xm13 - paddd xm3, xm2 - psrldq xm2, 4 ; shift top to next pixel - psrad xm3, [fg_dataq+FGData.ar_coeff_shift] - pslldq xm3, 2 - psrldq xm0, 2 - paddw xm3, xm0 - vpblendw xm0, xm3, 00000010b - packsswb xm0, xm0 - pextrb [bufq+xq], xm0, 1 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar2 - RET - -.ar3: - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - SUB rsp, 16*12 -%assign stack_size_padded (stack_size_padded+16*12) -%assign stack_size (stack_size+16*12) - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - vpbroadcastw xm14, [base+round_vals-12+shiftq*2] - pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7 - pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15 - pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 - pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] - pshufd xm9, xm0, q1111 - pshufd xm10, xm0, q2222 - pshufd xm11, xm0, q3333 - pshufd xm0, xm0, q0000 - pshufd xm6, xm1, q1111 - pshufd xm7, xm1, q2222 - pshufd xm8, xm1, q3333 - pshufd xm1, xm1, q0000 - pshufd xm3, xm2, q1111 - pshufd xm4, xm2, q2222 - vpbroadcastw xm5, xm5 - vpblendw xm4, xm5, 10101010b ; interleave luma cf - psrldq xm5, xm2, 10 - pshufd xm2, xm2, q0000 - pinsrw xm5, [base+round_vals+shiftq*2-10], 3 - pmovzxwd xm14, xm14 - mova [rsp+ 0*16], xm0 - mova [rsp+ 1*16], xm9 - mova [rsp+ 2*16], xm10 - mova [rsp+ 3*16], xm11 - mova [rsp+ 4*16], xm1 - mova [rsp+ 5*16], xm6 - mova [rsp+ 6*16], xm7 - mova [rsp+ 7*16], xm8 - mova [rsp+ 8*16], xm2 - mova [rsp+ 9*16], xm3 - mova [rsp+10*16], xm4 - mova [rsp+11*16], xm5 -%if %2 - vpbroadcastd xm13, [base+pb_1] - vpbroadcastw xm15, [base+hmul_bits+2+%3*2] -%endif - DEFINE_ARGS buf, bufy, fg_data, h, unused, x -%if %2 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*70-(82-3) -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 -.y_loop_ar3: - mov xq, -(76>>%2) - -.x_loop_ar3: - movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] - movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] - movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] - pxor xm3, xm3 - pcmpgtb xm6, xm3, xm2 - pcmpgtb xm5, xm3, xm1 - pcmpgtb xm4, xm3, xm0 - punpckhbw xm3, xm0, xm4 - punpcklbw xm0, xm4 - punpckhbw xm4, xm1, xm5 - punpcklbw xm1, xm5 - punpckhbw xm5, xm2, xm6 - punpcklbw xm2, xm6 - - psrldq xm6, xm0, 2 - psrldq xm7, xm0, 4 - psrldq xm8, xm0, 6 - psrldq xm9, xm0, 8 - palignr xm10, xm3, xm0, 10 - palignr xm11, xm3, xm0, 12 - - punpcklwd xm0, xm6 - punpcklwd xm7, xm8 - punpcklwd xm9, xm10 - punpcklwd xm11, xm1 - pmaddwd xm0, [rsp+ 0*16] - pmaddwd xm7, [rsp+ 1*16] - pmaddwd xm9, [rsp+ 2*16] - pmaddwd xm11, [rsp+ 3*16] - paddd xm0, xm7 - paddd xm9, xm11 - paddd xm0, xm9 - - psrldq xm6, xm1, 2 - psrldq xm7, xm1, 4 - psrldq xm8, xm1, 6 - psrldq xm9, xm1, 8 - palignr xm10, xm4, xm1, 10 - palignr xm11, xm4, xm1, 12 - psrldq xm12, xm2, 2 - - punpcklwd xm6, xm7 - punpcklwd xm8, xm9 - punpcklwd xm10, xm11 - punpcklwd xm12, xm2, xm12 - pmaddwd xm6, [rsp+ 4*16] - pmaddwd xm8, [rsp+ 5*16] - pmaddwd xm10, [rsp+ 6*16] - pmaddwd xm12, [rsp+ 7*16] - paddd xm6, xm8 - paddd xm10, xm12 - paddd xm6, xm10 - paddd xm0, xm6 - - psrldq xm6, xm2, 4 - psrldq xm7, xm2, 6 - psrldq xm8, xm2, 8 - palignr xm9, xm5, xm2, 10 - palignr xm5, xm5, xm2, 12 - -%if %2 - movq xm1, [bufyq+xq*2] -%if %3 - movq xm2, [bufyq+xq*2+82] -%endif - pmaddubsw xm1, xm13, xm1 -%if %3 - pmaddubsw xm2, xm13, xm2 - paddw xm1, xm2 -%endif - pmulhrsw xm1, xm15 -%else - pmovsxbw xm1, [bufyq+xq] -%endif - - punpcklwd xm6, xm7 - punpcklwd xm8, xm9 - punpcklwd xm5, xm1 - pmaddwd xm6, [rsp+ 8*16] - pmaddwd xm8, [rsp+ 9*16] - pmaddwd xm5, [rsp+10*16] - paddd xm0, xm6 - paddd xm8, xm5 - paddd xm0, xm8 - paddd xm0, xm14 - - movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pmovsxbw xm1, xm1 - pmaddwd xm2, xm1, [rsp+16*11] - pshufd xm3, xm2, q1111 - paddd xm2, xm3 ; left+cur - paddd xm2, xm0 ; add top - psrldq xm0, 4 - psrad xm2, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw, we only care about one value - pslldq xm2, 6 - vpblendw xm1, xm2, 1000b - packsswb xm1, xm1 - pextrb [bufq+xq], xm1, 3 - psrldq xm1, 1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar3 - RET -%endmacro - -generate_grain_uv_fn 420, 1, 1 -generate_grain_uv_fn 422, 1, 0 -generate_grain_uv_fn 444, 0, 0 - -INIT_YMM avx2 -cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut - pcmpeqw m10, m10 - psrld m10, 24 - mov r7d, [fg_dataq+FGData.scaling_shift] - lea r8, [pb_mask] -%define base r8-pb_mask - vpbroadcastw m11, [base+mul_bits+r7*2-14] - mov r7d, [fg_dataq+FGData.clip_to_restricted_range] - vpbroadcastw m12, [base+max+r7*4] - vpbroadcastw m13, [base+min+r7*2] - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap - - mov overlapd, [fg_dataq+FGData.overlap_flag] - movifnidn sbyd, sbym - test sbyd, sbyd - setnz r7b - test r7b, overlapb - jnz .vertical_overlap - - imul seed, sbyd, (173 << 24) | 37 - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - unused1, unused2, see, overlap - - lea src_bakq, [srcq+wq] - neg wq - sub dstq, srcq - -.loop_x: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, overlap - - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, overlap - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq+srcq], m0 - - add srcq, strideq - add grain_lutq, 82 - dec hd - jg .loop_y - - add wq, 32 - jge .end - lea srcq, [src_bakq+wq] - test overlapd, overlapd - jz .loop_x - - ; r8m = sbym - movq xm15, [pb_27_17_17_27] - cmp dword r8m, 0 - jne .loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) - movq xm14, [pw_1024] -.loop_x_h_overlap: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, left_offxy - - lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, left_offxy - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y_h_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - movd xm4, [grain_lutq+left_offxyq] - punpcklbw xm4, xm3 - pmaddubsw xm4, xm15, xm4 - pmulhrsw xm4, xm14 - packsswb xm4, xm4 - vpblendd m3, m3, m4, 00000001b - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq+srcq], m0 - - add srcq, strideq - add grain_lutq, 82 - dec hd - jg .loop_y_h_overlap - - add wq, 32 - jge .end - lea srcq, [src_bakq+wq] - - ; r8m = sbym - cmp dword r8m, 0 - jne .loop_x_hv_overlap - jmp .loop_x_h_overlap - -.end: - RET - -.vertical_overlap: - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap - - movzx sbyd, sbyb - imul seed, [fg_dataq+FGData.seed], 0x00010001 - imul r7d, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add r7d, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and r7d, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, r7d - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - unused1, unused2, see, overlap - - lea src_bakq, [srcq+wq] - neg wq - sub dstq, srcq - - vpbroadcastd m14, [pw_1024] -.loop_x_v_overlap: - vpbroadcastw m15, [pb_27_17_17_27] - - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, overlap, top_offxy - - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, overlap, top_offxy - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y_v_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - movu m4, [grain_lutq+top_offxyq] - punpckhbw m6, m4, m3 - punpcklbw m4, m3 - pmaddubsw m6, m15, m6 - pmaddubsw m4, m15, m4 - pmulhrsw m6, m14 - pmulhrsw m4, m14 - packsswb m3, m4, m6 - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq+srcq], m0 - - vpbroadcastw m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line - add srcq, strideq - add grain_lutq, 82 - dec hw - jz .end_y_v_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - btc hd, 16 - jnc .loop_y_v_overlap - jmp .loop_y - -.end_y_v_overlap: - add wq, 32 - jge .end_hv - lea srcq, [src_bakq+wq] - - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap - - movq xm15, [pb_27_17_17_27] -.loop_x_hv_overlap: - vpbroadcastw m8, [pb_27_17_17_27] - - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy - - lea topleft_offxyq, [top_offxyq+32] - lea left_offxyq, [offyq+32] - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y_hv_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel - vpgatherdd m9, [scalingq+m4], m3 - pcmpeqw m3, m3 - vpgatherdd m4, [scalingq+m5], m3 - pcmpeqw m3, m3 - vpgatherdd m5, [scalingq+m6], m3 - pcmpeqw m3, m3 - vpgatherdd m6, [scalingq+m7], m3 - pand m9, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m9, m4 - packusdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - movu m6, [grain_lutq+top_offxyq] - movd xm4, [grain_lutq+left_offxyq] - movd xm7, [grain_lutq+topleft_offxyq] - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklbw xm4, xm3 - punpcklbw xm7, xm6 - pmaddubsw xm4, xm15, xm4 - pmaddubsw xm7, xm15, xm7 - pmulhrsw xm4, xm14 - pmulhrsw xm7, xm14 - packsswb xm4, xm4 - packsswb xm7, xm7 - vpblendd m3, m4, 00000001b - vpblendd m6, m7, 00000001b - ; followed by v interpolation (top | cur -> cur) - punpckhbw m7, m6, m3 - punpcklbw m6, m3 - pmaddubsw m7, m8, m7 - pmaddubsw m6, m8, m6 - pmulhrsw m7, m14 - pmulhrsw m6, m14 - packsswb m3, m6, m7 - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m9 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq+srcq], m0 - - vpbroadcastw m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line - add srcq, strideq - add grain_lutq, 82 - dec hw - jz .end_y_hv_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - btc hd, 16 - jnc .loop_y_hv_overlap - jmp .loop_y_h_overlap - -.end_y_hv_overlap: - add wq, 32 - lea srcq, [src_bakq+wq] - jl .loop_x_hv_overlap - -.end_hv: - RET - -%macro FGUV_FN 3 ; name, ss_hor, ss_ver -cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ - grain_lut, h, sby, luma, lstride, uv_pl, is_id - mov r7d, [fg_dataq+FGData.scaling_shift] - lea r8, [pb_mask] -%define base r8-pb_mask - vpbroadcastw m11, [base+mul_bits+r7*2-14] - mov r7d, [fg_dataq+FGData.clip_to_restricted_range] - mov r9d, dword is_idm - vpbroadcastw m13, [base+min+r7*2] - shlx r7d, r7d, r9d - vpbroadcastw m12, [base+max+r7*2] - - cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 - jne .csfl - -%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap - -%if %1 - mov r7d, dword r11m - vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4] - vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4] - punpcklbw m14, m1, m0 - vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] -%else - vpbroadcastd m14, [pw_1024] -%if %2 - vpbroadcastq m15, [pb_23_22] -%else - vpbroadcastq xm15, [pb_27_17_17_27] -%endif -%endif -%if %3 - vpbroadcastw m10, [pb_23_22] -%elif %2 - mova m10, [pb_8x_27_17_8x_17_27] -%endif - - mov overlapd, [fg_dataq+FGData.overlap_flag] - movifnidn sbyd, sbym - test sbyd, sbyd - setnz r7b - test r7b, overlapb - jnz %%vertical_overlap - - imul seed, sbyd, (173 << 24) | 37 - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - unused2, unused3, see, overlap, unused4, unused5, lstride - - mov lumaq, r9mp - lea r12, [srcq+wq] - lea r13, [dstq+wq] - lea r14, [lumaq+wq*(1+%2)] - mov r11mp, r12 - mov r12mp, r13 - mov lstrideq, r10mp - neg wq - -%%loop_x: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, overlap, unused1, unused2, lstride - - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, overlap, unused1, unused2, lstride - - mov hd, hm - mov grain_lutq, grain_lutmp -%%loop_y: - ; src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm6, [lumaq+lstrideq*0+16] - mova xm0, [srcq] - vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 - vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 - vinserti128 m0, [srcq+strideq], 1 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - pxor m2, m2 - mova m4, [lumaq] - mova m0, [srcq] -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - - ; scaling[luma_src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq-3+m4], m3 - vpgatherdd m4, [scalingq-3+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq-3+m6], m3 - vpgatherdd m6, [scalingq-3+m7], m9 - REPX {psrld x, 24}, m8, m4, m5, m6 - packusdw m8, m4 - packusdw m5, m6 - - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; grain = grain_lut[offy+y][offx+x] -%if %2 - movu xm3, [grain_lutq+offxyq+ 0] - vinserti128 m3, [grain_lutq+offxyq+82], 1 -%else - movu m3, [grain_lutq+offxyq] -%endif - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 -%if %2 - mova [dstq], xm0 - vextracti128 [dstq+strideq], m0, 1 -%else - mova [dstq], m0 -%endif - -%if %2 - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82<<%2 - sub hb, 1+%2 - jg %%loop_y - - add wq, 32>>%2 - jge %%end - mov srcq, r11mp - mov dstq, r12mp - lea lumaq, [r14+wq*(1+%2)] - add srcq, wq - add dstq, wq - test overlapd, overlapd - jz %%loop_x - - ; r8m = sbym - cmp dword r8m, 0 - jne %%loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) -%%loop_x_h_overlap: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, unused1, unused2, lstride - - lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, unused1, unused2, lstride - - mov hd, hm - mov grain_lutq, grain_lutmp -%%loop_y_h_overlap: - ; src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm6, [lumaq+lstrideq*0+16] - mova xm0, [srcq] - vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 - vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 - vinserti128 m0, [srcq+strideq], 1 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - - ; scaling[luma_src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq-3+m4], m3 - vpgatherdd m4, [scalingq-3+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq-3+m6], m3 - vpgatherdd m6, [scalingq-3+m7], m9 - REPX {psrld x, 24}, m8, m4, m5, m6 - packusdw m8, m4 - packusdw m5, m6 - - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; grain = grain_lut[offy+y][offx+x] -%if %2 -%if %1 - vpbroadcastq m6, [pb_23_22] -%endif - movu xm3, [grain_lutq+offxyq+ 0] - movd xm4, [grain_lutq+left_offxyq+ 0] - vinserti128 m3, [grain_lutq+offxyq+82], 1 - vinserti128 m4, [grain_lutq+left_offxyq+82], 1 - punpcklbw m4, m3 -%if %1 - pmaddubsw m4, m6, m4 - pmulhrsw m4, [pw_1024] -%else - pmaddubsw m4, m15, m4 - pmulhrsw m4, m14 -%endif - packsswb m4, m4 - vpblendd m3, m3, m4, 00010001b -%else -%if %1 - movq xm6, [pb_27_17_17_27] -%endif - movu m3, [grain_lutq+offxyq] - movd xm4, [grain_lutq+left_offxyq] - punpcklbw xm4, xm3 -%if %1 - pmaddubsw xm4, xm6, xm4 - pmulhrsw xm4, [pw_1024] -%else - pmaddubsw xm4, xm15, xm4 - pmulhrsw xm4, xm14 -%endif - packsswb xm4, xm4 - vpblendd m3, m3, m4, 00000001b -%endif - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 -%if %2 - mova [dstq], xm0 - vextracti128 [dstq+strideq], m0, 1 -%else - mova [dstq], m0 -%endif - -%if %2 - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82*(1+%2) - sub hb, 1+%2 - jg %%loop_y_h_overlap - - add wq, 32>>%2 - jge %%end - mov srcq, r11mp - mov dstq, r12mp - lea lumaq, [r14+wq*(1+%2)] - add srcq, wq - add dstq, wq - - ; r8m = sbym - cmp dword r8m, 0 - jne %%loop_x_hv_overlap - jmp %%loop_x_h_overlap - -%%end: - RET - -%%vertical_overlap: - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ - sby, see, overlap, unused1, unused2, lstride - - movzx sbyd, sbyb - imul seed, [fg_dataq+FGData.seed], 0x00010001 - imul r7d, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add r7d, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and r7d, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, r7d - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - unused1, unused2, see, overlap, unused3, unused4, lstride - - mov lumaq, r9mp - lea r12, [srcq+wq] - lea r13, [dstq+wq] - lea r14, [lumaq+wq*(1+%2)] - mov r11mp, r12 - mov r12mp, r13 - mov lstrideq, r10mp - neg wq - -%%loop_x_v_overlap: - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, overlap, top_offxy, unused, lstride - - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, overlap, top_offxy, unused, lstride - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -%if %2 == 0 - vbroadcasti128 m10, [pb_8x_27_17_8x_17_27] -%endif -%%loop_y_v_overlap: - ; src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm6, [lumaq+lstrideq*0+16] - mova xm0, [srcq] - vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 - vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 - vinserti128 m0, [srcq+strideq], 1 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - - ; scaling[luma_src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq-3+m4], m3 - vpgatherdd m4, [scalingq-3+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq-3+m6], m3 - vpgatherdd m6, [scalingq-3+m7], m9 - REPX {psrld x, 24}, m8, m4, m5, m6 - packusdw m8, m4 - packusdw m5, m6 - -%if %2 - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word -%endif - - ; grain = grain_lut[offy+y][offx+x] -%if %3 == 0 -%if %2 - movu xm3, [grain_lutq+offxyq] - movu xm4, [grain_lutq+top_offxyq] - vinserti128 m3, [grain_lutq+offxyq+82], 1 - vinserti128 m4, [grain_lutq+top_offxyq+82], 1 -%else - movu m3, [grain_lutq+offxyq] - movu m4, [grain_lutq+top_offxyq] -%endif - punpckhbw m9, m4, m3 - punpcklbw m4, m3 - pmaddubsw m9, m10, m9 - pmaddubsw m4, m10, m4 -%if %1 - pmulhrsw m9, [pw_1024] - pmulhrsw m4, [pw_1024] -%else - pmulhrsw m9, m14 - pmulhrsw m4, m14 -%endif - packsswb m3, m4, m9 -%else - movq xm3, [grain_lutq+offxyq] - movq xm4, [grain_lutq+top_offxyq] - vinserti128 m3, [grain_lutq+offxyq+8], 1 - vinserti128 m4, [grain_lutq+top_offxyq+8], 1 - punpcklbw m4, m3 - pmaddubsw m4, m10, m4 -%if %1 - pmulhrsw m4, [pw_1024] -%else - pmulhrsw m4, m14 -%endif - packsswb m4, m4 - vpermq m4, m4, q3120 - ; only interpolate first line, insert second line unmodified - vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 -%endif - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) -%if %2 - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq], xm0 - vextracti128 [dstq+strideq], m0, 1 -%else - pxor m6, m6 - punpckhbw m9, m0, m6 - punpcklbw m0, m6 ; m0-1: src as word - - paddw m0, m2 - paddw m9, m3 - pmaxsw m0, m13 - pmaxsw m9, m13 - pminsw m0, m12 - pminsw m9, m12 - packuswb m0, m9 - mova [dstq], m0 -%endif - - sub hb, 1+%2 - jle %%end_y_v_overlap -%if %2 - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82<<%2 -%if %2 == 0 - vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16] - btc hd, 16 - jnc %%loop_y_v_overlap -%endif - jmp %%loop_y - -%%end_y_v_overlap: - add wq, 32>>%2 - jge %%end_hv - mov srcq, r11mp - mov dstq, r12mp - lea lumaq, [r14+wq*(1+%2)] - add srcq, wq - add dstq, wq - - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap - -%%loop_x_hv_overlap: - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride - - lea topleft_offxyq, [top_offxyq+(32>>%2)] - lea left_offxyq, [offyq+(32>>%2)] - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -%if %2 == 0 - vbroadcasti128 m10, [pb_8x_27_17_8x_17_27] -%endif -%%loop_y_hv_overlap: - ; src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm6, [lumaq+lstrideq*0+16] - mova xm0, [srcq] - vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 - vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 - vinserti128 m0, [srcq+strideq], 1 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m9, m9 - pcmpeqw m3, m3 - vpgatherdd m8, [scalingq-3+m4], m9 - vpgatherdd m4, [scalingq-3+m5], m3 - pcmpeqw m9, m9 - pcmpeqw m3, m3 - vpgatherdd m5, [scalingq-3+m6], m9 - vpgatherdd m6, [scalingq-3+m7], m3 - REPX {psrld x, 24}, m8, m4, m5, m6 - packusdw m8, m4 - packusdw m5, m6 - -%if %2 - ; unpack chroma source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word -%endif - - ; grain = grain_lut[offy+y][offx+x] -%if %1 -%if %2 - vpbroadcastq m9, [pb_23_22] -%else - vpbroadcastq xm9, [pb_27_17_17_27] -%endif -%endif - -%if %2 - movu xm3, [grain_lutq+offxyq] -%if %3 - movq xm6, [grain_lutq+top_offxyq] -%else - movu xm6, [grain_lutq+top_offxyq] -%endif - vinserti128 m3, [grain_lutq+offxyq+82], 1 -%if %3 - vinserti128 m6, [grain_lutq+top_offxyq+8], 1 -%else - vinserti128 m6, [grain_lutq+top_offxyq+82], 1 -%endif -%else - movu m3, [grain_lutq+offxyq] - movu m6, [grain_lutq+top_offxyq] -%endif - movd xm4, [grain_lutq+left_offxyq] - movd xm7, [grain_lutq+topleft_offxyq] -%if %2 - vinserti128 m4, [grain_lutq+left_offxyq+82], 1 -%if %3 == 0 - vinserti128 m7, [grain_lutq+topleft_offxyq+82], 1 -%endif -%endif - - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) -%if %2 - punpcklbw m4, m3 -%if %3 - punpcklbw xm7, xm6 -%else - punpcklbw m7, m6 -%endif - punpcklqdq m4, m7 -%if %1 - pmaddubsw m4, m9, m4 - pmulhrsw m4, [pw_1024] -%else - pmaddubsw m4, m15, m4 - pmulhrsw m4, m14 -%endif - packsswb m4, m4 - vpblendd m3, m4, 00010001b - psrldq m4, 4 -%if %3 - vpblendd m6, m6, m4, 00000001b -%else - vpblendd m6, m6, m4, 00010001b -%endif -%else - punpcklbw xm4, xm3 - punpcklbw xm7, xm6 - punpcklqdq xm4, xm7 -%if %1 - pmaddubsw xm4, xm9, xm4 - pmulhrsw xm4, [pw_1024] -%else - pmaddubsw xm4, xm15, xm4 - pmulhrsw xm4, xm14 -%endif - packsswb xm4, xm4 - vpblendd m3, m3, m4, 00000001b - psrldq xm4, 4 - vpblendd m6, m6, m4, 00000001b -%endif - - ; followed by v interpolation (top | cur -> cur) -%if %3 - vpermq m9, m3, q3120 - punpcklbw m6, m9 - pmaddubsw m6, m10, m6 -%if %1 - pmulhrsw m6, [pw_1024] -%else - pmulhrsw m6, m14 -%endif - packsswb m6, m6 - vpermq m6, m6, q3120 - vpblendd m3, m3, m6, 00001111b -%else - punpckhbw m9, m6, m3 - punpcklbw m6, m3 - pmaddubsw m9, m10, m9 - pmaddubsw m6, m10, m6 -%if %1 - pmulhrsw m9, [pw_1024] - pmulhrsw m6, [pw_1024] -%else - pmulhrsw m9, m14 - pmulhrsw m6, m14 -%endif - packsswb m3, m6, m9 -%endif - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) -%if %2 - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq], xm0 - vextracti128 [dstq+strideq], m0, 1 -%else - pxor m6, m6 - punpckhbw m9, m0, m6 - punpcklbw m0, m6 ; m0-1: src as word - paddw m0, m2 - paddw m9, m3 - pmaxsw m0, m13 - pmaxsw m9, m13 - pminsw m0, m12 - pminsw m9, m12 - packuswb m0, m9 - mova [dstq], m0 -%endif - -%if %2 - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82<<%2 - sub hb, 1+%2 -%if %2 - jg %%loop_y_h_overlap -%else - je %%end_y_hv_overlap - vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16] - btc hd, 16 - jnc %%loop_y_hv_overlap - jmp %%loop_y_h_overlap -%endif - -%%end_y_hv_overlap: - add wq, 32>>%2 - jge %%end_hv - mov srcq, r11mp - mov dstq, r12mp - lea lumaq, [r14+wq*(1+%2)] - add srcq, wq - add dstq, wq - jmp %%loop_x_hv_overlap - -%%end_hv: - RET -%endmacro - - %%FGUV_32x32xN_LOOP 1, %2, %3 -.csfl: - %%FGUV_32x32xN_LOOP 0, %2, %3 -%endmacro - -FGUV_FN 420, 1, 1 -FGUV_FN 422, 1, 0 -FGUV_FN 444, 0, 0 - -%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/filmgrain_avx2.asm dav1d-1.0.0/src/x86/filmgrain_avx2.asm --- dav1d-0.9.2/src/x86/filmgrain_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/filmgrain_avx2.asm 2022-03-18 14:31:56.010356000 +0000 @@ -0,0 +1,2107 @@ +; Copyright © 2019-2022, VideoLAN and dav1d authors +; Copyright © 2019-2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 +gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +; note: the order of (some of) the following constants matter +pb_27_17: times 2 db 27, 17 +byte_blend: db 0, 0, 0, -1 +pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 +pb_17_27: times 2 db 17, 27 +pb_1: times 4 db 1 +pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32 +next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +fg_min: times 4 db 0 + times 4 db 16 +fg_max: times 4 db 255 + times 4 db 240 + times 4 db 235 +pd_m65536: dd -65536 +pw_8: times 2 dw 8 +pw_1024: times 2 dw 1024 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +pw_1: dw 1 + +%macro JMP_TABLE 2-* + %1_8bpc_%2_table: + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 + +SECTION .text + +INIT_YMM avx2 +cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data +%define base r4-generate_grain_y_8bpc_avx2_table + lea r4, [generate_grain_y_8bpc_avx2_table] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r6d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movq xm4, [base+mul_bits] + movq xm5, [base+hmul_bits] + mov r7, -73*82 + mova xm6, [base+pb_mask] + sub bufq, r7 + vpbroadcastw xm7, [base+round+r6*2] + lea r6, [gaussian_sequence] + movsxd r5, [r4+r5*4] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm3, xm2 ; 4 next output seeds + pshuflw xm0, xm3, q3333 + psrlw xm3, 5 + pand xm2, xm0, xm1 + movq r2, xm3 + psrlw xm3, xm2, 10 + por xm2, xm3 + pmullw xm2, xm4 + pmulhuw xm0, xm5 + movzx r3d, r2w + pshufb xm3, xm6, xm2 + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm0, xm2 + movd xm2, [r6+r3*2] + rorx r3, r2, 32 + por xm3, xm0 + shr r2d, 16 + pinsrw xm2, [r6+r2*2], 1 + pshuflw xm0, xm3, q3333 + movzx r2d, r3w + psrlw xm3, 5 + pinsrw xm2, [r6+r2*2], 2 + shr r3d, 16 + movq r2, xm3 + pinsrw xm2, [r6+r3*2], 3 + movzx r3d, r2w + pinsrw xm2, [r6+r3*2], 4 + rorx r3, r2, 32 + shr r2d, 16 + pinsrw xm2, [r6+r2*2], 5 + movzx r2d, r3w + pinsrw xm2, [r6+r2*2], 6 + shr r3d, 16 + pinsrw xm2, [r6+r3*2], 7 + pmulhrsw xm2, xm7 + packsswb xm2, xm2 + movq [bufq+r7], xm2 + add r7, 8 + jl .loop + + ; auto-regression code + add r5, r4 + jmp r5 + +.ar1: + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm5, [fg_dataq+FGData.ar_coeffs_y] + mova xm2, [base+gen_shufC] + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 + pinsrb xm5, [base+pb_1], 3 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + pmovsxbw xm5, xm5 + pshufd xm4, xm5, q0000 + pshufd xm5, xm5, q1111 + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm1, [bufq+xq-82-3] + pshufb xm0, xm1, xm2 + punpckhwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + movsx val0d, byte [bufq+xq] + sarx val3d, val3d, shiftd + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xb, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign xmm_regs_used 16 + %assign stack_size_padded 168 + SUB rsp, stack_size_padded + movaps [rsp+16*2], xmm8 + movaps [rsp+16*3], xmm9 + movaps [rsp+16*4], xmm10 + movaps [rsp+16*5], xmm11 + movaps [rsp+16*6], xmm12 + movaps [rsp+16*7], xmm13 + movaps [rsp+16*8], xmm14 + movaps [rsp+16*9], xmm15 +%endif + DEFINE_ARGS buf, fg_data, h, x + mov r6d, [fg_dataq+FGData.ar_coeff_shift] + pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + vpbroadcastd xm10, [base+round_vals-14+r6*2] + movd xm11, [base+byte_blend+1] + pmovsxbw xm9, xm9 + pshufd xm4, xm7, q0000 + mova xm12, [base+gen_shufA] + pshufd xm5, xm7, q3333 + mova xm13, [base+gen_shufB] + pshufd xm6, xm7, q1111 + mova xm14, [base+gen_shufC] + pshufd xm7, xm7, q2222 + mova xm15, [base+gen_shufD] + pshufd xm8, xm9, q0000 + psrld xm10, 16 + pshufd xm9, xm9, q1111 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pshufb xm2, xm0, xm12 + pmaddwd xm2, xm4 + pshufb xm3, xm1, xm13 + pmaddwd xm3, xm5 + paddd xm2, xm3 + pshufb xm3, xm0, xm14 + pmaddwd xm3, xm6 + punpckhqdq xm0, xm0 + punpcklwd xm0, xm1 + pmaddwd xm0, xm7 + pshufb xm1, xm15 + pmaddwd xm1, xm8 + paddd xm2, xm10 + paddd xm2, xm3 + paddd xm0, xm1 + paddd xm2, xm0 + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm1, xm0 + pmaddwd xm3, xm9, xm1 + psrldq xm1, 4 ; y=0,x=0 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw xm3, xm1 + packsswb xm3, xm3 + pextrb [bufq+xq], xm3, 0 + pslldq xm3, 2 + vpblendvb xm0, xm3, xm11 + psrldq xm0, 1 + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +INIT_YMM avx2 +.ar3: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign stack_offset 16 + ALLOC_STACK 16*14 + %assign stack_size stack_size - 16*4 + %assign xmm_regs_used 12 + movaps [rsp+16*12], xmm8 + movaps [rsp+16*13], xmm9 + movaps [rsp+16*14], xmm10 + movaps [rsp+16*15], xmm11 +%else + ALLOC_STACK 16*12 +%endif + mov r6d, [fg_dataq+FGData.ar_coeff_shift] + movd xm11, [base+byte_blend] + pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pshufd m0, m1, q0000 + mova [rsp+16* 0], m0 + pshufd m0, m1, q1111 + mova [rsp+16* 2], m0 + pshufd m0, m1, q2222 + mova [rsp+16* 4], m0 + pshufd m1, m1, q3333 + mova [rsp+16* 6], m1 + pshufd xm0, xm2, q0000 + mova [rsp+16* 8], xm0 + pshufd xm0, xm2, q1111 + mova [rsp+16* 9], xm0 + psrldq xm7, xm2, 10 + mova m8, [base+gen_shufA] + pinsrw xm2, [base+pw_1], 5 + mova m9, [base+gen_shufC] + pshufd xm2, xm2, q2222 + movu m10, [base+gen_shufE] + vpbroadcastw xm6, [base+round_vals-12+r6*2] + pinsrw xm7, [base+round_vals+r6*2-10], 3 + mova [rsp+16*10], xm2 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 +.x_loop_ar3: + movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12] + movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + punpcklbw m3, m5, m5 + punpckhwd m5, m4 + psraw m3, 8 + punpcklbw m5, m5 + psraw m5, 8 + punpcklbw xm4, xm4 + psraw xm4, 8 + pshufb m0, m3, m8 + pmaddwd m0, [rsp+16*0] + pshufb m1, m3, m9 + pmaddwd m1, [rsp+16*2] + shufps m2, m3, m5, q1032 + paddd m0, m1 + pshufb m1, m2, m8 + vperm2i128 m3, m4, 0x21 + pmaddwd m1, [rsp+16*4] + shufps xm2, xm3, q1021 + vpblendd m2, m3, 0xf0 + pshufb m2, m10 + paddd m0, m1 + pmaddwd m2, [rsp+16*6] + pshufb xm1, xm4, xm9 + pmaddwd xm1, [rsp+16*8] + shufps xm4, xm5, q1132 + paddd m0, m2 + pshufb xm2, xm4, xm8 + pshufd xm4, xm4, q2121 + pmaddwd xm2, [rsp+16*9] + punpcklwd xm4, xm6 + pmaddwd xm4, [rsp+16*10] + vextracti128 xm3, m0, 1 + paddd xm0, xm1 + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] + paddd xm2, xm4 + paddd xm0, xm2 + paddd xm0, xm3 +.x_loop_ar3_inner: + pmovsxbw xm2, xm1 + pmaddwd xm2, xm7 + pshufd xm3, xm2, q1111 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb xm2, xm2 + pextrb [bufq+xq], xm2, 0 + pslldq xm2, 3 + vpblendvb xm1, xm2, xm11 + psrldq xm1, 1 + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y +INIT_XMM avx2 +cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv +%define base r4-generate_grain_uv_%1_8bpc_avx2_table + lea r4, [generate_grain_uv_%1_8bpc_avx2_table] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r6d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm5, [base+hmul_bits] + mova xm6, [base+pb_mask] + vpbroadcastw xm7, [base+round+r6*2] + vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] + pxor xm0, xm2 + lea r6, [gaussian_sequence] +%if %2 + mov r7d, 73-35*%3 + add bufq, 44 +.loop_y: + mov r5, -44 +%else + mov r5, -73*82 + sub bufq, r5 +%endif +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + movq r8, xm2 + movzx r9d, r8w + movd xm2, [r6+r9*2] + rorx r9, r8, 32 + shr r8d, 16 + pinsrw xm2, [r6+r8*2], 1 + movzx r8d, r9w + pinsrw xm2, [r6+r8*2], 2 + shr r9d, 16 + pinsrw xm2, [r6+r9*2], 3 + pmulhrsw xm2, xm7 + packsswb xm2, xm2 + movd [bufq+r5], xm2 + add r5, 4 + jl .loop +%if %2 + add bufq, 82 + dec r7d + jg .loop_y +%endif + + ; auto-regression code + movsxd r6, [fg_dataq+FGData.ar_coeff_lag] + movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4] + add r6, r4 + jmp r6 + +INIT_YMM avx2 +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd xm3, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h + pmovsxbw xm2, xm2 +%if %2 + vpbroadcastd m7, [base+pb_1] + vpbroadcastw m6, [base+hmul_bits+2+%3*2] +%endif + vpbroadcastw m2, xm2 + vpbroadcastw m3, xm3 + pxor m12, m12 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: +%if %2 + ; first 32 pixels + movu xm4, [bufyq] + vinserti128 m4, [bufyq+32], 1 +%if %3 + movu xm0, [bufyq+82] + vinserti128 m0, [bufyq+82+32], 1 +%endif + movu xm5, [bufyq+16] + vinserti128 m5, [bufyq+48], 1 +%if %3 + movu xm1, [bufyq+82+16] + vinserti128 m1, [bufyq+82+48], 1 +%endif + pmaddubsw m4, m7, m4 +%if %3 + pmaddubsw m0, m7, m0 +%endif + pmaddubsw m5, m7, m5 +%if %3 + pmaddubsw m1, m7, m1 + paddw m4, m0 + paddw m5, m1 +%endif + pmulhrsw m4, m6 + pmulhrsw m5, m6 +%else + xor r3d, r3d + ; first 32x2 pixels +.x_loop_ar0: + movu m4, [bufyq+r3] + pcmpgtb m0, m12, m4 + punpckhbw m5, m4, m0 + punpcklbw m4, m0 +%endif + pmullw m4, m2 + pmullw m5, m2 + pmulhrsw m4, m3 + pmulhrsw m5, m3 +%if %2 + movu m1, [bufq] +%else + movu m1, [bufq+r3] +%endif + pcmpgtb m8, m12, m1 + punpcklbw m0, m1, m8 + punpckhbw m1, m8 + paddw m0, m4 + paddw m1, m5 + packsswb m0, m1 +%if %2 + movu [bufq], m0 +%else + movu [bufq+r3], m0 + add r3d, 32 + cmp r3d, 64 + jl .x_loop_ar0 +%endif + + ; last 6/12 pixels + movu xm4, [bufyq+32*2] +%if %2 +%if %3 + movu xm5, [bufyq+32*2+82] +%endif + pmaddubsw xm4, xm7, xm4 +%if %3 + pmaddubsw xm5, xm7, xm5 + paddw xm4, xm5 +%endif + movq xm0, [bufq+32] + pmulhrsw xm4, xm6 + pmullw xm4, xm2 + pmulhrsw xm4, xm3 + pcmpgtb xm5, xm12, xm0 + punpcklbw xm5, xm0, xm5 + paddw xm4, xm5 + packsswb xm4, xm4 + pblendw xm0, xm4, xm0, 1000b + movq [bufq+32], xm0 +%else + movu xm0, [bufq+64] + pcmpgtb xm1, xm12, xm4 + punpckhbw xm5, xm4, xm1 + punpcklbw xm4, xm1 + pmullw xm5, xm2 + pmullw xm4, xm2 + vpblendd xm1, xm3, xm12, 0x0c + pmulhrsw xm5, xm1 + pmulhrsw xm4, xm3 + pcmpgtb xm1, xm12, xm0 + punpckhbw xm8, xm0, xm1 + punpcklbw xm0, xm1 + paddw xm5, xm8 + paddw xm0, xm4 + packsswb xm0, xm5 + movu [bufq+64], xm0 +%endif + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +INIT_XMM avx2 +.ar1: + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + vpbroadcastd xm7, [base+pb_1] + vpbroadcastw xm6, [base+hmul_bits+2+%3*2] +%endif + vpbroadcastd xm3, xm3 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm0, [bufq+xq-82-1] ; top/left +%if %2 + movq xm8, [bufyq+xq*2] +%if %3 + movq xm9, [bufyq+xq*2+82] +%endif +%endif + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right +%if %2 + pmaddubsw xm8, xm7, xm8 +%if %3 + pmaddubsw xm9, xm7, xm9 + paddw xm8, xm9 +%endif + pmulhrsw xm8, xm6 +%else + pmovsxbw xm8, [bufyq+xq] +%endif + punpcklwd xm0, xm2 + punpcklwd xm1, xm8 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 + paddd xm0, xm3 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar1 + RET + +.ar2: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm13, [base+round_vals-12+shiftq*2] + pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 + pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 + pinsrw xm0, [base+pw_1], 5 +%if %2 + vpbroadcastw xm12, [base+hmul_bits+2+%3*2] + vpbroadcastd xm11, [base+pb_1] +%endif + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd xm4, xm7, q0000 + pshufd xm5, xm7, q3333 + pshufd xm6, xm7, q1111 + pshufd xm7, xm7, q2222 + pshufd xm8, xm0, q0000 + pshufd xm9, xm0, q1111 + pshufd xm10, xm0, q2222 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pshufb xm2, xm0, [base+gen_shufA] + pmaddwd xm2, xm4 + pshufb xm3, xm1, [base+gen_shufB] + pmaddwd xm3, xm5 + paddd xm2, xm3 + pshufb xm3, xm0, [base+gen_shufC] + pmaddwd xm3, xm6 + punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5] + punpcklwd xm0, xm1 + pmaddwd xm0, xm7 + pshufb xm1, [gen_shufD] + pmaddwd xm1, xm8 + paddd xm2, xm3 + paddd xm0, xm1 + paddd xm2, xm0 + +%if %2 + movq xm0, [bufyq+xq*2] +%if %3 + movq xm3, [bufyq+xq*2+82] +%endif + pmaddubsw xm0, xm11, xm0 +%if %3 + pmaddubsw xm3, xm11, xm3 + paddw xm0, xm3 +%endif + pmulhrsw xm0, xm12 +%else + pmovsxbw xm0, [bufyq+xq] +%endif + punpcklwd xm0, xm13 + pmaddwd xm0, xm10 + paddd xm2, xm0 + + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm0, xm0 + pmaddwd xm3, xm0, xm9 + psrldq xm0, 2 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + pslldq xm3, 2 + paddw xm3, xm0 + pblendw xm0, xm3, 00000010b + packsswb xm0, xm0 + pextrb [bufq+xq], xm0, 1 + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +INIT_YMM avx2 +.ar3: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 + pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 + vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] + movd xm13, [base+round_vals-10+shiftq*2] + vpbroadcastd xm14, [base+round_vals-14+shiftq*2] + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m8, m0, q2222 + pshufd m9, m0, q3333 + pshufd xm10, xm1, q0000 + pshufd xm11, xm1, q1111 + pshufhw xm12, xm1, q0000 + psraw xm2, 8 + palignr xm13, xm1, 10 + punpckhwd xm12, xm2 ; interleave luma cf + psrld xm14, 16 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + vpbroadcastw xm15, [base+hmul_bits+2+%3*2] + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) +.x_loop_ar3: + vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12 + palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12] + vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + vpblendd m3, m1, 0x0f + pxor m0, m0 + pcmpgtb m2, m0, m3 + pcmpgtb m0, m4 + punpcklbw m1, m3, m2 + punpckhbw m3, m2 + punpcklbw m2, m4, m0 + punpckhbw xm4, xm0 + pshufb m0, m1, [base+gen_shufA] + pmaddwd m0, m6 + pshufb m5, m1, [base+gen_shufC] + pmaddwd m5, m7 + shufps m1, m3, q1032 + paddd m0, m5 + pshufb m5, m1, [base+gen_shufA] + pmaddwd m5, m8 + shufps xm1, xm3, q2121 + vpblendd m1, m2, 0xf0 + pshufb m1, [base+gen_shufE] + pmaddwd m1, m9 + paddd m0, m5 + pshufb xm3, xm2, [base+gen_shufC] + paddd m0, m1 + pmaddwd xm3, xm10 + palignr xm1, xm4, xm2, 2 + punpckhwd xm1, xm2, xm1 + pmaddwd xm1, xm11 + palignr xm4, xm2, 12 + paddd xm3, xm1 +%if %2 + vpbroadcastd xm5, [base+pb_1] + movq xm1, [bufyq+xq*2] + pmaddubsw xm1, xm5, xm1 +%if %3 + movq xm2, [bufyq+xq*2+82] + pmaddubsw xm5, xm2 + paddw xm1, xm5 +%endif + pmulhrsw xm1, xm15 +%else + pmovsxbw xm1, [bufyq+xq] +%endif + punpcklwd xm4, xm1 + pmaddwd xm4, xm12 + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] + vextracti128 xm2, m0, 1 + paddd xm0, xm14 + paddd xm3, xm4 + paddd xm0, xm3 + paddd xm0, xm2 +.x_loop_ar3_inner: + pmovsxbw xm1, xm1 + pmaddwd xm2, xm13, xm1 + pshuflw xm3, xm2, q1032 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + psrldq xm1, 2 + ; don't packssdw, we only care about one value + punpckldq xm2, xm2 + pblendw xm1, xm2, 0100b + packsswb xm1, xm1 + pextrb [bufq+xq], xm1, 2 + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +INIT_YMM avx2 +cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, see, overlap +%define base r9-pd_m65536 + lea r9, [pd_m65536] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] + vpbroadcastd m8, [base+pd_m65536] + vpbroadcastw m9, [base+mul_bits+r6*2-14] + vpbroadcastd m10, [base+fg_min+r7*4] + vpbroadcastd m11, [base+fg_max+r7*8] + vpbroadcastd m12, [base+pw_1024] + movq xm13, [base+pb_27_17_17_27] + test sbyd, sbyd + setnz r7b + pxor m7, m7 + test r7b, overlapb + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + +.loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m2, [srcq] + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + + ; scaling[src] + pandn m4, m8, m0 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, m0, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m1 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m5, [grain_lutq+offxyq] + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test overlapd, overlapd + jz .loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + ; src + mova m2, [srcq] + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + + ; scaling[src] + pandn m4, m8, m0 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, m0, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m1 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m5, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm5 + pmaddubsw xm4, xm13, xm4 + pmulhrsw xm4, xm12 + packsswb xm4, xm4 + vpblendd m4, m5, 0xfe + punpckhbw m5, m7 + punpcklbw m4, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; r8m = sbym + cmp dword r8m, 0 + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused, sby, see, overlap + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + +.loop_x_v_overlap: + vpbroadcastd m14, [pb_27_17] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_v_overlap: + ; src + mova m2, [srcq] + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + + ; scaling[src] + pandn m4, m8, m0 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, m0, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m1 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] + punpcklbw m5, m4, m6 + punpckhbw m4, m6 + pmaddubsw m5, m14, m5 + pmaddubsw m4, m14, m4 + pmulhrsw m5, m12 + pmulhrsw m4, m12 + packsswb m5, m4 + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hb + jz .end_y_v_overlap + vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_v_overlap + jmp .loop_y +.end_y_v_overlap: + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +.loop_x_hv_overlap: + vpbroadcastd m14, [pb_27_17] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyd, [top_offxyq+32] + lea left_offxyd, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_hv_overlap: + ; src + mova m2, [srcq] + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + + ; scaling[src] + pandn m4, m8, m0 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, m0, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m1 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq] + movd xm7, [grain_lutq+left_offxyq] + movu m4, [grain_lutq+top_offxyq] + movd xm5, [grain_lutq+topleft_offxyq] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw xm7, xm6 + punpcklbw xm5, xm4 + pmaddubsw xm7, xm13, xm7 + pmaddubsw xm5, xm13, xm5 + pmulhrsw xm7, xm12 + pmulhrsw xm5, xm12 + packsswb xm7, xm7 + packsswb xm5, xm5 + vpblendd m7, m6, 0xfe + vpblendd m5, m4, 0xfe + ; followed by v interpolation (top | cur -> cur) + punpckhbw m4, m6 + punpcklbw m5, m7 + pmaddubsw m4, m14, m4 + pmaddubsw m5, m14, m5 + pmulhrsw m4, m12 + pmulhrsw m5, m12 + pxor m7, m7 + packsswb m5, m4 + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hb + jz .end_y_hv_overlap + vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq] + jl .loop_x_hv_overlap +.end: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, overlap, uv_pl, is_id +%define base r11-pd_m65536 + lea r11, [pd_m65536] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, is_idm + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] + vpbroadcastd m8, [base+pd_m65536] + vpbroadcastw m9, [base+mul_bits+r6*2-14] + vpbroadcastd m10, [base+fg_min+r7*4] + shlx r7d, r7d, r9d + vpbroadcastd m11, [base+fg_max+r7*4] + vpbroadcastd m12, [base+pw_1024] + pxor m7, m7 + test sbyd, sbyd + setnz r7b + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, sby, see, overlap, uv_pl +%if %1 + mov r6d, uv_plm + vpbroadcastd m0, [base+pw_8] + vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] + pshufb m14, m0 ; uv_luma_mult, uv_mult +%elif %2 + vpbroadcastq m15, [base+pb_23_22] +%else + vpbroadcastq xm15, [base+pb_27_17_17_27] +%endif +%if %3 + vpbroadcastw m13, [base+pb_23_22] +%elif %2 + pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27 +%endif + test r7b, overlapb + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, unused5, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: + ; src +%if %2 + mova xm3, [lumaq+lstrideq*0+ 0] + vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1 + vpbroadcastd m2, [pb_1] + mova xm0, [lumaq+lstrideq*0+16] + vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 + mova xm1, [srcq] + vinserti128 m1, [srcq+strideq], 1 + pmaddubsw m3, m2 + pmaddubsw m0, m2 + pavgw m3, m7 + pavgw m0, m7 +%else + mova m2, [lumaq] + mova m1, [srcq] +%endif +%if %1 +%if %2 + packuswb m2, m3, m0 ; luma +%endif + punpckhbw m3, m2, m1 + punpcklbw m2, m1 ; { luma, chroma } + pmaddubsw m3, m14 + pmaddubsw m2, m14 + psraw m3, 6 + psraw m2, 6 + paddw m3, m15 + paddw m2, m15 + packuswb m2, m3 ; pack+unpack = clip +%endif +%if %1 || %2 == 0 + punpcklbw m3, m2, m7 + punpckhbw m0, m2, m7 +%endif + + ; scaling[luma_src] + pandn m4, m8, m3 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m0 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + psrld m0, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m0-2], m6 + pblendw m2, m4, 0xaa + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movu xm5, [grain_lutq+offxyq+ 0] + vinserti128 m5, [grain_lutq+offxyq+82], 1 +%else + movu m5, [grain_lutq+offxyq] +%endif + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; unpack chroma_source + punpcklbw m0, m1, m7 + punpckhbw m1, m7 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 + jg %%loop_y + + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + test overlapd, overlapd + jz %%loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: + ; src +%if %2 + mova xm3, [lumaq+lstrideq*0+ 0] + vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 + vpbroadcastd m2, [pb_1] + mova xm0, [lumaq+lstrideq*0+16] + vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 + mova xm1, [srcq] + vinserti128 m1, [srcq+strideq], 1 + pmaddubsw m3, m2 + pmaddubsw m0, m2 + pavgw m3, m7 + pavgw m0, m7 +%else + mova m2, [lumaq] + mova m1, [srcq] +%endif +%if %1 +%if %2 + packuswb m2, m3, m0 ; luma +%endif + punpckhbw m3, m2, m1 + punpcklbw m2, m1 ; { luma, chroma } + pmaddubsw m3, m14 + pmaddubsw m2, m14 + psraw m3, 6 + psraw m2, 6 + paddw m3, m15 + paddw m2, m15 + packuswb m2, m3 ; pack+unpack = clip +%endif +%if %1 || %2 == 0 + punpcklbw m3, m2, m7 + punpckhbw m0, m2, m7 +%endif + + ; scaling[luma_src] + pandn m4, m8, m3 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m0 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + psrld m0, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m0-2], m6 + pblendw m2, m4, 0xaa + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movu xm5, [grain_lutq+offxyq+ 0] + vinserti128 m5, [grain_lutq+offxyq+82], 1 + movd xm4, [grain_lutq+left_offxyq+ 0] + vinserti128 m4, [grain_lutq+left_offxyq+82], 1 + punpcklbw m4, m5 +%if %1 + vpbroadcastq m0, [pb_23_22] + pmaddubsw m4, m0, m4 +%else + pmaddubsw m4, m15, m4 +%endif + pmulhrsw m4, m12 + packsswb m4, m4 + vpblendd m4, m5, 0xee +%else + movu m5, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm5 +%if %1 + movq xm0, [pb_27_17_17_27] + pmaddubsw xm4, xm0, xm4 +%else + pmaddubsw xm4, xm15, xm4 +%endif + pmulhrsw xm4, xm12 + packsswb xm4, xm4 + vpblendd m4, m5, 0xfe +%endif + punpckhbw m5, m7 + punpcklbw m4, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; unpack chroma_source + punpcklbw m0, m1, m7 + punpckhbw m1, m7 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(1+%2) + sub hb, 1+%2 + jg %%loop_y_h_overlap + + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, overlap, unused1, unused2, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused1, unused2, see, overlap, unused3, unused4, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%if %2 == 0 + vpbroadcastd m13, [pb_27_17] +%endif +%%loop_y_v_overlap: + ; src +%if %2 + mova xm3, [lumaq+lstrideq*0+ 0] + vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 + vpbroadcastd m2, [pb_1] + mova xm0, [lumaq+lstrideq*0+16] + vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 + mova xm1, [srcq] + vinserti128 m1, [srcq+strideq], 1 + pmaddubsw m3, m2 + pmaddubsw m0, m2 + pavgw m3, m7 + pavgw m0, m7 +%else + mova m2, [lumaq] + mova m1, [srcq] +%endif +%if %1 +%if %2 + packuswb m2, m3, m0 ; luma +%endif + punpckhbw m3, m2, m1 + punpcklbw m2, m1 ; { luma, chroma } + pmaddubsw m3, m14 + pmaddubsw m2, m14 + psraw m3, 6 + psraw m2, 6 + paddw m3, m15 + paddw m2, m15 + packuswb m2, m3 ; pack+unpack = clip +%endif +%if %1 || %2 == 0 + punpcklbw m3, m2, m7 + punpckhbw m0, m2, m7 +%endif + + ; scaling[luma_src] + pandn m4, m8, m3 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m0 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + psrld m0, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m0-2], m6 + pblendw m2, m4, 0xaa + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] +%if %3 == 0 +%if %2 + movu xm0, [grain_lutq+offxyq] + vinserti128 m0, [grain_lutq+offxyq+82], 1 + movu xm4, [grain_lutq+top_offxyq] + vinserti128 m4, [grain_lutq+top_offxyq+82], 1 +%else + movu m0, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] +%endif + punpcklbw m5, m4, m0 + punpckhbw m4, m0 + pmaddubsw m5, m13, m5 + pmaddubsw m4, m13, m4 + pmulhrsw m5, m12 + pmulhrsw m4, m12 + packsswb m5, m4 +%else + movq xm4, [grain_lutq+offxyq] + vinserti128 m4, [grain_lutq+offxyq+8], 1 + movq xm5, [grain_lutq+top_offxyq] + vinserti128 m5, [grain_lutq+top_offxyq+8], 1 + punpcklbw m5, m4 + pmaddubsw m5, m13, m5 + pmulhrsw m5, m12 + vextracti128 xm4, m5, 1 + packsswb xm5, xm4 + ; only interpolate first line, insert second line unmodified + vinserti128 m5, [grain_lutq+offxyq+82], 1 +%endif + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; unpack chroma_source + punpcklbw m0, m1, m7 + punpckhbw m1, m7 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + + sub hb, 1+%2 + jle %%end_y_v_overlap +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 +%if %2 == 0 + vpbroadcastd m13, [pb_17_27] + add hd, 0x80000000 + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + lea topleft_offxyd, [top_offxyq+(32>>%2)] + lea left_offxyd, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%if %2 == 0 + vpbroadcastd m13, [pb_27_17] +%endif +%%loop_y_hv_overlap: + ; src +%if %2 + mova xm3, [lumaq+lstrideq*0+ 0] + vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 + vpbroadcastd m2, [pb_1] + mova xm0, [lumaq+lstrideq*0+16] + vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 + mova xm1, [srcq] + vinserti128 m1, [srcq+strideq], 1 + pmaddubsw m3, m2 + pmaddubsw m0, m2 + pavgw m3, m7 + pavgw m0, m7 +%else + mova m2, [lumaq] + mova m1, [srcq] +%endif +%if %1 +%if %2 + packuswb m2, m3, m0 ; luma +%endif + punpckhbw m3, m2, m1 + punpcklbw m2, m1 ; { luma, chroma } + pmaddubsw m3, m14 + pmaddubsw m2, m14 + psraw m3, 6 + psraw m2, 6 + paddw m3, m15 + paddw m2, m15 + packuswb m2, m3 ; pack+unpack = clip +%endif +%if %1 || %2 == 0 + punpcklbw m3, m2, m7 + punpckhbw m0, m2, m7 +%endif + + ; scaling[luma_src] + pandn m4, m8, m3 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m0 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + psrld m0, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m0-2], m6 + pblendw m2, m4, 0xaa + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movu xm4, [grain_lutq+offxyq] + vinserti128 m4, [grain_lutq+offxyq+82], 1 + movd xm0, [grain_lutq+left_offxyq] + vinserti128 m0, [grain_lutq+left_offxyq+82], 1 + movd xm6, [grain_lutq+topleft_offxyq] +%if %3 + movq xm5, [grain_lutq+top_offxyq] + vinserti128 m5, [grain_lutq+top_offxyq+8], 1 +%else + vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1 + movu xm5, [grain_lutq+top_offxyq] + vinserti128 m5, [grain_lutq+top_offxyq+82], 1 +%endif + + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m0, m4 +%if %3 + punpcklbw xm6, xm5 +%else + punpcklbw m6, m5 +%endif + punpcklqdq m0, m6 +%if %1 + vpbroadcastq m6, [pb_23_22] + pmaddubsw m0, m6, m0 +%else + pmaddubsw m0, m15, m0 +%endif + pmulhrsw m0, m12 + packsswb m0, m0 + vpblendd m4, m0, 0x11 +%if %3 + pshuflw xm0, xm0, q1032 + vpblendd m5, m0, 0x01 +%else + pshuflw m0, m0, q1032 + vpblendd m5, m0, 0x11 +%endif +%else + movu m4, [grain_lutq+offxyq] + movd xm0, [grain_lutq+left_offxyq] + movu m5, [grain_lutq+top_offxyq] + movd xm6, [grain_lutq+topleft_offxyq] + punpcklbw xm0, xm4 + punpcklbw xm6, xm5 + punpcklqdq xm0, xm6 +%if %1 + vpbroadcastq xm6, [pb_27_17_17_27] + pmaddubsw xm0, xm6, xm0 +%else + pmaddubsw xm0, xm15, xm0 +%endif + pmulhrsw xm0, xm12 + packsswb xm0, xm0 + vpblendd m4, m0, 0x01 + pshuflw xm0, xm0, q1032 + vpblendd m5, m0, 0x01 +%endif + + ; followed by v interpolation (top | cur -> cur) +%if %3 + vpermq m0, m4, q3120 + punpcklbw m5, m0 + pmaddubsw m5, m13, m5 + pmulhrsw m5, m12 + vextracti128 xm0, m5, 1 + packsswb xm5, xm0 + vpblendd m5, m4, 0xf0 +%else + punpckhbw m0, m5, m4 + punpcklbw m5, m4 + pmaddubsw m4, m13, m0 + pmaddubsw m5, m13, m5 + pmulhrsw m4, m12 + pmulhrsw m5, m12 + packsswb m5, m4 +%endif + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; unpack chroma source + punpcklbw m0, m1, m7 + punpckhbw m1, m7 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 +%if %2 + jg %%loop_y_h_overlap +%else + je %%end_y_hv_overlap + vpbroadcastd m13, [pb_17_27] + add hd, 0x80000000 + jnc %%loop_y_hv_overlap + jmp %%loop_y_h_overlap +%endif + +%%end_y_hv_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + jmp %%loop_x_hv_overlap +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +GEN_GRAIN_UV_FN 420, 1, 1 +FGUV_FN 420, 1, 1 +GEN_GRAIN_UV_FN 422, 1, 0 +FGUV_FN 422, 1, 0 +GEN_GRAIN_UV_FN 444, 0, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/filmgrain_avx512.asm dav1d-1.0.0/src/x86/filmgrain_avx512.asm --- dav1d-0.9.2/src/x86/filmgrain_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/filmgrain_avx512.asm 2022-03-18 14:31:56.010356000 +0000 @@ -0,0 +1,813 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 + db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 + db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 +pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 + db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 +interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7 +pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 +pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32 +pb_27_17: times 2 db 27, 17 +pb_23_22: times 2 db 23, 22 +pw_8: times 2 dw 8 +pw_1024: times 2 dw 1024 +pb_17_27: times 2 db 17, 27 +fg_max: times 4 db 255 + times 4 db 240 + times 4 db 235 +fg_min: times 4 db 0 + times 4 db 16 +noise_rnd: times 2 dw 128 + times 2 dw 64 + times 2 dw 32 + times 2 dw 16 + +SECTION .text + +INIT_ZMM avx512icl +cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, see, overlap +%define base r11-fg_min + lea r11, [fg_min] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] + mov r12, 0x0000000f0000000f ; h_overlap mask + mova m0, [scalingq+64*0] + mova m1, [scalingq+64*1] + mova m2, [scalingq+64*2] + mova m3, [scalingq+64*3] + kmovq k1, r12 + vbroadcasti32x4 m4, [base+interleave_hl] + vpbroadcastd ym16, [base+pb_27_17] + vpbroadcastd m12, [base+pb_17_27] + vpbroadcastd m6, [base+noise_rnd+r6*4-32] + test sbyd, sbyd + setnz r6b + vpbroadcastd m7, [base+fg_min+r7*4] + vpbroadcastd m8, [base+fg_max+r7*8] + pxor m5, m5 + vpbroadcastd m9, [base+pw_1024] + vpbroadcastq m10, [base+pb_27_17_17_27] + vmovdqa64 m12{k1}, m16 + test r6b, overlapb + jnz .v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq +.loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offxd, [offyq+offxq*2+829] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, overlap + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y: + movu ym21, [grain_lutq+offxyq-82] + vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1 + call .add_noise + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test overlapd, overlapd + jz .loop_x + test sbyd, sbyd + jnz .hv_overlap + +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, left_offxy + + rorx offyd, seed, 8 + mov left_offxyd, offxd ; previous column's offy*stride + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offxd, [offyq+offxq*2+829] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + movu ym20, [grain_lutq+offxyq-82] + vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1 + movd xm19, [grain_lutq+left_offxyq-50] + vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2 + punpcklbw m19, m20 + pmaddubsw m19, m10, m19 + pmulhrsw m19, m9 + punpckhbw m21, m20, m5 + packsswb m20{k1}, m19, m19 + punpcklbw m20, m5, m20 + call .add_noise_h + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test sbyd, sbyd + jnz .hv_overlap + jmp .loop_x_h_overlap + +.v_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \ + h, sby, see, overlap + + movzx r6d, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, r6d, 173 * 0x00010001 + imul r6d, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add r6d, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and r6d, 0xff00ff00 + xor seed, r7d + xor seed, r6d ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offxd, [offyq+offxq*2+0x10001*829+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, overlap, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + movu ym19, [grain_lutq+offxyq-82] + vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 + movu ym21, [grain_lutq+top_offxyq-82] + vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 + punpckhbw m20, m21, m19 + punpcklbw m21, m19 + call .add_noise_v + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump back + ; to .v_overlap, and instead always fall-through to h+v overlap +.hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, left_offxy, top_offxy, topleft_offxy + + mov topleft_offxyd, top_offxyd + rorx offyd, seed, 8 + mov left_offxyd, offxd + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offxd, [offyq+offxq*2+0x10001*829+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + movu ym19, [grain_lutq+offxyq-82] + vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 + movd xm16, [grain_lutq+left_offxyq-50] + vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2 + movu ym21, [grain_lutq+top_offxyq-82] + vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 + movd xm17, [grain_lutq+topleft_offxyq-50] + vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2 + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m16, m19 + pmaddubsw m16, m10, m16 + punpcklbw m17, m21 + pmaddubsw m17, m10, m17 + punpckhbw m20, m21, m19 + pmulhrsw m16, m9 + pmulhrsw m17, m9 + packsswb m19{k1}, m16, m16 + packsswb m21{k1}, m17, m17 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m21, m19 + call .add_noise_v + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + lea srcq, [src_bakq+wq] + jl .hv_overlap +.end: + RET +ALIGN function_align +.add_noise_v: + pmaddubsw m20, m12, m20 + pmaddubsw m21, m12, m21 + pmulhrsw m20, m9 + pmulhrsw m21, m9 + packsswb m21, m20 +.add_noise: + punpcklbw m20, m5, m21 + punpckhbw m21, m5 +.add_noise_h: + mova ym18, [srcq+strideq*0] + vinserti32x8 m18, [srcq+strideq*1], 1 + mova m19, m0 + punpcklbw m16, m18, m5 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpmovb2m k2, m18 + punpckhbw m17, m18, m5 + vpermi2b m18, m2, m3 ; scaling[128..255] + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2 + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 + mova [dstq+srcq], ym16 + add srcq, strideq + vextracti32x8 [dstq+srcq], m16, 1 + add srcq, strideq + ret + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \ + scaling, grain_lut, h, sby, luma, \ + overlap, uv_pl, is_id, _, stride3 + lea r11, [fg_min] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, is_idm + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] +%if %2 + mov r12, 0x000f000f000f000f ; h_overlap mask + vpbroadcastq m10, [base+pb_23_22_0_32] + lea stride3q, [strideq*3] +%else + mov r12, 0x0000000f0000000f + vpbroadcastq m10, [base+pb_27_17_17_27] +%endif + mova m0, [scalingq+64*0] + mova m1, [scalingq+64*1] + mova m2, [scalingq+64*2] + mova m3, [scalingq+64*3] + kmovq k1, r12 + vbroadcasti32x4 m4, [base+interleave_hl] + vpbroadcastd m6, [base+noise_rnd+r6*4-32] + vpbroadcastd m7, [base+fg_min+r7*4] + shlx r7d, r7d, r9d + vpbroadcastd m8, [base+fg_max+r7*4] + test sbyd, sbyd + setnz r7b + vpbroadcastd m9, [base+pw_1024] + mova m11, [base+pb_even] + mova m12, [base+pb_odd] + pxor m5, m5 + mov r5, r10mp ; lstride + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ + h, sby, see, overlap, uv_pl, _, _, stride3 +%if %1 + mov r6d, uv_plm + vpbroadcastd m16, [base+pw_8] + vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4] + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] + pshufb m14, m16 ; uv_luma_mult, uv_mult +%endif + test r7b, overlapb + jnz %%v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, overlap, _, _, _, stride3 + + mov lumaq, r9mp + lea r11, [srcq+wq] + lea r12, [dstq+wq] + lea r13, [lumaq+wq*(1+%2)] + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, overlap, _, _, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: +%if %2 + movu xm21, [grain_lutq+offxyq+82*0] + vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 + vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 +%else + movu ym21, [grain_lutq+offxyq+82*0] + vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 +%endif + call %%add_noise + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + test overlapd, overlapd + jz %%loop_x + cmp dword r8m, 0 ; sby + jne %%hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, left_offxy, _, _, _, stride3 + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, left_offxy, _, _, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: +%if %2 + movu xm20, [grain_lutq+offxyq +82*0] + movd xm19, [grain_lutq+left_offxyq+82*0] + vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1 + vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1 + vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2 + vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2 + vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3 + vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3 +%else + movu ym20, [grain_lutq+offxyq + 0] + movd xm19, [grain_lutq+left_offxyq+ 0] + vinserti32x8 m20, [grain_lutq+offxyq +82], 1 + vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2 +%endif + punpcklbw m19, m20 + pmaddubsw m19, m10, m19 + punpckhbw m21, m20, m5 + pmulhrsw m19, m9 + vpacksswb m20{k1}, m19, m19 + punpcklbw m20, m5, m20 + call %%add_noise_h + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + cmp dword r8m, 0 ; sby + jne %%hv_overlap + jmp %%loop_x_h_overlap + +%%v_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ + _, sby, see, overlap, _, _, _, stride3 + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + +%if %3 + vpbroadcastd m13, [base+pb_23_22] + kxnorw k3, k3, k3 ; v_overlap mask +%elif %2 + vbroadcasti32x8 m13, [base+pb_27_17] + kxnord k3, k3, k3 + pshufd m13, m13, q0000 ; 8x27_17, 8x17_27 +%else + vpbroadcastd ym16, [base+pb_27_17] + vpbroadcastd m13, [base+pb_17_27] + vmovdqa64 m13{k1}, m16 +%endif + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, overlap, top_offxy, _, _, stride3 + + mov lumaq, r9mp + lea r11, [srcq+wq] + lea r12, [dstq+wq] + lea r13, [lumaq+wq*(1<<%2)] + mov r11mp, r11 + mov r12mp, r12 + neg wq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0x000f000f + and offxd, 0x000f000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, overlap, top_offxy, _, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %3 + movu xm18, [grain_lutq+offxyq+82*0] + movu xm20, [grain_lutq+top_offxyq+82*0] + ; only interpolate first line, insert remaining line unmodified + vbroadcasti128 ym21, [grain_lutq+offxyq+82*1] + vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 + punpcklbw xm19, xm20, xm18 + punpckhbw xm20, xm18 +%elif %2 + movu xm18, [grain_lutq+offxyq+82*0] + vinserti128 ym18, [grain_lutq+offxyq+82*1], 1 + movu xm20, [grain_lutq+top_offxyq+82*0] + vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1 + vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2] + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 + punpcklbw ym19, ym20, ym18 + punpckhbw ym20, ym18 +%else + movu ym21, [grain_lutq+offxyq+82*0] + vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 + movu ym20, [grain_lutq+top_offxyq+82*0] + vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + +%%hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 + + lea topleft_offxyd, [top_offxyq+(32>>%2)] + lea left_offxyd, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0x000f000f + and offxd, 0x000f000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %2 + movu xm21, [grain_lutq+offxyq+82*0] + movd xm16, [grain_lutq+left_offxyq+82*0] + vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 + vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1 + vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 + vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2 + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 + vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3 + movd xm18, [grain_lutq+topleft_offxyq+82*0] + movu xm20, [grain_lutq+top_offxyq] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m16, m21 +%if %3 + punpcklbw xm18, xm20 +%else + vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1 + vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1 + punpcklbw ym18, ym20 +%endif + punpcklqdq m16, m18 + pmaddubsw m16, m10, m16 + pmulhrsw m16, m9 + packsswb m16, m16 + vmovdqu8 m21{k1}, m16 +%if %3 + vpalignr xm20{k1}, xm16, xm16, 4 + punpcklbw xm19, xm20, xm21 + punpckhbw xm20, xm21 +%else + vpalignr ym20{k1}, ym16, ym16, 4 + punpcklbw ym19, ym20, ym21 + punpckhbw ym20, ym21 +%endif +%else + movu ym21, [grain_lutq+offxyq+82*0] + vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 + movd xm16, [grain_lutq+left_offxyq+82*0] + vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2 + movu ym20, [grain_lutq+top_offxyq+82*0] + vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 + movd xm18, [grain_lutq+topleft_offxyq+82*0] + vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2 + punpcklbw m16, m21 + punpcklbw m18, m20 + punpcklqdq m16, m18 + pmaddubsw m16, m10, m16 + pmulhrsw m16, m9 + packsswb m16, m16 + vpalignr m20{k1}, m16, m16, 4 + vmovdqu8 m21{k1}, m16 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + jmp %%hv_overlap +ALIGN function_align +%%add_noise_v: +%if %3 + pmaddubsw xm19, xm13, xm19 + pmaddubsw xm20, xm13, xm20 + pmulhrsw xm19, xm9 + pmulhrsw xm20, xm9 + vpacksswb m21{k3}, m19, m20 +%elif %2 + pmaddubsw ym19, ym13, ym19 + pmaddubsw ym20, ym13, ym20 + pmulhrsw ym19, ym9 + pmulhrsw ym20, ym9 + vpacksswb m21{k3}, m19, m20 +%else + punpcklbw m19, m20, m21 + punpckhbw m20, m21 + pmaddubsw m19, m13, m19 + pmaddubsw m20, m13, m20 + pmulhrsw m19, m9 + pmulhrsw m20, m9 + packsswb m21, m19, m20 +%endif +%%add_noise: + punpcklbw m20, m5, m21 + punpckhbw m21, m5 +%%add_noise_h: + mova ym18, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 +%if %2 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova ym16, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1 + mova xm17, [srcq+strideq*0] + mova m19, m11 + vpermi2b m19, m18, m16 + vinserti128 ym17, [srcq+strideq*1], 1 + vpermt2b m18, m12, m16 + vinserti32x4 m17, [srcq+strideq*2], 2 + pavgb m18, m19 + vinserti32x4 m17, [srcq+stride3q ], 3 +%else + mova ym17, [srcq+strideq*0] + vinserti32x8 m17, [srcq+strideq*1], 1 +%endif +%if %1 + punpckhbw m19, m18, m17 + punpcklbw m18, m17 ; { luma, chroma } + pmaddubsw m19, m14 + pmaddubsw m18, m14 + psraw m19, 6 + psraw m18, 6 + paddw m19, m15 + paddw m18, m15 + packuswb m18, m19 +.add_noise_main: + mova m19, m0 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpmovb2m k2, m18 + vpermi2b m18, m2, m3 ; scaling[128..255] + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2<<%2 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + lea srcq, [srcq+strideq*(2<<%2)] + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + punpcklbw m16, m17, m5 ; chroma + punpckhbw m17, m5 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 +%if %2 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+stride3q ], m16, 3 +%else + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 +%endif + lea dstq, [dstq+strideq*(2<<%2)] + ret +%else + jmp .add_noise_main +%endif +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/filmgrain_common.asm dav1d-1.0.0/src/x86/filmgrain_common.asm --- dav1d-0.9.2/src/x86/filmgrain_common.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/filmgrain_common.asm 2022-03-18 14:31:56.010356000 +0000 @@ -0,0 +1,46 @@ +; Copyright © 2019-2022, VideoLAN and dav1d authors +; Copyright © 2019-2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence diff -Nru dav1d-0.9.2/src/x86/film_grain_init_tmpl.c dav1d-1.0.0/src/x86/film_grain_init_tmpl.c --- dav1d-0.9.2/src/x86/film_grain_init_tmpl.c 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/film_grain_init_tmpl.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,75 +0,0 @@ -/* - * Copyright © 2018-2021, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "src/cpu.h" -#include "src/film_grain.h" - -decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ssse3)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ssse3)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ssse3)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ssse3)); -decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ssse3)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ssse3)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ssse3)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ssse3)); - -decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, avx2)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, avx2)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, avx2)); -decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, avx2)); -decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, avx2)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, avx2)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, avx2)); -decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, avx2)); - -COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) { - const unsigned flags = dav1d_get_cpu_flags(); - - if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; - - c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3); - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3); - c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3); - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3); - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3); - -#if ARCH_X86_64 - if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; - - c->generate_grain_y = BF(dav1d_generate_grain_y, avx2); - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2); - c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2); - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2); - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2); - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); -#endif -} diff -Nru dav1d-0.9.2/src/x86/filmgrain_init_tmpl.c dav1d-1.0.0/src/x86/filmgrain_init_tmpl.c --- dav1d-0.9.2/src/x86/filmgrain_init_tmpl.c 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/filmgrain_init_tmpl.c 2022-03-18 14:31:56.010356000 +0000 @@ -0,0 +1,81 @@ +/* + * Copyright © 2018-2022, VideoLAN and dav1d authors + * Copyright © 2018-2022, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/filmgrain.h" + +#define decl_fg_fns(ext) \ +decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \ +decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext)) + +decl_fg_fns(ssse3); +decl_fg_fns(avx2); +decl_fg_fns(avx512icl); + +COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3); + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) { + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); + } + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl); +#endif +} diff -Nru dav1d-0.9.2/src/x86/film_grain_sse.asm dav1d-1.0.0/src/x86/film_grain_sse.asm --- dav1d-0.9.2/src/x86/film_grain_sse.asm 2021-09-03 15:51:24.413037000 +0000 +++ dav1d-1.0.0/src/x86/film_grain_sse.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,3262 +0,0 @@ -; Copyright © 2019-2021, VideoLAN and dav1d authors -; Copyright © 2019, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "config.asm" -%include "ext/x86/x86inc.asm" - -SECTION_RODATA - -pw_1024: times 8 dw 1024 -pb_27_17_17_27: db 27, 17, 17, 27 - times 6 db 0, 32 -pb_23_22_h: db 23, 22 - times 7 db 0, 32 -pb_27_17: times 8 db 27, 17 -pb_17_27: times 8 db 17, 27 -pb_23_22: times 8 db 23, 22 -pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 -rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 -byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 -pw_seed_xor: times 2 dw 0xb524 - times 2 dw 0x49d8 -pb_1: times 4 db 1 -hmul_bits: dw 32768, 16384, 8192, 4096 -round: dw 2048, 1024, 512 -mul_bits: dw 256, 128, 64, 32, 16 -round_vals: dw 32, 64, 128, 256, 512 -max: dw 255, 240, 235 -min: dw 0, 16 -pw_1: dw 1 - -%macro JMP_TABLE 2-* - %xdefine %1_8bpc_%2_table %%table - %xdefine %%base %1_8bpc_%2_table - %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) - %%table: - %rep %0 - 2 - dd %%prefix %+ .ar%3 - %%base - %rotate 1 - %endrep -%endmacro - -JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 - -struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - -SECTION .text - -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - -%if ARCH_X86_32 -%define PIC_ptr(a) base+a -%else -%define PIC_ptr(a) a -%endif - -%macro SCRATCH 3 -%if ARCH_X86_32 - mova [rsp+%3*mmsize], m%1 -%define m%2 [rsp+%3*mmsize] -%else - SWAP %1, %2 -%endif -%endmacro - -INIT_XMM ssse3 -cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data - LEA r4, $$ -%define base r4-$$ - movq m1, [base+rnd_next_upperbit_mask] - movq m4, [base+mul_bits] - movq m7, [base+hmul_bits] - mov r2d, [fg_dataq+FGData.grain_scale_shift] - movd m2, [base+round+r2*2] - movd m0, [fg_dataq+FGData.seed] - mova m5, [base+pb_mask] - pshuflw m2, m2, q0000 - pshuflw m0, m0, q0000 - mov r2, -73*82 - sub bufq, r2 - lea r3, [base+gaussian_sequence] -.loop: - pand m6, m0, m1 - psrlw m3, m6, 10 - por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw m6, m4 ; bits 0x0f00 are set - pshufb m3, m5, m6 ; set 15th bit for next 4 seeds - psllq m6, m3, 30 - por m3, m6 - psllq m6, m3, 15 - por m3, m6 ; aggregate each bit into next seed's high bit - pmulhuw m6, m0, m7 - por m3, m6 ; 4 next output seeds - pshuflw m0, m3, q3333 - psrlw m3, 5 -%if ARCH_X86_64 - movq r6, m3 - mov r8, r6 - movzx r5d, r6w - shr r6d, 16 - shr r8, 32 - movzx r7, r8w - shr r8, 16 - - movd m6, [r3+r5*2] - pinsrw m6, [r3+r6*2], 1 - pinsrw m6, [r3+r7*2], 2 - pinsrw m6, [r3+r8*2], 3 -%else - movd r6, m3 - pshuflw m3, m3, q3232 - movzx r5, r6w - shr r6, 16 - - movd m6, [r3+r5*2] - pinsrw m6, [r3+r6*2], 1 - - movd r6, m3 - movzx r5, r6w - shr r6, 16 - - pinsrw m6, [r3+r5*2], 2 - pinsrw m6, [r3+r6*2], 3 -%endif - pmulhrsw m6, m2 - packsswb m6, m6 - movd [bufq+r2], m6 - add r2, 4 - jl .loop - - ; auto-regression code - movsxd r2, [fg_dataq+FGData.ar_coeff_lag] - movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] - lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] - jmp r2 - -.ar1: -%if ARCH_X86_32 - DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max -%elif WIN64 - DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 - mov bufq, r0 -%else - DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 -%endif - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] - movd m4, [fg_dataq+FGData.ar_coeffs_y] - mov ecx, [fg_dataq+FGData.ar_coeff_shift] -%if ARCH_X86_32 - mov r1m, cf3d - DEFINE_ARGS buf, shift, val3, min, max, x, val0 -%define hd r0mp -%define cf3d r1mp -%elif WIN64 - DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 -%else - DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 -%endif - pxor m6, m6 - pcmpgtb m7, m6, m4 - punpcklbw m4, m7 - pinsrw m4, [base+pw_1], 3 - pshufd m5, m4, q1111 - pshufd m4, m4, q0000 - movd m3, [base+round_vals+shiftq*2-12] ; rnd - pshuflw m3, m3, q0000 - sub bufq, 82*73-(82*3+79) - mov hd, 70 - mov mind, -128 - mov maxd, 127 -.y_loop_ar1: - mov xq, -76 - movsx val3d, byte [bufq+xq-1] -.x_loop_ar1: - movq m0, [bufq+xq-82-1] ; top/left - pcmpgtb m7, m6, m0 - punpcklbw m0, m7 - psrldq m2, m0, 2 ; top - psrldq m1, m0, 4 ; top/right - punpcklwd m0, m2 - punpcklwd m1, m3 - pmaddwd m0, m4 - pmaddwd m1, m5 - paddd m0, m1 -.x_loop_ar1_inner: - movd val0d, m0 - psrldq m0, 4 - imul val3d, cf3d - add val3d, val0d - sar val3d, shiftb - movsx val0d, byte [bufq+xq] - add val3d, val0d - cmp val3d, maxd - cmovns val3d, maxd - cmp val3d, mind - cmovs val3d, mind - mov byte [bufq+xq], val3b - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82 - dec hd - jg .y_loop_ar1 -.ar0: - RET - -.ar2: -%if ARCH_X86_32 -%assign stack_offset_old stack_offset - ALLOC_STACK -16*8 -%endif - DEFINE_ARGS buf, fg_data, shift - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m6, [base+round_vals-12+shiftq*2] - movd m7, [base+byte_blend+1] - SCRATCH 7, 15, 7 - movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 - movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 - pxor m7, m7 - pshuflw m6, m6, q0000 - punpcklwd m6, m7 - pcmpgtb m4, m7, m0 - pcmpgtb m5, m7, m1 - punpcklbw m0, m4 - punpcklbw m1, m5 - DEFINE_ARGS buf, fg_data, h, x - pshufd m4, m1, q0000 - pshufd m5, m1, q1111 - pshufd m3, m0, q3333 - pshufd m2, m0, q2222 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - SCRATCH 0, 8, 0 - SCRATCH 1, 9, 1 - SCRATCH 2, 10, 2 - SCRATCH 3, 11, 3 - SCRATCH 4, 12, 4 - SCRATCH 5, 13, 5 - SCRATCH 6, 14, 6 - sub bufq, 82*73-(82*3+79) - mov hd, 70 -.y_loop_ar2: - mov xq, -76 - -.x_loop_ar2: - movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] - movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] - pcmpgtb m2, m7, m0 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 - psrldq m5, m0, 2 ; y=-2,x=[-1,+5] - psrldq m3, m1, 2 ; y=-1,x=[-1,+5] - psrldq m4, m1, 4 ; y=-1,x=[+0,+5] - punpcklwd m2, m0, m5 - punpcklwd m3, m4 - pmaddwd m2, m8 - pmaddwd m3, m11 - paddd m2, m3 - - psrldq m4, m0, 4 ; y=-2,x=[+0,+5] - psrldq m5, m0, 6 ; y=-2,x=[+1,+5] - psrldq m6, m0, 8 ; y=-2,x=[+2,+5] - punpcklwd m4, m5 - punpcklwd m6, m1 - psrldq m5, m1, 6 ; y=-1,x=[+1,+5] - psrldq m1, m1, 8 ; y=-1,x=[+2,+5] - punpcklwd m5, m1 - pmaddwd m4, m9 - pmaddwd m6, m10 - pmaddwd m5, m12 - paddd m4, m6 - paddd m2, m5 - paddd m2, m4 - paddd m2, m14 - - movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] -.x_loop_ar2_inner: - pcmpgtb m4, m7, m0 - punpcklbw m1, m0, m4 - pmaddwd m3, m1, m13 - paddd m3, m2 - psrldq m1, 4 ; y=0,x=0 - psrldq m2, 4 ; shift top to next pixel - psrad m3, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw since we only care about one value - paddw m3, m1 - packsswb m3, m3 - pslldq m3, 2 - pand m3, m15 - pandn m1, m15, m0 - por m0, m1, m3 - psrldq m0, 1 - ; overwrite 2 pixels, but that's ok - movd [bufq+xq-1], m0 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82 - dec hd - jg .y_loop_ar2 - RET - -.ar3: - DEFINE_ARGS buf, fg_data, shift -%if ARCH_X86_32 -%assign stack_offset stack_offset_old - ALLOC_STACK -16*14 -%elif WIN64 - SUB rsp, 16*6 -%assign stack_size_padded (stack_size_padded+16*6) -%assign stack_size (stack_size+16*6) -%else - ALLOC_STACK -16*6 -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m6, [base+round_vals-12+shiftq*2] - movd m7, [base+byte_blend] - movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 - movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 - pxor m3, m3 - pcmpgtb m4, m3, m0 - pcmpgtb m3, m2 - pshuflw m6, m6, q0000 - SCRATCH 6, 14, 12 - SCRATCH 7, 15, 13 - punpckhbw m1, m0, m4 - punpcklbw m0, m4 - punpcklbw m2, m3 - pshufd m3, m0, q1111 - pshufd m4, m0, q2222 - pshufd m5, m0, q3333 - pshufd m0, m0, q0000 - mova [rsp+ 0*16], m0 - mova [rsp+ 1*16], m3 - mova [rsp+ 2*16], m4 - mova [rsp+ 3*16], m5 - pshufd m6, m1, q1111 - pshufd m7, m1, q2222 - pshufd m5, m1, q3333 - pshufd m1, m1, q0000 - pshufd m3, m2, q1111 - psrldq m0, m2, 10 - pinsrw m2, [base+pw_1], 5 - pshufd m4, m2, q2222 - pshufd m2, m2, q0000 - pinsrw m0, [base+round_vals+shiftq*2-10], 3 - mova [rsp+ 4*16], m1 - mova [rsp+ 5*16], m6 - SCRATCH 7, 8, 6 - SCRATCH 5, 9, 7 - SCRATCH 2, 10, 8 - SCRATCH 3, 11, 9 - SCRATCH 4, 12, 10 - SCRATCH 0, 13, 11 - DEFINE_ARGS buf, fg_data, h, x - sub bufq, 82*73-(82*3+79) - mov hd, 70 -.y_loop_ar3: - mov xq, -76 - -.x_loop_ar3: - movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] - pxor m3, m3 - pcmpgtb m3, m0 - punpckhbw m2, m0, m3 - punpcklbw m0, m3 - - psrldq m5, m0, 2 - psrldq m6, m0, 4 - psrldq m7, m0, 6 - punpcklwd m4, m0, m5 - punpcklwd m6, m7 - pmaddwd m4, [rsp+ 0*16] - pmaddwd m6, [rsp+ 1*16] - paddd m4, m6 - - movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] - pxor m5, m5 - pcmpgtb m5, m1 - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - palignr m6, m2, m0, 10 - palignr m7, m2, m0, 12 - psrldq m0, 8 - punpcklwd m0, m6 - punpcklwd m7, m1 - pmaddwd m0, [rsp+ 2*16] - pmaddwd m7, [rsp+ 3*16] - paddd m0, m7 - paddd m0, m4 - - psrldq m4, m1, 2 - psrldq m5, m1, 4 - psrldq m6, m1, 6 - psrldq m7, m1, 8 - punpcklwd m4, m5 - punpcklwd m6, m7 - pmaddwd m4, [rsp+ 4*16] - pmaddwd m6, [rsp+ 5*16] - paddd m4, m6 - paddd m0, m4 - - movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] - pxor m7, m7 - pcmpgtb m7, m2 - punpckhbw m5, m2, m7 - punpcklbw m2, m7 - palignr m7, m3, m1, 10 - palignr m3, m1, 12 - psrldq m1, m2, 2 - punpcklwd m7, m3 - punpcklwd m3, m2, m1 - pmaddwd m7, m8 - pmaddwd m3, m9 - paddd m7, m3 - paddd m0, m7 - - psrldq m6, m2, 4 - psrldq m1, m2, 6 - psrldq m3, m2, 8 - palignr m4, m5, m2, 10 - palignr m5, m5, m2, 12 - - punpcklwd m6, m1 - punpcklwd m3, m4 - punpcklwd m5, m14 - pmaddwd m6, m10 - pmaddwd m3, m11 - pmaddwd m5, m12 - paddd m0, m6 - paddd m3, m5 - paddd m0, m3 - - movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pxor m5, m5 - pcmpgtb m5, m1 - punpcklbw m2, m1, m5 - pmaddwd m2, m13 - pshufd m3, m2, q1111 - paddd m2, m3 ; left+cur - paddd m2, m0 ; add top - psrldq m0, 4 - psrad m2, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw since we only care about one value - packsswb m2, m2 - pslldq m2, 3 - pand m2, m15 - pandn m3, m15, m1 - por m1, m2, m3 - movd [bufq+xq-3], m1 - psrldq m1, 1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82 - dec hd - jg .y_loop_ar3 - RET - -%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y -INIT_XMM ssse3 -cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv - movifnidn r2, r2mp - movifnidn r3, r3mp - LEA r4, $$ -%define base r4-$$ - movq m1, [base+rnd_next_upperbit_mask] - movq m4, [base+mul_bits] - movq m7, [base+hmul_bits] - mov r5d, [fg_dataq+FGData.grain_scale_shift] - movd m6, [base+round+r5*2] - mova m5, [base+pb_mask] - movd m0, [fg_dataq+FGData.seed] - movd m2, [base+pw_seed_xor+uvq*4] - pxor m0, m2 - pshuflw m6, m6, q0000 - pshuflw m0, m0, q0000 - lea r6, [base+gaussian_sequence] -%if %2 -%if ARCH_X86_64 - mov r7d, 73-35*%3 -%else - mov r3mp, 73-35*%3 -%endif - add bufq, 44 -.loop_y: - mov r5, -44 -.loop_x: -%else - mov r5, -82*73 - sub bufq, r5 -.loop: -%endif - pand m2, m0, m1 - psrlw m3, m2, 10 - por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw m2, m4 ; bits 0x0f00 are set - pshufb m3, m5, m2 ; set 15th bit for next 4 seeds - psllq m2, m3, 30 - por m3, m2 - psllq m2, m3, 15 - por m3, m2 ; aggregate each bit into next seed's high bit - pmulhuw m2, m0, m7 - por m2, m3 ; 4 next output seeds - pshuflw m0, m2, q3333 - psrlw m2, 5 -%if ARCH_X86_64 - movd r9d, m2 - pshuflw m2, m2, q3232 - movzx r8, r9w - shr r9, 16 - - movd m3, [r6+r8*2] - pinsrw m3, [r6+r9*2], 1 - - movd r9d, m2 - movzx r8, r9w - shr r9, 16 - - pinsrw m3, [r6+r8*2], 2 - pinsrw m3, [r6+r9*2], 3 -%else - movd r2, m2 - pshuflw m2, m2, q3232 - movzx r1, r2w - shr r2, 16 - - movd m3, [r6+r1*2] - pinsrw m3, [r6+r2*2], 1 - - movd r2, m2 - movzx r1, r2w - shr r2, 16 - - pinsrw m3, [r6+r1*2], 2 - pinsrw m3, [r6+r2*2], 3 -%endif - pmulhrsw m3, m6 - packsswb m3, m3 - movd [bufq+r5], m3 - add r5, 4 -%if %2 - jl .loop_x - add bufq, 82 -%if ARCH_X86_64 - dec r7d -%else - dec r3mp -%endif - jg .loop_y -%else - jl .loop -%endif - -%if ARCH_X86_32 - mov r2, r2mp -%endif - - ; auto-regression code - movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] - jmp r5 - -.ar0: - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - movifnidn bufyq, bufymp -%if ARCH_X86_32 -%assign stack_offset_old stack_offset - ALLOC_STACK -2*16 -%endif - imul uvd, 28 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] - movd m4, [base+hmul_bits+shiftq*2] - DEFINE_ARGS buf, bufy, h, x - pxor m0, m0 - pcmpgtb m0, m5 - punpcklbw m5, m0 - movd m7, [base+pb_1] -%if %2 - movd m6, [base+hmul_bits+2+%3*2] -%endif - pshuflw m5, m5, q0000 - pshuflw m4, m4, q0000 - pshufd m7, m7, q0000 -%if %2 - pshuflw m6, m6, q0000 -%endif - punpcklqdq m5, m5 - punpcklqdq m4, m4 -%if %2 - punpcklqdq m6, m6 -%endif - pcmpeqw m1, m1 - pslldq m1, 12>>%2 - SCRATCH 1, 8, 0 - SCRATCH 4, 9, 1 -%if %2 - sub bufq, 82*(73-35*%3)+82-(82*3+41) -%else - sub bufq, 82*70-3 -%endif - add bufyq, 3+82*3 - mov hd, 70-35*%3 -.y_loop_ar0: - xor xd, xd -.x_loop_ar0: - ; first 32 pixels -%if %2 - movu m1, [bufyq+xq*2] -%if %3 - movu m2, [bufyq+xq*2+82] -%endif - movu m3, [bufyq+xq*2+16] -%if %3 - movu m4, [bufyq+xq*2+82+16] -%endif - pmaddubsw m0, m7, m1 -%if %3 - pmaddubsw m1, m7, m2 -%endif - pmaddubsw m2, m7, m3 -%if %3 - pmaddubsw m3, m7, m4 - paddw m0, m1 - paddw m2, m3 -%endif - pmulhrsw m0, m6 - pmulhrsw m2, m6 -%else - movu m0, [bufyq+xq] - pxor m6, m6 - pcmpgtb m6, m0 - punpckhbw m2, m0, m6 - punpcklbw m0, m6 -%endif - pmullw m0, m5 - pmullw m2, m5 - pmulhrsw m0, m9 - pmulhrsw m2, m9 - movu m1, [bufq+xq] - pxor m4, m4 - pcmpgtb m4, m1 - punpckhbw m3, m1, m4 -%if %2 - punpcklbw m1, m4 - paddw m2, m3 - paddw m0, m1 -%else - punpcklbw m6, m1, m4 - paddw m2, m3 - paddw m0, m6 -%endif - packsswb m0, m2 -%if %2 - movu [bufq+xq], m0 - add xd, 16 - cmp xd, 32 - jl .x_loop_ar0 - - ; last 6/12 pixels - movu m1, [bufyq+xq*(1+%2)] -%if %3 - movu m2, [bufyq+xq*2+82] -%endif - pmaddubsw m0, m7, m1 -%if %3 - pmaddubsw m1, m7, m2 - paddw m0, m1 -%endif - pmulhrsw m0, m6 - pmullw m0, m5 - pmulhrsw m0, m9 - movq m1, [bufq+xq] - pxor m4, m4 - pcmpgtb m4, m1 - punpcklbw m2, m1, m4 - paddw m0, m2 - packsswb m0, m0 - pandn m2, m8, m0 - pand m1, m8 - por m2, m1 - movq [bufq+xq], m2 -%else - add xd, 16 - cmp xd, 80 - je .y_loop_final_ar0 - movu [bufq+xq-16], m0 - jmp .x_loop_ar0 -.y_loop_final_ar0: - pandn m2, m8, m0 - pand m1, m8 - por m2, m1 - movu [bufq+xq-16], m2 -%endif - - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar0 - RET - -.ar1: -%if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp -%endif - DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x - imul uvd, 28 - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] - movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] - pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 -%if ARCH_X86_32 - mov r3mp, cf3d - DEFINE_ARGS buf, shift, fg_data, val3, min, max, x -%elif WIN64 - DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x - mov bufq, r0 -%else - DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m3, [base+round_vals+shiftq*2-12] ; rnd -%if %2 - movd m7, [base+pb_1] - movd m6, [base+hmul_bits+2+%3*2] -%endif - psrldq m4, 1 -%if ARCH_X86_32 - DEFINE_ARGS buf, shift, val0, val3, min, max, x -%elif WIN64 - DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 -%else - DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 -%endif - pxor m5, m5 - punpcklwd m3, m5 -%if %2 - punpcklwd m6, m6 -%endif - pcmpgtb m5, m4 - punpcklbw m4, m5 - pshufd m5, m4, q1111 - pshufd m4, m4, q0000 - pshufd m3, m3, q0000 -%if %2 - pshufd m7, m7, q0000 - pshufd m6, m6, q0000 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*69+3 -%endif -%if ARCH_X86_32 - add r1mp, 79+82*3 - mov r0mp, 70-35*%3 -%else - add bufyq, 79+82*3 - mov hd, 70-35*%3 -%endif - mov mind, -128 - mov maxd, 127 -.y_loop_ar1: - mov xq, -(76>>%2) - movsx val3d, byte [bufq+xq-1] -.x_loop_ar1: -%if %2 -%if ARCH_X86_32 - mov r2, r1mp - movq m0, [r2+xq*2] -%if %3 - movq m1, [r2+xq*2+82] -%endif -%else - movq m0, [bufyq+xq*2] -%if %3 - movq m1, [bufyq+xq*2+82] -%endif -%endif - pmaddubsw m2, m7, m0 -%if %3 - pmaddubsw m0, m7, m1 - paddw m2, m0 -%endif - pmulhrsw m2, m6 -%else -%if ARCH_X86_32 - mov r2, r1mp - movd m2, [r2+xq] -%else - movd m2, [bufyq+xq] -%endif - pxor m0, m0 - pcmpgtb m0, m2 - punpcklbw m2, m0 -%endif - - movq m0, [bufq+xq-82-1] ; top/left - pxor m1, m1 - pcmpgtb m1, m0 - punpcklbw m0, m1 - psrldq m1, m0, 4 ; top/right - punpcklwd m1, m2 - psrldq m2, m0, 2 ; top - punpcklwd m0, m2 - pmaddwd m0, m4 - pmaddwd m1, m5 - paddd m0, m1 - paddd m0, m3 -.x_loop_ar1_inner: - movd val0d, m0 - psrldq m0, 4 -%if ARCH_X86_32 - imul val3d, r3mp -%else - imul val3d, cf3d -%endif - add val3d, val0d - sar val3d, shiftb - movsx val0d, byte [bufq+xq] - add val3d, val0d - cmp val3d, maxd - cmovns val3d, maxd - cmp val3d, mind - cmovs val3d, mind - mov byte [bufq+xq], val3b - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82 -%if ARCH_X86_32 - add r1mp, 82<<%3 - dec r0mp -%else - add bufyq, 82<<%3 - dec hd -%endif - jg .y_loop_ar1 - RET - -.ar2: -%if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp - ALLOC_STACK -8*16 -%endif - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - movifnidn bufyq, bufymp - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - movd m7, [base+round_vals-12+shiftq*2] - movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 - pxor m2, m2 - pcmpgtb m2, m0 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 - pinsrw m1, [base+pw_1], 5 - punpcklwd m7, m7 - pshufd m7, m7, q0000 - DEFINE_ARGS buf, bufy, fg_data, h, unused, x - pshufd m4, m1, q0000 - pshufd m5, m1, q1111 - pshufd m6, m1, q2222 - pshufd m3, m0, q3333 - pshufd m2, m0, q2222 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - SCRATCH 0, 8, 0 - SCRATCH 1, 9, 1 - SCRATCH 2, 10, 2 - SCRATCH 3, 11, 3 - SCRATCH 4, 12, 4 - SCRATCH 5, 13, 5 - SCRATCH 6, 14, 6 - SCRATCH 7, 15, 7 -%if %2 - movd m7, [base+hmul_bits+2+%3*2] - movd m6, [base+pb_1] - punpcklwd m7, m7 - pshufd m6, m6, q0000 - pshufd m7, m7, q0000 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*69+3 -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 -.y_loop_ar2: - mov xq, -(76>>%2) - -.x_loop_ar2: - pxor m2, m2 - movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] - movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] - pcmpgtb m2, m0 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 - psrldq m5, m0, 2 ; y=-2,x=[-1,+5] - psrldq m3, m1, 2 ; y=-1,x=[-1,+5] - psrldq m4, m1, 4 ; y=-1,x=[+0,+5] - punpcklwd m2, m0, m5 - punpcklwd m3, m4 - pmaddwd m2, m8 - pmaddwd m3, m11 - paddd m2, m3 - - psrldq m4, m0, 4 ; y=-2,x=[+0,+5] - psrldq m5, m0, 6 ; y=-2,x=[+1,+5] - psrldq m0, 8 ; y=-2,x=[+2,+5] - punpcklwd m4, m5 - punpcklwd m0, m1 - psrldq m3, m1, 6 ; y=-1,x=[+1,+5] - psrldq m1, m1, 8 ; y=-1,x=[+2,+5] - punpcklwd m3, m1 - pmaddwd m4, m9 - pmaddwd m0, m10 - pmaddwd m3, m12 - paddd m4, m0 - paddd m2, m3 - paddd m2, m4 - -%if %2 - movq m1, [bufyq+xq*2] -%if %3 - movq m3, [bufyq+xq*2+82] -%endif - pmaddubsw m0, m6, m1 -%if %3 - pmaddubsw m1, m6, m3 - paddw m0, m1 -%endif - pmulhrsw m0, m7 -%else - movd m0, [bufyq+xq] - pxor m1, m1 - pcmpgtb m1, m0 - punpcklbw m0, m1 -%endif - punpcklwd m0, m15 - pmaddwd m0, m14 - paddd m2, m0 - - movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] - pxor m4, m4 - movd m5, [base+byte_blend+1] - punpcklbw m5, m5 -.x_loop_ar2_inner: - pcmpgtb m1, m4, m0 - punpcklbw m0, m1 - pmaddwd m3, m0, m13 - paddd m3, m2 - psrldq m2, 4 ; shift top to next pixel - psrad m3, [fg_dataq+FGData.ar_coeff_shift] - pslldq m3, 4 - pand m3, m5 - paddw m0, m3 - packsswb m0, m0 - movd [bufq+xq-2], m0 - psrldq m0, 1 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar2 - RET - -.ar3: -%if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp -%endif - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - movifnidn bufyq, bufymp -%if ARCH_X86_32 - ALLOC_STACK -15*16 -%else - SUB rsp, 16*7 -%assign stack_size_padded (stack_size_padded+16*7) -%assign stack_size (stack_size+16*7) -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - - movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 - pxor m3, m3 - pcmpgtb m3, m0 - punpckhbw m1, m0, m3 - punpcklbw m0, m3 - pshufd m2, m0, q1111 - pshufd m3, m0, q2222 - pshufd m4, m0, q3333 - pshufd m0, m0, q0000 - pshufd m5, m1, q1111 - pshufd m6, m1, q2222 - pshufd m7, m1, q3333 - pshufd m1, m1, q0000 - mova [rsp+ 0*16], m0 - mova [rsp+ 1*16], m2 - mova [rsp+ 2*16], m3 - mova [rsp+ 3*16], m4 - mova [rsp+ 4*16], m1 - mova [rsp+ 5*16], m5 - mova [rsp+ 6*16], m6 - SCRATCH 7, 8, 7 - - movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] - pxor m4, m4 - pcmpgtb m4, m2 - punpckhbw m5, m2, m4 - punpcklbw m2, m4 - pshufd m4, m2, q3232 - punpcklwd m3, m4, m5 - pshuflw m5, m4, q3321 - pshufd m4, m3, q0000 - pshufd m3, m2, q1111 - pshufd m2, m2, q0000 - pinsrw m5, [base+round_vals+shiftq*2-10], 3 - SCRATCH 2, 9, 8 - SCRATCH 3, 10, 9 - SCRATCH 4, 11, 10 - SCRATCH 5, 12, 11 - - movd m2, [base+round_vals-12+shiftq*2] -%if %2 - movd m1, [base+pb_1] - movd m3, [base+hmul_bits+2+%3*2] -%endif - pxor m0, m0 - punpcklwd m2, m0 -%if %2 - punpcklwd m3, m3 -%endif - pshufd m2, m2, q0000 -%if %2 - pshufd m1, m1, q0000 - pshufd m3, m3, q0000 - SCRATCH 1, 13, 12 -%endif - SCRATCH 2, 14, 13 -%if %2 - SCRATCH 3, 15, 14 -%endif - - DEFINE_ARGS buf, bufy, fg_data, h, unused, x -%if %2 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*69+3 -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 -.y_loop_ar3: - mov xq, -(76>>%2) - -.x_loop_ar3: - movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] - pxor m4, m4 - pcmpgtb m4, m0 - punpckhbw m3, m0, m4 - punpcklbw m0, m4 - - psrldq m5, m0, 2 - psrldq m6, m0, 4 - psrldq m7, m0, 6 - punpcklwd m4, m0, m5 - punpcklwd m6, m7 - pmaddwd m4, [rsp+ 0*16] - pmaddwd m6, [rsp+ 1*16] - paddd m4, m6 - - palignr m2, m3, m0, 10 - palignr m3, m0, 12 - psrldq m0, 8 - - movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] - pxor m6, m6 - pcmpgtb m6, m1 - punpckhbw m5, m1, m6 - punpcklbw m1, m6 - - punpcklwd m0, m2 - punpcklwd m3, m1 - pmaddwd m0, [rsp+ 2*16] - pmaddwd m3, [rsp+ 3*16] - paddd m0, m3 - paddd m0, m4 - - movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] - pxor m7, m7 - pcmpgtb m7, m2 - punpckhbw m6, m2, m7 - punpcklbw m2, m7 - - palignr m3, m5, m1, 10 - palignr m5, m1, 12 - psrldq m4, m2, 2 - - punpcklwd m3, m5 - punpcklwd m5, m2, m4 - pmaddwd m3, [rsp+ 6*16] - pmaddwd m5, m8 - paddd m3, m5 - paddd m0, m3 - - psrldq m3, m1, 2 - psrldq m4, m1, 4 - psrldq m5, m1, 6 - psrldq m1, 8 - - punpcklwd m3, m4 - punpcklwd m5, m1 - pmaddwd m3, [rsp+ 4*16] - pmaddwd m5, [rsp+ 5*16] - paddd m3, m5 - paddd m0, m3 - -%if %2 - movq m1, [bufyq+xq*2] -%if %3 - movq m3, [bufyq+xq*2+82] -%endif - pmaddubsw m7, m13, m1 -%if %3 - pmaddubsw m5, m13, m3 - paddw m7, m5 -%endif - pmulhrsw m7, m15 -%else - movd m7, [bufyq+xq] - pxor m1, m1 - pcmpgtb m1, m7 - punpcklbw m7, m1 -%endif - - psrldq m1, m2, 4 - psrldq m3, m2, 6 - palignr m4, m6, m2, 10 - palignr m6, m2, 12 - psrldq m2, 8 - - punpcklwd m1, m3 - punpcklwd m2, m4 - punpcklwd m6, m7 - pmaddwd m1, m9 - pmaddwd m2, m10 - pmaddwd m6, m11 - paddd m1, m2 - paddd m0, m6 - paddd m0, m1 - paddd m0, m14 - - movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] - pxor m4, m4 - movd m5, [base+byte_blend] -.x_loop_ar3_inner: - pcmpgtb m2, m4, m1 - punpcklbw m3, m1, m2 - pmaddwd m2, m3, m12 - pshufd m3, m2, q1111 - paddd m2, m3 ; left+cur - paddd m2, m0 ; add top - psrldq m0, 4 - psrad m2, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw, we only care about one value - packsswb m2, m2 - pandn m3, m5, m1 - pslld m2, 24 - pand m2, m5 - por m1, m2, m3 - movd [bufq+xq-3], m1 - psrldq m1, 1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar3 - RET -%endmacro - -generate_grain_uv_fn 420, 1, 1 -generate_grain_uv_fn 422, 1, 0 -generate_grain_uv_fn 444, 0, 0 - -%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg -%assign %%idx 0 -%define %%tmp %2 -%if %0 == 6 -%define %%tmp %6 -%endif -%rep 4 -%if %%idx == 0 - movd %5 %+ d, %2 - pshuflw %%tmp, %2, q3232 -%else - movd %5 %+ d, %%tmp -%if %%idx == 2 - punpckhqdq %%tmp, %%tmp -%elif %%idx == 4 - psrlq %%tmp, 32 -%endif -%endif - movzx %4 %+ d, %5 %+ w - shr %5 %+ d, 16 - -%if %%idx == 0 - movd %1, [%3+%4] -%else - pinsrw %1, [%3+%4], %%idx + 0 -%endif - pinsrw %1, [%3+%5], %%idx + 1 -%assign %%idx %%idx+2 -%endrep -%endmacro - -INIT_XMM ssse3 -; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) -%if ARCH_X86_32 -%if STACK_ALIGNMENT < mmsize -cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ - dst, src, scaling, unused1, fg_data, picptr, unused2 - ; copy stack arguments to new position post-alignment, so that we - ; don't have to keep the old stack location in a separate register - mov r0, r0m - mov r1, r2m - mov r2, r4m - mov r3, r6m - mov r4, r7m - mov r5, r8m - - mov [rsp+5*mmsize+ 4*gprsize], r0 - mov [rsp+5*mmsize+ 6*gprsize], r1 - mov [rsp+5*mmsize+ 8*gprsize], r2 - mov [rsp+5*mmsize+10*gprsize], r3 - mov [rsp+5*mmsize+11*gprsize], r4 - mov [rsp+5*mmsize+12*gprsize], r5 -%else -cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ - dst, src, scaling, unused1, fg_data, picptr, unused2 -%endif - mov srcq, srcm - mov fg_dataq, r3m - mov scalingq, r5m -%if STACK_ALIGNMENT < mmsize -%define r0m [rsp+5*mmsize+ 4*gprsize] -%define r1m [rsp+5*mmsize+ 5*gprsize] -%define r2m [rsp+5*mmsize+ 6*gprsize] -%define r3m [rsp+5*mmsize+ 7*gprsize] -%define r4m [rsp+5*mmsize+ 8*gprsize] -%define r5m [rsp+5*mmsize+ 9*gprsize] -%define r6m [rsp+5*mmsize+10*gprsize] -%define r7m [rsp+5*mmsize+11*gprsize] -%define r8m [rsp+5*mmsize+12*gprsize] -%endif - LEA r5, pb_mask -%define base r5-pb_mask - mov r5m, picptrq -%else -cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut - lea r7, [pb_mask] -%define base r7-pb_mask -%endif - mov r6d, [fg_dataq+FGData.scaling_shift] - movd m3, [base+mul_bits+r6*2-14] - mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - movd m4, [base+max+r6*4] - movd m5, [base+min+r6*2] - punpcklwd m3, m3 - punpcklwd m4, m4 - punpcklwd m5, m5 - pshufd m3, m3, q0000 - pshufd m4, m4, q0000 - pshufd m5, m5, q0000 - SCRATCH 3, 11, 0 - SCRATCH 4, 12, 1 - SCRATCH 5, 13, 2 - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap -%endif - - mov sbyd, r8m - mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 - test overlapd, overlapd - jz .no_vertical_overlap - mova m6, [base+pw_1024] - mova m7, [base+pb_27_17_17_27] - SCRATCH 6, 14, 3 - SCRATCH 7, 15, 4 - test sbyd, sbyd - jnz .vertical_overlap - ; fall-through - -.no_vertical_overlap: - mov r8m, overlapd -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused - imul seed, (173 << 24) | 37 -%else - imul seed, sbyd, (173 << 24) | 37 -%endif - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak - - mov r3m, seed - mov wq, r4m -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - unused1, unused2, see, unused3 -%endif - - lea src_bakq, [srcq+wq] - neg wq - sub dstmp, srcq -%if ARCH_X86_32 - mov r1m, src_bakq - mov r4m, wq - DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 -%endif - -.loop_x: -%if ARCH_X86_32 - mov seed, r3m -%endif - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, unused - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - -%if ARCH_X86_32 - ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, - ; r6m=grain_lut, r7m=h, r8m=overlap_v|h - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, unused -%endif - -.loop_x_odd: - mov hd, r7m - mov grain_lutq, grain_lutmp -.loop_y: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m4, m0, scalingq-1, r0, r5, m3 - vpgatherdw m5, m1, scalingq-1, r0, r5, m3 -%else - vpgatherdw m4, m0, scalingq-1, r12, r13, m3 - vpgatherdw m5, m1, scalingq-1, r12, r13, m3 -%endif - REPX {psrlw x, 8}, m4, m5 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m4 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - - add srcq, r2mp - add grain_lutq, 82 - dec hd - jg .loop_y - -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end -%if ARCH_X86_32 - mov srcq, r1mp - add srcq, r4mp -%else - lea srcq, [src_bakq+wq] -%endif - btc dword r8m, 2 - jc .next_blk - - add offxyd, 16 - test dword r8m, 2 ; r8m & 2 = have_top_overlap - jz .loop_x_odd - -%if ARCH_X86_32 - add dword [rsp+5*mmsize+1*gprsize], 16 -%else - add r11d, 16 ; top_offxyd -%endif - jnz .loop_x_odd_v_overlap - -.next_blk: - test dword r8m, 1 - jz .loop_x - - test dword r8m, 2 - jnz .loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) -.loop_x_h_overlap: -%if ARCH_X86_32 - ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, - ; r6m=grain_lut, r7m=h, r8m=overlap_v|h - DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 - - add offxyd, 16 ; left_offxyd - mov [rsp+5*mmsize+0*gprsize], offxyd - - DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 - - mov seed, r3m -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, left_offxy - - lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx -%endif - - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, left_offxy -%endif - - mov hd, r7m - mov grain_lutq, grain_lutmp -.loop_y_h_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m4, m0, scalingq-1, r0, r5, m3 - vpgatherdw m5, m1, scalingq-1, r0, r5, m3 -%else - vpgatherdw m4, m0, scalingq-1, r12, r13, m3 - vpgatherdw m5, m1, scalingq-1, r12, r13, m3 -%endif - REPX {psrlw x, 8}, m4, m5 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] -%if ARCH_X86_32 - mov r5, [rsp+5*mmsize+0*gprsize] - movd m7, [grain_lutq+r5] -%else - movd m7, [grain_lutq+left_offxyq] -%endif - punpcklbw m7, m3 - pmaddubsw m6, m15, m7 - pmulhrsw m6, m14 - packsswb m6, m6 - shufps m6, m3, q3210 - pcmpgtb m2, m6 - punpcklbw m7, m6, m2 - punpckhbw m6, m2 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m7, m4 - pmullw m6, m5 - pmulhrsw m7, m11 - pmulhrsw m6, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m7 - paddw m1, m6 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - - add srcq, r2mp - add grain_lutq, 82 - dec hd - jg .loop_y_h_overlap - -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end -%if ARCH_X86_32 - mov srcq, r1m - add srcq, r4m -%else - lea srcq, [src_bakq+wq] -%endif - xor dword r8m, 4 - add offxyd, 16 - - ; since this half-block had left-overlap, the next does not - test dword r8m, 2 ; have_top_overlap - jz .loop_x_odd -%if ARCH_X86_32 - add dword [rsp+5*mmsize+1*gprsize], 16 -%else - add r11d, 16 ; top_offxyd -%endif - jmp .loop_x_odd_v_overlap - -.end: - RET - -.vertical_overlap: -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap -%endif - - or overlapd, 2 ; top_overlap: overlap & 2 - mov r8m, overlapd - movzx sbyd, sbyb -%if ARCH_X86_32 - imul r4, [fg_dataq+FGData.seed], 0x00010001 - DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused -%else - imul seed, [fg_dataq+FGData.seed], 0x00010001 -%endif - imul tmpd, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add tmpd, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and tmpd, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, tmpd -%if ARCH_X86_32 - xor sbyd, seed ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak - - mov r3m, seed - mov wq, r4m -%else - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - tmp, unused2, see, unused3 -%endif - - lea src_bakq, [srcq+wq] - neg wq - sub dstmp, srcq -%if ARCH_X86_32 - mov r1m, src_bakq - mov r4m, wq - DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 -%endif - -.loop_x_v_overlap: -%if ARCH_X86_32 - mov seed, r3m -%endif - ; we assume from the block above that bits 8-15 of tmpd are zero'ed, - ; because of the 'and tmpd, 0x00ff00ff' above - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp tmpb ; parity of top_seed - shr seed, 16 - shl tmpd, 16 - test seeb, seeh - setp tmpb ; parity of cur_seed - or r6d, 0x00010001 - xor tmpd, r6d - mov seed, tmpd - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, unused, top_offxy - - mov offyd, seed - mov offxd, seed -%endif - - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - -%if ARCH_X86_32 - DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, unused, top_offxy -%endif - - movzx top_offxyd, offxyw -%if ARCH_X86_32 - mov [rsp+5*mmsize+1*gprsize], top_offxyd - - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - shr offxyd, 16 - -.loop_x_odd_v_overlap: -%if ARCH_X86_32 - mov r5, r5m - lea r5, [base+pb_27_17] - mov [rsp+5*mmsize+12], r5 -%else - mova m8, [pb_27_17] -%endif - mov hd, r7m - mov grain_lutq, grain_lutmp -.loop_y_v_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m4, m0, scalingq-1, r0, r5, m3 - vpgatherdw m5, m1, scalingq-1, r0, r5, m3 -%else - vpgatherdw m4, m0, scalingq-1, r12, r13, m3 - vpgatherdw m5, m1, scalingq-1, r12, r13, m3 -%endif - REPX {psrlw x, 8}, m4, m5 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] -%if ARCH_X86_32 - mov r5, [rsp+5*mmsize+1*gprsize] - movu m7, [grain_lutq+r5] -%else - movu m7, [grain_lutq+top_offxyq] -%endif - punpckhbw m6, m7, m3 - punpcklbw m7, m3 -%if ARCH_X86_32 - mov r5, [rsp+5*mmsize+12] - pmaddubsw m3, [r5], m6 - pmaddubsw m6, [r5], m7 -%else - pmaddubsw m3, m8, m6 - pmaddubsw m6, m8, m7 -%endif - pmulhrsw m3, m14 - pmulhrsw m6, m14 - packsswb m6, m3 - pcmpgtb m7, m2, m6 - punpcklbw m2, m6, m7 - punpckhbw m6, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m4 - pmullw m6, m5 - pmulhrsw m2, m11 - pmulhrsw m6, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m6 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add dword [rsp+5*mmsize+12], mmsize -%else - mova m8, [pb_17_27] -%endif - add srcq, r2mp - add grain_lutq, 82 - dec hw - jz .end_y_v_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - btc hd, 16 - jnc .loop_y_v_overlap - jmp .loop_y - -.end_y_v_overlap: -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end_hv -%if ARCH_X86_32 - mov srcq, r1mp - add srcq, r4mp -%else - lea srcq, [src_bakq+wq] -%endif - btc dword r8m, 2 - jc .loop_x_hv_overlap - add offxyd, 16 -%if ARCH_X86_32 - add dword [rsp+5*mmsize+1*gprsize], 16 -%else - add top_offxyd, 16 -%endif - jmp .loop_x_odd_v_overlap - -.loop_x_hv_overlap: -%if ARCH_X86_32 - mov r5, r5m - lea r5, [base+pb_27_17] - mov [rsp+5*mmsize+12], r5 - - DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak - - mov r5, [rsp+5*mmsize+1*gprsize] - mov r4, offxyd - add r5, 16 - add r4, 16 - mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy - mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy - - DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak - - xor tmpd, tmpd - mov seed, r3m -%else - mova m8, [pb_27_17] - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - tmp, unused2, see, unused3 - - ; we assume from the block above that bits 8-15 of tmpd are zero'ed -%endif - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp tmpb ; parity of top_seed - shr seed, 16 - shl tmpd, 16 - test seeb, seeh - setp tmpb ; parity of cur_seed - or r6d, 0x00010001 - xor tmpd, r6d - mov seed, tmpd - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy - - lea topleft_offxyq, [top_offxyq+16] - lea left_offxyq, [offyq+16] - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut - - movzx r5, offxyw ; top_offxy - mov [rsp+5*mmsize+1*gprsize], r5 -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy - - movzx top_offxyd, offxyw -%endif - shr offxyd, 16 - - mov hd, r7m - mov grain_lutq, grain_lutmp -.loop_y_hv_overlap: - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] -%if ARCH_X86_32 - mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy - mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy - movu m6, [grain_lutq+r5] - mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy - movd m4, [grain_lutq+r0] - movd m7, [grain_lutq+r5] -%else - movu m6, [grain_lutq+top_offxyq] - movd m4, [grain_lutq+left_offxyq] - movd m7, [grain_lutq+topleft_offxyq] -%endif - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklbw m4, m3 - punpcklbw m7, m6 - pmaddubsw m2, m15, m4 - pmaddubsw m4, m15, m7 - pmulhrsw m2, m14 - pmulhrsw m4, m14 - packsswb m2, m2 - packsswb m4, m4 - shufps m2, m3, q3210 - shufps m4, m6, q3210 - ; followed by v interpolation (top | cur -> cur) - punpcklbw m3, m4, m2 - punpckhbw m4, m2 -%if ARCH_X86_32 - mov r5, [rsp+5*mmsize+12] - pmaddubsw m7, [r5], m4 - pmaddubsw m4, [r5], m3 -%else - pmaddubsw m7, m8, m4 - pmaddubsw m4, m8, m3 -%endif - pmulhrsw m7, m14 - pmulhrsw m4, m14 - packsswb m4, m7 - pxor m2, m2 - pcmpgtb m7, m2, m4 - punpcklbw m3, m4, m7 - punpckhbw m4, m7 - - ; src - mova m0, [srcq] - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m5, m0, scalingq-1, r0, r5, m7 - vpgatherdw m6, m1, scalingq-1, r0, r5, m7 -%else - vpgatherdw m5, m0, scalingq-1, r13, r14, m7 - vpgatherdw m6, m1, scalingq-1, r13, r14, m7 -%endif - REPX {psrlw x, 8}, m5, m6 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m3, m5 - pmullw m4, m6 - pmulhrsw m3, m11 - pmulhrsw m4, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m3 - paddw m1, m4 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add dword [rsp+5*mmsize+12], mmsize -%else - mova m8, [pb_17_27] -%endif - add srcq, r2mp - add grain_lutq, 82 - dec hw - jz .end_y_hv_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - btc hd, 16 - jnc .loop_y_hv_overlap - jmp .loop_y_h_overlap - -.end_y_hv_overlap: -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end_hv -%if ARCH_X86_32 - mov srcq, r1m - add srcq, r4m -%else - lea srcq, [src_bakq+wq] -%endif - xor dword r8m, 4 - add offxyd, 16 -%if ARCH_X86_32 - add dword [rsp+5*mmsize+1*gprsize], 16 -%else - add top_offxyd, 16 -%endif - jmp .loop_x_odd_v_overlap - -.end_hv: - RET - -%macro FGUV_FN 3 ; name, ss_hor, ss_ver -INIT_XMM ssse3 -%if ARCH_X86_32 -; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, -; sby, luma, lstride, uv_pl, is_id) -%if STACK_ALIGNMENT < mmsize -DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 -cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ - tmp, src, scaling, h, fg_data, picptr, unused - mov r0, r0m - mov r1, r2m - mov r2, r4m - mov r3, r6m - mov r4, r7m - mov [rsp+7*mmsize+3*gprsize], r0 - mov [rsp+7*mmsize+5*gprsize], r1 - mov [rsp+7*mmsize+7*gprsize], r2 - mov [rsp+7*mmsize+9*gprsize], r3 - mov [rsp+7*mmsize+10*gprsize], r4 - - mov r0, r8m - mov r1, r9m - mov r2, r10m - mov r4, r11m - mov r3, r12m - mov [rsp+7*mmsize+11*gprsize], r0 - mov [rsp+7*mmsize+12*gprsize], r1 - mov [rsp+7*mmsize+13*gprsize], r2 - mov [rsp+7*mmsize+14*gprsize], r4 -%else -cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ - tmp, src, scaling, h, fg_data, picptr, unused -%endif - mov srcq, srcm - mov fg_dataq, r3m - mov scalingq, r5m -%if STACK_ALIGNMENT < mmsize -%define r0m [rsp+7*mmsize+ 3*gprsize] -%define r1m [rsp+7*mmsize+ 4*gprsize] -%define r2m [rsp+7*mmsize+ 5*gprsize] -%define r3m [rsp+7*mmsize+ 6*gprsize] -%define r4m [rsp+7*mmsize+ 7*gprsize] -%define r5m [rsp+7*mmsize+ 8*gprsize] -%define r6m [rsp+7*mmsize+ 9*gprsize] -%define r7m [rsp+7*mmsize+10*gprsize] -%define r8m [rsp+7*mmsize+11*gprsize] -%define r9m [rsp+7*mmsize+12*gprsize] -%define r10m [rsp+7*mmsize+13*gprsize] -%define r11m [rsp+7*mmsize+14*gprsize] -%define r12m [rsp+7*mmsize+15*gprsize] -%endif - LEA r5, pb_mask -%define base r5-pb_mask - mov r5m, r5 -%else -cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ - grain_lut, tmp, sby, luma, lstride, uv_pl, is_id - lea r8, [pb_mask] -%define base r8-pb_mask -%endif - mov r6d, [fg_dataq+FGData.scaling_shift] - movd m3, [base+mul_bits+r6*2-14] - mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - lea tmpd, [r6d*2] -%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize - test r3, r3 -%else - cmp dword r12m, 0 ; is_idm -%endif - movd m5, [base+min+r6*2] - cmovne r6d, tmpd - movd m4, [base+max+r6*2] - punpcklwd m3, m3 - punpcklwd m5, m5 - punpcklwd m4, m4 - pshufd m3, m3, q0000 - pshufd m5, m5, q0000 - pshufd m4, m4, q0000 - SCRATCH 3, 11, 0 - SCRATCH 4, 12, 1 - SCRATCH 5, 13, 2 - - cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 - jne .csfl - -%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap -%endif - -%if %1 - mov r6d, dword r11m - movd m0, [fg_dataq+FGData.uv_mult+r6*4] - movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] - punpcklbw m6, m1, m0 - movd m7, [fg_dataq+FGData.uv_offset+r6*4] - punpcklwd m6, m6 - punpcklwd m7, m7 - pshufd m6, m6, q0000 - pshufd m7, m7, q0000 - SCRATCH 6, 14, 3 - SCRATCH 7, 15, 4 -%endif - - mov sbyd, r8m - mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 - test overlapd, overlapd - jz %%no_vertical_overlap -%if ARCH_X86_32 -%if %2 - mova m1, [base+pb_23_22_h] -%else - mova m1, [base+pb_27_17_17_27] -%endif - mova m0, [base+pw_1024] -%else -%if %2 - mova m1, [pb_23_22_h] -%else - mova m1, [pb_27_17_17_27] -%endif - mova m0, [pw_1024] -%endif - SCRATCH 0, 8, 5 - SCRATCH 1, 9, 6 - test sbyd, sbyd - jnz %%vertical_overlap - ; fall-through - -%%no_vertical_overlap: - mov r8m, overlapd -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap - imul seed, (173 << 24) | 37 -%else - imul seed, sbyd, (173 << 24) | 37 -%endif - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak -%define luma_bakq lumaq - - mov wq, r4m -%if %3 - shl r10mp, 1 -%endif -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak - - mov lstrideq, r10mp -%endif - - mov lumaq, r9mp - lea src_bakq, [srcq+wq] - lea luma_bakq, [lumaq+wq*(1+%2)] - neg wq - sub r0mp, srcq -%if ARCH_X86_32 - mov r1m, src_bakq - mov r11m, luma_bakq - mov r4m, wq - - DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 -%else - mov r11mp, src_bakq - mov r12mp, strideq -%endif - -%%loop_x: -%if ARCH_X86_32 - mov seed, r3m -%endif - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, overlap, unused1, unused2, lstride - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, overlap, unused1, unused2, lstride, luma_bak -%endif - -%%loop_x_odd: - mov hd, r7m - mov grain_lutq, grain_lutmp -%%loop_y: - ; src -%if ARCH_X86_32 - mov lumaq, r9mp -%endif -%if %2 - mova m4, [lumaq+ 0] - mova m6, [lumaq+16] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq - mov r5, r5m - movd m7, [base+pb_1] -%else - movd m7, [pb_1] -%endif - pshufd m7, m7, q0000 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m7, m4, scalingq-1, r0, r5 - vpgatherdw m5, m6, scalingq-1, r0, r5 -%else - vpgatherdw m7, m4, scalingq-1, r12, r2 - vpgatherdw m5, m6, scalingq-1, r12, r2 -%endif - REPX {psrlw x, 8}, m7, m5 - - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq+ 0] - pcmpgtb m6, m2, m3 - punpcklbw m2, m3, m6 - punpckhbw m3, m6 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m7 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add srcq, r2mp - ; we already incremented lumaq above -%else - add srcq, r12mp -%if %3 - lea lumaq, [lumaq+lstrideq*2] -%else - add lumaq, lstrideq -%endif -%endif - add grain_lutq, 82 - dec hw - jg %%loop_y - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end -%if ARCH_X86_32 - mov srcq, r1mp - mov lumaq, r11mp -%else - mov srcq, r11mp -%endif - lea lumaq, [luma_bakq+wq*(1+%2)] - add srcq, wq -%if ARCH_X86_32 - mov r4m, wq - mov r9m, lumaq -%endif -%if %2 == 0 - ; adjust top_offxy -%if ARCH_X86_32 - add dword [rsp+7*mmsize+1*gprsize], 16 -%else - add r11d, 16 -%endif - add offxyd, 16 - btc dword r8m, 2 - jc %%loop_x_even - test dword r8m, 2 - jz %%loop_x_odd - jmp %%loop_x_odd_v_overlap -%%loop_x_even: -%endif - test dword r8m, 1 - jz %%loop_x - - ; r8m = sbym - test dword r8m, 2 - jne %%loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) -%%loop_x_h_overlap: -%if ARCH_X86_32 -%if %2 - lea r6, [offxyd+16] - mov [rsp+7*mmsize+0*gprsize], r6 -%else - mov [rsp+7*mmsize+0*gprsize], offxyd -%endif - - DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut - - mov seed, r3m -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, unused1, unused2, lstride - -%if %2 - lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx -%else - mov left_offxyd, offyd -%endif -%endif - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, unused1, unused2, lstride - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak -%endif - - mov hd, r7m - mov grain_lutq, grain_lutmp -%%loop_y_h_overlap: - ; src -%if ARCH_X86_32 - mov lumaq, r9mp -%endif -%if %2 - mova m4, [lumaq+ 0] - mova m6, [lumaq+16] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq - mov r5, r5m - movd m7, [base+pb_1] -%else - movd m7, [pb_1] -%endif - pshufd m7, m7, q0000 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m7, m4, scalingq-1, r0, r5 - vpgatherdw m5, m6, scalingq-1, r0, r5 -%else - vpgatherdw m7, m4, scalingq-1, r12, r2 - vpgatherdw m5, m6, scalingq-1, r12, r2 -%endif - REPX {psrlw x, 8}, m7, m5 - - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; grain = grain_lut[offy+y][offx+x] - movu m4, [grain_lutq+offxyq+ 0] -%if ARCH_X86_32 - mov r0, [rsp+7*mmsize+0*gprsize] - movd m2, [grain_lutq+r0+ 0] -%else - movd m2, [grain_lutq+left_offxyq+ 0] -%endif - punpcklbw m2, m4 - pmaddubsw m3, m9, m2 - pmulhrsw m3, m8 - packsswb m3, m3 - shufps m3, m4, q3210 - pxor m4, m4 - pcmpgtb m4, m3 - punpcklbw m2, m3, m4 - punpckhbw m3, m4 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m7 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add srcq, r2mp - ; lumaq has already been incremented above -%else - add srcq, r12mp -%if %3 - lea lumaq, [lumaq+lstrideq*2] -%else - add lumaq, lstrideq -%endif -%endif - add grain_lutq, 82 - dec hw - jg %%loop_y_h_overlap - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end -%if ARCH_X86_32 - mov srcq, r1mp - mov lumaq, r11mp -%else - mov srcq, r11mp -%endif - lea lumaq, [luma_bakq+wq*(1+%2)] - add srcq, wq -%if ARCH_X86_32 - mov r4m, wq - mov r9m, lumaq -%endif -%if %2 == 0 - xor dword r8m, 4 - ; adjust top_offxyd -%if ARCH_X86_32 - add dword [rsp+7*mmsize+1*gprsize], 16 -%else - add r11d, 16 -%endif - add offxyd, 16 -%endif - - ; r8m = sbym - test dword r8m, 2 -%if %2 - jne %%loop_x_hv_overlap - jmp %%loop_x_h_overlap -%else - jne %%loop_x_odd_v_overlap - jmp %%loop_x_odd -%endif - -%%end: - RET - -%%vertical_overlap: -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap -%endif - - or overlapd, 2 ; top_overlap: overlap & 2 - mov r8m, overlapd - movzx sbyd, sbyb -%if ARCH_X86_32 - imul r4, [fg_dataq+FGData.seed], 0x00010001 - DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused -%else - imul seed, [fg_dataq+FGData.seed], 0x00010001 -%endif - imul tmpd, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add tmpd, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and tmpd, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, tmpd -%if ARCH_X86_32 - xor sbyd, seed ; (cur_seed << 16) | top_seed - - DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak - - mov r3m, seed - mov wq, r4m -%if %3 - shl r10mp, 1 -%endif -%else - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak - - mov lstrideq, r10mp -%endif - - mov lumaq, r9mp - lea src_bakq, [srcq+wq] - lea luma_bakq, [lumaq+wq*(1+%2)] - neg wq - sub r0mp, srcq -%if ARCH_X86_32 - mov r1m, src_bakq - mov r11m, luma_bakq - mov r4m, wq - - DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 -%else - mov r11mp, src_bakq - mov r12mp, strideq -%endif - -%%loop_x_v_overlap: -%if ARCH_X86_32 - mov seed, r3m - xor tmpd, tmpd -%endif - ; we assume from the block above that bits 8-15 of tmpd are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp tmpb ; parity of top_seed - shr seed, 16 - shl tmpd, 16 - test seeb, seeh - setp tmpb ; parity of cur_seed - or r6d, 0x00010001 - xor tmpd, r6d - mov seed, tmpd - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, overlap, top_offxy, unused, lstride - - mov offxd, seed - mov offyd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - -%if ARCH_X86_32 - DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak -%endif - - movzx top_offxyd, offxyw - shr offxyd, 16 -%if ARCH_X86_32 - mov [rsp+7*mmsize+1*gprsize], top_offxyd - - DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut -%endif - -%%loop_x_odd_v_overlap: - mov hd, r7m - mov grain_lutq, grain_lutmp -%if ARCH_X86_32 - mov r5, r5m -%endif -%if %3 - mova m1, [PIC_ptr(pb_23_22)] -%else - mova m1, [PIC_ptr(pb_27_17)] -%endif -%%loop_y_v_overlap: -%if ARCH_X86_32 - mov lumaq, r9mp -%endif -%if %2 - mova m4, [lumaq+ 0] - mova m6, [lumaq+16] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq - mov r5, r5m - movd m7, [base+pb_1] -%else - movd m7, [pb_1] -%endif - pshufd m7, m7, q0000 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m7, m4, scalingq-1, r0, r5 - vpgatherdw m5, m6, scalingq-1, r0, r5 -%else - vpgatherdw m7, m4, scalingq-1, r12, r2 - vpgatherdw m5, m6, scalingq-1, r12, r2 -%endif - REPX {psrlw x, 8}, m7, m5 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] -%if ARCH_X86_32 - mov r0, [rsp+7*mmsize+1*gprsize] - movu m4, [grain_lutq+r0] -%else - movu m4, [grain_lutq+top_offxyq] -%endif - punpckhbw m6, m4, m3 - punpcklbw m4, m3 - pmaddubsw m2, m1, m6 - pmaddubsw m3, m1, m4 - pmulhrsw m2, m8 - pmulhrsw m3, m8 - packsswb m3, m2 - pxor m6, m6 - pcmpgtb m6, m3 - punpcklbw m2, m3, m6 - punpckhbw m3, m6 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m7 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; unpack chroma_source - pxor m4, m4 - punpckhbw m6, m0, m4 - punpcklbw m0, m4 ; m0-1: src as word - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m6, m3 - pmaxsw m0, m13 - pmaxsw m6, m13 - pminsw m0, m12 - pminsw m6, m12 - packuswb m0, m6 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - - dec hw - je %%end_y_v_overlap -%if ARCH_X86_32 - add srcq, r2mp - ; lumaq has already been incremented above -%else - add srcq, r12mp -%if %3 - lea lumaq, [lumaq+lstrideq*2] -%else - add lumaq, lstrideq -%endif -%endif - add grain_lutq, 82 -%if %3 == 0 - btc hd, 16 -%if ARCH_X86_32 - mov r5, r5m -%endif - mova m1, [PIC_ptr(pb_17_27)] - jnc %%loop_y_v_overlap -%endif - jmp %%loop_y - -%%end_y_v_overlap: -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end_hv -%if ARCH_X86_32 - mov srcq, r1mp - mov lumaq, r11mp -%else - mov srcq, r11mp -%endif - lea lumaq, [luma_bakq+wq*(1+%2)] - add srcq, wq -%if ARCH_X86_32 - mov r4m, wq - mov r9m, lumaq -%endif - -%if %2 - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap -%else -%if ARCH_X86_32 - add dword [rsp+7*mmsize+1*gprsize], 16 -%else - add top_offxyd, 16 -%endif - add offxyd, 16 - btc dword r8m, 2 - jnc %%loop_x_odd_v_overlap -%endif - -%%loop_x_hv_overlap: -%if ARCH_X86_32 - DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused - - mov r6, [rsp+7*mmsize+1*gprsize] -%if %2 - lea r0, [r3d+16] - add r6, 16 - mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy -%else - mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy -%endif - mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy - - DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused - - mov seed, r3m - xor tmpd, tmpd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride - -%if %2 - lea topleft_offxyq, [top_offxyq+16] - lea left_offxyq, [offxyq+16] -%else - mov topleft_offxyq, top_offxyq - mov left_offxyq, offxyq -%endif - - ; we assume from the block above that bits 8-15 of tmpd are zero'ed -%endif - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp tmpb ; parity of top_seed - shr seed, 16 - shl tmpd, 16 - test seeb, seeh - setp tmpb ; parity of cur_seed - or r6d, 0x00010001 - xor tmpd, r6d - mov seed, tmpd - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride - - mov offxd, seed - mov offyd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - -%if ARCH_X86_32 - DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak -%endif - - movzx top_offxyd, offxyw - shr offxyd, 16 -%if ARCH_X86_32 - mov [rsp+7*mmsize+1*gprsize], top_offxyd -%endif - - mov hd, r7m - mov grain_lutq, grain_lutmp -%if ARCH_X86_32 - mov r5, r5m -%endif -%if %3 - mova m3, [PIC_ptr(pb_23_22)] -%else - mova m3, [PIC_ptr(pb_27_17)] -%endif -%%loop_y_hv_overlap: - ; grain = grain_lut[offy+y][offx+x] -%if ARCH_X86_32 - mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy - mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy - movd m1, [grain_lutq+r0] - mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy -%else - movd m1, [grain_lutq+topleft_offxyq] -%endif - movu m2, [grain_lutq+offxyq] -%if ARCH_X86_32 - movu m6, [grain_lutq+r5] - movd m4, [grain_lutq+r0] -%else - movu m6, [grain_lutq+top_offxyq] - movd m4, [grain_lutq+left_offxyq] -%endif - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklbw m1, m6 - punpcklbw m4, m2 - pmaddubsw m0, m9, m1 - pmaddubsw m1, m9, m4 - REPX {pmulhrsw x, m8}, m0, m1 - packsswb m0, m1 - shufps m4, m0, m2, q3232 - shufps m0, m6, q3210 - ; followed by v interpolation (top | cur -> cur) - punpcklbw m2, m0, m4 - punpckhbw m0, m4 - pmaddubsw m4, m3, m0 - pmaddubsw m1, m3, m2 - pmulhrsw m4, m8 - pmulhrsw m1, m8 - packsswb m1, m4 - - ; src -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov lumaq, r9mp -%endif -%if %2 - mova m4, [lumaq+ 0] - mova m6, [lumaq+16] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq - mov r5, r5m - movd m7, [base+pb_1] -%else - movd m7, [pb_1] -%endif - pshufd m7, m7, q0000 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m7, m4, scalingq-1, r0, r5 - vpgatherdw m5, m6, scalingq-1, r0, r5 -%else -%if %3 - vpgatherdw m7, m4, scalingq-1, r2, r12 - vpgatherdw m5, m6, scalingq-1, r2, r12 -%else - vpgatherdw m7, m4, scalingq-1, r2, r13 - vpgatherdw m5, m6, scalingq-1, r2, r13 -%endif -%endif - REPX {psrlw x, 8}, m7, m5 - - ; unpack grain - pxor m4, m4 - pcmpgtb m4, m1 - punpcklbw m2, m1, m4 - punpckhbw m1, m4 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m7 - pmullw m1, m5 - pmulhrsw m2, m11 - pmulhrsw m1, m11 - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - - ; unpack chroma source - pxor m4, m4 - punpckhbw m5, m0, m4 - punpcklbw m0, m4 ; m0-1: src as word - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m5, m1 - pmaxsw m0, m13 - pmaxsw m5, m13 - pminsw m0, m12 - pminsw m5, m12 - packuswb m0, m5 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add srcq, r2mp - ; lumaq has been adjusted above already -%else - add srcq, r12mp -%if %3 - lea lumaq, [lumaq+lstrideq*(1+%2)] -%else - add lumaq, r10mp -%endif -%endif - add grain_lutq, 82 - dec hw -%if %3 - jg %%loop_y_h_overlap -%else - jle %%end_y_hv_overlap -%if ARCH_X86_32 - mov r5, r5m -%endif - mova m3, [PIC_ptr(pb_17_27)] - btc hd, 16 - jnc %%loop_y_hv_overlap -%if ARCH_X86_64 - mov lstrideq, r10mp -%endif - jmp %%loop_y_h_overlap -%%end_y_hv_overlap: -%if ARCH_X86_64 - mov lstrideq, r10mp -%endif -%endif - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end_hv -%if ARCH_X86_32 - mov srcq, r1mp - mov lumaq, r11mp -%else - mov srcq, r11mp -%endif - lea lumaq, [luma_bakq+wq*(1+%2)] - add srcq, wq -%if ARCH_X86_32 - mov r4m, wq - mov r9m, lumaq -%endif -%if %2 - jmp %%loop_x_hv_overlap -%else -%if ARCH_X86_32 - add dword [rsp+7*mmsize+1*gprsize], 16 -%else - add top_offxyd, 16 -%endif - add offxyd, 16 - xor dword r8m, 4 - jmp %%loop_x_odd_v_overlap -%endif - -%%end_hv: - RET -%endmacro - - %%FGUV_32x32xN_LOOP 1, %2, %3 -.csfl: - %%FGUV_32x32xN_LOOP 0, %2, %3 -%endmacro - -FGUV_FN 420, 1, 1 - -%if STACK_ALIGNMENT < mmsize -DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 -%endif - -FGUV_FN 422, 1, 0 - -%if STACK_ALIGNMENT < mmsize -DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 -%endif - -FGUV_FN 444, 0, 0 diff -Nru dav1d-0.9.2/src/x86/filmgrain_sse.asm dav1d-1.0.0/src/x86/filmgrain_sse.asm --- dav1d-0.9.2/src/x86/filmgrain_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/filmgrain_sse.asm 2022-03-18 14:31:56.010356000 +0000 @@ -0,0 +1,3233 @@ +; Copyright © 2019-2021, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +SECTION_RODATA + +pw_1024: times 8 dw 1024 +pb_27_17_17_27: db 27, 17, 17, 27 + times 6 db 0, 32 +pb_23_22_h: db 23, 22 + times 7 db 0, 32 +pb_27_17: times 8 db 27, 17 +pb_17_27: times 8 db 17, 27 +pb_23_22: times 8 db 23, 22 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +max: dw 255, 240, 235 +min: dw 0, 16 +pw_1: dw 1 + +%macro JMP_TABLE 2-* + %xdefine %1_8bpc_%2_table %%table + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 + +SECTION .text + +%if ARCH_X86_32 +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r2d, [fg_dataq+FGData.grain_scale_shift] + movd m2, [base+round+r2*2] + movd m0, [fg_dataq+FGData.seed] + mova m5, [base+pb_mask] + pshuflw m2, m2, q0000 + pshuflw m0, m0, q0000 + mov r2, -73*82 + sub bufq, r2 + lea r3, [base+gaussian_sequence] +.loop: + pand m6, m0, m1 + psrlw m3, m6, 10 + por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m6, m4 ; bits 0x0f00 are set + pshufb m3, m5, m6 ; set 15th bit for next 4 seeds + psllq m6, m3, 30 + por m3, m6 + psllq m6, m3, 15 + por m3, m6 ; aggregate each bit into next seed's high bit + pmulhuw m6, m0, m7 + por m3, m6 ; 4 next output seeds + pshuflw m0, m3, q3333 + psrlw m3, 5 +%if ARCH_X86_64 + movq r6, m3 + mov r8, r6 + movzx r5d, r6w + shr r6d, 16 + shr r8, 32 + movzx r7, r8w + shr r8, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + pinsrw m6, [r3+r7*2], 2 + pinsrw m6, [r3+r8*2], 3 +%else + movd r6, m3 + pshuflw m3, m3, q3232 + movzx r5, r6w + shr r6, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + + movd r6, m3 + movzx r5, r6w + shr r6, 16 + + pinsrw m6, [r3+r5*2], 2 + pinsrw m6, [r3+r6*2], 3 +%endif + pmulhrsw m6, m2 + packsswb m6, m6 + movd [bufq+r2], m6 + add r2, 4 + jl .loop + + ; auto-regression code + movsxd r2, [fg_dataq+FGData.ar_coeff_lag] + movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] + lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] + jmp r2 + +.ar1: +%if ARCH_X86_32 + DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max +%elif WIN64 + DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 + mov bufq, r0 +%else + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 +%endif + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov ecx, [fg_dataq+FGData.ar_coeff_shift] +%if ARCH_X86_32 + mov r1m, cf3d + DEFINE_ARGS buf, shift, val3, min, max, x, val0 +%define hd r0mp +%define cf3d r1mp +%elif WIN64 + DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 +%else + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 +%endif + pxor m6, m6 + pcmpgtb m7, m6, m4 + punpcklbw m4, m7 + pinsrw m4, [base+pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pshuflw m3, m3, q0000 + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + movq m0, [bufq+xq-82-1] ; top/left + pcmpgtb m7, m6, m0 + punpcklbw m0, m7 + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif + DEFINE_ARGS buf, fg_data, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend+1] + SCRATCH 7, 15, 7 + movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + pxor m7, m7 + pshuflw m6, m6, q0000 + punpcklwd m6, m7 + pcmpgtb m4, m7, m0 + pcmpgtb m5, m7, m1 + punpcklbw m0, m4 + punpcklbw m1, m5 + DEFINE_ARGS buf, fg_data, h, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m7, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m6, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m6, m1 + psrldq m5, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m5, m1 + pmaddwd m4, m9 + pmaddwd m6, m10 + pmaddwd m5, m12 + paddd m4, m6 + paddd m2, m5 + paddd m2, m4 + paddd m2, m14 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pcmpgtb m4, m7, m0 + punpcklbw m1, m0, m4 + pmaddwd m3, m1, m13 + paddd m3, m2 + psrldq m1, 4 ; y=0,x=0 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw m3, m1 + packsswb m3, m3 + pslldq m3, 2 + pand m3, m15 + pandn m1, m15, m0 + por m0, m1, m3 + psrldq m0, 1 + ; overwrite 2 pixels, but that's ok + movd [bufq+xq-1], m0 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, shift +%if ARCH_X86_32 +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 +%elif WIN64 + SUB rsp, 16*6 +%assign stack_size_padded (stack_size_padded+16*6) +%assign stack_size (stack_size+16*6) +%else + ALLOC_STACK -16*6 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend] + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pxor m3, m3 + pcmpgtb m4, m3, m0 + pcmpgtb m3, m2 + pshuflw m6, m6, q0000 + SCRATCH 6, 14, 12 + SCRATCH 7, 15, 13 + punpckhbw m1, m0, m4 + punpcklbw m0, m4 + punpcklbw m2, m3 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m5, m0, q3333 + pshufd m0, m0, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m3 + mova [rsp+ 2*16], m4 + mova [rsp+ 3*16], m5 + pshufd m6, m1, q1111 + pshufd m7, m1, q2222 + pshufd m5, m1, q3333 + pshufd m1, m1, q0000 + pshufd m3, m2, q1111 + psrldq m0, m2, 10 + pinsrw m2, [base+pw_1], 5 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + pinsrw m0, [base+round_vals+shiftq*2-10], 3 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m6 + SCRATCH 7, 8, 6 + SCRATCH 5, 9, 7 + SCRATCH 2, 10, 8 + SCRATCH 3, 11, 9 + SCRATCH 4, 12, 10 + SCRATCH 0, 13, 11 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m2, m0, m3 + punpcklbw m0, m3 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m5, m5 + pcmpgtb m5, m1 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + palignr m6, m2, m0, 10 + palignr m7, m2, m0, 12 + psrldq m0, 8 + punpcklwd m0, m6 + punpcklwd m7, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m7, [rsp+ 3*16] + paddd m0, m7 + paddd m0, m4 + + psrldq m4, m1, 2 + psrldq m5, m1, 4 + psrldq m6, m1, 6 + psrldq m7, m1, 8 + punpcklwd m4, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 4*16] + pmaddwd m6, [rsp+ 5*16] + paddd m4, m6 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m5, m2, m7 + punpcklbw m2, m7 + palignr m7, m3, m1, 10 + palignr m3, m1, 12 + psrldq m1, m2, 2 + punpcklwd m7, m3 + punpcklwd m3, m2, m1 + pmaddwd m7, m8 + pmaddwd m3, m9 + paddd m7, m3 + paddd m0, m7 + + psrldq m6, m2, 4 + psrldq m1, m2, 6 + psrldq m3, m2, 8 + palignr m4, m5, m2, 10 + palignr m5, m5, m2, 12 + + punpcklwd m6, m1 + punpcklwd m3, m4 + punpcklwd m5, m14 + pmaddwd m6, m10 + pmaddwd m3, m11 + pmaddwd m5, m12 + paddd m0, m6 + paddd m3, m5 + paddd m0, m3 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pxor m5, m5 + pcmpgtb m5, m1 + punpcklbw m2, m1, m5 + pmaddwd m2, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb m2, m2 + pslldq m2, 3 + pand m2, m15 + pandn m3, m15, m1 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM ssse3 +cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv + movifnidn r2, r2mp + movifnidn r3, r3mp + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + movd m6, [base+round+r5*2] + mova m5, [base+pb_mask] + movd m0, [fg_dataq+FGData.seed] + movd m2, [base+pw_seed_xor+uvq*4] + pxor m0, m2 + pshuflw m6, m6, q0000 + pshuflw m0, m0, q0000 + lea r6, [base+gaussian_sequence] +%if %2 +%if ARCH_X86_64 + mov r7d, 73-35*%3 +%else + mov r3mp, 73-35*%3 +%endif + add bufq, 44 +.loop_y: + mov r5, -44 +.loop_x: +%else + mov r5, -82*73 + sub bufq, r5 +.loop: +%endif + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m3, m2 + psllq m2, m3, 15 + por m3, m2 ; aggregate each bit into next seed's high bit + pmulhuw m2, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + movd r9d, m2 + pshuflw m2, m2, q3232 + movzx r8, r9w + shr r9, 16 + + movd m3, [r6+r8*2] + pinsrw m3, [r6+r9*2], 1 + + movd r9d, m2 + movzx r8, r9w + shr r9, 16 + + pinsrw m3, [r6+r8*2], 2 + pinsrw m3, [r6+r9*2], 3 +%else + movd r2, m2 + pshuflw m2, m2, q3232 + movzx r1, r2w + shr r2, 16 + + movd m3, [r6+r1*2] + pinsrw m3, [r6+r2*2], 1 + + movd r2, m2 + movzx r1, r2w + shr r2, 16 + + pinsrw m3, [r6+r1*2], 2 + pinsrw m3, [r6+r2*2], 3 +%endif + pmulhrsw m3, m6 + packsswb m3, m3 + movd [bufq+r5], m3 + add r5, 4 +%if %2 + jl .loop_x + add bufq, 82 +%if ARCH_X86_64 + dec r7d +%else + dec r3mp +%endif + jg .loop_y +%else + jl .loop +%endif + +%if ARCH_X86_32 + mov r2, r2mp +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] + jmp r5 + +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -2*16 +%endif + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd m4, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h, x + pxor m0, m0 + pcmpgtb m0, m5 + punpcklbw m5, m0 + movd m7, [base+pb_1] +%if %2 + movd m6, [base+hmul_bits+2+%3*2] +%endif + pshuflw m5, m5, q0000 + pshuflw m4, m4, q0000 + pshufd m7, m7, q0000 +%if %2 + pshuflw m6, m6, q0000 +%endif + punpcklqdq m5, m5 + punpcklqdq m4, m4 +%if %2 + punpcklqdq m6, m6 +%endif + pcmpeqw m1, m1 + pslldq m1, 12>>%2 + SCRATCH 1, 8, 0 + SCRATCH 4, 9, 1 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: + xor xd, xd +.x_loop_ar0: + ; first 32 pixels +%if %2 + movu m1, [bufyq+xq*2] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + movu m3, [bufyq+xq*2+16] +%if %3 + movu m4, [bufyq+xq*2+82+16] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 +%endif + pmaddubsw m2, m7, m3 +%if %3 + pmaddubsw m3, m7, m4 + paddw m0, m1 + paddw m2, m3 +%endif + pmulhrsw m0, m6 + pmulhrsw m2, m6 +%else + movu m0, [bufyq+xq] + pxor m6, m6 + pcmpgtb m6, m0 + punpckhbw m2, m0, m6 + punpcklbw m0, m6 +%endif + pmullw m0, m5 + pmullw m2, m5 + pmulhrsw m0, m9 + pmulhrsw m2, m9 + movu m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpckhbw m3, m1, m4 +%if %2 + punpcklbw m1, m4 + paddw m2, m3 + paddw m0, m1 +%else + punpcklbw m6, m1, m4 + paddw m2, m3 + paddw m0, m6 +%endif + packsswb m0, m2 +%if %2 + movu [bufq+xq], m0 + add xd, 16 + cmp xd, 32 + jl .x_loop_ar0 + + ; last 6/12 pixels + movu m1, [bufyq+xq*(1+%2)] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 + paddw m0, m1 +%endif + pmulhrsw m0, m6 + pmullw m0, m5 + pmulhrsw m0, m9 + movq m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + paddw m0, m2 + packsswb m0, m0 + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movq [bufq+xq], m2 +%else + add xd, 16 + cmp xd, 80 + je .y_loop_final_ar0 + movu [bufq+xq-16], m0 + jmp .x_loop_ar0 +.y_loop_final_ar0: + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movu [bufq+xq-16], m2 +%endif + + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +.ar1: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] + pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 +%if ARCH_X86_32 + mov r3mp, cf3d + DEFINE_ARGS buf, shift, fg_data, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x + mov bufq, r0 +%else + DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + movd m7, [base+pb_1] + movd m6, [base+hmul_bits+2+%3*2] +%endif + psrldq m4, 1 +%if ARCH_X86_32 + DEFINE_ARGS buf, shift, val0, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 +%else + DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 +%endif + pxor m5, m5 + punpcklwd m3, m5 +%if %2 + punpcklwd m6, m6 +%endif + pcmpgtb m5, m4 + punpcklbw m4, m5 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + pshufd m3, m3, q0000 +%if %2 + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif +%if ARCH_X86_32 + add r1mp, 79+82*3 + mov r0mp, 70-35*%3 +%else + add bufyq, 79+82*3 + mov hd, 70-35*%3 +%endif + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: +%if %2 +%if ARCH_X86_32 + mov r2, r1mp + movq m0, [r2+xq*2] +%if %3 + movq m1, [r2+xq*2+82] +%endif +%else + movq m0, [bufyq+xq*2] +%if %3 + movq m1, [bufyq+xq*2+82] +%endif +%endif + pmaddubsw m2, m7, m0 +%if %3 + pmaddubsw m0, m7, m1 + paddw m2, m0 +%endif + pmulhrsw m2, m6 +%else +%if ARCH_X86_32 + mov r2, r1mp + movd m2, [r2+xq] +%else + movd m2, [bufyq+xq] +%endif + pxor m0, m0 + pcmpgtb m0, m2 + punpcklbw m2, m0 +%endif + + movq m0, [bufq+xq-82-1] ; top/left + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 + psrldq m1, m0, 4 ; top/right + punpcklwd m1, m2 + psrldq m2, m0, 2 ; top + punpcklwd m0, m2 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 +%if ARCH_X86_32 + imul val3d, r3mp +%else + imul val3d, cf3d +%endif + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 +%if ARCH_X86_32 + add r1mp, 82<<%3 + dec r0mp +%else + add bufyq, 82<<%3 + dec hd +%endif + jg .y_loop_ar1 + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp + ALLOC_STACK -8*16 +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + movd m7, [base+round_vals-12+shiftq*2] + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 + pxor m2, m2 + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + pinsrw m1, [base+pw_1], 5 + punpcklwd m7, m7 + pshufd m7, m7, q0000 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + SCRATCH 7, 15, 7 +%if %2 + movd m7, [base+hmul_bits+2+%3*2] + movd m6, [base+pb_1] + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pxor m2, m2 + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m0, m1 + psrldq m3, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + pmaddwd m4, m9 + pmaddwd m0, m10 + pmaddwd m3, m12 + paddd m4, m0 + paddd m2, m3 + paddd m2, m4 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m6, m1 +%if %3 + pmaddubsw m1, m6, m3 + paddw m0, m1 +%endif + pmulhrsw m0, m7 +%else + movd m0, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 +%endif + punpcklwd m0, m15 + pmaddwd m0, m14 + paddd m2, m0 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] + pxor m4, m4 + movd m5, [base+byte_blend+1] + punpcklbw m5, m5 +.x_loop_ar2_inner: + pcmpgtb m1, m4, m0 + punpcklbw m0, m1 + pmaddwd m3, m0, m13 + paddd m3, m2 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + pslldq m3, 4 + pand m3, m5 + paddw m0, m3 + packsswb m0, m0 + movd [bufq+xq-2], m0 + psrldq m0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 + ALLOC_STACK -15*16 +%else + SUB rsp, 16*7 +%assign stack_size_padded (stack_size_padded+16*7) +%assign stack_size (stack_size+16*7) +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + pshufd m2, m0, q1111 + pshufd m3, m0, q2222 + pshufd m4, m0, q3333 + pshufd m0, m0, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m7, m1, q3333 + pshufd m1, m1, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m2 + mova [rsp+ 2*16], m3 + mova [rsp+ 3*16], m4 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m5 + mova [rsp+ 6*16], m6 + SCRATCH 7, 8, 7 + + movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] + pxor m4, m4 + pcmpgtb m4, m2 + punpckhbw m5, m2, m4 + punpcklbw m2, m4 + pshufd m4, m2, q3232 + punpcklwd m3, m4, m5 + pshuflw m5, m4, q3321 + pshufd m4, m3, q0000 + pshufd m3, m2, q1111 + pshufd m2, m2, q0000 + pinsrw m5, [base+round_vals+shiftq*2-10], 3 + SCRATCH 2, 9, 8 + SCRATCH 3, 10, 9 + SCRATCH 4, 11, 10 + SCRATCH 5, 12, 11 + + movd m2, [base+round_vals-12+shiftq*2] +%if %2 + movd m1, [base+pb_1] + movd m3, [base+hmul_bits+2+%3*2] +%endif + pxor m0, m0 + punpcklwd m2, m0 +%if %2 + punpcklwd m3, m3 +%endif + pshufd m2, m2, q0000 +%if %2 + pshufd m1, m1, q0000 + pshufd m3, m3, q0000 + SCRATCH 1, 13, 12 +%endif + SCRATCH 2, 14, 13 +%if %2 + SCRATCH 3, 15, 14 +%endif + + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m4, m4 + pcmpgtb m4, m0 + punpckhbw m3, m0, m4 + punpcklbw m0, m4 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + palignr m2, m3, m0, 10 + palignr m3, m0, 12 + psrldq m0, 8 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m6, m6 + pcmpgtb m6, m1 + punpckhbw m5, m1, m6 + punpcklbw m1, m6 + + punpcklwd m0, m2 + punpcklwd m3, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m3, [rsp+ 3*16] + paddd m0, m3 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m6, m2, m7 + punpcklbw m2, m7 + + palignr m3, m5, m1, 10 + palignr m5, m1, 12 + psrldq m4, m2, 2 + + punpcklwd m3, m5 + punpcklwd m5, m2, m4 + pmaddwd m3, [rsp+ 6*16] + pmaddwd m5, m8 + paddd m3, m5 + paddd m0, m3 + + psrldq m3, m1, 2 + psrldq m4, m1, 4 + psrldq m5, m1, 6 + psrldq m1, 8 + + punpcklwd m3, m4 + punpcklwd m5, m1 + pmaddwd m3, [rsp+ 4*16] + pmaddwd m5, [rsp+ 5*16] + paddd m3, m5 + paddd m0, m3 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m7, m13, m1 +%if %3 + pmaddubsw m5, m13, m3 + paddw m7, m5 +%endif + pmulhrsw m7, m15 +%else + movd m7, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m7 + punpcklbw m7, m1 +%endif + + psrldq m1, m2, 4 + psrldq m3, m2, 6 + palignr m4, m6, m2, 10 + palignr m6, m2, 12 + psrldq m2, 8 + + punpcklwd m1, m3 + punpcklwd m2, m4 + punpcklwd m6, m7 + pmaddwd m1, m9 + pmaddwd m2, m10 + pmaddwd m6, m11 + paddd m1, m2 + paddd m0, m6 + paddd m0, m1 + paddd m0, m14 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] + pxor m4, m4 + movd m5, [base+byte_blend] +.x_loop_ar3_inner: + pcmpgtb m2, m4, m1 + punpcklbw m3, m1, m2 + pmaddwd m2, m3, m12 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw, we only care about one value + packsswb m2, m2 + pandn m3, m5, m1 + pslld m2, 24 + pand m2, m5 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 6 +%define %%tmp %6 +%endif +%rep 4 +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4] +%else + pinsrw %1, [%3+%4], %%idx + 0 +%endif + pinsrw %1, [%3+%5], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +INIT_XMM ssse3 +; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + + mov [rsp+5*mmsize+ 4*gprsize], r0 + mov [rsp+5*mmsize+ 6*gprsize], r1 + mov [rsp+5*mmsize+ 8*gprsize], r2 + mov [rsp+5*mmsize+10*gprsize], r3 + mov [rsp+5*mmsize+11*gprsize], r4 + mov [rsp+5*mmsize+12*gprsize], r5 +%else +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+5*mmsize+ 4*gprsize] +%define r1m [rsp+5*mmsize+ 5*gprsize] +%define r2m [rsp+5*mmsize+ 6*gprsize] +%define r3m [rsp+5*mmsize+ 7*gprsize] +%define r4m [rsp+5*mmsize+ 8*gprsize] +%define r5m [rsp+5*mmsize+ 9*gprsize] +%define r6m [rsp+5*mmsize+10*gprsize] +%define r7m [rsp+5*mmsize+11*gprsize] +%define r8m [rsp+5*mmsize+12*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, picptrq +%else +cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + lea r7, [pb_mask] +%define base r7-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + movd m4, [base+max+r6*4] + movd m5, [base+min+r6*2] + punpcklwd m3, m3 + punpcklwd m4, m4 + punpcklwd m5, m5 + pshufd m3, m3, q0000 + pshufd m4, m4, q0000 + pshufd m5, m5, q0000 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz .no_vertical_overlap + mova m6, [base+pw_1024] + mova m7, [base+pb_27_17_17_27] + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 + test sbyd, sbyd + jnz .vertical_overlap + ; fall-through + +.no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 +%endif + +.loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused +%endif + +.loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .next_blk + + add offxyd, 16 + test dword r8m, 2 ; r8m & 2 = have_top_overlap + jz .loop_x_odd + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jnz .loop_x_odd_v_overlap + +.next_blk: + test dword r8m, 1 + jz .loop_x + + test dword r8m, 2 + jnz .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 + + add offxyd, 16 ; left_offxyd + mov [rsp+5*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+0*gprsize] + movd m7, [grain_lutq+r5] +%else + movd m7, [grain_lutq+left_offxyq] +%endif + punpcklbw m7, m3 + pmaddubsw m6, m15, m7 + pmulhrsw m6, m14 + packsswb m6, m6 + shufps m6, m3, q3210 + pcmpgtb m2, m6 + punpcklbw m7, m6, m2 + punpckhbw m6, m2 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m7, m4 + pmullw m6, m5 + pmulhrsw m7, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m7 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 + + ; since this half-block had left-overlap, the next does not + test dword r8m, 2 ; have_top_overlap + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%endif + +.loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed, + ; because of the 'and tmpd, 0x00ff00ff' above + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused, top_offxy + + mov offyd, seed + mov offxd, seed +%endif + + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused, top_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+5*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +.loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+12], r5 +%else + mova m8, [pb_27_17] +%endif + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+1*gprsize] + movu m7, [grain_lutq+r5] +%else + movu m7, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m7, m3 + punpcklbw m7, m3 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+12] + pmaddubsw m3, [r5], m6 + pmaddubsw m6, [r5], m7 +%else + pmaddubsw m3, m8, m6 + pmaddubsw m6, m8, m7 +%endif + pmulhrsw m3, m14 + pmulhrsw m6, m14 + packsswb m6, m3 + pcmpgtb m7, m2, m6 + punpcklbw m2, m6, m7 + punpckhbw m6, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m6, m5 + pmulhrsw m2, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+12], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.loop_x_hv_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+12], r5 + + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak + + mov r5, [rsp+5*mmsize+1*gprsize] + mov r4, offxyd + add r5, 16 + add r4, 16 + mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy + mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak + + xor tmpd, tmpd + mov seed, r3m +%else + mova m8, [pb_27_17] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut + + movzx r5, offxyw ; top_offxy + mov [rsp+5*mmsize+1*gprsize], r5 +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw +%endif + shr offxyd, 16 + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy + mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy + movu m6, [grain_lutq+r5] + mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy + movd m4, [grain_lutq+r0] + movd m7, [grain_lutq+r5] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] + movd m7, [grain_lutq+topleft_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m4, m3 + punpcklbw m7, m6 + pmaddubsw m2, m15, m4 + pmaddubsw m4, m15, m7 + pmulhrsw m2, m14 + pmulhrsw m4, m14 + packsswb m2, m2 + packsswb m4, m4 + shufps m2, m3, q3210 + shufps m4, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m3, m4, m2 + punpckhbw m4, m2 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+12] + pmaddubsw m7, [r5], m4 + pmaddubsw m4, [r5], m3 +%else + pmaddubsw m7, m8, m4 + pmaddubsw m4, m8, m3 +%endif + pmulhrsw m7, m14 + pmulhrsw m4, m14 + packsswb m4, m7 + pxor m2, m2 + pcmpgtb m7, m2, m4 + punpcklbw m3, m4, m7 + punpckhbw m4, m7 + + ; src + mova m0, [srcq] + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m5, m0, scalingq-1, r0, r5, m7 + vpgatherdw m6, m1, scalingq-1, r0, r5, m7 +%else + vpgatherdw m5, m0, scalingq-1, r13, r14, m7 + vpgatherdw m6, m1, scalingq-1, r13, r14, m7 +%endif + REPX {psrlw x, 8}, m5, m6 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m3, m5 + pmullw m4, m6 + pmulhrsw m3, m11 + pmulhrsw m4, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+12], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.end_hv: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +INIT_XMM ssse3 +%if ARCH_X86_32 +; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, +; sby, luma, lstride, uv_pl, is_id) +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov [rsp+7*mmsize+3*gprsize], r0 + mov [rsp+7*mmsize+5*gprsize], r1 + mov [rsp+7*mmsize+7*gprsize], r2 + mov [rsp+7*mmsize+9*gprsize], r3 + mov [rsp+7*mmsize+10*gprsize], r4 + + mov r0, r8m + mov r1, r9m + mov r2, r10m + mov r4, r11m + mov r3, r12m + mov [rsp+7*mmsize+11*gprsize], r0 + mov [rsp+7*mmsize+12*gprsize], r1 + mov [rsp+7*mmsize+13*gprsize], r2 + mov [rsp+7*mmsize+14*gprsize], r4 +%else +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+7*mmsize+ 3*gprsize] +%define r1m [rsp+7*mmsize+ 4*gprsize] +%define r2m [rsp+7*mmsize+ 5*gprsize] +%define r3m [rsp+7*mmsize+ 6*gprsize] +%define r4m [rsp+7*mmsize+ 7*gprsize] +%define r5m [rsp+7*mmsize+ 8*gprsize] +%define r6m [rsp+7*mmsize+ 9*gprsize] +%define r7m [rsp+7*mmsize+10*gprsize] +%define r8m [rsp+7*mmsize+11*gprsize] +%define r9m [rsp+7*mmsize+12*gprsize] +%define r10m [rsp+7*mmsize+13*gprsize] +%define r11m [rsp+7*mmsize+14*gprsize] +%define r12m [rsp+7*mmsize+15*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, r5 +%else +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, tmp, sby, luma, lstride, uv_pl, is_id + lea r8, [pb_mask] +%define base r8-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + lea tmpd, [r6d*2] +%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize + test r3, r3 +%else + cmp dword r12m, 0 ; is_idm +%endif + movd m5, [base+min+r6*2] + cmovne r6d, tmpd + movd m4, [base+max+r6*2] + punpcklwd m3, m3 + punpcklwd m5, m5 + punpcklwd m4, m4 + pshufd m3, m3, q0000 + pshufd m5, m5, q0000 + pshufd m4, m4, q0000 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + +%if %1 + mov r6d, dword r11m + movd m0, [fg_dataq+FGData.uv_mult+r6*4] + movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklbw m6, m1, m0 + movd m7, [fg_dataq+FGData.uv_offset+r6*4] + punpcklwd m6, m6 + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz %%no_vertical_overlap +%if ARCH_X86_32 +%if %2 + mova m1, [base+pb_23_22_h] +%else + mova m1, [base+pb_27_17_17_27] +%endif + mova m0, [base+pw_1024] +%else +%if %2 + mova m1, [pb_23_22_h] +%else + mova m1, [pb_27_17_17_27] +%endif + mova m0, [pw_1024] +%endif + SCRATCH 0, 8, 5 + SCRATCH 1, 9, 6 + test sbyd, sbyd + jnz %%vertical_overlap + ; fall-through + +%%no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak +%define luma_bakq lumaq + + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride, luma_bak +%endif + +%%loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq+ 0] + pcmpgtb m6, m2, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; we already incremented lumaq above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + ; adjust top_offxy +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jc %%loop_x_even + test dword r8m, 2 + jz %%loop_x_odd + jmp %%loop_x_odd_v_overlap +%%loop_x_even: +%endif + test dword r8m, 1 + jz %%loop_x + + ; r8m = sbym + test dword r8m, 2 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: +%if ARCH_X86_32 +%if %2 + lea r6, [offxyd+16] + mov [rsp+7*mmsize+0*gprsize], r6 +%else + mov [rsp+7*mmsize+0*gprsize], offxyd +%endif + + DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + +%if %2 + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%else + mov left_offxyd, offyd +%endif +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq+ 0] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+0*gprsize] + movd m2, [grain_lutq+r0+ 0] +%else + movd m2, [grain_lutq+left_offxyq+ 0] +%endif + punpcklbw m2, m4 + pmaddubsw m3, m9, m2 + pmulhrsw m3, m8 + packsswb m3, m3 + shufps m3, m4, q3210 + pxor m4, m4 + pcmpgtb m4, m3 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y_h_overlap + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + xor dword r8m, 4 + ; adjust top_offxyd +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 +%endif + + ; r8m = sbym + test dword r8m, 2 +%if %2 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap +%else + jne %%loop_x_odd_v_overlap + jmp %%loop_x_odd +%endif + +%%end: + RET + +%%vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor tmpd, tmpd +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+7*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%endif + +%%loop_x_odd_v_overlap: + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %3 + mova m1, [PIC_ptr(pb_23_22)] +%else + mova m1, [PIC_ptr(pb_27_17)] +%endif +%%loop_y_v_overlap: +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+1*gprsize] + movu m4, [grain_lutq+r0] +%else + movu m4, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, m1, m6 + pmaddubsw m3, m1, m4 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + packsswb m3, m2 + pxor m6, m6 + pcmpgtb m6, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; unpack chroma_source + pxor m4, m4 + punpckhbw m6, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m6, m3 + pmaxsw m0, m13 + pmaxsw m6, m13 + pminsw m0, m12 + pminsw m6, m12 + packuswb m0, m6 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + dec hw + je %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 +%if %3 == 0 + btc hd, 16 +%if ARCH_X86_32 + mov r5, r5m +%endif + mova m1, [PIC_ptr(pb_17_27)] + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif + +%if %2 + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%else +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jnc %%loop_x_odd_v_overlap +%endif + +%%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused + + mov r6, [rsp+7*mmsize+1*gprsize] +%if %2 + lea r0, [r3d+16] + add r6, 16 + mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy +%else + mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy +%endif + mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused + + mov seed, r3m + xor tmpd, tmpd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + +%if %2 + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offxyq+16] +%else + mov topleft_offxyq, top_offxyq + mov left_offxyq, offxyq +%endif + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+7*mmsize+1*gprsize], top_offxyd +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %3 + mova m3, [PIC_ptr(pb_23_22)] +%else + mova m3, [PIC_ptr(pb_27_17)] +%endif +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy + mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy + movd m1, [grain_lutq+r0] + mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy +%else + movd m1, [grain_lutq+topleft_offxyq] +%endif + movu m2, [grain_lutq+offxyq] +%if ARCH_X86_32 + movu m6, [grain_lutq+r5] + movd m4, [grain_lutq+r0] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m1, m6 + punpcklbw m4, m2 + pmaddubsw m0, m9, m1 + pmaddubsw m1, m9, m4 + REPX {pmulhrsw x, m8}, m0, m1 + packsswb m0, m1 + shufps m4, m0, m2, q3232 + shufps m0, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m2, m0, m4 + punpckhbw m0, m4 + pmaddubsw m4, m3, m0 + pmaddubsw m1, m3, m2 + pmulhrsw m4, m8 + pmulhrsw m1, m8 + packsswb m1, m4 + + ; src +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else +%if %3 + vpgatherdw m7, m4, scalingq-1, r2, r12 + vpgatherdw m5, m6, scalingq-1, r2, r12 +%else + vpgatherdw m7, m4, scalingq-1, r2, r13 + vpgatherdw m5, m6, scalingq-1, r2, r13 +%endif +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack grain + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + punpckhbw m1, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m1, m5 + pmulhrsw m2, m11 + pmulhrsw m1, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; unpack chroma source + pxor m4, m4 + punpckhbw m5, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m5, m1 + pmaxsw m0, m13 + pmaxsw m5, m13 + pminsw m0, m12 + pminsw m5, m12 + packuswb m0, m5 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has been adjusted above already +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*(1+%2)] +%else + add lumaq, r10mp +%endif +%endif + add grain_lutq, 82 + dec hw +%if %3 + jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap +%if ARCH_X86_32 + mov r5, r5m +%endif + mova m3, [PIC_ptr(pb_17_27)] + btc hd, 16 + jnc %%loop_y_hv_overlap +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif + jmp %%loop_y_h_overlap +%%end_y_hv_overlap: +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif +%endif + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 + jmp %%loop_x_hv_overlap +%else +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + xor dword r8m, 4 + jmp %%loop_x_odd_v_overlap +%endif + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 422, 1, 0 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 444, 0, 0 diff -Nru dav1d-0.9.2/src/x86/ipred16_avx2.asm dav1d-1.0.0/src/x86/ipred16_avx2.asm --- dav1d-0.9.2/src/x86/ipred16_avx2.asm 2021-09-03 15:51:24.417037000 +0000 +++ dav1d-1.0.0/src/x86/ipred16_avx2.asm 2022-03-18 14:31:56.010356000 +0000 @@ -26,7 +26,7 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 %macro SMOOTH_WEIGHTS 1-* const smooth_weights_1d_16bpc ; sm_weights[] << 7 @@ -134,14 +134,6 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - INIT_YMM avx2 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h movifnidn hd, hm diff -Nru dav1d-0.9.2/src/x86/ipred16_avx512.asm dav1d-1.0.0/src/x86/ipred16_avx512.asm --- dav1d-0.9.2/src/x86/ipred16_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/ipred16_avx512.asm 2022-03-18 14:31:56.010356000 +0000 @@ -0,0 +1,833 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3 + db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11 + db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7 + db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15 +smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 + db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 +pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 + db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 + db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 + db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 +filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 + times 4 db 10, 11, 12, 13, 2, 3, -1, -1 +filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 + times 4 db 26, 27, 28, 29, 14, 15, -1, -1 +filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9 +pw_1: times 2 dw 1 + dd 10 +filter_rnd: dd 32 + dd 1 + dd 8 + dd 11 +filter_shift: times 2 dw 6 + dd 0 + times 2 dw 4 + dd 9 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 + +cextern smooth_weights_1d_16bpc +cextern smooth_weights_2d_16bpc +cextern filter_intra_taps + +SECTION .text + +%macro PAETH 3 ; top, signed_ldiff, ldiff + paddw m0, m%2, m2 + psubw m1, m0, m3 ; tldiff + psubw m0, m%1 ; tdiff + pabsw m1, m1 + pabsw m0, m0 + pcmpgtw k1, m0, m1 + pminsw m0, m1 + pcmpgtw k2, m%3, m0 + vpblendmw m0{k1}, m%1, m3 + vpblendmw m0{k2}, m2, m0 +%endmacro + +INIT_ZMM avx512icl +cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h +%define base r6-ipred_paeth_16bpc_avx512icl_table + lea r6, [ipred_paeth_16bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastw m3, [tlq] ; topleft + add wq, r6 + jmp wq +.w4: + vpbroadcastq m4, [tlq+2] ; top + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] + psubw m5, m4, m3 + pabsw m6, m5 +.w4_loop: + sub tlq, 16 + vbroadcasti32x4 m2, [tlq] + pshufb m2, m7 ; left + PAETH 4, 5, 6 + vextracti32x4 xmm1, m0, 2 + vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+r6 ], xmm3 + sub hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+r6 ], xmm3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.w8: + vbroadcasti32x4 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] + psubw m5, m4, m3 + pabsw m6, m5 +.w8_loop: + sub tlq, 8 + vpbroadcastq m2, [tlq] + pshufb m2, m7 + PAETH 4, 5, 6 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+r6 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + vbroadcasti32x8 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + psubw m5, m4, m3 + pabsw m6, m5 +.w16_loop: + sub tlq, 4 + vpbroadcastd m2, [tlq] + pshufb m2, m7 + PAETH 4, 5, 6 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: + movu m4, [tlq+2] + psubw m5, m4, m3 + pabsw m6, m5 +.w32_loop: + sub tlq, 2 + vpbroadcastw m2, [tlq] + PAETH 4, 5, 6 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +.w64: + movu m4, [tlq+ 2] + movu m7, [tlq+66] + psubw m5, m4, m3 + psubw m8, m7, m3 + pabsw m6, m5 + pabsw m9, m8 +.w64_loop: + sub tlq, 2 + vpbroadcastw m2, [tlq] + PAETH 4, 5, 6 + mova [dstq+64*0], m0 + PAETH 7, 8, 9 + mova [dstq+64*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 +%define base r6-$$ + lea r6, [$$] + tzcnt wd, wm + mov hd, hm + movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4] + lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] + neg hq + vpbroadcastw m6, [tlq+hq*2] ; bottom + lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq] + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastq m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w4_loop: + vbroadcasti32x4 m3, [weightsq+hq*2] + pshufb m3, m4 + pmulhrsw m3, m5 + paddw m3, m6 + vextracti32x4 xmm0, m3, 3 + vextracti32x4 xmm1, ym3, 1 + vextracti32x4 xmm2, m3, 2 + movhps [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xm3 + add hq, 8 + jg .end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xmm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + jl .w4_loop +.end: + RET +.w8: + vbroadcasti32x4 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w8_loop: + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m4 + pmulhrsw m0, m5 + paddw m0, m6 + vextracti32x4 [dstq+strideq*0], m0, 3 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +.w16: + vbroadcasti32x8 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w16_loop: + vpbroadcastd m0, [weightsq+hq*2+0] + vpbroadcastd m1, [weightsq+hq*2+4] + pshufb m0, m4 + pshufb m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m6 + paddw m1, m6 + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], ym0 + vextracti32x8 [dstq+strideq*2], m1, 1 + mova [dstq+stride3q ], ym1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w16_loop + RET +.w32: + movu m5, [tlq+2] + psubw m5, m6 +.w32_loop: + vpbroadcastw m0, [weightsq+hq*2+0] + vpbroadcastw m1, [weightsq+hq*2+2] + vpbroadcastw m2, [weightsq+hq*2+4] + vpbroadcastw m3, [weightsq+hq*2+6] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w32_loop + RET +.w64: + movu m4, [tlq+ 2] + movu m5, [tlq+66] + psubw m4, m6 + psubw m5, m6 +.w64_loop: + vpbroadcastw m1, [weightsq+hq*2+0] + vpbroadcastw m3, [weightsq+hq*2+2] + pmulhrsw m0, m4, m1 + pmulhrsw m1, m5 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m5 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + mova [dstq+strideq*1+64*0], m2 + mova [dstq+strideq*1+64*1], m3 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w64_loop + RET + +cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 + lea r6, [$$] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m6, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4] + sub tlq, hq + lea stride3q, [strideq*3] + lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq] + jmp wq +.w4: + movsldup m4, [base+ipred_shuf] + vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2] +.w4_loop: + vbroadcasti32x4 m0, [tlq+hq-16] ; left + pshufb m0, m4 + psubw m0, m6 ; left - right + pmulhrsw m0, m5 + paddw m0, m6 + vextracti32x4 xmm1, m0, 2 + vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + sub hd, 8*2 + jl .end + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.end: + RET +.w8: + movsldup m4, [base+ipred_shuf] + vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2] +.w8_loop: + vpbroadcastq m0, [tlq+hq-8] ; left + pshufb m0, m4 + psubw m0, m6 ; left - right + pmulhrsw m0, m5 + paddw m0, m6 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w8_loop + RET +.w16: + movsldup m4, [base+ipred_shuf] + vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastd m1, [tlq+hq-8] + pshufb m0, m4 + pshufb m1, m4 + psubw m0, m6 + psubw m1, m6 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m6 + paddw m1, m6 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w16_loop + RET +.w32: + movu m5, [base+smooth_weights_1d_16bpc+32*2] +.w32_loop: + vpbroadcastq m3, [tlq+hq-8] + punpcklwd m3, m3 + psubw m3, m6 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w32_loop + RET +.w64: + movu m4, [base+smooth_weights_1d_16bpc+64*2] + movu m5, [base+smooth_weights_1d_16bpc+64*3] +.w64_loop: + vpbroadcastw m1, [tlq+hq-2] + vpbroadcastw m3, [tlq+hq-4] + psubw m1, m6 + psubw m3, m6 + pmulhrsw m0, m4, m1 + pmulhrsw m1, m5 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m5 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + mova [dstq+strideq*1+64*0], m2 + mova [dstq+strideq*1+64*1], m3 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w64_loop + RET + +cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 + lea r6, [$$] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m13, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4] + mov r5d, 0x55555555 + sub tlq, hq + mova m14, [base+smooth_perm] + kmovd k1, r5d + vpbroadcastw m0, [tlq] ; bottom + mov r5, 0x3333333333333333 + pxor m15, m15 + lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq] + kmovq k2, r5 + lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2] + jmp wq +.w4: + vpbroadcastq m5, [tlq+hq+2] + movshdup m3, [base+ipred_shuf] + movsldup m4, [base+ipred_shuf] + vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4] + lea stride3q, [strideq*3] + punpcklwd m5, m0 ; top, bottom +.w4_loop: + vbroadcasti32x4 m0, [v_weightsq] + vpbroadcastq m2, [tlq+hq-8] + mova m1, m13 + pshufb m0, m3 + pmaddwd m0, m5 + pshufb m1{k2}, m2, m4 ; left, right + vpdpwssd m0, m1, m6 + vpermb m0, m14, m0 + pavgw ym0, ym15 + vextracti32x4 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + add v_weightsq, 4*4 + sub hd, 4*2 + jg .w4_loop + RET +.w8: + vbroadcasti32x4 ym5, [tlq+hq+2] + movshdup m6, [base+ipred_shuf] + movsldup m7, [base+ipred_shuf] + pmovzxwd m5, ym5 + vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4] + lea stride3q, [strideq*3] + vpblendmw m5{k1}, m0, m5 ; top, bottom +.w8_loop: + vpbroadcastq m0, [v_weightsq+0] + vpbroadcastq m1, [v_weightsq+8] + vpbroadcastd m3, [tlq+hq-4] + vpbroadcastd m4, [tlq+hq-8] + pshufb m0, m6 + pmaddwd m0, m5 + pshufb m1, m6 + pmaddwd m1, m5 + mova m2, m13 + pshufb m2{k2}, m3, m7 ; left, right + mova m3, m13 + pshufb m3{k2}, m4, m7 + vpdpwssd m0, m2, m8 + vpdpwssd m1, m3, m8 + add v_weightsq, 4*4 + vpermt2b m0, m14, m1 + pavgw m0, m15 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w8_loop + RET +.w16: + pmovzxwd m5, [tlq+hq+2] + mova m6, [base+smooth_weights_2d_16bpc+16*4] + vpblendmw m5{k1}, m0, m5 ; top, bottom +.w16_loop: + vpbroadcastd m0, [v_weightsq+0] + vpbroadcastd m1, [v_weightsq+4] + pmaddwd m0, m5 + pmaddwd m1, m5 + mova m2, m13 + vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right + mova m3, m13 + vpbroadcastw m3{k1}, [tlq+hq-4] + vpdpwssd m0, m2, m6 + vpdpwssd m1, m3, m6 + add v_weightsq, 2*4 + vpermt2b m0, m14, m1 + pavgw m0, m15 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w16_loop + RET +.w32: + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + mova m7, [base+smooth_weights_2d_16bpc+32*4] + mova m8, [base+smooth_weights_2d_16bpc+32*6] + vpblendmw m5{k1}, m0, m5 ; top, bottom + vpblendmw m6{k1}, m0, m6 +.w32_loop: + vpbroadcastd m2, [v_weightsq+0] + vpbroadcastd m3, [v_weightsq+4] + pmaddwd m0, m5, m2 + pmaddwd m2, m6 + pmaddwd m1, m5, m3 + pmaddwd m3, m6 + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right + vpdpwssd m0, m4, m7 + vpdpwssd m2, m4, m8 + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-4] + vpdpwssd m1, m4, m7 + vpdpwssd m3, m4, m8 + add v_weightsq, 2*4 + vpermt2b m0, m14, m2 + vpermt2b m1, m14, m3 + pavgw m0, m15 + pavgw m1, m15 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w32_loop + RET +.w64: + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + pmovzxwd m7, [tlq+hq+66] + pmovzxwd m8, [tlq+hq+98] + mova m9, [base+smooth_weights_2d_16bpc+64*4] + vpblendmw m5{k1}, m0, m5 ; top, bottom + mova m10, [base+smooth_weights_2d_16bpc+64*5] + vpblendmw m6{k1}, m0, m6 + mova m11, [base+smooth_weights_2d_16bpc+64*6] + vpblendmw m7{k1}, m0, m7 + mova m12, [base+smooth_weights_2d_16bpc+64*7] + vpblendmw m8{k1}, m0, m8 +.w64_loop: + vpbroadcastd m3, [v_weightsq] + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right + pmaddwd m0, m5, m3 + pmaddwd m2, m6, m3 + pmaddwd m1, m7, m3 + pmaddwd m3, m8 + vpdpwssd m0, m4, m9 + vpdpwssd m2, m4, m10 + vpdpwssd m1, m4, m11 + vpdpwssd m3, m4, m12 + add v_weightsq, 1*4 + vpermt2b m0, m14, m2 + vpermt2b m1, m14, m3 + pavgw m0, m15 + pavgw m1, m15 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + sub hd, 1*2 + jg .w64_loop + RET + +cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 + lea r6, [pal_pred_16bpc_avx512icl_table] + tzcnt wd, wm + mova m2, [pal_pred_perm] + movsxd wq, [r6+wq*4] + mova xm3, [palq] + movifnidn hd, hm + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.w4: + pmovzxbw ym0, [idxq] + add idxq, 16 + vpermw ym0, ym0, ym3 + vextracti32x4 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xmm1 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + pmovzxbw m0, [idxq] + add idxq, 32 + vpermw m0, m0, m3 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + dec hd + jg .w64 + RET + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. +; w4 w8 w16 w32 +; 1 1 2 1 2 5 6 1 2 5 6 9 a d e +; 2 2 3 2 3 6 7 2 3 6 7 a b e f +; 3 3 4 3 4 7 8 3 4 7 8 b c f g +; 4 4 5 4 5 8 9 4 5 8 9 c d g h + +cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top +%define base r6-$$ + lea r6, [$$] +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + movifnidn hd, hm + movu xm0, [tlq-6] + pmovsxbw m7, [base+filter_intra_taps+filterq+32*0] + pmovsxbw m8, [base+filter_intra_taps+filterq+32*1] + mov r5d, r8m ; bitdepth_max + movsldup m9, [base+filter_permA] + movshdup m10, [base+filter_permA] + shr r5d, 11 ; is_12bpc + jnz .12bpc + psllw m7, 2 ; upshift multipliers so that packusdw + psllw m8, 2 ; will perform clipping for free +.12bpc: + vpbroadcastd m5, [base+filter_rnd+r5*8] + vpbroadcastd m6, [base+filter_shift+r5*8] + sub wd, 8 + jl .w4 +.w8: + call .main4 + movsldup m11, [filter_permB] + lea r5d, [hq*2+2] + movshdup m12, [filter_permB] + lea topq, [tlq+2] + mova m13, [filter_permC] + sub hd, 4 + vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1 + sub tlq, r5 +%if WIN64 + push r7 + push r8 +%endif + mov r7, dstq + mov r8d, hd +.w8_loop: + movlps xm4, xm0, [tlq+hq*2] + call .main8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jge .w8_loop + test wd, wd + jz .end + mov r2d, 0x0d + kmovb k1, r2d + lea r2, [strideq*3] +.w16: + movd xmm0, [r7+strideq*1+12] + vpblendd xmm0, [topq+8], 0x0e ; t1 t2 + pinsrw xm4, xmm0, [r7+strideq*0+14], 2 + call .main8 + add r7, 16 + vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3 + mov hd, r8d + mov dstq, r7 + add topq, 16 +.w16_loop: + movd xmm1, [dstq+strideq*2-4] + punpcklwd xm4, xmm1, xmm0 + movd xmm0, [dstq+r2-4] + shufps xm4{k1}, xmm0, xm0, q3210 + call .main8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jge .w16_loop + sub wd, 8 + jg .w16 +.end: + vpermb m2, m11, m0 + mova ym1, ym5 + vpdpwssd m1, m2, m7 + vpermb m2, m12, m0 + vpdpwssd m1, m2, m8 +%if WIN64 + pop r8 + pop r7 +%endif + vextracti32x8 ym2, m1, 1 + paddd ym1, ym2 + packusdw ym1, ym1 + vpsrlvw ym1, ym6 + vpermt2q m0, m13, m1 + vextracti32x4 [dstq+strideq*0], m0, 2 + vextracti32x4 [dstq+strideq*1], ym0, 1 + RET +.w4_loop: + movlps xm0, [tlq-10] + lea dstq, [dstq+strideq*2] + sub tlq, 4 +.w4: + call .main4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.main4: + vpermb m2, m9, m0 + mova ym1, ym5 + vpdpwssd m1, m2, m7 + vpermb m0, m10, m0 + vpdpwssd m1, m0, m8 + vextracti32x8 ym0, m1, 1 + paddd ym0, ym1 + vextracti32x4 xm1, ym0, 1 + packusdw xm0, xm1 ; clip + vpsrlvw xm0, xm6 + ret +ALIGN function_align +.main8: + vpermb m3, m11, m0 + mova ym2, ym5 + vpdpwssd m2, m3, m7 + vpermb m3, m9, m4 + mova ym1, ym5 + vpdpwssd m1, m3, m7 + vpermb m3, m12, m0 + vpdpwssd m2, m3, m8 + vpermb m3, m10, m4 + vpdpwssd m1, m3, m8 + vextracti32x8 ym4, m2, 1 + vextracti32x8 ym3, m1, 1 + paddd ym2, ym4 + paddd ym1, ym3 + packusdw ym1, ym2 ; clip + vpsrlvw ym1, ym6 + vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1 + vextracti32x4 [dstq+strideq*0], m0, 2 + vextracti32x4 [dstq+strideq*1], ym0, 1 + ret + +%endif diff -Nru dav1d-0.9.2/src/x86/ipred16_sse.asm dav1d-1.0.0/src/x86/ipred16_sse.asm --- dav1d-0.9.2/src/x86/ipred16_sse.asm 2021-09-03 15:51:24.417037000 +0000 +++ dav1d-1.0.0/src/x86/ipred16_sse.asm 2022-03-18 14:31:56.010356000 +0000 @@ -70,14 +70,6 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - INIT_XMM ssse3 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_16bpc_ssse3_table diff -Nru dav1d-0.9.2/src/x86/ipred_avx512.asm dav1d-1.0.0/src/x86/ipred_avx512.asm --- dav1d-0.9.2/src/x86/ipred_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/ipred_avx512.asm 2022-03-18 14:31:56.014356000 +0000 @@ -0,0 +1,1432 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + db %1-128, 127-%1 + %rotate 1 + %endrep +%endmacro + +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ +filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 + db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 + db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 + db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 + db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 + db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 + db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 + db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 + db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 + db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 + db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 + db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 + db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 + db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 + db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 + db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 + db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 + db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 + db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 + db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 +filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 + db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 + db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 + db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 +filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 +smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 + db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 + db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 + db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 +smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 + db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 +smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 + db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 + db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 + db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 +ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 + db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 + +pb_127_m127: times 2 db 127, -127 +pb_128: times 4 db 128 +pw_128: times 2 dw 128 +pw_255: times 2 dw 255 + +%define pb_1 (ipred_h_shuf+24) +%define pb_2 (ipred_h_shuf+20) +%define pb_3 (ipred_h_shuf+16) +%define pd_8 (filter_taps+128) + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) + +JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 +JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64 + +SECTION .text + +INIT_ZMM avx512icl +cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h + lea r5, [ipred_dc_left_8bpc_avx512icl_table] + movd xm0, wm + tzcnt wd, wm + inc tlq + movifnidn hd, hm + movu ym1, [tlq] + movd xmm3, wd + movsxd r6, [r5+wq*4] + vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] + psrld xm0, 1 + vpdpbusd ym0, ym1, ym2 + add r6, r5 + add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_left_8bpc_avx512icl_table] + mov hd, hm + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movd xm0, hm + movu ym1, [tlq] + movd xmm3, r6d + movsxd r6, [r5+r6*4] + vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] + psrld xm0, 1 + vpdpbusd ym0, ym1, ym2 + add r6, r5 + add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu ym1, [tlq+32] ; unaligned when jumping here from dc_top + vpdpbusd ym0, ym1, ym2 +.h32: + vextracti32x4 xm1, ym0, 1 + paddd xm0, xm1 +.h16: + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 +.h8: + psrlq xm1, xm0, 32 + paddd xm0, xm1 +.h4: + vpsrlvd xm0, xmm3 + lea stride3q, [strideq*3] + vpbroadcastb m0, xm0 + jmp wq + +cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm0, r5d + tzcnt r5d, r5d + movd xmm4, r5d + lea r5, [ipred_dc_8bpc_avx512icl_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] + psrld xm0, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movd xmm1, [tlq-4] + vpdpbusd xm0, xmm1, xm3 + jmp wq +.w4: + movd xmm1, [tlq+1] + vpdpbusd xm0, xmm1, xm3 + cmp hd, 4 + jg .w4_mul + psrlw xmm0, xm0, 3 + jmp .w4_end +.w4_mul: + punpckhqdq xmm1, xm0, xm0 + lea r2d, [hq*2] + mov r6d, 0x55563334 + paddd xmm1, xm0 + shrx r6d, r6d, r2d + psrlq xmm0, xmm1, 32 + paddd xmm0, xmm1 + movd xmm1, r6d + psrld xmm0, 2 + pmulhuw xmm0, xmm1 +.w4_end: + vpbroadcastb xm0, xmm0 +.s4: + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm0 + movd [dstq+strideq*2], xm0 + movd [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +.h8: + movq xmm1, [tlq-8] + vpdpbusd xm0, xmm1, xm3 + jmp wq +.w8: + movq xmm1, [tlq+1] + vextracti32x4 xmm2, ym0, 1 + vpdpbusd xm0, xmm1, xm3 + paddd xmm2, xm0 + punpckhqdq xmm0, xmm2, xmm2 + paddd xmm0, xmm2 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmove r6d, r2d + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w8_end: + vpbroadcastb xm0, xmm0 +.s8: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +.h16: + mova xmm1, [tlq-16] + vpdpbusd xm0, xmm1, xm3 + jmp wq +.w16: + movu xmm1, [tlq+1] + vextracti32x4 xmm2, ym0, 1 + vpdpbusd xm0, xmm1, xm3 + paddd xmm2, xm0 + punpckhqdq xmm0, xmm2, xmm2 + paddd xmm0, xmm2 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hb, 8|32 + cmovz r6d, r2d + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w16_end: + vpbroadcastb xm0, xmm0 +.s16: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +.h32: + mova ym1, [tlq-32] + vpdpbusd ym0, ym1, ym3 + jmp wq +.w32: + movu ym1, [tlq+1] + vpdpbusd ym0, ym1, ym3 + vextracti32x4 xmm1, ym0, 1 + paddd xmm1, xm0 + punpckhqdq xmm0, xmm1, xmm1 + paddd xmm0, xmm1 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x33345556 + shrx r6d, r6d, r2d + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w32_end: + vpbroadcastb ym0, xmm0 +.s32: + mova [dstq+strideq*0], ym0 + mova [dstq+strideq*1], ym0 + mova [dstq+strideq*2], ym0 + mova [dstq+stride3q ], ym0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +.h64: + mova ym1, [tlq-64] + mova ym2, [tlq-32] + vpdpbusd ym0, ym1, ym3 + vpdpbusd ym0, ym2, ym3 + jmp wq +.w64: + movu ym1, [tlq+ 1] + movu ym2, [tlq+33] + vpdpbusd ym0, ym1, ym3 + vpdpbusd ym0, ym2, ym3 + vextracti32x4 xmm1, ym0, 1 + paddd xmm1, xm0 + punpckhqdq xmm0, xmm1, xmm1 + paddd xmm0, xmm1 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 64 + je .w64_end + mov r6d, 0x33345556 + shrx r6d, r6d, hd + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w64_end: + vpbroadcastb m0, xmm0 +.s64: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s64 + RET + +cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_8bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_8bpc_avx512icl_table] + tzcnt wd, wm + movu m0, [tlq+1] + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 +%define base r6-ipred_h_8bpc_avx512icl_table + lea r6, [ipred_h_8bpc_avx512icl_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + lea stride3q, [strideq*3] + sub tlq, hq + add wq, r6 + jmp wq +.w4: + mova xmm1, [base+ipred_h_shuf+16] +.w4_loop: + movd xmm0, [tlq+hq-4] + pshufb xmm0, xmm1 + movd [dstq+strideq*0], xmm0 + pextrd [dstq+strideq*1], xmm0, 1 + pextrd [dstq+strideq*2], xmm0, 2 + pextrd [dstq+stride3q ], xmm0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +.w8: + movsldup xmm2, [base+ipred_h_shuf+16] + movshdup xmm3, [base+ipred_h_shuf+16] +.w8_loop: + movd xmm1, [tlq+hq-4] + pshufb xmm0, xmm1, xmm2 + pshufb xmm1, xmm3 + movq [dstq+strideq*0], xmm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + movsldup m1, [base+smooth_shuf] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + pshufb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpbroadcastd ym3, [base+pb_1] + vpord m2, m3, [base+pb_2] {1to16} +.w32_loop: + vpbroadcastd m1, [tlq+hq-4] + pshufb m0, m1, m2 + pshufb m1, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32_loop + RET +.w64: + vpbroadcastd m4, [base+pb_3] + vpbroadcastd m5, [base+pb_2] + vpbroadcastd m6, [base+pb_1] + pxor m7, m7 +.w64_loop: + vpbroadcastd m3, [tlq+hq-4] + pshufb m0, m3, m4 + pshufb m1, m3, m5 + pshufb m2, m3, m6 + pshufb m3, m7 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w64_loop + RET + +%macro PAETH 0 + psubusb m1, m5, m4 + psubusb m0, m4, m5 + por m1, m0 ; tdiff + pavgb m2, m6, m4 + vpcmpub k1, m1, m7, 1 ; tdiff < ldiff + vpblendmb m0{k1}, m4, m6 + vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 + psubusb m3, m5, m2 + psubb m2, m4 + psubusb m2, m5 + por m2, m3 + pminub m1, m7 + paddusb m2, m2 + por m2, m4 ; min(tldiff, 255) + vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff + vmovdqu8 m0{k1}, m5 +%endmacro + +cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 + lea r6, [ipred_paeth_8bpc_avx512icl_table] + tzcnt wd, wm + vpbroadcastb m5, [tlq] ; topleft + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] + lea topq, [tlq+1] + sub tlq, hq + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +INIT_YMM avx512icl +.w4: + vpbroadcastd m6, [topq] + mova m9, [ipred_h_shuf] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 ; ldiff +.w4_loop: + vpbroadcastq m4, [tlq+hq-8] + pshufb m4, m9 ; left + PAETH + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm0, 3 + sub hd, 8 + jl .w4_ret + vextracti32x4 xmm0, m0, 1 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0], xmm0 + pextrd [dstq+strideq*1], xmm0, 1 + pextrd [dstq+strideq*2], xmm0, 2 + pextrd [dstq+stride3q ], xmm0, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_ret: + RET +INIT_ZMM avx512icl +.w8: + vpbroadcastq m6, [topq] + movsldup m9, [smooth_shuf] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w8_loop: + vpbroadcastq m4, [tlq+hq-8] + pshufb m4, m9 + PAETH + vextracti32x4 xmm1, m0, 2 + vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + sub hd, 8 + jl .w8_ret + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + jg .w8_loop +.w8_ret: + RET +.w16: + vbroadcasti32x4 m6, [topq] + movsldup m9, [smooth_shuf] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w16_loop: + vpbroadcastd m4, [tlq+hq-4] + pshufb m4, m9 + PAETH + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + vbroadcasti32x8 m6, [topq] + mova ym9, ym8 + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w32_loop: + vpbroadcastd m4, [tlq+hq-2] + pshufb m4, m9 + PAETH + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + movu m6, [topq] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w64_loop: + vpbroadcastb m4, [tlq+hq-1] + PAETH + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 +%define base r6-ipred_smooth_v_8bpc_avx512icl_table + lea r6, [ipred_smooth_v_8bpc_avx512icl_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m0, [base+pb_127_m127] + vpbroadcastd m1, [base+pw_128] + lea weightsq, [base+smooth_weights+hq*4] + neg hq + vpbroadcastb m4, [tlq+hq] ; bottom + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastd m2, [tlq+1] + movshdup m5, [smooth_shuf] + mova ym6, [smooth_endA] + punpcklbw m2, m4 ; top, bottom + pmaddubsw m3, m2, m0 + paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok + paddw m3, m1 ; 128 * top + 129 * bottom + 128 +.w4_loop: + vbroadcasti32x4 m0, [weightsq+hq*2] + pshufb m0, m5 + pmaddubsw m0, m2, m0 + paddw m0, m3 + vpermb m0, m6, m0 + vextracti32x4 xmm1, ym0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xmm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xmm1, 2 + add hq, 8 + jg .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+stride3q ], xmm1, 3 + lea dstq, [dstq+strideq*4] + jl .w4_loop +.ret: + RET +.w8: + vpbroadcastq m2, [tlq+1] + movshdup m5, [smooth_shuf] + mova ym6, [smooth_endA] + punpcklbw m2, m4 + pmaddubsw m3, m2, m0 + paddw m1, m2 + paddw m3, m1 +.w8_loop: + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m5 + pmaddubsw m0, m2, m0 + paddw m0, m3 + vpermb m0, m6, m0 + vextracti32x4 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +.w16: + vbroadcasti32x4 m3, [tlq+1] + movshdup m6, [smooth_shuf] + mova m7, [smooth_endB] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w16_loop: + vpbroadcastq m1, [weightsq+hq*2] + pshufb m1, m6 + pmaddubsw m0, m2, m1 + pmaddubsw m1, m3, m1 + paddw m0, m4 + paddw m1, m5 + vpermt2b m0, m7, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w16_loop + RET +.w32: + vbroadcasti32x8 m3, [tlq+1] + movshdup m6, [smooth_shuf] + mova m7, [smooth_endB] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w32_loop: + vpbroadcastd m1, [weightsq+hq*2] + pshufb m1, m6 + pmaddubsw m0, m2, m1 + pmaddubsw m1, m3, m1 + paddw m0, m4 + paddw m1, m5 + vpermt2b m0, m7, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w32_loop + RET +.w64: + movu m3, [tlq+1] + mova m6, [smooth_endB] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w64_loop: + vpbroadcastw m1, [weightsq+hq*2] + pmaddubsw m0, m2, m1 + pmaddubsw m1, m3, m1 + paddw m0, m4 + paddw m1, m5 + vpermt2b m0, m6, m1 + mova [dstq], m0 + add dstq, strideq + inc hq + jl .w64_loop + RET + +cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 +%define base r5-ipred_smooth_h_8bpc_avx512icl_table + lea r5, [ipred_smooth_h_8bpc_avx512icl_table] + mov r6d, wd + tzcnt wd, wd + vpbroadcastb m4, [tlq+r6] ; right + mov hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m5, [base+pb_127_m127] + vpbroadcastd m6, [base+pw_128] + sub tlq, hq + add wq, r5 + vpmovb2m k1, m6 + lea stride3q, [strideq*3] + jmp wq +.w4: + movsldup m3, [smooth_shuf] + vpbroadcastq m7, [smooth_weights+4*2] + mova ym8, [smooth_endA] +.w4_loop: + vpbroadcastq m0, [tlq+hq-8] + mova m2, m4 + vpshufb m2{k1}, m0, m3 ; left, right + pmaddubsw m0, m2, m5 + pmaddubsw m1, m2, m7 + paddw m2, m6 + paddw m0, m2 + paddw m0, m1 + vpermb m0, m8, m0 + vextracti32x4 xmm1, ym0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xmm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xmm1, 2 + sub hd, 8 + jl .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+stride3q ], xmm1, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.ret: + RET +.w8: + movsldup m3, [smooth_shuf] + vbroadcasti32x4 m7, [smooth_weights+8*2] + mova ym8, [smooth_endA] +.w8_loop: + vpbroadcastd m0, [tlq+hq-4] + mova m2, m4 + vpshufb m2{k1}, m0, m3 + pmaddubsw m0, m2, m5 + pmaddubsw m1, m2, m7 + paddw m2, m6 + paddw m0, m2 + paddw m0, m1 + vpermb m0, m8, m0 + vextracti32x4 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + movsldup m7, [smooth_shuf] + vbroadcasti32x4 m8, [smooth_weights+16*2] + vbroadcasti32x4 m9, [smooth_weights+16*3] + mova m10, [smooth_endB] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + mova m3, m4 + vpshufb m3{k1}, m0, m7 + pmaddubsw m2, m3, m5 + pmaddubsw m0, m3, m8 + pmaddubsw m1, m3, m9 + paddw m3, m6 + paddw m2, m3 + paddw m0, m2 + paddw m1, m2 + vpermt2b m0, m10, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + mova m10, [smooth_endA] + vpbroadcastd ym7, [pb_1] + vbroadcasti32x8 m8, [smooth_weights+32*2] + vbroadcasti32x8 m9, [smooth_weights+32*3] + vshufi32x4 m10, m10, q3120 +.w32_loop: + vpbroadcastd m0, [tlq+hq-2] + mova m3, m4 + vpshufb m3{k1}, m0, m7 + pmaddubsw m2, m3, m5 + pmaddubsw m0, m3, m8 + pmaddubsw m1, m3, m9 + paddw m3, m6 + paddw m2, m3 + paddw m0, m2 + paddw m1, m2 + vpermt2b m0, m10, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + mova m7, [smooth_weights+64*2] + mova m8, [smooth_weights+64*3] + mova m9, [smooth_endA] +.w64_loop: + mova m3, m4 + vpbroadcastb m3{k1}, [tlq+hq-1] + pmaddubsw m2, m3, m5 + pmaddubsw m0, m3, m7 + pmaddubsw m1, m3, m8 + paddw m3, m6 + paddw m2, m3 + paddw m0, m2 + paddw m1, m2 + vpermt2b m0, m9, m1 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 +%define base r5-ipred_smooth_8bpc_avx512icl_table + lea r5, [ipred_smooth_8bpc_avx512icl_table] + mov r6d, wd + tzcnt wd, wd + mov hd, hm + vpbroadcastb m6, [tlq+r6] ; right + sub tlq, hq + movsxd wq, [r5+wq*4] + vpbroadcastd m7, [base+pb_127_m127] + vpbroadcastb m0, [tlq] ; bottom + vpbroadcastd m1, [base+pw_255] + add wq, r5 + lea v_weightsq, [base+smooth_weights+hq*2] + vpmovb2m k1, m1 + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastd m8, [tlq+hq+1] + movsldup m4, [smooth_shuf] + movshdup m5, [smooth_shuf] + vpbroadcastq m9, [smooth_weights+4*2] + mova ym11, [smooth_endA] + + punpcklbw m8, m0 ; top, bottom + pmaddubsw m10, m8, m7 + paddw m1, m8 ; 1 * top + 256 * bottom + 255 + paddw m10, m1 ; 128 * top + 129 * bottom + 255 +.w4_loop: + vpbroadcastq m1, [tlq+hq-8] + vbroadcasti32x4 m0, [v_weightsq] + add v_weightsq, 16 + mova m2, m6 + vpshufb m2{k1}, m1, m4 ; left, right + pmaddubsw m1, m2, m7 ; 127 * left - 127 * right + pshufb m0, m5 + pmaddubsw m0, m8, m0 + paddw m1, m2 ; 128 * left + 129 * right + pmaddubsw m2, m9 + paddw m0, m10 + paddw m1, m2 + pavgw m0, m1 + vpermb m0, m11, m0 + vextracti32x4 xmm1, ym0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xmm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xmm1, 2 + sub hd, 8 + jl .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+stride3q ], xmm1, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.ret: + RET +.w8: + vpbroadcastq m8, [tlq+hq+1] + movsldup m4, [smooth_shuf] + movshdup m5, [smooth_shuf] + vbroadcasti32x4 m9, [smooth_weights+8*2] + mova ym11, [smooth_endA] + punpcklbw m8, m0 + pmaddubsw m10, m8, m7 + paddw m1, m8 + paddw m10, m1 +.w8_loop: + vpbroadcastd m1, [tlq+hq-4] + vpbroadcastq m0, [v_weightsq] + add v_weightsq, 8 + mova m2, m6 + vpshufb m2{k1}, m1, m4 + pmaddubsw m1, m2, m7 + pshufb m0, m5 + pmaddubsw m0, m8, m0 + paddw m1, m2 + pmaddubsw m2, m9 + paddw m0, m10 + paddw m1, m2 + pavgw m0, m1 + vpermb m0, m11, m0 + vextracti32x4 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + vbroadcasti32x4 m9, [tlq+hq+1] + movsldup m5, [smooth_shuf] + movshdup m10, [smooth_shuf] + vbroadcasti32x4 m11, [smooth_weights+16*2] + vbroadcasti32x4 m12, [smooth_weights+16*3] + mova m15, [smooth_endB] + punpcklbw m8, m9, m0 + punpckhbw m9, m0 + pmaddubsw m13, m8, m7 + pmaddubsw m14, m9, m7 + paddw m0, m1, m8 + paddw m1, m9 + paddw m13, m0 + paddw m14, m1 +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastq m1, [v_weightsq] + add v_weightsq, 8 + mova m4, m6 + vpshufb m4{k1}, m0, m5 + pmaddubsw m2, m4, m7 + pshufb m1, m10 + pmaddubsw m0, m8, m1 + pmaddubsw m1, m9, m1 + paddw m2, m4 + pmaddubsw m3, m4, m11 + pmaddubsw m4, m12 + paddw m0, m13 + paddw m1, m14 + paddw m3, m2 + paddw m4, m2 + pavgw m0, m3 + pavgw m1, m4 + vpermt2b m0, m15, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + vbroadcasti32x8 m9, [tlq+hq+1] + movshdup m10, [smooth_shuf] + mova m12, [smooth_weights+32*2] + vpbroadcastd ym5, [pb_1] + mova m15, [smooth_endB] + punpcklbw m8, m9, m0 + punpckhbw m9, m0 + pmaddubsw m13, m8, m7 + pmaddubsw m14, m9, m7 + vshufi32x4 m11, m12, m12, q2020 + vshufi32x4 m12, m12, q3131 + paddw m0, m1, m8 + paddw m1, m9 + paddw m13, m0 + paddw m14, m1 +.w32_loop: + vpbroadcastd m0, [tlq+hq-2] + vpbroadcastd m1, [v_weightsq] + add v_weightsq, 4 + mova m4, m6 + vpshufb m4{k1}, m0, m5 + pmaddubsw m2, m4, m7 + pshufb m1, m10 + pmaddubsw m0, m8, m1 + pmaddubsw m1, m9, m1 + paddw m2, m4 + pmaddubsw m3, m4, m11 + pmaddubsw m4, m12 + paddw m0, m13 + paddw m1, m14 + paddw m3, m2 + paddw m4, m2 + pavgw m0, m3 + pavgw m1, m4 + vpermt2b m0, m15, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + movu m9, [tlq+hq+1] + mova m11, [smooth_weights+64*2] + mova m2, [smooth_weights+64*3] + mova m14, [smooth_endB] + punpcklbw m8, m9, m0 + punpckhbw m9, m0 + pmaddubsw m12, m8, m7 + pmaddubsw m13, m9, m7 + vshufi32x4 m10, m11, m2, q2020 + vshufi32x4 m11, m2, q3131 + paddw m0, m1, m8 + paddw m1, m9 + paddw m12, m0 + paddw m13, m1 +.w64_loop: + mova m4, m6 + vpbroadcastb m4{k1}, [tlq+hq-1] + vpbroadcastw m1, [v_weightsq] + add v_weightsq, 2 + pmaddubsw m2, m4, m7 + pmaddubsw m0, m8, m1 + pmaddubsw m1, m9, m1 + paddw m2, m4 + pmaddubsw m3, m4, m10 + pmaddubsw m4, m11 + paddw m0, m12 + paddw m1, m13 + paddw m3, m2 + paddw m4, m2 + pavgw m0, m3 + pavgw m1, m4 + vpermt2b m0, m14, m1 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3 + lea r6, [pal_pred_8bpc_avx512icl_table] + tzcnt wd, wm + vbroadcasti32x4 m4, [palq] + movifnidn hd, hm + movsxd wq, [r6+wq*4] + packuswb m4, m4 + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.w4: + pshufb xmm0, xm4, [idxq] + add idxq, 16 + movd [dstq+strideq*0], xmm0 + pextrd [dstq+strideq*1], xmm0, 1 + pextrd [dstq+strideq*2], xmm0, 2 + pextrd [dstq+stride3q ], xmm0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + pshufb xmm0, xm4, [idxq+16*0] + pshufb xmm1, xm4, [idxq+16*1] + add idxq, 16*2 + movq [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm0 + movq [dstq+strideq*2], xmm1 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + pshufb m0, m4, [idxq] + add idxq, 64 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + pshufb m0, m4, [idxq+64*0] + pshufb m1, m4, [idxq+64*1] + add idxq, 64*2 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32 + RET +.w64: + pshufb m0, m4, [idxq+64*0] + pshufb m1, m4, [idxq+64*1] + pshufb m2, m4, [idxq+64*2] + pshufb m3, m4, [idxq+64*3] + add idxq, 64*4 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w64 + RET + +; The ipred_filter code processes 4x2 blocks in the following order +; which increases parallelism compared to doing things row by row. +; Some redundant blocks are calculated for w > 4. +; w4 w8 w16 w32 +; 1 1 2 1 2 3 4 1 2 3 4 9 a b c +; 2 2 3 2 3 4 5 2 3 4 5 a b c d +; 3 3 4 3 4 5 6 3 4 5 6 b c d e +; 4 4 5 4 5 6 7 4 5 6 7 c d e f +; 5 5 6 5 6 7 8 5 6 7 8 d e f g +; 6 6 7 6 7 8 9 6 7 8 9 e f g h +; 7 7 8 7 8 9 a 7 8 9 a f g h i +; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ +; 9 9 a b h i j +; a b i j +; b j + +cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt +%define base r6-filter_taps + lea r6, [filter_taps] +%ifidn fltd, fltm + movzx fltd, fltb +%else + movzx fltd, byte fltm +%endif + vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 + movifnidn hd, hm + shl fltd, 6 + vpbroadcastd m6, [base+pd_8] + vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ + vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 + vbroadcasti32x4 m8, [r6+fltq+16*1] + vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ + vbroadcasti32x4 m10, [r6+fltq+16*3] + mova xmm0, xm6 + vpdpbusd xmm0, xmm2, xm7 + mova xmm1, xm6 + vpdpbusd xmm1, xmm2, xm8 + vpdpbusd xmm0, xmm3, xm9 + vpdpbusd xmm1, xmm3, xm10 + packssdw xmm0, xmm1 + cmp wd, 8 + jb .w4 + vpbroadcastd ym2, [tlq+5] + mova m11, [base+filter_perm] + mov r5, 0xffffffffffff000f + psrldq xmm2, 1 ; __ t0 + kmovq k1, r5 ; 0x000f + psraw xm5, xmm0, 4 + packuswb xmm2, xm5 ; __ t0 a0 b0 + pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 + je .w8 + kxnorb k3, k3, k3 ; 0x00ff + vpbroadcastd xm3, [tlq-4] + kandnq k2, k3, k1 ; 0xffffffffffff0000 + vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ + mova ym0, ym6 + vpdpbusd ym0, ym2, ym7 + mova ym1, ym6 + vpdpbusd ym1, ym2, ym8 + pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 + vpbroadcastd m2, [tlq+9] + vpdpbusd ym0, ym3, ym9 + vpdpbusd ym1, ym3, ym10 + vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ + kunpckbw k4, k1, k3 ; 0x0fff + packssdw ym0, ym1 + psraw ym0, 4 ; a0 d0 a1 b1 + packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 + pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 + vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ + mova m4, m6 + vpdpbusd m4, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 + vpbroadcastd m2, [tlq+13] + vpdpbusd m4, m3, m9 + vpdpbusd m1, m3, m10 + mova m12, [base+filter_end] + lea r5d, [hq-6] + mov r6, dstq + cmovp hd, r5d ; w == 16 ? h : h - 6 + packssdw m4, m1 + psraw m4, 4 ; e0 f0 c1 d1 a2 b2 + packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 + pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 +.w16_loop: + vpbroadcastd xm3, [tlq-8] + vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ + mova m1, m6 + vpdpbusd m1, m2, m7 + mova m0, m6 + vpdpbusd m0, m2, m8 + sub tlq, 2 + vpdpbusd m1, m3, m9 + vpdpbusd m0, m3, m10 + packssdw m1, m0 + mova m0, m4 + psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 + packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 + pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 + vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 + vextracti32x4 [dstq+strideq*0], m5, 2 + vextracti32x4 [dstq+strideq*1], m5, 3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + cmp wd, 16 + je .ret + mova xm13, [filter_perm+16] + mova xmm3, [r6+strideq*0] + punpckhdq xmm3, [r6+strideq*1] + vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 + pinsrb xm3, xmm3, [tlq+r5+16], 7 + pshufb xm3, xm13 + vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ + mova m0, m6 + vpdpbusd m0, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + kunpckbw k5, k3, k1 ; 0xff0f + lea r3, [strideq*3] + vpdpbusd m0, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m0, m1 + psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 + packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 + vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 + vpbroadcastd ym2, [tlq+r5+21] + pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 + vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 + vextracti32x4 [dstq+strideq*0], m5, 2 + vextracti32x4 [dstq+strideq*1], m5, 3 + punpckhqdq xmm3, [r6+r3] + pinsrb xmm3, [r6+strideq*2+15], 11 + pshufb xm3, xmm3, xm13 + vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ + mova m4, m6 + vpdpbusd m4, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + kxnord k3, k3, k4 ; 0xfffff0ff + lea r4, [strideq*5] + vpdpbusd m4, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m4, m1 + psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 + packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 + vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 + vpbroadcastd m2, [tlq+r5+25] + pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 + vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 + vextracti32x4 [dstq+strideq*2], m5, 2 + vextracti32x4 [dstq+r3 ], m5, 3 + punpckhqdq xmm3, [r6+r4] + pinsrb xmm3, [r6+strideq*4+15], 11 + pshufb xm3, xmm3, xm13 + vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ + mova m0, m6 + vpdpbusd m0, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + kunpckwd k1, k1, k2 ; 0x000f0000 + vpdpbusd m0, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m0, m1 + psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 + packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 + vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 + vpbroadcastd m2, [tlq+r5+29] + pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 + vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 + vextracti32x4 [dstq+strideq*4], m5, 2 + vextracti32x4 [dstq+r4 ], m5, 3 + lea r0, [strideq+r3*2] +.w32_loop: + punpckhqdq xmm3, [r6+r0] + pinsrb xmm3, [r6+r3*2+15], 11 + pshufb xm3, xmm3, xm13 + vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ +.w32_loop_tail: + mova m4, m6 + vpdpbusd m4, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + vpdpbusd m4, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m4, m1 + mova m1, m0 + psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 + packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 + pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 + vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 + vextracti32x4 [r6+strideq*0+16], m5, 2 + vextracti32x4 [r6+strideq*1+16], m5, 3 + lea r6, [r6+strideq*2] + sub r5d, 2 + jg .w32_loop + vpermb m3, m11, m1 + cmp r5d, -6 + jg .w32_loop_tail +.ret: + RET +.w8: + vpermb ym3, ym11, ymm2 +.w8_loop: + vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ + mova ym0, ym6 + vpdpbusd ym0, ym2, ym7 + mova ym1, ym6 + vpdpbusd ym1, ym2, ym8 + sub tlq, 2 + vpdpbusd ym0, ym3, ym9 + vpdpbusd ym1, ym3, ym10 + mova ym3, ym5 + packssdw ym0, ym1 + psraw ym5, ym0, 4 ; c0 d0 a1 b1 + packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 + pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 + vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +.w4_loop: + vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ + mova xmm0, xm6 + vpdpbusd xmm0, xmm2, xm7 + mova xmm1, xm6 + vpdpbusd xmm1, xmm2, xm8 + sub tlq, 2 + vpdpbusd xmm0, xmm3, xm9 + vpdpbusd xmm1, xmm3, xm10 + packssdw xmm0, xmm1 +.w4: + psraw xmm0, 4 ; a0 b0 + packuswb xmm0, xmm0 + movd [dstq+strideq*0], xmm0 + pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 + movd [dstq+strideq*1], xmm2 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_loop + RET + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/ipred_init_tmpl.c dav1d-1.0.0/src/x86/ipred_init_tmpl.c --- dav1d-0.9.2/src/x86/ipred_init_tmpl.c 2021-09-03 15:51:24.417037000 +0000 +++ dav1d-1.0.0/src/x86/ipred_init_tmpl.c 2022-03-18 14:31:56.014356000 +0000 @@ -30,7 +30,8 @@ #define decl_fn(type, name) \ decl_##type##_fn(BF(dav1d_##name, ssse3)); \ - decl_##type##_fn(BF(dav1d_##name, avx2)) + decl_##type##_fn(BF(dav1d_##name, avx2)); \ + decl_##type##_fn(BF(dav1d_##name, avx512icl)) #define init_fn(type0, type1, name, suffix) \ c->type0[type1] = BF(dav1d_##name, suffix) @@ -123,5 +124,23 @@ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2); c->pal_pred = BF(dav1d_pal_pred, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if BITDEPTH == 8 + init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl); + init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); + init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); +#endif + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl); + + c->pal_pred = BF(dav1d_pal_pred, avx512icl); #endif } diff -Nru dav1d-0.9.2/src/x86/itx16_avx2.asm dav1d-1.0.0/src/x86/itx16_avx2.asm --- dav1d-0.9.2/src/x86/itx16_avx2.asm 2021-09-03 15:51:24.417037000 +0000 +++ dav1d-1.0.0/src/x86/itx16_avx2.asm 2022-03-18 14:31:56.014356000 +0000 @@ -1,5 +1,6 @@ ; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC +; Copyright © 2021, Matthias Dressel ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without @@ -32,6 +33,11 @@ pd_1321_2482: dd 1321, 1321, 1321, 1321, 2482, 2482, 2482, 2482 itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 +idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 +idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5 +iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 +idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 +iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 @@ -63,6 +69,7 @@ %define pd_1321 (pd_1321_2482 + 4*0) %define pd_2482 (pd_1321_2482 + 4*4) +pd_8: dd 8 pd_m601: dd -601 pd_m1189: dd -1189 pd_m1380: dd -1380 @@ -76,11 +83,15 @@ pd_10239: dd 10239 ; 2048 + 8192 - 1 pd_10240: dd 10240 ; 2048 + 8192 pd_11586: dd 11586 ; 5793 * 2 +pd_34816: dd 34816 ; 2048 + 32768 pd_38912: dd 38912 ; 2048 + 4096 + 32768 -pixel_max: times 2 dw 0x03ff ; 10bpc -clip_min: dd -0x20000 -clip_max: dd 0x1ffff +pixel_10bpc_max: times 2 dw 0x03ff +pixel_12bpc_max: times 2 dw 0x0fff +clip_18b_min: dd -0x20000 +clip_18b_max: dd 0x1ffff +clip_20b_min: dd -0x80000 +clip_20b_max: dd 0x7ffff idct64_mul_16bpc: dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 @@ -134,14 +145,6 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %macro WRAP_XMM 1+ @@ -264,12 +267,12 @@ %endif %endmacro -%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size -cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 5, 0, dst, stride, c, eob, tx2 - %define %%p1 m(i%1_%4_internal_16bpc) +%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth +cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_%5bpc) ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. - lea tx2q, [m(i%2_%4_internal_16bpc).pass2] + lea tx2q, [m(i%2_%4_internal_%5bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 @@ -297,7 +300,24 @@ pmulhrsw xm0, xm1 vpbroadcastw xm0, xm0 mova xm1, xm0 - jmp m(iadst_4x4_internal_16bpc).end + jmp m(iadst_4x4_internal_10bpc).end +%endif +%endmacro + +%macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x4, 12 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + movd xm0, r6d + vpbroadcastd m0, xm0 + mova m1, m0 + jmp m(iadst_4x4_internal_12bpc).end %endif %endmacro @@ -312,21 +332,20 @@ %macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd vpbroadcastd m%5, [pw_m3784_1567] punpckhwd m%3, m%2, m%1 - psubw m%4, m%1, m%2 - paddw m%1, m%2 - vpbroadcastd m%2, [pw_1567_3784] - punpcklqdq m%1, m%4 - vpbroadcastd m%4, [pw_2896x8] + vpbroadcastd m%4, [pw_1567_3784] + punpcklwd m%2, m%1 + vpbroadcastd m%1, [pw_m2896_2896] pmaddwd m%5, m%3 - pmaddwd m%3, m%2 - pmulhrsw m%1, m%4 ; t0 t1 - paddd m%5, m%6 - paddd m%3, m%6 - psrad m%5, 12 - psrad m%3, 12 + pmaddwd m%3, m%4 + vpbroadcastd m%4, [pw_2896_2896] + pmaddwd m%1, m%2 + pmaddwd m%2, m%4 + REPX {paddd x, m%6}, m%5, m%3, m%1, m%2 + REPX {psrad x, 12 }, m%5, m%3, m%1, m%2 packssdw m%3, m%5 ; t3 t2 - psubsw m%2, m%1, m%3 ; out3 out2 - paddsw m%1, m%3 ; out0 out1 + packssdw m%2, m%1 ; t0 t1 + paddsw m%1, m%2, m%3 ; out0 out1 + psubsw m%2, m%3 ; out3 out2 %endmacro INV_TXFM_4X4_FN dct, dct @@ -334,11 +353,8 @@ INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst -cglobal idct_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 - vpermq m0, [cq+32*0], q3120 - vpermq m1, [cq+32*1], q3120 - vpbroadcastd m5, [pd_2048] - IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 +cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call .main vbroadcasti128 m2, [idct4_shuf] packssdw m0, m1 pshufb m0, m2 @@ -354,7 +370,7 @@ lea r6, [dstq+strideq*2] movq xm3, [r6 +strideq*1] movhps xm3, [r6 +strideq*0] - vpbroadcastd xm5, [pixel_max] + vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 @@ -369,20 +385,29 @@ movhps [r6 +strideq*0], xm1 movq [r6 +strideq*1], xm1 RET +ALIGN function_align +.main: + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m5, [pd_2048] +.main2: + IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 + ret INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity -cglobal iadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call .main + packssdw m0, m1 vpermd m0, m4, m0 psrld m4, 4 pshufb m0, m4 jmp tx2q .pass2: - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 call m(iadst_4x4_internal_8bpc).main .end: @@ -392,7 +417,7 @@ lea r6, [dstq+strideq*2] movq xm3, [r6 +strideq*0] movhps xm3, [r6 +strideq*1] - vpbroadcastd xm5, [pixel_max] + vpbroadcastd xm5, [pixel_10bpc_max] pmulhrsw xm0, xm4 pmulhrsw xm1, xm4 pxor m4, m4 @@ -412,9 +437,10 @@ ALIGN function_align .main: mova m2, [cq+16*2] + vbroadcasti128 m5, [cq+16*0] +.main2: mova m0, [pd_1321_2482] vpbroadcastd m3, [pd_3803] - vbroadcasti128 m5, [cq+16*0] vpbroadcastd m1, [pd_m3344] pmulld m4, m0, m2 pmulld m3, m2 @@ -439,7 +465,6 @@ paddd m1, m3 ; out2 out3 psrad m0, 12 psrad m1, 12 - packssdw m0, m1 ret INV_TXFM_4X4_FN flipadst, dct @@ -447,15 +472,16 @@ INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity -cglobal iflipadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 - call m(iadst_4x4_internal_16bpc).main +cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10bpc).main + packssdw m0, m1 psrld m1, m4, 8 vpermd m0, m1, m0 psrld m4, 4 pshufb m0, m4 jmp tx2q .pass2: - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 call m(iadst_4x4_internal_8bpc).main vpbroadcastd xm4, [pw_2048] @@ -464,7 +490,7 @@ lea r6, [dstq+strideq*2] movq xm2, [r6 +strideq*1] movhps xm2, [r6 +strideq*0] - vpbroadcastd xm5, [pixel_max] + vpbroadcastd xm5, [pixel_10bpc_max] pmulhrsw xm0, xm4 pmulhrsw xm1, xm4 pxor m4, m4 @@ -487,7 +513,7 @@ INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity -cglobal iidentity_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 vpbroadcastd m1, [pd_5793] pmulld m0, m1, [cq+32*0] pmulld m1, [cq+32*1] @@ -511,7 +537,7 @@ paddsw m0, m1 movq xm3, [r6 +strideq*0] movhps xm3, [r6 +strideq*1] - vpbroadcastd xm4, [pixel_max] + vpbroadcastd xm4, [pixel_10bpc_max] packssdw m5, m5 ; pw_2048 pmulhrsw m0, m5 pxor m5, m5 @@ -530,8 +556,134 @@ movhps [r6 +strideq*1], xm1 RET -%macro INV_TXFM_4X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x8 +INV_TXFM_4X4_12BPC_FN dct, dct +INV_TXFM_4X4_12BPC_FN dct, identity +INV_TXFM_4X4_12BPC_FN dct, adst +INV_TXFM_4X4_12BPC_FN dct, flipadst + +cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call m(idct_4x4_internal_10bpc).main + mova m3, [idct4_12_shuf] + mova m4, [idct4_12_shuf2] + vpermd m2, m3, m0 + vpermd m1, m4, m1 + jmp m(iadst_4x4_internal_12bpc).pass1_end +.pass2: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct_4x4_internal_10bpc).main2 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_4x4_internal_12bpc).end + +INV_TXFM_4X4_12BPC_FN adst, dct +INV_TXFM_4X4_12BPC_FN adst, adst +INV_TXFM_4X4_12BPC_FN adst, flipadst +INV_TXFM_4X4_12BPC_FN adst, identity + +cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10bpc).main + vpermd m2, m4, m0 + vpermd m1, m4, m1 +.pass1_end: + punpcklqdq m0, m2, m1 + punpckhqdq m1, m2, m1 +.pass1_end2: + vpbroadcastd m3, [clip_18b_min] + vpbroadcastd m4, [clip_18b_max] + pmaxsd m0, m3 + pmaxsd m1, m3 + pminsd m0, m4 + pminsd m1, m4 + jmp tx2q +.pass2: + mova [cq+16*0], m0 + vextracti128 [cq+16*3], m1, 1 + mova m2, m1 + vpermq m5, m0, q1010 + call m(iadst_4x4_internal_10bpc).main2 +.end: + vpbroadcastd m4, [pw_16384] + movq xm2, [dstq+strideq*0] + movq xm3, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movhps xm2, [r6 +strideq*0] ; dst0 dst2 + movhps xm3, [r6 +strideq*1] ; dst1 dst3 + vpbroadcastd m5, [pixel_12bpc_max] + vinserti128 m2, xm3, 1 + psrad m0, 3 + psrad m1, 3 + packssdw m0, m1 ; t0 t2 t1 t3 + pmulhrsw m0, m4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw m0, m2 ; out0 out2 out1 out3 + pmaxsw m0, m4 + pminsw m0, m5 + vextracti128 xm1, m0, 1 ; out1 out3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movhps [r6 +strideq*1], xm1 + RET + +INV_TXFM_4X4_12BPC_FN flipadst, dct +INV_TXFM_4X4_12BPC_FN flipadst, adst +INV_TXFM_4X4_12BPC_FN flipadst, flipadst +INV_TXFM_4X4_12BPC_FN flipadst, identity + +cglobal iflipadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10bpc).main + psrld m4, 8 + vpermd m2, m4, m0 + vpermd m1, m4, m1 + punpckhqdq m0, m1, m2 + punpcklqdq m1, m2 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 +.pass2: + mova [cq+16*0], m0 + vextracti128 [cq+16*3], m1, 1 + mova m2, m1 + vpermq m5, m0, q1010 + call m(iadst_4x4_internal_10bpc).main2 + vpermq m2, m0, q1032 + vpermq m0, m1, q1032 + mova m1, m2 + jmp m(iadst_4x4_internal_12bpc).end + +INV_TXFM_4X4_12BPC_FN identity, dct +INV_TXFM_4X4_12BPC_FN identity, adst +INV_TXFM_4X4_12BPC_FN identity, flipadst +INV_TXFM_4X4_12BPC_FN identity, identity + +cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 + vpbroadcastd m1, [pd_5793] + pmulld m0, m1, [cq+32*0] + pmulld m1, [cq+32*1] + vpbroadcastd m5, [pd_2048] + mova m3, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + vpermd m2, m3, m0 + vpermd m1, m3, m1 + jmp m(iadst_4x4_internal_12bpc).pass1_end +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + vpbroadcastd m3, [pd_5793] + pmulld m0, m3 + pmulld m1, m3 + paddd m0, m5 ; 2048 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + jmp m(iadst_4x4_internal_12bpc).end + +%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x8, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 2896 mov [cq], eobd ; 0 @@ -547,8 +699,7 @@ sar r6d, 16 movd xm0, r6d vpbroadcastw xm0, xm0 -.end2: - vpbroadcastd xm3, [pixel_max] + vpbroadcastd xm3, [pixel_%3bpc_max] pxor xm2, xm2 .end_loop: movq xm1, [dstq+strideq*0] @@ -586,7 +737,8 @@ INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst -cglobal idct_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 +cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: vpbroadcastd m3, [pd_2896] pmulld m0, m3, [cq+32*0] pmulld m1, m3, [cq+32*1] @@ -600,7 +752,7 @@ .pass2: packssdw m0, m2 packssdw m1, m3 - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m2 ; 2 3 @@ -624,7 +776,7 @@ paddw xm1, xm5 ; 3 2 paddw xm2, xm6 ; 4 5 paddw xm3, xm7 ; 7 6 - vpbroadcastd xm5, [pixel_max] + vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 @@ -644,8 +796,9 @@ INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity -cglobal iadst_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 - call m(iadst_8x4_internal_16bpc).main +cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + call m(iadst_8x4_internal_10bpc).main psrad m0, m4, 12 psrad m1, m5, 12 psrad m2, 12 @@ -670,7 +823,7 @@ paddw xm1, xm5 ; 2 3 paddw xm2, xm6 ; 4 5 paddw xm3, xm7 ; 6 7 - vpbroadcastd xm5, [pixel_max] + vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 @@ -688,7 +841,7 @@ .pass2_main: packssdw m0, m2 packssdw m1, m3 - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpckhdq m5, m4, m0 @@ -700,6 +853,9 @@ jmp m(iadst_4x8_internal_8bpc).main_pass2 ALIGN function_align .main: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.main2: vbroadcasti128 m0, [cq+16*0] vbroadcasti128 m2, [cq+16*2] vbroadcasti128 m3, [cq+16*5] @@ -712,13 +868,12 @@ vbroadcasti128 m5, [cq+16*1] vbroadcasti128 m3, [cq+16*3] vpbroadcastd m7, [pd_2048] - vpbroadcastd m8, [clip_min] - vpbroadcastd m9, [clip_max] shufpd m2, m4, 0x0c ; 4 6 shufpd m3, m5, 0x0c ; 3 1 REPX {pmulld x, m6}, m0, m1, m2, m3 REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 +.main3: ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1 ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1 psubd m4, m0, m2 ; t4 t6 @@ -762,15 +917,16 @@ INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity -cglobal iflipadst_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 - call m(iadst_8x4_internal_16bpc).main +cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + call m(iadst_8x4_internal_10bpc).main psrad m0, m3, 12 psrad m1, m2, 12 psrad m2, m5, 12 psrad m3, m4, 12 jmp tx2q .pass2: - call m(iadst_4x8_internal_16bpc).pass2_main + call m(iadst_4x8_internal_10bpc).pass2_main mova xm4, [pw_2048_m2048] REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 lea r3, [strideq*3] @@ -787,7 +943,7 @@ paddw xm2, xm5 ; 3 2 paddw xm1, xm6 ; 5 4 paddw xm0, xm7 ; 7 6 - vpbroadcastd xm5, [pixel_max] + vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 @@ -807,7 +963,8 @@ INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity -cglobal iidentity_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 +cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: vpbroadcastd m3, [pd_2896] pmulld m0, m3, [cq+32*0] pmulld m1, m3, [cq+32*1] @@ -822,6 +979,11 @@ REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: + vpbroadcastd m6, [pixel_10bpc_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: vpbroadcastd m4, [pw_4096] packssdw m0, m2 packssdw m1, m3 @@ -845,17 +1007,112 @@ vpbroadcastq m5, [r6 +r3 ] vpblendd m3, m4, 0x30 vpblendd m3, m5, 0xc0 - vpbroadcastd m5, [pixel_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 - paddw m0, m2 - paddw m1, m3 + paddw m0, m2 ; out0 out1 out4 out5 + paddw m1, m3 ; out2 out3 out6 out7 pmaxsw m0, m4 pmaxsw m1, m4 - pminsw m0, m5 - pminsw m1, m5 - vextracti128 xm2, m0, 1 - vextracti128 xm3, m1, 1 + pminsw m0, m6 + pminsw m1, m6 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + ret + +INV_TXFM_4X8_FN dct, dct, 12 +INV_TXFM_4X8_FN dct, identity, 12 +INV_TXFM_4X8_FN dct, adst, 12 +INV_TXFM_4X8_FN dct, flipadst, 12 + +cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(idct_4x8_internal_10bpc).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vpermq m0, m0, q3102 + vpermq m2, m2, q3102 + vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [pd_2048] + call m(idct_8x4_internal_10bpc).main + psubd m3, m0, m4 ; out7 out6 + paddd m0, m4 ; out0 out1 + paddd m1, m2, m5 ; out3 out2 + psubd m2, m5 ; out4 out5 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x8_internal_12bpc).end + +INV_TXFM_4X8_FN adst, dct, 12 +INV_TXFM_4X8_FN adst, adst, 12 +INV_TXFM_4X8_FN adst, flipadst, 12 +INV_TXFM_4X8_FN adst, identity, 12 + +cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iadst_4x8_internal_10bpc).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 +.end: + vpbroadcastd m4, [pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m2 ; 0 1 4 5 (interleaved) + packssdw m1, m3 ; 2 3 6 7 (interleaved) + mova m2, [iadst8_12_shuf] + vpermd m0, m2, m0 ; 0 1 4 5 + vpermd m1, m2, m1 ; 2 3 6 7 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + vinserti128 m4, xm6, 1 + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + vinserti128 m5, xm7, 1 + paddw m0, m4 ; 0 1 4 5 + paddw m1, m5 ; 2 3 6 7 + vpbroadcastd m5, [pixel_12bpc_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, m4}, m0, m1 + REPX {pminsw x, m5}, m0, m1 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 @@ -865,16 +1122,70 @@ movq [r6 +strideq*2], xm3 movhps [r6 +r3 ], xm3 RET +ALIGN function_align +.pass2_main: + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [pd_2048] + jmp m(iadst_4x8_internal_10bpc).main3 + +INV_TXFM_4X8_FN flipadst, dct, 12 +INV_TXFM_4X8_FN flipadst, adst, 12 +INV_TXFM_4X8_FN flipadst, flipadst, 12 +INV_TXFM_4X8_FN flipadst, identity, 12 + +cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iflipadst_4x8_internal_10bpc).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_4x8_internal_12bpc).pass2_main + shufpd m3, m4, m0, 0x05 ; out1 out0 + shufpd m0, m4, 0x05 ; out7 out6 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 ; out5 out4 + psignd m2, m5, m6 ; out3 out2 + jmp m(iadst_4x8_internal_12bpc).end + +INV_TXFM_4X8_FN identity, dct, 12 +INV_TXFM_4X8_FN identity, adst, 12 +INV_TXFM_4X8_FN identity, flipadst, 12 +INV_TXFM_4X8_FN identity, identity, 12 + +cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_4x8_internal_10bpc).pass1 +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m2 = in4 in5 + ; m3 = in6 in7 + vpbroadcastd m6, [pixel_12bpc_max] + call m(iidentity_4x8_internal_10bpc).pass2_end + RET -%macro INV_TXFM_4X16_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x16 +%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x16, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 2896 mov [cq], eobd ; 0 mov r3d, 16 add r6d, 6144 sar r6d, 13 - jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end + jmp m(inv_txfm_add_dct_dct_4x8_%3bpc).end %endif %endmacro @@ -883,37 +1194,19 @@ INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst -cglobal idct_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 +cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m10, [pd_6144] mova m1, [cq+32*2] mova m3, [cq+32*6] mova m5, [cq+32*3] mova m7, [cq+32*7] - vpbroadcastd m4, [pd_3784] - vpbroadcastd m8, [pd_1567] - vpbroadcastd m9, [pd_2048] - vpbroadcastd m6, [pd_2896] - ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l - ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h + call .pass1_main pmulld m0, m6, [cq+32*0] pmulld m2, m6, [cq+32*4] pmulld m4, m6, [cq+32*1] pmulld m6, [cq+32*5] - vpbroadcastd m8, [pd_6144] - paddd m0, m8 - paddd m4, m8 - paddd m8, m0, m2 - psubd m0, m2 - paddd m9, m4, m6 - psubd m4, m6 - REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h - psubd m2, m0, m1 - paddd m1, m0 - psubd m6, m4, m5 - paddd m5, m4 - paddd m0, m8, m3 - psubd m3, m8, m3 - paddd m4, m9, m7 - psubd m7, m9, m7 + call .pass1_main2 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: @@ -921,7 +1214,7 @@ packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] punpcklwd m4, m2, m3 punpckhwd m2, m3 punpckhwd m5, m0, m1 @@ -940,7 +1233,38 @@ vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 vinserti128 m2, m4, xm5, 1 ; 8 9 b a vinserti128 m3, m6, xm7, 1 ; c d f e - vpbroadcastd m8, [pixel_max] + vpbroadcastd m8, [pixel_10bpc_max] + call .pass2_end + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m4, [pd_3784] + vpbroadcastd m8, [pd_1567] + vpbroadcastd m9, [pd_2048] + vpbroadcastd m6, [pd_2896] + ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l + ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h + ret +ALIGN function_align +.pass1_main2: + paddd m0, m10 + paddd m4, m10 + paddd m8, m0, m2 + psubd m0, m2 + paddd m9, m4, m6 + psubd m4, m6 + REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + psubd m2, m0, m1 + paddd m1, m0 + psubd m6, m4, m5 + paddd m5, m4 + paddd m0, m8, m3 + psubd m3, m8, m3 + paddd m4, m9, m7 + psubd m7, m9, m7 + ret +ALIGN function_align +.pass2_end: lea r6, [strideq*3] pxor m7, m7 pmulhrsw m0, m9 @@ -951,7 +1275,7 @@ call .write_4x4 pmulhrsw m0, m3, m9 call .write_4x4 - RET + ret ALIGN function_align .write_4x4: movq xm4, [dstq+strideq*0] @@ -979,8 +1303,11 @@ INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity -cglobal iadst_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 - call m(iadst_16x4_internal_16bpc).main +cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_6144] + call m(iadst_16x4_internal_10bpc).main_end psrad m0, m4, 13 psrad m1, m5, 13 psrad m2, 13 @@ -993,7 +1320,7 @@ .pass2: call .pass2_main vpbroadcastd m5, [pw_2048] - vpbroadcastd m8, [pixel_max] + vpbroadcastd m8, [pixel_10bpc_max] lea r6, [strideq*3] vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 @@ -1037,7 +1364,7 @@ packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] punpcklwd m4, m2, m3 punpckhwd m2, m3 punpckhwd m5, m0, m1 @@ -1087,6 +1414,8 @@ vbroadcasti128 m7, [cq+16* 3] vbroadcasti128 m8, [cq+16* 1] shufpd m7, m8, 0x0c ; 3 1 +.main2: + ; expects: m12 = clip_min m13 = clip_max vpbroadcastd m11, [pd_2048] ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 @@ -1100,6 +1429,8 @@ paddd m2, m6 ; t4a t6a psubd m6, m3, m7 ; t13a t15a paddd m3, m7 ; t5a t7a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8 ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1 ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1 psubd m7, m0, m2 ; t4 t6 @@ -1110,6 +1441,8 @@ paddd m4, m6 ; t8a t10a psubd m6, m8, m5 ; t13a t15a paddd m8, m5 ; t9a t11a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8 punpcklqdq m5, m3, m7 ; t12a t4 punpckhqdq m3, m7 ; t14a t6 punpckhqdq m7, m6, m2 ; t15a t7 @@ -1130,6 +1463,8 @@ paddd m8, m0 ; out14 -out15 paddd m0, m4, m2 ; -out1 out0 psubd m4, m2 ; t10 t2a + REPX {pmaxsd x, m12}, m6, m5, m3, m4 + REPX {pminsd x, m13}, m6, m5, m3, m4 REPX {pmulld x, m10}, m6, m5, m3, m4 paddd m6, m11 paddd m4, m11 @@ -1148,8 +1483,11 @@ INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity -cglobal iflipadst_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 - call m(iadst_16x4_internal_16bpc).main +cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_6144] + call m(iadst_16x4_internal_10bpc).main_end psrad m0, m3, 13 psrad m1, m2, 13 psrad m2, m5, 13 @@ -1160,9 +1498,9 @@ psrad m7, m8, 13 jmp tx2q .pass2: - call m(iadst_4x16_internal_16bpc).pass2_main + call m(iadst_4x16_internal_10bpc).pass2_main vpbroadcastd m5, [pw_2048] - vpbroadcastd m8, [pixel_max] + vpbroadcastd m8, [pixel_10bpc_max] lea r6, [strideq*3] vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 @@ -1206,7 +1544,8 @@ INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity -cglobal iidentity_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 +cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: vpbroadcastd m7, [pd_5793] pmulld m0, m7, [cq+32*0] pmulld m4, m7, [cq+32*1] @@ -1236,7 +1575,11 @@ paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 - vpbroadcastd m4, [pixel_max] + vpbroadcastd m4, [pixel_10bpc_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: punpckhwd m7, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 @@ -1256,7 +1599,7 @@ call .write_2x4x2 pmulhrsw m0, m7, m8 call .write_2x4x2 - RET + ret ALIGN function_align .write_2x4x2: movq xm1, [dstq+strideq*0] @@ -1279,8 +1622,178 @@ lea dstq, [dstq+strideq*2] ret -%macro INV_TXFM_8X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 8x4 +INV_TXFM_4X16_FN dct, dct, 12 +INV_TXFM_4X16_FN dct, identity, 12 +INV_TXFM_4X16_FN dct, adst, 12 +INV_TXFM_4X16_FN dct, flipadst, 12 + +cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(idct_4x16_internal_10bpc).pass1 +.pass2: + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m5, m0, m2 ; 2 6 + punpckhqdq m12, m0, m2 ; 3 7 + punpcklqdq m0, m8, m9 ; 0 4 + punpckhqdq m10, m8, m9 ; 1 5 + punpcklqdq m2, m1, m3 ; 8 12 + punpckhqdq m13, m1, m3 ; 9 13 + punpcklqdq m9, m4, m6 ; 10 14 + punpckhqdq m4, m6 ; 11 15 + vperm2i128 m1, m5, m9, 0x20 ; 2 10 + vperm2i128 m3, m9, m5, 0x31 ; 14 6 + vpermq m11, m4, q1302 ; 15 11 + ; interleave + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13 + REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13 + call m(idct_16x4_internal_10bpc).pass1_main + vpermq m6, m12, q1302 ; 7 3 + vpermq m5, m13, q3120 ; 9 13 + call m(idct_16x4_internal_10bpc).pass1_main2 + call m(idct_16x4_internal_10bpc).pass1_main3 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [idct16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [pw_16384] + vpbroadcastd m8, [pixel_12bpc_max] + call m(idct_4x16_internal_10bpc).pass2_end + RET + +INV_TXFM_4X16_FN adst, dct, 12 +INV_TXFM_4X16_FN adst, adst, 12 +INV_TXFM_4X16_FN adst, flipadst, 12 +INV_TXFM_4X16_FN adst, identity, 12 + +cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(iadst_4x16_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_16x4 + call m(iadst_4x16_internal_10bpc).main2 + pshufd m4, m5, q1032 + psrad m5, m6, 3 + pshufd m6, m7, q1032 + psrad m7, m8, 3 + REPX {pshufd x, x, q1032}, m0, m2 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6 +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [iadst16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [pw_16384] + vpbroadcastd m8, [pixel_12bpc_max] + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call m(iadst_4x16_internal_10bpc).write_4x4 + pmulhrsw m0, m9, m1 + call m(iadst_4x16_internal_10bpc).write_4x4 + pmulhrsw m0, m9, m2 + call m(iadst_4x16_internal_10bpc).write_4x4 + pmulhrsw m0, m9, m3 + call m(iadst_4x16_internal_10bpc).write_4x4 + RET +ALIGN function_align +.transpose_16x4: + ; transpose & interleave + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m10, m8, m0 + punpckhqdq m0, m8 + punpcklqdq m11, m9, m2 + punpckhqdq m2, m9 + punpcklqdq m8, m1, m4 + punpckhqdq m4, m1 + punpcklqdq m9, m3, m6 + punpckhqdq m6, m3 + vperm2i128 m5, m0, m2, 0x31 ; 7 5 + vperm2i128 m7, m0, m2, 0x20 ; 3 1 + vperm2i128 m0, m10, m11, 0x20 ; 0 2 + vperm2i128 m2, m10, m11, 0x31 ; 4 6 + vperm2i128 m1, m4, m6, 0x31 ; 15 13 + vperm2i128 m3, m4, m6, 0x20 ; 11 9 + vperm2i128 m4, m8, m9, 0x20 ; 8 10 + vperm2i128 m6, m8, m9, 0x31 ; 12 14 + ret + +INV_TXFM_4X16_FN flipadst, dct, 12 +INV_TXFM_4X16_FN flipadst, adst, 12 +INV_TXFM_4X16_FN flipadst, flipadst, 12 +INV_TXFM_4X16_FN flipadst, identity, 12 + +cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(iflipadst_4x16_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_4x16_internal_12bpc).transpose_16x4 + call m(iadst_4x16_internal_10bpc).main2 + pshufd m4, m3, q1032 + psrad m3, m5, 3 + psrad m5, m2, 3 + pshufd m2, m6, q1032 + pshufd m6, m1, q1032 + psrad m1, m7, 3 + psrad m7, m0, 3 + pshufd m0, m8, q1032 + REPX {psrad x, 3}, m0, m2, m4, m6 + jmp m(iadst_4x16_internal_12bpc).pass2_end + +INV_TXFM_4X16_FN identity, dct, 12 +INV_TXFM_4X16_FN identity, adst, 12 +INV_TXFM_4X16_FN identity, flipadst, 12 +INV_TXFM_4X16_FN identity, identity, 12 + +cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(iidentity_4x16_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [pd_11586] + vpbroadcastd m9, [pd_2048] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m8, [pw_16384] + vpbroadcastd m4, [pixel_12bpc_max] + call m(iidentity_4x16_internal_10bpc).pass2_end + RET + +%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x4, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 2896 mov [cq], eobd ; 0 @@ -1295,7 +1808,7 @@ movd xm0, r6d vpbroadcastw m0, xm0 .end: - vpbroadcastd m4, [pixel_max] + vpbroadcastd m4, [pixel_%3bpc_max] pxor m3, m3 mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 @@ -1321,7 +1834,10 @@ INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst -cglobal idct_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2 +cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: vbroadcasti128 m1, [cq+16*1] vbroadcasti128 m0, [cq+16*5] vbroadcasti128 m2, [cq+16*3] @@ -1340,10 +1856,10 @@ REPX {paddd x, m7}, m1, m3, m0, m2 REPX {psrad x, 12}, m1, m3, m0, m2 call .main - psubd m3, m0, m4 ; out7 out6 - paddd m0, m4 ; out0 out1 - paddd m1, m2, m5 ; out3 out2 - psubd m2, m5 ; out4 out5 + psubd m3, m0, m4 ; out7 out6 (interleaved) + paddd m0, m4 ; out0 out1 (interleaved) + paddd m1, m2, m5 ; out3 out2 (interleaved) + psubd m2, m5 ; out4 out5 (interleaved) pshufd m1, m1, q1032 pshufd m3, m3, q1032 jmp tx2q @@ -1358,13 +1874,11 @@ IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 vpermq m0, m0, q3120 ; out0 out1 vpermq m2, m1, q2031 ; out2 out3 - jmp m(iadst_8x4_internal_16bpc).end + jmp m(iadst_8x4_internal_10bpc).end ALIGN function_align .main: ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1 IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7 - vpbroadcastd m8, [clip_min] - vpbroadcastd m9, [clip_max] vpbroadcastd m6, [pd_2896] punpcklqdq m4, m1, m3 ; t4a t7a punpckhqdq m1, m3 ; t5a t6a @@ -1388,13 +1902,13 @@ INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity -cglobal iadst_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2 - call m(iadst_4x8_internal_16bpc).main - vpblendd m3, m0, m4, 0x33 ; out6 out7 - vpblendd m0, m4, 0xcc ; out0 out1 - pshufd m1, m5, q1032 - psignd m2, m6 ; out4 out5 - psignd m1, m6 ; out2 out3 +cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10bpc).main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 jmp tx2q .pass2: call .pass2_main @@ -1404,13 +1918,13 @@ vpbroadcastd m1, [pw_2048] pmulhrsw m0, m1 pmulhrsw m1, m2 + vpbroadcastd m5, [pixel_10bpc_max] .end2: mova xm2, [dstq+strideq*0] vinserti128 m2, [dstq+strideq*1], 1 lea r6, [dstq+strideq*2] mova xm3, [r6 +strideq*0] vinserti128 m3, [r6 +strideq*1], 1 - vpbroadcastd m5, [pixel_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 paddw m0, m2 @@ -1429,7 +1943,7 @@ vbroadcasti128 m4, [deint_shuf] packssdw m0, m1 packssdw m2, m3 - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] vperm2i128 m1, m0, m2, 0x31 vinserti128 m0, xm2, 1 pshufb m0, m4 @@ -1445,6 +1959,7 @@ vpbroadcastd m4, [pd_2048] REPX {paddd x, m4}, m0, m3, m2, m1 REPX {psrad x, 12}, m0, m3, m2, m1 +.main2: vbroadcasti128 m6, [pd_1321] vbroadcasti128 m7, [pd_2482] pmulld m4, m0, m6 ; 1321*in0 @@ -1478,27 +1993,28 @@ INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity -cglobal iflipadst_8x4_internal_16bpc, 0, 5, 10, dst, stride, c, eob, tx2 - call m(iadst_4x8_internal_16bpc).main - shufpd m3, m4, m0, 0x05 - shufpd m0, m4, 0x05 - psignd m2, m6 - pshufd m6, m6, q1032 - pshufd m1, m2, q1032 - psignd m2, m5, m6 +cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10bpc).main + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 jmp tx2q .pass2: - call m(iadst_8x4_internal_16bpc).pass2_main + call m(iadst_8x4_internal_10bpc).pass2_main vpermq m2, m0, q2031 vpermq m0, m1, q2031 - jmp m(iadst_8x4_internal_16bpc).end + jmp m(iadst_8x4_internal_10bpc).end INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity -cglobal iidentity_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2 +cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 +.pass1: vpbroadcastd m4, [pd_2896] vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 @@ -1511,6 +2027,7 @@ REPX {paddd x, x }, m0, m1, m2, m3 jmp tx2q .pass2: + vpbroadcastd m5, [pixel_10bpc_max] vpbroadcastd m4, [pw_1697x8] packssdw m0, m1 packssdw m2, m3 @@ -1518,9 +2035,10 @@ pmulhrsw m4, m2 paddsw m0, m1 paddsw m2, m4 + packssdw m7, m7 ; pw_2048 +.pass2_end: punpckhwd m1, m0, m2 punpcklwd m0, m2 - packssdw m7, m7 ; pw_2048 lea r6, [dstq+strideq*2] punpckhwd m2, m0, m1 punpcklwd m0, m1 @@ -1532,7 +2050,6 @@ vinserti128 m2, [r6 +strideq*0], 1 mova xm3, [dstq+strideq*1] vinserti128 m3, [r6 +strideq*1], 1 - vpbroadcastd m5, [pixel_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 paddw m0, m2 @@ -1547,8 +2064,138 @@ vextracti128 [r6 +strideq*1], m1, 1 RET -%macro INV_TXFM_8X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 8x8 +INV_TXFM_8X4_FN dct, dct, 12 +INV_TXFM_8X4_FN dct, identity, 12 +INV_TXFM_8X4_FN dct, adst, 12 +INV_TXFM_8X4_FN dct, flipadst, 12 + +cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct_8x4_internal_10bpc).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12bpc).transpose_4x8 + IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(iadst_8x4_internal_12bpc).end + +INV_TXFM_8X4_FN adst, dct, 12 +INV_TXFM_8X4_FN adst, adst, 12 +INV_TXFM_8X4_FN adst, flipadst, 12 +INV_TXFM_8X4_FN adst, identity, 12 + +cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10bpc).main2 + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 +.end: + vpbroadcastd m4, [pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m2, m4 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m1, m1, q3120 ; out2 out3 + vpbroadcastd m5, [pixel_12bpc_max] + jmp m(iadst_8x4_internal_10bpc).end2 +ALIGN function_align +.pass2_main: + call .transpose_4x8 + jmp m(iadst_8x4_internal_10bpc).main2 +ALIGN function_align +.transpose_4x8: + ; deinterleave + pshufd m0, m0, q3120 + pshufd m1, m1, q3120 + pshufd m2, m2, q3120 + pshufd m3, m3, q3120 + ; transpose + punpcklqdq m4, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m5, m2, m3 + punpckhqdq m2, m3 + vperm2i128 m1, m0, m2, 0x20 ; out1 + vperm2i128 m3, m0, m2, 0x31 ; out3 + vperm2i128 m2, m4, m5, 0x31 ; out2 + vperm2i128 m0, m4, m5, 0x20 ; out0 + ret + +INV_TXFM_8X4_FN flipadst, dct, 12 +INV_TXFM_8X4_FN flipadst, adst, 12 +INV_TXFM_8X4_FN flipadst, flipadst, 12 +INV_TXFM_8X4_FN flipadst, identity, 12 + +cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10bpc).main2 + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12bpc).pass2_main + psrad m0, m3, 12 + psrad m3, m4, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + jmp m(iadst_8x4_internal_12bpc).end + +INV_TXFM_8X4_FN identity, dct, 12 +INV_TXFM_8X4_FN identity, adst, 12 +INV_TXFM_8X4_FN identity, flipadst, 12 +INV_TXFM_8X4_FN identity, identity, 12 + +cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_8x4_internal_10bpc).pass1 +.pass2: + ; m0 = in0 in1 (interleaved) + ; m1 = in2 in3 (interleaved) + ; m2 = in4 in5 (interleaved) + ; m3 = in6 in7 (interleaved) + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + vpbroadcastd m4, [pd_5793] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 15}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12bpc_max] + vpbroadcastd m7, [pw_16384] + packssdw m0, m1 + packssdw m2, m3 + jmp m(iidentity_8x4_internal_10bpc).pass2_end + +%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x8, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 2896 mov [cq], eobd ; 0 @@ -1562,7 +2209,7 @@ sar r6d, 16 movd xm0, r6d vpbroadcastw m0, xm0 - vpbroadcastd m3, [pixel_max] + vpbroadcastd m3, [pixel_%3bpc_max] pxor m2, m2 .dconly_loop: mova xm1, [dstq+strideq*0] @@ -1619,7 +2266,10 @@ INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst -cglobal idct_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 +cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] @@ -1629,31 +2279,29 @@ mova m6, [cq+32*6] mova m7, [cq+32*7] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] call .main call .round_shift1 jmp tx2q .pass2: call .transpose_8x8_packed call m(idct_8x8_internal_8bpc).main - vpbroadcastd m12, [pw_2048] - vpermq m0, m0, q3120 - vpermq m1, m1, q2031 - vpermq m2, m2, q3120 - vpermq m3, m3, q2031 - pmulhrsw m0, m12 - pmulhrsw m1, m12 + vpbroadcastd m12, [pw_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 call .write_8x4_start - pmulhrsw m0, m2, m12 - pmulhrsw m1, m3, m12 + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 call .write_8x4 RET ALIGN function_align .write_8x4_start: - vpbroadcastd m11, [pixel_max] - lea r6, [strideq*3] - pxor m10, m10 + vpbroadcastd m11, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m10, m10 .write_8x4: mova xm8, [dstq+strideq*0] vinserti128 m8, [dstq+strideq*1], 1 @@ -1682,7 +2330,7 @@ packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 @@ -1746,12 +2394,15 @@ INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity -cglobal iadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 +cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: call .main call .main_end jmp tx2q .pass2: - call m(idct_8x8_internal_16bpc).transpose_8x8_packed + call m(idct_8x8_internal_10bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 @@ -1761,10 +2412,10 @@ REPX {vpermq x, x, q3120}, m0, m1, m2, m3 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .main: @@ -1777,8 +2428,6 @@ mova m3, [cq+32*3] mova m4, [cq+32*4] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] .main2: IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 psrld m8, 11 ; pd_1 @@ -1806,12 +2455,15 @@ INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity -cglobal iflipadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 - call m(iadst_8x8_internal_16bpc).main +cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_8x8_internal_10bpc).main call .main_end jmp tx2q .pass2: - call m(idct_8x8_internal_16bpc).transpose_8x8_packed + call m(idct_8x8_internal_10bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 @@ -1824,10 +2476,10 @@ vpermq m3, m0, q2031 pmulhrsw m0, m8, m12 pmulhrsw m1, m9, m12 - call m(idct_8x8_internal_16bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .main_end: @@ -1855,7 +2507,8 @@ INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity -cglobal iidentity_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 +cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 +.pass1: mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] @@ -1866,10 +2519,12 @@ mova m7, [cq+32*7] jmp tx2q .pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_10bpc_max] +.pass2_main: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 - packssdw m3, m7 vpbroadcastd m12, [pw_4096] punpckhwd m4, m0, m1 punpcklwd m0, m1 @@ -1891,7 +2546,6 @@ call .write_2x8x2_zero RET .write_2x8x2_start: - vpbroadcastd m7, [pixel_max] lea r6, [strideq*5] pxor m6, m6 .write_2x8x2_zero: @@ -1918,8 +2572,162 @@ lea dstq, [dstq+strideq*2] ret -%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset - INV_TXFM_FN %1, %2, %3, 8x16 +%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4] + punpckldq m%9, m%1, m%2 ; aibj emfn + punpckhdq m%1, m%2 ; ckdl gohp + punpckldq m%10, m%3, m%4 ; qyrz uCvD + punpckhdq m%3, m%4 ; sAtB wExF + punpckldq m%11, m%5, m%6 ; GOHP KSLT + punpckhdq m%5, m%6 ; IQJR MUNV + punpckldq m%12, m%7, m%8 ; WeXf aibj + punpckhdq m%7, m%8 ; YgZh ckdl + punpcklqdq m%2, m%9, m%10 ; aiqy emuC + punpckhqdq m%9, m%10 ; bjrz fnvD + punpcklqdq m%4, m%1, m%3 ; cksA gowE + punpckhqdq m%10, m%1, m%3 ; dltB hpxF + punpcklqdq m%6, m%11, m%12 ; GOWe KSai + punpckhqdq m%11, m%12 ; HPXf LTbj + punpcklqdq m%8, m%5, m%7 ; IQYg MUck + punpckhqdq m%12, m%5, m%7 ; JRZh NVdl + vperm2i128 m%1, m%2, m%6, 0x20 ; out0 + vperm2i128 m%5, m%2, m%6, 0x31 ; out4 + vperm2i128 m%2, m%9, m%11, 0x20 ; out1 + vperm2i128 m%6, m%9, m%11, 0x31 ; out5 + vperm2i128 m%3, m%4, m%8, 0x20 ; out2 + vperm2i128 m%7, m%4, m%8, 0x31 ; out6 + vperm2i128 m%4, m%10, m%12, 0x20 ; out3 + vperm2i128 m%8, m%10, m%12, 0x31 ; out7 +%endmacro + +INV_TXFM_8X8_FN dct, dct, 12 +INV_TXFM_8X8_FN dct, identity, 12 +INV_TXFM_8X8_FN dct, adst, 12 +INV_TXFM_8X8_FN dct, flipadst, 12 + +cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_8x8_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_8x8 + vpbroadcastd m11, [pd_2048] + call m(idct_8x8_internal_10bpc).main + call .round_shift4 + jmp m(iadst_8x8_internal_12bpc).pass2_end +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_12bpc_max] + lea r6, [strideq*3] + pxor m10, m10 + ret +ALIGN function_align +.transpose_8x8: + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + ret +ALIGN function_align +.round_shift4: + vpbroadcastd m1, [pd_8] + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct, 12 +INV_TXFM_8X8_FN adst, adst, 12 +INV_TXFM_8X8_FN adst, flipadst, 12 +INV_TXFM_8X8_FN adst, identity, 12 + +cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x8_internal_10bpc).pass1 +.pass2: + call .pass2_main +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + packssdw m0, m4, m5 + packssdw m1, m6, m7 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_12bpc).transpose_8x8 + vpbroadcastd m11, [pd_2048] +.pass2_main2: + call m(iadst_8x8_internal_10bpc).main2 + pslld m9, m8, 3 ; pd_8 + paddd m0, m9 + psubd m1, m9, m1 ; 8+x + paddd m6, m9 + psubd m7, m9, m7 + REPX {psrad x, 4}, m0, m1, m6, m7 + vpbroadcastd m9, [pd_34816] + psubd m8, m9, m8 ; 34815 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 16}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct, 12 +INV_TXFM_8X8_FN flipadst, adst, 12 +INV_TXFM_8X8_FN flipadst, flipadst, 12 +INV_TXFM_8X8_FN flipadst, identity, 12 + +cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x8_internal_10bpc).pass1 +.pass2: + call m(iadst_8x8_internal_12bpc).pass2_main + packssdw m7, m7, m6 + packssdw m6, m1, m0 + packssdw m1, m5, m4 + vpermq m0, m7, q3120 + vpermq m1, m1, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + packssdw m0, m3, m2 + vpermq m0, m0, q3120 + vpermq m1, m6, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET + +INV_TXFM_8X8_FN identity, dct, 12 +INV_TXFM_8X8_FN identity, adst, 12 +INV_TXFM_8X8_FN identity, flipadst, 12 +INV_TXFM_8X8_FN identity, identity, 12 + +cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(iidentity_8x8_internal_10bpc).pass1 +.pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(iidentity_8x8_internal_10bpc).pass2_main + +%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 8x16, %4 %ifidn %1_%2, dct_dct imul r6d, [cq], 2896 mov [cq], eobd ; 0 @@ -1927,7 +2735,7 @@ add r6d, 2048 sar r6d, 12 imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_8x8_16bpc).dconly + jmp m(inv_txfm_add_dct_dct_8x8_%4bpc).dconly %endif %endmacro @@ -1936,12 +2744,13 @@ INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst -cglobal idct_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] cmp eobd, 43 jl .fast add cq, 32 @@ -1978,16 +2787,16 @@ .end: pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m0, m4, m12 pmulhrsw m1, m5, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m0, m6, m12 pmulhrsw m1, m7, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .transpose: @@ -1999,7 +2808,7 @@ packssdw m5, m13 packssdw m6, m14 packssdw m7, m15 - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] punpckhwd m8, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 @@ -2035,8 +2844,8 @@ pmulld m5, m14, [cq+32*10] pmulld m6, m14, [cq+32*12] pmulld m7, m14, [cq+32*14] - call m(idct_8x8_internal_16bpc).main_rect2 - jmp m(idct_8x8_internal_16bpc).round_shift1 + call m(idct_8x8_internal_10bpc).main_rect2 + jmp m(idct_8x8_internal_10bpc).round_shift1 ALIGN function_align .main_evenhalf: paddd m1, m6, m7 ; idct8 out1 @@ -2129,17 +2938,18 @@ INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity, 35 -cglobal iadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] cmp eobd, 43 jl .fast add cq, 32 call .pass1_main - call m(iadst_8x8_internal_16bpc).main_end + call m(iadst_8x8_internal_10bpc).main_end sub cq, 32 mova [cq+32* 1], m0 mova [cq+32* 3], m1 @@ -2150,7 +2960,7 @@ mova [cq+32*13], m6 mova m15, m7 call .pass1_main - call m(iadst_8x8_internal_16bpc).main_end + call m(iadst_8x8_internal_10bpc).main_end mova m8, [cq+32* 1] mova m9, [cq+32* 3] mova m10, [cq+32* 5] @@ -2161,12 +2971,12 @@ jmp tx2q .fast: call .pass1_main - call m(iadst_8x8_internal_16bpc).main_end + call m(iadst_8x8_internal_10bpc).main_end pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: - call m(idct_8x16_internal_16bpc).transpose + call m(idct_8x16_internal_10bpc).transpose call m(iadst_8x16_internal_8bpc).main call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m8, [pw_2048] @@ -2174,7 +2984,7 @@ REPX {vpermq x, x, q2031}, m0, m1, m2, m3 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 psubw m12, m8 - jmp m(idct_8x16_internal_16bpc).end + jmp m(idct_8x16_internal_10bpc).end ALIGN function_align .pass1_main: pmulld m0, m14, [cq+32* 0] @@ -2187,24 +2997,25 @@ pmulld m4, m14, [cq+32* 8] REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 - jmp m(iadst_8x8_internal_16bpc).main2 + jmp m(iadst_8x8_internal_10bpc).main2 INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity, 35 -cglobal iflipadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] cmp eobd, 43 jl .fast add cq, 32 - call m(iadst_8x16_internal_16bpc).pass1_main - call m(iflipadst_8x8_internal_16bpc).main_end + call m(iadst_8x16_internal_10bpc).pass1_main + call m(iflipadst_8x8_internal_10bpc).main_end sub cq, 32 mova [cq+32* 1], m0 mova [cq+32* 3], m1 @@ -2214,8 +3025,8 @@ mova [cq+32*11], m5 mova [cq+32*13], m6 mova m15, m7 - call m(iadst_8x16_internal_16bpc).pass1_main - call m(iflipadst_8x8_internal_16bpc).main_end + call m(iadst_8x16_internal_10bpc).pass1_main + call m(iflipadst_8x8_internal_10bpc).main_end mova m8, [cq+32* 1] mova m9, [cq+32* 3] mova m10, [cq+32* 5] @@ -2225,13 +3036,13 @@ mova m14, [cq+32*13] jmp tx2q .fast: - call m(iadst_8x16_internal_16bpc).pass1_main - call m(iflipadst_8x8_internal_16bpc).main_end + call m(iadst_8x16_internal_10bpc).pass1_main + call m(iflipadst_8x8_internal_10bpc).main_end pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: - call m(idct_8x16_internal_16bpc).transpose + call m(idct_8x16_internal_10bpc).transpose call m(iadst_8x16_internal_8bpc).main call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m12, [pw_2048] @@ -2249,7 +3060,7 @@ vpermq m6, m10, q3120 vpermq m7, m11, q3120 psubw m12, m13 - jmp m(idct_8x16_internal_16bpc).end + jmp m(idct_8x16_internal_10bpc).end INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst @@ -2266,7 +3077,8 @@ paddsw m%1, m%2 %endmacro -cglobal iidentity_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +.pass1: vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 0] pmulld m8, m15, [cq+32* 1] @@ -2300,51 +3112,266 @@ packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 - packssdw m7, m15 + packssdw m13, m7, m15 vpbroadcastd m8, [pw_1697x16] - REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 + vpbroadcastd m7, [pixel_10bpc_max] + vpbroadcastd m12, [pw_2048] + call .pass2_end + RET +ALIGN function_align +.pass2_end: punpckhwd m9, m0, m1 punpcklwd m0, m1 - punpckhwd m1, m6, m7 - punpcklwd m6, m7 - punpckhwd m7, m4, m5 + punpckhwd m1, m6, m13 + punpcklwd m6, m13 + punpckhwd m13, m4, m5 punpcklwd m4, m5 punpcklwd m5, m2, m3 punpckhwd m2, m3 - vpbroadcastd m12, [pw_2048] punpckhdq m3, m0, m5 punpckldq m0, m5 punpckhdq m11, m9, m2 punpckldq m9, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 - punpckldq m6, m7, m1 - punpckhdq m7, m1 + punpckldq m6, m13, m1 + punpckhdq m13, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m8, m9, m6 punpckhqdq m9, m6 - punpcklqdq m10, m11, m7 - punpckhqdq m11, m7 + punpcklqdq m10, m11, m13 + punpckhqdq m11, m13 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(iidentity_8x8_internal_16bpc).write_2x8x2_start + call m(iidentity_8x8_internal_10bpc).write_2x8x2_start pmulhrsw m0, m12, m2 pmulhrsw m1, m12, m3 - call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero + call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero pmulhrsw m0, m12, m8 pmulhrsw m1, m12, m9 lea dstq, [dstq+strideq*4] - call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero + call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero pmulhrsw m0, m12, m10 pmulhrsw m1, m12, m11 - call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero + call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero + ret + +INV_TXFM_8X16_FN dct, dct, 0, 12 +INV_TXFM_8X16_FN dct, identity, 35, 12 +INV_TXFM_8X16_FN dct, adst, 0, 12 +INV_TXFM_8X16_FN dct, flipadst, 0, 12 + +cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call .transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*10], m2 + mova [cq+32*12], m4 + mova [cq+32*14], m6 + pmaxsd m0, m12, [cq+32* 1] + pmaxsd m4, m12, m1 + pmaxsd m1, m12, [cq+32* 3] + pmaxsd m2, m12, [cq+32* 5] + pmaxsd m6, m12, m5 + pmaxsd m5, m12, m3 + pmaxsd m3, m12, [cq+32* 7] + pmaxsd m7, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 2] + pmaxsd m2, m12, [cq+32* 4] + pmaxsd m3, m12, [cq+32* 6] + pmaxsd m4, m12, [cq+32* 8] + pmaxsd m5, m12, [cq+32*10] + pmaxsd m6, m12, [cq+32*12] + pmaxsd m7, m12, [cq+32*14] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + vpbroadcastd m11, [pd_8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x8_internal_10bpc).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 +.end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m2, q3120 + vpermq m1, m3, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 RET +ALIGN function_align +.transpose: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + mova [cq+32* 3], m3 + mova [cq+32* 4], m4 + mova [cq+32* 5], m5 + mova [cq+32* 6], m6 + mova [cq+32* 7], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, m12 + mova m5, m13 + mova m6, m14 + mova m7, m15 + jmp m(idct_8x8_internal_12bpc).transpose_8x8 + +INV_TXFM_8X16_FN adst, dct, 0, 12 +INV_TXFM_8X16_FN adst, adst, 0, 12 +INV_TXFM_8X16_FN adst, flipadst, 0, 12 +INV_TXFM_8X16_FN adst, identity, 35, 12 + +cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call .pass2_main + call m(iadst_16x8_internal_10bpc).pass1_rotations +.pass2_end: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp m(idct_8x16_internal_12bpc).end +ALIGN function_align +.pass2_main: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*15], m7 + pmaxsd m0, m13, [cq+32* 2] ; 2 + pmaxsd m3, m13, m1 ; 9 + pmaxsd m1, m13, m5 ; 13 + pmaxsd m4, m13, m2 ; 10 + pmaxsd m2, m13, [cq+32* 6] ; 6 + pmaxsd m5, m13, [cq+32* 5] ; 5 + pmaxsd m6, m13, m6 ; 14 + pmaxsd m7, m13, [cq+32* 1] ; 1 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m12, [pd_2048] + vpbroadcastd m15, [pd_2896] + call m(iadst_16x8_internal_10bpc).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m1, m13, [cq+32*15] ; 15 + pmaxsd m2, m13, [cq+32* 4] ; 4 + pmaxsd m3, m13, [cq+32*11] ; 11 + pmaxsd m4, m13, [cq+32* 8] ; 8 + pmaxsd m5, m13, [cq+32* 7] ; 7 + pmaxsd m6, m13, [cq+32*12] ; 12 + pmaxsd m7, m13, [cq+32* 3] ; 3 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_16x8_internal_10bpc).main_part2 + vpbroadcastd m14, [pd_34816] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_34815 + pslld m15, 3 ; pd_8 + ret + +INV_TXFM_8X16_FN flipadst, dct, 0, 12 +INV_TXFM_8X16_FN flipadst, adst, 0, 12 +INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 +INV_TXFM_8X16_FN flipadst, identity, 35, 12 + +cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call m(iadst_8x16_internal_12bpc).pass2_main + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + jmp m(iadst_8x16_internal_12bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct, 0, 12 +INV_TXFM_8X16_FN identity, adst, 0, 12 +INV_TXFM_8X16_FN identity, flipadst, 0, 12 +INV_TXFM_8X16_FN identity, identity, 0, 12 + +cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_8x16_internal_10bpc).pass1 +.pass2: + call .pass2_main + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m7, [pixel_12bpc_max] + vpbroadcastd m12, [pw_16384] + call m(iidentity_8x16_internal_10bpc).pass2_end + RET +ALIGN function_align +.pass2_main: + mova [cq], m7 + vpbroadcastd m7, [clip_18b_min] + REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmaxsd m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [clip_18b_max] + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pminsd m15, [cq] + mova [cq], m7 + vpbroadcastd m7, [pd_11586] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmulld m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [pd_2048] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [cq] + REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret -%macro INV_TXFM_16X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 16x4 +%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x4, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 2896 mov [cq], eobd ; 0 @@ -2358,7 +3385,7 @@ sar r6d, 16 movd xm0, r6d vpbroadcastw m0, xm0 - vpbroadcastd m4, [pixel_max] + vpbroadcastd m4, [pixel_%3bpc_max] pxor m3, m3 .dconly_loop: paddw m1, m0, [dstq+strideq*0] @@ -2381,7 +3408,10 @@ INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst -cglobal idct_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 +cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: vbroadcasti128 m0, [cq+16* 0] vbroadcasti128 m4, [cq+16* 4] vbroadcasti128 m1, [cq+16* 2] @@ -2394,15 +3424,7 @@ shufpd m1, m5, 0x0c ; 2 10 shufpd m2, m6, 0x0c ; 8 12 shufpd m3, m7, 0x0c ; 14 6 - vpbroadcastd m7, [pd_2048] - call m(idct_8x4_internal_16bpc).main - pcmpeqd m6, m6 - psubd m0, m6 - psubd m2, m6 - psubd m3, m0, m4 ; idct8 out7 out6 - paddd m0, m4 ; idct8 out0 out1 - paddd m1, m2, m5 ; idct8 out3 out2 - psubd m2, m5 ; idct8 out4 out5 + call .pass1_main vbroadcasti128 m10, [cq+16* 1] vbroadcasti128 m4, [cq+16* 5] vbroadcasti128 m11, [cq+16*15] @@ -2415,6 +3437,47 @@ vbroadcasti128 m6, [cq+16* 7] vbroadcasti128 m4, [cq+16* 3] shufpd m6, m4, 0x0c ; 7 3 + call .pass1_main2 + pcmpeqd m4, m4 + REPX {psubd x, m4}, m0, m1, m2, m3 + call .pass1_main3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(idct_16x4_internal_8bpc).main +.end: + vpbroadcastd m4, [pw_2048] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_10bpc_max] +.end2: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] +.end3: + lea r6, [dstq+strideq*2] + paddw m2, [r6 +strideq*0] + paddw m3, [r6 +strideq*1] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + REPX {pminsw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [r6 +strideq*0], m2 + mova [r6 +strideq*1], m3 + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m7, [pd_2048] + call m(idct_8x4_internal_10bpc).main + psubd m3, m0, m4 ; idct8 out7 out6 + paddd m0, m4 ; idct8 out0 out1 + paddd m1, m2, m5 ; idct8 out3 out2 + psubd m2, m5 ; idct8 out4 out5 + ret +ALIGN function_align +.pass1_main2: ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 psubd m4, m10, m5 ; t9 -t10 @@ -2439,6 +3502,9 @@ pmulld m6, m12 REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10 REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10 + ret +ALIGN function_align +.pass1_main3: paddd m5, m7 psubd m4, m5, m6 paddd m5, m6 @@ -2453,32 +3519,7 @@ psubd m4, m3, m10 ; out8 out9 paddd m3, m10 ; out7 out6 REPX {pshufd x, x, q1032}, m1, m3, m5, m7 - REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 - jmp tx2q -.pass2: - call .transpose_4x16_packed - lea rax, [deint_shuf+128] - call m(idct_16x4_internal_8bpc).main -.end: - vpbroadcastd m4, [pw_2048] - REPX {pmulhrsw x, m4}, m0, m1, m2, m3 -.end2: - paddw m0, [dstq+strideq*0] - paddw m1, [dstq+strideq*1] -.end3: - lea r6, [dstq+strideq*2] - paddw m2, [r6 +strideq*0] - paddw m3, [r6 +strideq*1] - vpbroadcastd m5, [pixel_max] - pxor m4, m4 - REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 - REPX {pmaxsw x, m4}, m0, m1, m2, m3 - REPX {pminsw x, m5}, m0, m1, m2, m3 - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m1 - mova [r6 +strideq*0], m2 - mova [r6 +strideq*1], m3 - RET + ret ALIGN function_align .transpose_4x16_packed: vbroadcasti128 m8, [deint_shuf] @@ -2502,8 +3543,11 @@ INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity -cglobal iadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 - call m(iadst_4x16_internal_16bpc).main +cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10bpc).main psrad m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3 paddd m4, m5, m11 @@ -2515,10 +3559,10 @@ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: - call m(idct_16x4_internal_16bpc).transpose_4x16_packed - lea rax, [deint_shuf+128] + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + lea r6, [deint_shuf+128] call m(iadst_16x4_internal_8bpc).main - jmp m(idct_16x4_internal_16bpc).end + jmp m(idct_16x4_internal_10bpc).end ALIGN function_align .main: vbroadcasti128 m6, [pd_1321] @@ -2555,11 +3599,14 @@ paddd m7, m8 ; t0 psubd m5, m3 psubd m9, m6 ; t1 - vpbroadcastd m6, [pd_6144] pmulld m2, m1 pmulld m0, m1 ; t2 pmulld m3, m1, [cq+32*2] pmulld m1, [cq+32*3] ; -t3 + ret +ALIGN function_align +.main_end: + ; expects: m6 = rnd paddd m5, m6 paddd m9, m6 paddd m10, m4, m5 @@ -2581,8 +3628,11 @@ INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity -cglobal iflipadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 - call m(iadst_4x16_internal_16bpc).main +cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10bpc).main psrad m11, 11 ; pd_1 paddd m4, m3, m11 paddd m3, m5, m11 @@ -2592,10 +3642,10 @@ paddd m1, m7, m11 paddd m7, m0, m11 paddd m0, m8, m11 - jmp m(iadst_16x4_internal_16bpc).pass1_end + jmp m(iadst_16x4_internal_10bpc).pass1_end .pass2: - call m(idct_16x4_internal_16bpc).transpose_4x16_packed - lea rax, [deint_shuf+128] + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + lea r6, [deint_shuf+128] call m(iadst_16x4_internal_8bpc).main vpbroadcastd m4, [pw_2048] pmulhrsw m5, m3, m4 @@ -2604,14 +3654,16 @@ pmulhrsw m3, m0, m4 paddw m0, m5, [dstq+strideq*0] paddw m1, m6, [dstq+strideq*1] - jmp m(idct_16x4_internal_16bpc).end3 + vpbroadcastd m5, [pixel_10bpc_max] + jmp m(idct_16x4_internal_10bpc).end3 INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity -cglobal iidentity_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 +cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 +.pass1: vpbroadcastd m8, [pd_11586] vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m1, [cq+32*1], q3120 ; 2 3 @@ -2627,7 +3679,7 @@ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: - call m(idct_16x4_internal_16bpc).transpose_4x16_packed + call m(idct_16x4_internal_10bpc).transpose_4x16_packed vpbroadcastd m7, [pw_1697x8] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 @@ -2637,10 +3689,160 @@ paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 - jmp m(idct_16x4_internal_16bpc).end + jmp m(idct_16x4_internal_10bpc).end + +INV_TXFM_16X4_FN dct, dct, 12 +INV_TXFM_16X4_FN dct, identity, 12 +INV_TXFM_16X4_FN dct, adst, 12 +INV_TXFM_16X4_FN dct, flipadst, 12 + +cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct_16x4_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ; deinterleave + REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + ; transpose + punpcklqdq m8, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m9, m2, m3 + punpckhqdq m2, m3 + punpcklqdq m10, m4, m5 + punpckhqdq m4, m5 + punpcklqdq m11, m6, m7 + punpckhqdq m6, m7 + vperm2i128 m3, m0, m2, 0x31 ; out6 + vperm2i128 m1, m0, m2, 0x20 ; out2 + vperm2i128 m7, m4, m6, 0x31 ; out7 + vperm2i128 m5, m4, m6, 0x20 ; out3 + vperm2i128 m13, m10, m11, 0x31 ; out5 + vperm2i128 m12, m10, m11, 0x20 ; out1 + vperm2i128 m11, m8, m9, 0x31 ; out4 + vperm2i128 m10, m8, m9, 0x20 ; out0 + call m(idct_4x16_internal_10bpc).pass1_main + pmulld m0, m6, m10 + pmulld m2, m6, m11 + pmulld m4, m6, m12 + pmulld m6, m13 + vpbroadcastd m10, [pd_2048] + call m(idct_4x16_internal_10bpc).pass1_main2 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m4, [pw_16384] + vpbroadcastd m5, [pixel_12bpc_max] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + jmp m(idct_16x4_internal_10bpc).end2 -%macro INV_TXFM_16X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 16x8 +INV_TXFM_16X4_FN adst, dct, 12 +INV_TXFM_16X4_FN adst, adst, 12 +INV_TXFM_16X4_FN adst, flipadst, 12 +INV_TXFM_16X4_FN adst, identity, 12 + +cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_16x4_internal_10bpc).pass1 +.pass2: + call .pass2_main + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + jmp m(idct_16x4_internal_10bpc).end2 +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7 + pmaxsd m8, m4, m12 + pmaxsd m9, m5, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12bpc).transpose_4x8 + mova [cq+32*0], m0 + mova [cq+32*2], m1 + mova [cq+32*4], m2 + mova [cq+32*6], m3 + pminsd m0, m8, m13 + pminsd m1, m9, m13 + pminsd m2, m6, m13 + pminsd m3, m7, m13 + call m(iadst_8x4_internal_12bpc).transpose_4x8 + mova [cq+32*1], m0 + mova [cq+32*3], m1 + mova [cq+32*5], m2 + mova [cq+32*7], m3 + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_2048] + call m(iadst_16x4_internal_10bpc).main_end + psrad m0, m4, 15 + psrad m1, m5, 15 + psrad m2, 15 + psrad m3, 15 + psrad m4, m8, 15 + psrad m5, m9, 15 + psrad m6, 15 + psrad m7, 15 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m4, [pw_16384] + vpbroadcastd m5, [pixel_12bpc_max] + ret + +INV_TXFM_16X4_FN flipadst, dct, 12 +INV_TXFM_16X4_FN flipadst, adst, 12 +INV_TXFM_16X4_FN flipadst, flipadst, 12 +INV_TXFM_16X4_FN flipadst, identity, 12 + +cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_16x4_internal_10bpc).pass1 +.pass2: + call m(iadst_16x4_internal_12bpc).pass2_main + vpermq m7, m0, q3120 + vpermq m6, m1, q3120 + vpermq m1, m2, q3120 + vpermq m0, m3, q3120 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m6, m4 + pmulhrsw m3, m7, m4 + jmp m(idct_16x4_internal_10bpc).end2 + +INV_TXFM_16X4_FN identity, dct, 12 +INV_TXFM_16X4_FN identity, adst, 12 +INV_TXFM_16X4_FN identity, flipadst, 12 +INV_TXFM_16X4_FN identity, identity, 12 + +cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(iidentity_16x4_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [pd_5793] + vpbroadcastd m9, [pd_2048] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + vpbroadcastd m4, [pw_16384] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12bpc_max] + jmp m(idct_16x4_internal_10bpc).end2 + +%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x8, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 2896 mov [cq], eobd ; 0 @@ -2648,7 +3850,7 @@ add r6d, 2048 sar r6d, 12 imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly + jmp m(inv_txfm_add_dct_dct_16x4_%3bpc).dconly %endif %endmacro @@ -2657,7 +3859,10 @@ INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst -cglobal idct_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 +cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: vpbroadcastd m14, [pd_2896] pmulld m0, m14, [cq+32* 1] pmulld m1, m14, [cq+32* 3] @@ -2668,10 +3873,8 @@ pmulld m6, m14, [cq+32*13] pmulld m7, m14, [cq+32*15] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] lea r6, [rsp+32*4] - call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+32* 0] pmulld m1, m14, [cq+32* 2] pmulld m2, m14, [cq+32* 4] @@ -2680,10 +3883,33 @@ pmulld m5, m14, [cq+32*10] pmulld m6, m14, [cq+32*12] pmulld m7, m14, [cq+32*14] - call m(idct_8x8_internal_16bpc).main_rect2 - call m(idct_8x16_internal_16bpc).main_evenhalf + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf psrld m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call .pass1_rotations + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m10, [pw_2048] +.end: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call .write_16x4_start +.end2: + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m10 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m10 + call .write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: mova m14, [r6-32*4] mova m13, [r6-32*3] mova m12, [r6-32*2] @@ -2707,28 +3933,10 @@ paddd m6, m8 ; out6 psubd m8, m7, [r6+32*3] ; out8 paddd m7, [r6+32*3] ; out7 - REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ - m8, m9, m10, m11, m12, m13, m14, m15 - jmp tx2q -.pass2: - call .transpose - call m(idct_16x8_internal_8bpc).main - vpbroadcastd m10, [pw_2048] -.end: - pmulhrsw m0, m10 - pmulhrsw m1, m10 - pmulhrsw m2, m10 - pmulhrsw m3, m10 - call .write_16x4_start - pmulhrsw m0, m4, m10 - pmulhrsw m1, m5, m10 - pmulhrsw m2, m6, m10 - pmulhrsw m3, m7, m10 - call .write_16x4_zero - RET + ret ALIGN function_align .transpose: - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] .transpose2: packssdw m0, m8 packssdw m1, m9 @@ -2774,7 +3982,7 @@ ret ALIGN function_align .write_16x4_start: - vpbroadcastd m9, [pixel_max] + vpbroadcastd m9, [pixel_10bpc_max] lea r3, [strideq*3] pxor m8, m8 .write_16x4_zero: @@ -2799,34 +4007,22 @@ INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity -cglobal iadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 +cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: lea r6, [rsp+32*4] call .main vpbroadcastd m14, [pd_6144] psrld m15, 11 ; pd_1 psubd m13, m14, m15 ; pd_6143 - paddd m0, m15 - psubd m1, m15, m1 - paddd m2, m15 - psubd m3, m15, m3 - paddd m4, m14 - psubd m5, m13, m5 - paddd m6, m14 - psubd m7, m13, m7 - paddd m8, m14, m9 - psubd m9, m13, m10 - paddd m10, m14, m11 - psubd m11, m13, m12 - paddd m12, m15, [r6-32*1] - psubd m13, m15, [r6-32*2] - paddd m14, m15, [r6-32*3] - psubd m15, [r6-32*4] + call .pass1_rotations .pass1_end: REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 jmp tx2q .pass2: - call m(idct_16x8_internal_16bpc).transpose + call m(idct_16x8_internal_10bpc).transpose call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] @@ -2836,15 +4032,35 @@ pmulhrsw m1, m11 pmulhrsw m2, m10 pmulhrsw m3, m11 - call m(idct_16x8_internal_16bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m4, m10 pmulhrsw m1, m5, m11 pmulhrsw m2, m6, m10 pmulhrsw m3, m7, m11 - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align +.pass1_rotations: + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] + ret +ALIGN function_align .main: + ; expects: m13 = clip_min m14 = clip_max vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 2] pmulld m1, m15, [cq+32*13] @@ -2855,8 +4071,6 @@ pmulld m6, m15, [cq+32*14] pmulld m7, m15, [cq+32* 1] vpbroadcastd m12, [pd_2048] - vpbroadcastd m13, [clip_min] - vpbroadcastd m14, [clip_max] REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 call .main_part1 @@ -3013,32 +4227,19 @@ INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity -cglobal iflipadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 +cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: lea r6, [rsp+32*4] - call m(iadst_16x8_internal_16bpc).main + call m(iadst_16x8_internal_10bpc).main vpbroadcastd m14, [pd_6144] psrld m15, 11 psubd m13, m14, m15 - psubd m8, m13, m7 - paddd m7, m14, m9 - paddd m9, m14, m6 - psubd m6, m13, m10 - psubd m10, m13, m5 - paddd m5, m14, m11 - paddd m11, m14, m4 - psubd m4, m13, m12 - psubd m12, m15, m3 - paddd m3, m15, [r6-32*1] - paddd m13, m15, m2 - psubd m2, m15, [r6-32*2] - psubd m14, m15, m1 - mova m1, m15 - paddd m15, m0 - psubd m0, m1, [r6-32*4] - paddd m1, [r6-32*3] - jmp m(iadst_16x8_internal_16bpc).pass1_end + call .pass1_rotations + jmp m(iadst_16x8_internal_10bpc).pass1_end .pass2: - call m(idct_16x8_internal_16bpc).transpose + call m(idct_16x8_internal_10bpc).transpose call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] @@ -3052,20 +4253,41 @@ pmulhrsw m2, m5, m11 mova m5, m3 pmulhrsw m3, m4, m10 - call m(idct_16x8_internal_16bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m5, m11 pmulhrsw m1, m6, m10 pmulhrsw m2, m7, m11 pmulhrsw m3, m12, m10 - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero RET +ALIGN function_align +.pass1_rotations: + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + ret INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity -cglobal iidentity_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 +cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 +.pass1: vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 0] pmulld m1, m15, [cq+32* 1] @@ -3104,19 +4326,169 @@ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: - call m(idct_16x8_internal_16bpc).transpose + call m(idct_16x8_internal_10bpc).transpose + vpbroadcastd m10, [pw_4096] + jmp m(idct_16x8_internal_10bpc).end + +INV_TXFM_16X8_FN dct, dct, 12 +INV_TXFM_16X8_FN dct, identity, 12 +INV_TXFM_16X8_FN dct, adst, 12 +INV_TXFM_16X8_FN dct, flipadst, 12 + +cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_16x8_internal_10bpc).pass1 +.pass2: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_12bpc).round_shift4 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_12bpc).round_shift4 +.end: + packssdw m0, [cq+32* 8] + packssdw m1, [cq+32* 9] + packssdw m2, [cq+32*10] + packssdw m3, [cq+32*11] + packssdw m4, [cq+32*12] + packssdw m5, [cq+32*13] + packssdw m6, [cq+32*14] + packssdw m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call .write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + vpermq m2, m6, q3120 + vpermq m3, m7, q3120 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_12bpc_max] + lea r3, [strideq*3] + pxor m8, m8 + ret + +INV_TXFM_16X8_FN adst, dct, 12 +INV_TXFM_16X8_FN adst, adst, 12 +INV_TXFM_16X8_FN adst, flipadst, 12 +INV_TXFM_16X8_FN adst, identity, 12 + +cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x8_internal_10bpc).pass1 +.pass2: + call .pass2_main + jmp m(idct_16x8_internal_12bpc).end +ALIGN function_align +.pass2_main: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12bpc).pass2_main2 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12bpc).pass2_main2 + ret + +INV_TXFM_16X8_FN flipadst, dct, 12 +INV_TXFM_16X8_FN flipadst, adst, 12 +INV_TXFM_16X8_FN flipadst, flipadst, 12 +INV_TXFM_16X8_FN flipadst, identity, 12 + +cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x8_internal_10bpc).pass1 +.pass2: + call m(iadst_16x8_internal_12bpc).pass2_main + packssdw m13, m0, [cq+32* 8] + packssdw m12, m1, [cq+32* 9] + packssdw m11, m2, [cq+32*10] + packssdw m10, m3, [cq+32*11] + packssdw m3, m4, [cq+32*12] + packssdw m2, m5, [cq+32*13] + packssdw m1, m6, [cq+32*14] + packssdw m0, m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpermq m0, m10, q3120 + vpermq m1, m11, q3120 + vpermq m2, m12, q3120 + vpermq m3, m13, q3120 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET + +INV_TXFM_16X8_FN identity, dct, 12 +INV_TXFM_16X8_FN identity, adst, 12 +INV_TXFM_16X8_FN identity, flipadst, 12 +INV_TXFM_16X8_FN identity, identity, 12 + +cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_16x8_internal_10bpc).pass1 +.pass2: + call m(idct_16x8_internal_10bpc).transpose2 vpbroadcastd m10, [pw_4096] - jmp m(idct_16x8_internal_16bpc).end + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + jmp m(idct_16x8_internal_10bpc).end2 -%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset - INV_TXFM_FN %1, %2, %3, 16x16 +%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 16x16, %4 %ifidn %1_%2, dct_dct imul r6d, [cq], 2896 mov [cq], eobd ; 0 mov r3d, 16 add r6d, 10240 sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 + jmp m(inv_txfm_add_dct_dct_16x4_%4bpc).dconly2 %endif %endmacro @@ -3125,10 +4497,11 @@ INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst -cglobal idct_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 +cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*4] sub eobd, 36 @@ -3214,7 +4587,7 @@ jmp tx2q .pass2: call .transpose - lea rax, [pw_5+128] + lea r6, [pw_5+128] mova [rsp], m15 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] @@ -3231,22 +4604,23 @@ pmulhrsw m1, m12 pmulhrsw m2, m12 pmulhrsw m3, m12 - call m(idct_16x8_internal_16bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_start +.write_16x16_2: pmulhrsw m0, m12, m4 pmulhrsw m1, m12, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m12, m7 - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+gprsize+32*0] pmulhrsw m1, m12, [rsp+gprsize+32*1] pmulhrsw m2, m12, m10 pmulhrsw m3, m12, m11 - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+gprsize+32*2] pmulhrsw m1, m12, m13 pmulhrsw m2, m12, m14 pmulhrsw m3, m12, m15 - jmp m(idct_16x8_internal_16bpc).write_16x4_zero + jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .transpose: test eobd, eobd @@ -3338,7 +4712,7 @@ vinserti128 m15, [r6+16], 0 ret .transpose_fast: - call m(idct_16x8_internal_16bpc).transpose2 + call m(idct_16x8_internal_10bpc).transpose2 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 ret @@ -3352,7 +4726,7 @@ mova m5, [cq+64*11] mova m6, [cq+64*13] mova m7, [cq+64*15] - call m(idct_8x16_internal_16bpc).main_oddhalf + call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+64* 0] mova m1, [cq+64* 2] mova m2, [cq+64* 4] @@ -3361,8 +4735,8 @@ mova m5, [cq+64*10] mova m6, [cq+64*12] mova m7, [cq+64*14] - call m(idct_8x8_internal_16bpc).main - call m(idct_8x16_internal_16bpc).main_evenhalf + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf psrld m10, m11, 10 ; pd_2 REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 ret @@ -3371,9 +4745,10 @@ INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst -cglobal iadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 - vpbroadcastd m13, [clip_min] - vpbroadcastd m14, [clip_max] +cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: vpbroadcastd m15, [pd_2896] lea r6, [rsp+32*4] sub eobd, 36 @@ -3447,8 +4822,8 @@ sub r6, 32*8 jmp tx2q .pass2: - call m(idct_16x16_internal_16bpc).transpose - lea rax, [pw_5+128] + call m(idct_16x16_internal_10bpc).transpose + lea r6, [pw_5+128] mova [rsp], m15 call m(iadst_16x16_internal_8bpc).main call m(iadst_16x16_internal_8bpc).main_pass2_end @@ -3463,22 +4838,22 @@ mova [rsp+32*1], m9 pmulhrsw m2, m12 pmulhrsw m3, m13 - call m(idct_16x8_internal_16bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m12, m4 pmulhrsw m1, m13, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m13, m7 - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+32*0] pmulhrsw m1, m13, [rsp+32*1] pmulhrsw m2, m12, m10 pmulhrsw m3, m13, m11 - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+32*2] pmulhrsw m1, m13, [rsp+32*3] pmulhrsw m2, m12, m14 pmulhrsw m3, m13, m15 - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align .main: @@ -3491,7 +4866,7 @@ mova m6, [cq+64*14] mova m7, [cq+64* 1] vpbroadcastd m12, [pd_2048] - call m(iadst_16x8_internal_16bpc).main_part1 + call m(iadst_16x8_internal_10bpc).main_part1 mova m0, [cq+64* 0] mova m1, [cq+64*15] mova m2, [cq+64* 4] @@ -3500,21 +4875,22 @@ mova m5, [cq+64* 7] mova m6, [cq+64*12] mova m7, [cq+64* 3] - jmp m(iadst_16x8_internal_16bpc).main_part2 + jmp m(iadst_16x8_internal_10bpc).main_part2 INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst -cglobal iflipadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 - vpbroadcastd m13, [clip_min] - vpbroadcastd m14, [clip_max] +cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: vpbroadcastd m15, [pd_2896] lea r6, [rsp+32*4] sub eobd, 36 jl .fast add cq, 32 - call m(iadst_16x16_internal_16bpc).main + call m(iadst_16x16_internal_10bpc).main sub cq, 32 vpbroadcastd m8, [pd_10240] paddd m11, m8 @@ -3556,7 +4932,7 @@ mova [r6+32*3], m0 .fast: add r6, 32*8 - call m(iadst_16x16_internal_16bpc).main + call m(iadst_16x16_internal_10bpc).main vpbroadcastd m14, [pd_10240] vpbroadcastd m13, [pd_10239] psrld m15, 10 ; pd_2 @@ -3577,10 +4953,10 @@ paddd m15, m0 psubd m0, m1, [r6-32*4] paddd m1, [r6-32*3] - jmp m(iadst_16x16_internal_16bpc).pass1_end + jmp m(iadst_16x16_internal_10bpc).pass1_end .pass2: - call m(idct_16x16_internal_16bpc).transpose - lea rax, [pw_5+128] + call m(idct_16x16_internal_10bpc).transpose + lea r6, [pw_5+128] mova [rsp], m15 call m(iadst_16x16_internal_8bpc).main call m(iadst_16x16_internal_8bpc).main_pass2_end @@ -3598,28 +4974,29 @@ pmulhrsw m3, m12 mova m14, m8 mova m15, m9 - call m(idct_16x8_internal_16bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m13, m11 pmulhrsw m1, m12, m10 pmulhrsw m2, m13, m15 pmulhrsw m3, m12, m14 - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m13, m7 pmulhrsw m1, m12, m6 pmulhrsw m2, m13, m5 pmulhrsw m3, m12, m4 - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m13, [rsp+32*3] pmulhrsw m1, m12, [rsp+32*2] pmulhrsw m2, m13, [rsp+32*1] pmulhrsw m3, m12, [rsp+32*0] - call m(idct_16x8_internal_16bpc).write_16x4_zero + call m(idct_16x8_internal_10bpc).write_16x4_zero RET INV_TXFM_16X16_FN identity, dct, -92 INV_TXFM_16X16_FN identity, identity -cglobal iidentity_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 +cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 +.pass1: vpbroadcastd m15, [pd_11586] vpbroadcastd m7, [pd_10240] lea r6, [rsp+32*4] @@ -3665,7 +5042,7 @@ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: - call m(idct_16x16_internal_16bpc).transpose + call m(idct_16x16_internal_10bpc).transpose mova [cq+32*0], m15 mova [cq+32*1], m0 @@ -3681,7 +5058,376 @@ paddsw m1, m1 paddsw m15, m1 mova m1, [cq+32*1] - jmp m(idct_16x16_internal_16bpc).end + jmp m(idct_16x16_internal_10bpc).end + +INV_TXFM_16X16_FN dct, dct, 0, 12 +INV_TXFM_16X16_FN dct, identity, 28, 12 +INV_TXFM_16X16_FN dct, adst, 0, 12 +INV_TXFM_16X16_FN dct, flipadst, 0, 12 + +cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_16x16_internal_10bpc).pass1 +.pass2: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 + call .pass2_main + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + call .pass2_main + jmp m(iadst_16x16_internal_12bpc).end +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_16384] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + jmp m(idct_16x16_internal_10bpc).write_16x16_2 +ALIGN function_align +.pass2_main: + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m2 + mova [cq+32* 2], m4 + mova [cq+32* 3], m6 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, m1 + pmaxsd m1, m12, m3 + pmaxsd m2, m12, m5 + pmaxsd m3, m12, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m4, [r6-32*3] + mova m10, [r6-32*2] + mova m5, [r6-32*1] + mova m12, [r6+32*0] + mova m6, [r6+32*1] + mova m14, [r6+32*2] + mova m7, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15 + mova [cq+32* 4], m8 + mova [cq+32* 5], m10 + mova [cq+32* 6], m12 + mova [cq+32* 7], m14 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m4, m5, m6, m7 + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast: + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 1] + pmaxsd m2, m12, [cq+32* 2] + pmaxsd m3, m12, [cq+32* 3] + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow2 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m12, [cq+32* 4] + pmaxsd m5, m12, [cq+32* 5] + pmaxsd m6, m12, [cq+32* 6] + pmaxsd m7, m12, [cq+32* 7] + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast2: + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + psrad m11, 8 ; pd_8 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x8_internal_10bpc).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret + +INV_TXFM_16X16_FN adst, dct, 0, 12 +INV_TXFM_16X16_FN adst, adst, 0, 12 +INV_TXFM_16X16_FN adst, flipadst, 0, 12 + +cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x16_internal_10bpc).pass1 +.pass2: + call .pass2_part1 + call m(iadst_16x8_internal_10bpc).pass1_rotations + call .pass2_part2 + call m(iadst_16x8_internal_10bpc).pass1_rotations +.pass2_part3: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 +.end: + packssdw m15, m14 + packssdw m14, m13, m12 + packssdw m13, m11, m10 + packssdw m12, m9, m8 + packssdw m11, m7, m6 + packssdw m10, m5, m4 + packssdw m7, m3, m2 + packssdw m6, m1, m0 + vpblendd m0, m6, [r5-32*4], 0x33 + vpblendd m1, m6, [r5-32*4], 0xcc + vpblendd m2, m7, [r5-32*3], 0x33 + vpblendd m3, m7, [r5-32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpblendd m0, m10, [r5-32*2], 0x33 + vpblendd m1, m10, [r5-32*2], 0xcc + vpblendd m2, m11, [r5-32*1], 0x33 + vpblendd m3, m11, [r5-32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpblendd m0, m12, [r5+32*0], 0x33 + vpblendd m1, m12, [r5+32*0], 0xcc + vpblendd m2, m13, [r5+32*1], 0x33 + vpblendd m3, m13, [r5+32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpblendd m0, m14, [r5+32*2], 0x33 + vpblendd m1, m14, [r5+32*2], 0xcc + vpblendd m2, m15, [r5+32*3], 0x33 + vpblendd m3, m15, [r5+32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.pass2_part1: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 +.pass2_main: + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m3 + mova [cq+32* 2], m4 + mova [cq+32* 3], m7 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + pmaxsd m0, m13, m2 + pmaxsd m2, m13, m6 + pmaxsd m5, m13, m5 + pmaxsd m7, m13, m1 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m3, [r6-32*3] + mova m4, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m1, [r6+32*1] + mova m6, [r6+32*2] + mova m15, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14 + mova [cq+32* 4], m8 + mova [cq+32* 5], m11 + mova [cq+32* 6], m12 + mova [cq+32* 7], m15 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + REPX {pmaxsd x, m13}, m1, m3, m4, m6 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast: + vpbroadcastd m12, [pd_2048] + vpbroadcastd m15, [pd_2896] + call m(iadst_16x8_internal_10bpc).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m7, m13, [cq+32* 1] ; 3 + pmaxsd m2, m13, [cq+32* 2] ; 4 + pmaxsd m5, m13, [cq+32* 3] ; 7 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow2 + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m13, [cq+32* 4] ; 8 + pmaxsd m3, m13, [cq+32* 5] ; 11 + pmaxsd m6, m13, [cq+32* 6] ; 12 + pmaxsd m1, m13, [cq+32* 7] ; 15 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast2: + call m(iadst_16x8_internal_10bpc).main_part2 + vpbroadcastd m14, [pd_34816] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_34815 + pslld m15, 3 ; pd_8 + ret +ALIGN function_align +.pass2_part2: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + jmp .pass2_main + +INV_TXFM_16X16_FN flipadst, dct, 0, 12 +INV_TXFM_16X16_FN flipadst, adst, 0, 12 +INV_TXFM_16X16_FN flipadst, flipadst, 0, 12 + +cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x16_internal_10bpc).pass1 +.pass2: + call m(iadst_16x16_internal_12bpc).pass2_part1 + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + call m(iadst_16x16_internal_12bpc).pass2_part2 + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + jmp m(iadst_16x16_internal_12bpc).pass2_part3 + +INV_TXFM_16X16_FN identity, dct, -92, 12 +INV_TXFM_16X16_FN identity, identity, 0, 12 + +cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + jmp m(iidentity_16x16_internal_10bpc).pass1 +.pass2: + call m(iidentity_8x16_internal_12bpc).pass2_main + call m(idct_16x16_internal_10bpc).transpose_fast + test eobd, eobd + jl .pass2_fast + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + mova m10, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m13, [r6+32*1] + mova m14, [r6+32*2] + mova m15, [r6+32*3] + sub r6, 32*8 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + mova m2, [r6-32*2] + mova m3, [r6-32*1] + mova m4, [r6+32*0] + mova m5, [r6+32*1] + mova m6, [r6+32*2] + mova m7, [r6+32*3] + call m(iidentity_8x16_internal_12bpc).pass2_main + call m(idct_16x8_internal_10bpc).transpose2 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 + mova m13, m5 + mova m14, m6 + mova m15, m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] +.pass2_fast: + call m(idct_16x16_internal_12bpc).write_16x16 + RET %macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift mova m%4, [r6+32*(%1-4)] @@ -3704,14 +5450,14 @@ packssdw m%2, m%4 ; out15 - n, out31 - n %endmacro -cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] vbroadcasti128 m14, [idct32_shuf] mov r4, cq call .pass1_main @@ -3738,7 +5484,7 @@ pxor m4, m4 .pass1_end_fast: vpbroadcastd m10, [pw_2048] - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] REPX {mova x, m4}, m5, m6, m7 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .end @@ -3775,7 +5521,7 @@ mova m1, [r6-32*3] ; out4 out6 mova m2, [r6-32*2] ; out8 out10 mova m3, [r6-32*1] ; out12 out14 - lea rax, [deint_shuf+128] + lea r6, [deint_shuf+128] mova m11, [rsp+32*3] ; out13 out15 vpbroadcastd m10, [pw_2048] call m(inv_txfm_add_dct_dct_8x32_8bpc).main @@ -3790,42 +5536,42 @@ vpermq m1, m1, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4_start vpermq m0, m2, q3120 vpermq m1, m3, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m4, q3120 vpermq m1, m5, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m6, q3120 vpermq m1, m7, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [rsp+32*1], q3120 vpermq m1, [rsp+32*2], q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [rsp+32*3], q3120 vpermq m1, [rsp+32*4], q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [rsp+32*0], q3120 vpermq m1, m13, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m14, q3120 vpermq m1, m15, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 - call m(idct_8x8_internal_16bpc).write_8x4 + call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: imul r6d, [cq], 2896 @@ -3833,7 +5579,7 @@ mov r3d, 32 add r6d, 10240 sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_8x8_16bpc).dconly2 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 ALIGN function_align .pass1_main: mova m0, [cq+128*0] @@ -3845,7 +5591,7 @@ mova m6, [cq+128*6] mova m7, [cq+128*7] add cq, 32 - call m(idct_8x8_internal_16bpc).main + call m(idct_8x8_internal_10bpc).main psrld m1, m11, 10 ; pd_2 REPX {paddd x, m1}, m0, m6, m5, m3 paddd m1, m6, m7 ; out1 @@ -4134,9 +5880,9 @@ vinserti128 m1, xm15, 1 ret -cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m5, [pw_5] - vpbroadcastd m7, [pixel_max] + vpbroadcastd m7, [pixel_10bpc_max] pxor m6, m6 mov r6d, eobd add eobb, 21 @@ -4201,7 +5947,7 @@ vextracti128 [dstq+r4 ], m3, 1 ret -cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full imul r6d, [cq], 2896 @@ -4216,7 +5962,7 @@ sar r6d, 16 movd xm0, r6d vpbroadcastw m0, xm0 - vpbroadcastd m4, [pixel_max] + vpbroadcastd m4, [pixel_10bpc_max] pxor m3, m3 .dconly_loop: paddw m1, m0, [dstq+32*0] @@ -4242,11 +5988,11 @@ mova m6, [cq+32*25] mova m7, [cq+32*31] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*4] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 mova m0, [cq+32* 3] mova m1, [cq+32* 5] mova m2, [cq+32*11] @@ -4255,7 +6001,7 @@ mova m5, [cq+32*21] mova m6, [cq+32*27] mova m7, [cq+32*29] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 mova m0, [cq+32* 2] mova m1, [cq+32* 6] mova m2, [cq+32*10] @@ -4264,7 +6010,7 @@ mova m5, [cq+32*22] mova m6, [cq+32*26] mova m7, [cq+32*30] - call m(idct_8x16_internal_16bpc).main_oddhalf + call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+32* 0] mova m1, [cq+32* 4] mova m2, [cq+32* 8] @@ -4273,10 +6019,10 @@ mova m5, [cq+32*20] mova m6, [cq+32*24] mova m7, [cq+32*28] - call m(idct_8x8_internal_16bpc).main - call m(idct_8x16_internal_16bpc).main_evenhalf - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_end - lea rax, [deint_shuf+128] + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end + lea r6, [deint_shuf+128] vpbroadcastd m11, [pw_2048] mov r4, dstq call .pass2 @@ -4288,7 +6034,7 @@ mova m5, [r5-32*2] ; 26 27 mova m6, [r5-32*3] ; 22 23 mova m7, [r5-32*4] ; 24 25 - call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose lea dstq, [r4+32] call .pass2 RET @@ -4296,16 +6042,16 @@ .pass2: call m(idct_16x8_internal_8bpc).main REPX {pmulhrsw x, m11}, m0, m1, m2, m3 - call m(idct_16x8_internal_16bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m11, m4 pmulhrsw m1, m11, m5 pmulhrsw m2, m11, m6 pmulhrsw m3, m11, m7 - jmp m(idct_16x8_internal_16bpc).write_16x4_zero + jmp m(idct_16x8_internal_10bpc).write_16x4_zero -cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m5, [pw_4096] - vpbroadcastd m7, [pixel_max] + vpbroadcastd m7, [pixel_10bpc_max] pxor m6, m6 mov r6d, eobd add eobb, 21 @@ -4326,7 +6072,7 @@ packssdw m3, [cq-32*1] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 - call m(inv_txfm_add_identity_identity_8x32_16bpc).main + call m(inv_txfm_add_identity_identity_8x32_10bpc).main add dstq, 16 sub eobd, 64 jge .loop @@ -4351,14 +6097,14 @@ mova [r2+%6], m%1 %endmacro -cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*16] lea r4, [r6+32*8] @@ -4381,7 +6127,7 @@ add r6d, 2048 sar r6d, 12 imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly .eob44: mova [r4+16*0], xm0 mova [r4+16*1], xm3 @@ -4403,7 +6149,7 @@ mova m2, [r4+32*2] mova m3, [r4+32*3] .fast: - lea rax, [pw_5+128] + lea r6, [pw_5+128] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 @@ -4455,7 +6201,7 @@ mova m5, [r4-32*3] mova m6, [r4-32*2] mova m7, [r4-32*1] - lea rax, [pw_5 + 128] + lea r6, [pw_5 + 128] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf lea r3, [rsp+32*8] mova m8, [r3+32*0] @@ -4493,7 +6239,7 @@ pmulld m5, m14, [cq+128*11] pmulld m6, m14, [cq+128*13] pmulld m7, m14, [cq+128*15] - call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+128* 0] pmulld m1, m14, [cq+128* 2] pmulld m2, m14, [cq+128* 4] @@ -4502,8 +6248,8 @@ pmulld m5, m14, [cq+128*10] pmulld m6, m14, [cq+128*12] pmulld m7, m14, [cq+128*14] - call m(idct_8x8_internal_16bpc).main_rect2 - call m(idct_8x16_internal_16bpc).main_evenhalf + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf psrld m15, m11, 11 ; pd_1 mova m8, [r6-32*4] mova m9, [r6-32*3] @@ -4592,7 +6338,7 @@ mova [rsp+gprsize+32*2], m7 mova [rsp+gprsize+32*3], m15 vpbroadcastd m15, [pw_2048] - vpbroadcastd m7, [pixel_max] + vpbroadcastd m7, [pixel_10bpc_max] IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 @@ -4621,11 +6367,11 @@ IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 ret -cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] vpbroadcastd m11, [pw_8192] - vpbroadcastd m7, [pixel_max] + vpbroadcastd m7, [pixel_10bpc_max] lea r6, [strideq*5] pxor m6, m6 paddw m10, m11, m11 ; pw_16384 @@ -4690,26 +6436,26 @@ punpckhwd m2, m1 punpckhqdq m1, m0, m4 punpcklqdq m0, m4 - call m(iidentity_8x8_internal_16bpc).write_2x8x2 + call m(iidentity_8x8_internal_10bpc).write_2x8x2 punpcklqdq m0, m3, m2 punpckhqdq m1, m3, m2 - jmp m(iidentity_8x8_internal_16bpc).write_2x8x2 + jmp m(iidentity_8x8_internal_10bpc).write_2x8x2 -cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob %undef cmp - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*4] call .main cmp eobd, 36 jge .full - call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] - lea rax, [pw_5+128] + lea r6, [pw_5+128] mov r7, dstq call m(idct_16x16_internal_8bpc).main call .write_16x16 @@ -4721,7 +6467,7 @@ mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] - call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] jmp .end @@ -4734,7 +6480,7 @@ imul r6d, 2896 add r6d, 6144 sar r6d, 13 - jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .full: add cq, 32 mova [r4+32*3], m0 @@ -4748,7 +6494,7 @@ call .main sub r4, 32*16 ; topleft 16x8 call .transpose_16x16 - lea rax, [pw_5+128] + lea r6, [pw_5+128] mov r7, dstq call m(idct_16x16_internal_8bpc).main call .write_16x16 @@ -4802,7 +6548,7 @@ mova m6, [r4-32*3] mova m7, [r4-32*4] mova [rsp+gprsize], m15 - jmp m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose ALIGN function_align .main: vpbroadcastd m14, [pd_2896] @@ -4815,7 +6561,7 @@ pmulld m5, m14, [cq+64*23] pmulld m6, m14, [cq+64*25] pmulld m7, m14, [cq+64*31] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_rect2 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 pmulld m0, m14, [cq+64* 3] pmulld m1, m14, [cq+64* 5] pmulld m2, m14, [cq+64*11] @@ -4824,7 +6570,7 @@ pmulld m5, m14, [cq+64*21] pmulld m6, m14, [cq+64*27] pmulld m7, m14, [cq+64*29] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_rect2 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 pmulld m0, m14, [cq+64* 2] pmulld m1, m14, [cq+64* 6] pmulld m2, m14, [cq+64*10] @@ -4833,7 +6579,7 @@ pmulld m5, m14, [cq+64*22] pmulld m6, m14, [cq+64*26] pmulld m7, m14, [cq+64*30] - call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+64* 0] pmulld m1, m14, [cq+64* 4] pmulld m2, m14, [cq+64* 8] @@ -4842,8 +6588,8 @@ pmulld m5, m14, [cq+64*20] pmulld m6, m14, [cq+64*24] pmulld m7, m14, [cq+64*28] - call m(idct_8x8_internal_16bpc).main_rect2 - call m(idct_8x16_internal_16bpc).main_evenhalf + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf pxor m8, m8 mov r7d, 64*30 .main_zero_loop: @@ -4895,35 +6641,35 @@ mova [rsp+gprsize+32*1], m9 mova [rsp+gprsize+32*2], m12 vpbroadcastd m12, [pw_2048] - vpbroadcastd m9, [pixel_max] + vpbroadcastd m9, [pixel_10bpc_max] lea r3, [strideq*3] pxor m8, m8 pmulhrsw m0, m12 pmulhrsw m1, m12 pmulhrsw m2, m12 pmulhrsw m3, m12 - call m(idct_16x8_internal_16bpc).write_16x4 + call m(idct_16x8_internal_10bpc).write_16x4 pmulhrsw m0, m12, m4 pmulhrsw m1, m12, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m12, m7 - call m(idct_16x8_internal_16bpc).write_16x4 + call m(idct_16x8_internal_10bpc).write_16x4 pmulhrsw m0, m12, [rsp+gprsize+32*0] pmulhrsw m1, m12, [rsp+gprsize+32*1] pmulhrsw m2, m12, m10 pmulhrsw m3, m12, m11 - call m(idct_16x8_internal_16bpc).write_16x4 + call m(idct_16x8_internal_10bpc).write_16x4 pmulhrsw m0, m12, [rsp+gprsize+32*2] pmulhrsw m1, m12, m13 pmulhrsw m2, m12, m14 pmulhrsw m3, m12, m15 - jmp m(idct_16x8_internal_16bpc).write_16x4 + jmp m(idct_16x8_internal_10bpc).write_16x4 -cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] vpbroadcastd m10, [pw_2048] - vpbroadcastd m7, [pixel_max] + vpbroadcastd m7, [pixel_10bpc_max] lea r6, [strideq*5] pxor m6, m6 mov r5, dstq @@ -4974,15 +6720,15 @@ REPX {IDTX16 x, 4, 9 }, 0, 1, 2, 3 REPX {pmulhrsw x, m10}, m0, m1, m2, m3 REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 - jmp m(inv_txfm_add_identity_identity_16x32_16bpc).main2 + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 -cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob %undef cmp - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*7] call .main cmp eobd, 36 @@ -4999,7 +6745,7 @@ imul r6d, [cq], 2896 mov [cq], eobd ; 0 mov r3d, 32 - jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .fast: lea r4, [rsp+32*71] pxor m0, m0 @@ -5012,13 +6758,13 @@ lea r3, [rsp+32*3] mov r4, r6 lea r5, [r6+32*8] - lea rax, [pw_5+128] + lea r6, [pw_5+128] call .pass2_oddhalf call .pass2_evenhalf imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq - call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end + call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end sub dstq, r3 lea r2, [r2+r3+32] add dstq, 32 @@ -5026,7 +6772,7 @@ call .pass2_oddhalf call .pass2_evenhalf lea r3, [strideq*3] - call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end + call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end RET ALIGN function_align .main: @@ -5040,7 +6786,7 @@ mova m7, [cq+128*31] vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 mova m0, [cq+128* 3] mova m1, [cq+128* 5] mova m2, [cq+128*11] @@ -5049,7 +6795,7 @@ mova m5, [cq+128*21] mova m6, [cq+128*27] mova m7, [cq+128*29] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 mova m0, [cq+128* 2] mova m1, [cq+128* 6] mova m2, [cq+128*10] @@ -5058,7 +6804,7 @@ mova m5, [cq+128*22] mova m6, [cq+128*26] mova m7, [cq+128*30] - call m(idct_8x16_internal_16bpc).main_oddhalf + call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+128* 0] mova m1, [cq+128* 4] mova m2, [cq+128* 8] @@ -5067,9 +6813,9 @@ mova m5, [cq+128*20] mova m6, [cq+128*24] mova m7, [cq+128*28] - call m(idct_8x8_internal_16bpc).main - call m(idct_8x16_internal_16bpc).main_evenhalf - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_end + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end pxor m15, m15 mov r7d, 128*29 .main_zero_loop: @@ -5096,7 +6842,7 @@ mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] - call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose mova [r5-32*4], m0 mova [r5-32*3], m1 mova [r5-32*2], m2 @@ -5146,10 +6892,10 @@ mova [rsp+gprsize], m15 jmp m(idct_16x16_internal_8bpc).main -cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 8, 8, dst, stride, c, eob +cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob %undef cmp vpbroadcastd m5, [pw_8192] - vpbroadcastd m7, [pixel_max] + vpbroadcastd m7, [pixel_10bpc_max] pxor m6, m6 lea r6, [strideq*3] lea r5, [strideq*5] @@ -5213,7 +6959,7 @@ mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 - jmp m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero + jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) %if %1 & 1 @@ -5243,7 +6989,7 @@ paddw m%6, [%%d1+%10] pxor m%2, m%2 REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 - vpbroadcastd m%2, [pixel_max] + vpbroadcastd m%2, [pixel_10bpc_max] REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 mova [%%d0+%7 ], m%3 mova [%%d1+%8 ], m%4 @@ -5251,14 +6997,14 @@ mova [%%d1+%10], m%6 %endmacro -cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*6] call .main @@ -5278,7 +7024,7 @@ mov r3d, 64 add r6d, 10240 sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 .fast: lea r4, [rsp+32*38] pxor m0, m0 @@ -5288,7 +7034,7 @@ cmp r6, r4 jl .fast_loop .pass2: - lea rax, [pw_5+128] + lea r6, [pw_5+128] mova m0, [rsp+32* 2] ; in0 mova m1, [rsp+32* 6] ; in4 mova m2, [rsp+32*10] ; in8 @@ -5339,7 +7085,7 @@ mova m5, [rsp+32*25] ; in23 mova m6, [rsp+32*27] ; in25 mova m7, [rsp+32* 9] ; in7 - lea rax, [idct64_mul - 8] + lea r6, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 @@ -5351,7 +7097,7 @@ mova m5, [rsp+32*21] ; in19 mova m6, [rsp+32*31] ; in29 mova m7, [rsp+32* 5] ; in3 - add rax, 8 + add r6, 8 add r4, 32*8 sub r5, 32*8 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 @@ -5371,7 +7117,7 @@ mova m5, [cq+128*11] mova m6, [cq+128*13] mova m7, [cq+128*15] - call m(idct_8x16_internal_16bpc).main_oddhalf + call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+128* 0] mova m1, [cq+128* 2] mova m2, [cq+128* 4] @@ -5380,8 +7126,8 @@ mova m5, [cq+128*10] mova m6, [cq+128*12] mova m7, [cq+128*14] - call m(idct_8x8_internal_16bpc).main - call m(idct_8x16_internal_16bpc).main_evenhalf + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf pxor m15, m15 mov r7d, 128*13 .main_zero_loop: @@ -5430,7 +7176,7 @@ REPX {psrad x, 2}, m3, m15, m10, m4 packssdw m3, m15 packssdw m4, m10 - call m(idct_16x8_internal_16bpc).transpose3 + call m(idct_16x8_internal_10bpc).transpose3 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 @@ -5445,7 +7191,7 @@ vpbroadcastd m11, [pw_1567_3784] vpbroadcastd m12, [pw_m3784_1567] vpbroadcastd m13, [pw_2896_2896] - lea rax, [pw_5+128] + lea r6, [pw_5+128] lea r2, [dstq+r7] .main_part2_pass2_loop: vpbroadcastd m14, [pw_m2896_2896] @@ -5592,13 +7338,13 @@ jl .main_part2_loop ret -cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob %undef cmp - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*6] call .main cmp eobd, 36 @@ -5620,7 +7366,7 @@ imul r6d, 2896 add r6d, 6144 sar r6d, 13 - jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .fast: lea r4, [rsp+32*70] pxor m0, m0 @@ -5630,7 +7376,7 @@ cmp r6, r4 jl .fast_loop .pass2: - lea rax, [pw_5 + 128] + lea r6, [pw_5 + 128] mov r10, rsp lea r8, [strideq*4] lea r9, [strideq*5] @@ -5687,7 +7433,7 @@ mova m5, [r10+32*41] ; in23 mova m6, [r10+32*51] ; in25 mova m7, [r10+32* 9] ; in7 - lea rax, [idct64_mul - 8] + lea r6, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 @@ -5699,11 +7445,11 @@ mova m5, [r10+32*37] ; in19 mova m6, [r10+32*55] ; in29 mova m7, [r10+32* 5] ; in3 - add rax, 8 + add r6, 8 add r4, 32*8 sub r5, 32*8 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 add r10, 32*8 sub r4, 32*98 ; rsp+32*16 sub dstq, r8 @@ -5723,7 +7469,7 @@ pmulld m5, m14, [cq+128*23] pmulld m6, m14, [cq+128*25] pmulld m7, m14, [cq+128*31] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_rect2 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 pmulld m0, m14, [cq+128* 3] pmulld m1, m14, [cq+128* 5] pmulld m2, m14, [cq+128*11] @@ -5732,7 +7478,7 @@ pmulld m5, m14, [cq+128*21] pmulld m6, m14, [cq+128*27] pmulld m7, m14, [cq+128*29] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_rect2 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 pmulld m0, m14, [cq+128* 2] pmulld m1, m14, [cq+128* 6] pmulld m2, m14, [cq+128*10] @@ -5741,7 +7487,7 @@ pmulld m5, m14, [cq+128*22] pmulld m6, m14, [cq+128*26] pmulld m7, m14, [cq+128*30] - call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+128* 0] pmulld m1, m14, [cq+128* 4] pmulld m2, m14, [cq+128* 8] @@ -5760,10 +7506,10 @@ sub r7d, 128*4 jg .main_zero_loop add cq, 32 - call m(idct_8x8_internal_16bpc).main_rect2 - call m(idct_8x16_internal_16bpc).main_evenhalf - call m(inv_txfm_add_dct_dct_32x16_16bpc).main_end - call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 @@ -5780,7 +7526,7 @@ mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] - call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose mova [r5-32*4], m0 mova [r5-32*3], m1 mova [r5-32*2], m2 @@ -5791,7 +7537,7 @@ mova [r5+32*3], m7 ret -cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .normal imul r6d, [cq], 2896 @@ -5809,7 +7555,7 @@ movaps [rsp+8], xmm6 %endif vpbroadcastw m0, xm0 - vpbroadcastd m6, [pixel_max] + vpbroadcastd m6, [pixel_10bpc_max] pxor m5, m5 .dconly_loop: paddw m1, m0, [dstq+32*0] @@ -5833,8 +7579,8 @@ PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*4] call .main @@ -5855,7 +7601,7 @@ .pass2: lea r7, [r6-32*64] lea r4, [r6-32*32] - lea rax, [pw_5+128] + lea r6, [pw_5+128] mov r5, dstq .pass2_loop: mova m0, [r7-32*4] @@ -5879,7 +7625,7 @@ mova [rsp], m15 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] - call m(inv_txfm_add_dct_dct_32x16_16bpc).write_16x16 + call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16 add r5, 32 mov dstq, r5 cmp r7, r4 @@ -5892,38 +7638,38 @@ mova m1, [cq+64*31] mova m2, [cq+64*17] mova m3, [cq+64*15] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+64* 7] mova m1, [cq+64*25] mova m2, [cq+64*23] mova m3, [cq+64* 9] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+64* 5] mova m1, [cq+64*27] mova m2, [cq+64*21] mova m3, [cq+64*11] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+64* 3] mova m1, [cq+64*29] mova m2, [cq+64*19] mova m3, [cq+64*13] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 mova m0, [cq+64* 2] mova m1, [cq+64*14] mova m2, [cq+64*18] mova m3, [cq+64*30] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast mova m0, [cq+64* 6] mova m1, [cq+64*10] mova m2, [cq+64*22] mova m3, [cq+64*26] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast mova m0, [cq+64* 4] mova m1, [cq+64*12] mova m2, [cq+64*20] mova m3, [cq+64*28] - call m(idct_8x16_internal_16bpc).main_oddhalf_fast + call m(idct_8x16_internal_10bpc).main_oddhalf_fast mova m0, [cq+64* 0] mova m1, [cq+64* 8] mova m2, [cq+64*16] @@ -5943,9 +7689,9 @@ add cq, 32 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal_16bpc).main + call m(idct_8x8_internal_10bpc).main add r6, 32*8 - call m(idct_8x16_internal_16bpc).main_evenhalf + call m(idct_8x16_internal_10bpc).main_evenhalf mova [r6+32*2], m1 mova [r6+32*1], m2 mova [r6+32*0], m3 @@ -6028,7 +7774,7 @@ REPX {psrad x, %1}, m6, m8, m7, m9 packssdw m6, m8 packssdw m7, m9 - call m(idct_16x8_internal_16bpc).transpose3 + call m(idct_16x8_internal_10bpc).transpose3 mova [r5-32*4], m0 mova [r5-32*3], m1 mova [r5-32*2], m2 @@ -6046,14 +7792,14 @@ IDCT64_SHIFT_TRANSPOSE 2 ret -cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*7] call .main @@ -6076,7 +7822,7 @@ imul r6d, 2896 add r6d, 6144 sar r6d, 13 - jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 .fast: pxor m0, m0 lea r4, [rsp+32*135] @@ -6088,7 +7834,7 @@ .pass2: lea r7, [r6-32*32] lea r5, [r6+32*8] - lea rax, [pw_5+128] + lea r6, [pw_5+128] imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq @@ -6129,7 +7875,7 @@ add r7, 32*8 mova [rsp], m15 call m(idct_16x16_internal_8bpc).main - call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end + call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end sub dstq, r3 lea r2, [r2+r3+32] add dstq, 32 @@ -6143,38 +7889,38 @@ pmulld m1, m14, [cq+128*31] pmulld m2, m14, [cq+128*17] pmulld m3, m14, [cq+128*15] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 pmulld m0, m14, [cq+128* 7] pmulld m1, m14, [cq+128*25] pmulld m2, m14, [cq+128*23] pmulld m3, m14, [cq+128* 9] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 pmulld m0, m14, [cq+128* 5] pmulld m1, m14, [cq+128*27] pmulld m2, m14, [cq+128*21] pmulld m3, m14, [cq+128*11] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 pmulld m0, m14, [cq+128* 3] pmulld m1, m14, [cq+128*29] pmulld m2, m14, [cq+128*19] pmulld m3, m14, [cq+128*13] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 pmulld m0, m14, [cq+128* 2] pmulld m1, m14, [cq+128*14] pmulld m2, m14, [cq+128*18] pmulld m3, m14, [cq+128*30] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast_rect2 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2 pmulld m0, m14, [cq+128* 6] pmulld m1, m14, [cq+128*10] pmulld m2, m14, [cq+128*22] pmulld m3, m14, [cq+128*26] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast_rect2 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2 pmulld m0, m14, [cq+128* 4] pmulld m1, m14, [cq+128*12] pmulld m2, m14, [cq+128*20] pmulld m3, m14, [cq+128*28] - call m(idct_8x16_internal_16bpc).main_oddhalf_fast_rect2 + call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2 pmulld m0, m14, [cq+128* 0] pmulld m1, m14, [cq+128* 8] pmulld m2, m14, [cq+128*16] @@ -6191,18 +7937,18 @@ psrld m15, m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 - call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end2 + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2 IDCT64_SHIFT_TRANSPOSE 1 ret -cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob +cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_min] - vpbroadcastd m13, [clip_max] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*7] call .main @@ -6220,7 +7966,7 @@ imul r6d, [cq], 2896 mov [cq], eobd ; 0 mov r3d, 64 - jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly .fast: pxor m0, m0 lea r4, [rsp+32*135] @@ -6231,7 +7977,7 @@ jl .fast_loop .pass2: lea r10, [r6-32*32] - lea rax, [pw_5+128] + lea r6, [pw_5+128] lea r8, [strideq*4] lea r9, [strideq*5] lea r3, [r9+strideq*1] ; stride*6 @@ -6286,7 +8032,7 @@ mova m5, [r10-32*29] ; in23 mova m6, [r10-32* 3] ; in25 mova m7, [r10-32*93] ; in7 - lea rax, [idct64_mul - 8] + lea r6, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 @@ -6298,11 +8044,11 @@ mova m5, [r10-32*33] ; in19 mova m6, [r10+32* 1] ; in29 mova m7, [r10-32*97] ; in3 - add rax, 8 + add r6, 8 add r4, 32*8 sub r5, 32*8 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 add r10, 32*8 sub dstq, r8 sub r4, 32*44 @@ -6317,38 +8063,38 @@ mova m1, [cq+128*31] mova m2, [cq+128*17] mova m3, [cq+128*15] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+128* 7] mova m1, [cq+128*25] mova m2, [cq+128*23] mova m3, [cq+128* 9] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+128* 5] mova m1, [cq+128*27] mova m2, [cq+128*21] mova m3, [cq+128*11] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+128* 3] mova m1, [cq+128*29] mova m2, [cq+128*19] mova m3, [cq+128*13] - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 - call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 mova m0, [cq+128* 2] mova m1, [cq+128*14] mova m2, [cq+128*18] mova m3, [cq+128*30] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast mova m0, [cq+128* 6] mova m1, [cq+128*10] mova m2, [cq+128*22] mova m3, [cq+128*26] - call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast mova m0, [cq+128* 4] mova m1, [cq+128*12] mova m2, [cq+128*20] mova m3, [cq+128*28] - call m(idct_8x16_internal_16bpc).main_oddhalf_fast + call m(idct_8x16_internal_10bpc).main_oddhalf_fast mova m0, [cq+128* 0] mova m1, [cq+128* 8] mova m2, [cq+128*16] @@ -6362,7 +8108,7 @@ mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop - call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end - jmp m(inv_txfm_add_dct_dct_64x16_16bpc).shift_transpose + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose %endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/itx16_sse.asm dav1d-1.0.0/src/x86/itx16_sse.asm --- dav1d-0.9.2/src/x86/itx16_sse.asm 2021-09-03 15:51:24.421037200 +0000 +++ dav1d-1.0.0/src/x86/itx16_sse.asm 2022-03-18 14:31:56.018356000 +0000 @@ -101,9 +101,11 @@ pw_1567_3784: times 4 dw 1567, 3784 pw_m3784_1567: times 4 dw -3784, 1567 +pw_2896_2896: times 4 dw 2896, 2896 +pw_m2896_2896: times 4 dw -2896, 2896 -clip_min: times 4 dd -0x20000 -clip_max: times 4 dd 0x1ffff +clip_18b_min: times 4 dd -0x20000 +clip_18b_max: times 4 dd 0x1ffff idct64_mul_16bpc: dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 @@ -172,14 +174,6 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) %define m(x) m_suffix(x, SUFFIX) @@ -429,22 +423,19 @@ ; m0 = in0 in1 ; m1 = in2 in3 ; m5 = pd_2048 - mova m4, [o(pw_m3784_1567)] punpckhwd m2, m1, m0 - psubw m3, m0, m1 - paddw m0, m1 - punpcklqdq m0, m3 - pmaddwd m4, m2 + punpcklwd m1, m0 + pmaddwd m4, m2, [o(pw_m3784_1567)] pmaddwd m2, [o(pw_1567_3784)] - pmulhrsw m0, [o(pw_2896x8)] ; t0 t1 - paddd m4, m5 - paddd m2, m5 - psrad m4, 12 - psrad m2, 12 - packssdw m2, m4 ; t3 t2 - psubsw m1, m0, m2 ; tmp3 tmp2 - paddsw m0, m2 ; tmp0 tmp1 + pmaddwd m0, m1, [o(pw_m2896_2896)] + pmaddwd m1, [o(pw_2896_2896)] + REPX {paddd x, m5}, m4, m2, m0, m1 packssdw m5, m5 ; pw_2048 + REPX {psrad x, 12}, m4, m2, m0, m1 + packssdw m2, m4 ; t3 t2 + packssdw m1, m0 ; t0 t1 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 pmulhrsw m0, m5 pmulhrsw m1, m5 movq m2, [dstq+strideq*0] @@ -1348,8 +1339,8 @@ lea r3, [rsp+gprsize] %else mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+0*16] @@ -1473,9 +1464,9 @@ mova m0, m4 paddd m4, m7, m3 ; t7 psubd m7, m3 ; t6a - mova m3, [o(clip_min)] + mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3 }, m1, m2, m7, m4 - mova m3, [o(clip_max)] + mova m3, [o(clip_18b_max)] REPX {pminsd x, m3 }, m1, m2, m7, m4 mova [r3+3*16], m2 mova [r3+1*16], m4 @@ -1495,9 +1486,9 @@ psubd m1, m5 ; t5a paddd m4, m7, m3 ; t7 psubd m7, m3 ; t6a - mova m6, [o(clip_min)] + mova m6, [o(clip_18b_min)] REPX {pmaxsd x, m6 }, m1, m2, m7, m4 - mova m6, [o(clip_max)] + mova m6, [o(clip_18b_max)] REPX {pminsd x, m6 }, m1, m2, m7, m4 mova m6, [r3+3*16] mova [r3+3*16], m2 @@ -1526,9 +1517,9 @@ paddd m6, m5, m2 ; dct4 out1 psubd m5, m2 ; dct4 out2 - mova m1, [o(clip_min)] + mova m1, [o(clip_18b_min)] REPX {pmaxsd x, m1 }, m0, m6, m5, m3 - mova m1, [o(clip_max)] + mova m1, [o(clip_18b_max)] REPX {pminsd x, m1 }, m0, m6, m5, m3 ret .round: @@ -1668,10 +1659,10 @@ paddd m5, m1 ; t3 psubd m1, m7, m3 ; t5 paddd m7, m3 ; t1 - mova m3, [o(clip_min)] + mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 mova [r3+1*16], m7 - mova m7, [o(clip_max)] + mova m7, [o(clip_18b_max)] pmaxsd m3, [r3+0*16] REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 pminsd m7, [r3+1*16] @@ -1695,9 +1686,9 @@ paddd m0, m6 ; out0 psubd m6, m1, m4 ; t6 paddd m1, m4 ; -out1 - mova m4, [o(clip_min)] + mova m4, [o(clip_18b_min)] REPX {pmaxsd x, m4 }, m5, m3, m6, m2 - mova m4, [o(clip_max)] + mova m4, [o(clip_18b_max)] REPX {pminsd x, m4 }, m5, m3, m6, m2 mova m4, [o(pd_2896)] REPX {pmulld x, m4 }, m5, m3, m6, m2 @@ -1843,8 +1834,8 @@ .pass1_full: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif %undef cmp @@ -2189,8 +2180,8 @@ .pass1_full: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif %undef cmp @@ -2523,8 +2514,8 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif ; setup stack pointer @@ -2705,11 +2696,11 @@ paddd m5, m1 ; t12 psubd m1, m7, m3 ; t14 paddd m7, m3 ; t15 - mova m3, [o(clip_min)] + mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7 pmaxsd m3, [r3+0*16] mova [r3+0*16], m3 - mova m3, [o(clip_max)] + mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7 pminsd m3, [r3+0*16] mova [r3+0*16], m0 @@ -2734,11 +2725,11 @@ psubd m7, m5 ; t12a mova [r3+0*16], m3 mova m3, [r3+1*16] - mova m5, [o(clip_min)] + mova m5, [o(clip_18b_min)] REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6 pmaxsd m5, [r3+0*16] mova [r3+0*16], m5 - mova m5, [o(clip_max)] + mova m5, [o(clip_18b_max)] REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6 pminsd m5, [r3+0*16] mova [r3+0*16], m5 @@ -2904,8 +2895,8 @@ .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 2*16] @@ -3098,11 +3089,11 @@ mova m3, [r3+11*16] psubd m7, m5, m3 ; t13a paddd m5, m3 ; t5a - mova m3, [o(clip_min)] + mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5 pmaxsd m3, [r3+8*16] mova [r3+8*16], m3 - mova m3, [o(clip_max)] + mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5 pminsd m3, [r3+8*16] mova [r3+8*16], m3 @@ -3125,11 +3116,11 @@ mova m0, [r3+8*16] mova m1, [r3+9*16] mova m4, [r3+10*16] - mova m3, [o(clip_min)] + mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6 pmaxsd m3, [r3+11*16] mova [r3+8*16], m3 - mova m3, [o(clip_max)] + mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6 pminsd m3, [r3+8*16] mova [r3+8*16], m0 @@ -3176,11 +3167,11 @@ psubd m7, m1 ; t2a paddd m1, [r3+8*16] ; out0 mova [r3+4*16], m1 - mova m1, [o(clip_min)] + mova m1, [o(clip_18b_min)] REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7 pmaxsd m1, [r3+10*16] mova [r3+10*16], m1 - mova m1, [o(clip_max)] + mova m1, [o(clip_18b_max)] REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7 pminsd m1, [r3+10*16] mova [r3+10*16], m1 @@ -3275,11 +3266,11 @@ paddd m2, m6 ; t6a psubd m6, m7, m3 ; t15a paddd m7, m3 ; t7a - mova m3, [o(clip_min)] + mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7 pmaxsd m3, [r3+4*16] mova [r3+4*16], m3 - mova m3, [o(clip_max)] + mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7 pminsd m3, [r3+4*16] mova [r3+4*16], m3 @@ -3302,11 +3293,11 @@ mova m1, [r3+5*16] mova m3, [r3+6*16] mova m2, [r3+7*16] - mova m0, [o(clip_min)] + mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5 pmaxsd m0, [r3+4*16] mova [r3+4*16], m0 - mova m0, [o(clip_max)] + mova m0, [o(clip_18b_max)] REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5 pminsd m0, [r3+4*16] mova [r3+4*16], m0 @@ -3557,8 +3548,8 @@ .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*32+r5] @@ -3652,8 +3643,8 @@ .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 2*32+r5] @@ -4027,8 +4018,8 @@ .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif @@ -4250,8 +4241,8 @@ .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 2*64+r5] @@ -5033,8 +5024,8 @@ .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+0*128+r5*8] @@ -5218,8 +5209,8 @@ .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*128+r5*8] @@ -5390,8 +5381,8 @@ mova m7, [cq+32*31+r5*8] %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp @@ -5622,11 +5613,11 @@ paddd m1, m5 ; t28 psubd m5, m3, m7 ; t30 paddd m7, m3 ; t31 - mova m3, [o(clip_min)] + mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pmaxsd m3, [r3+0*16] mova [r3+0*16], m3 - mova m3, [o(clip_max)] + mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pminsd m3, [r3+0*16] mova [r3+0*16], m0 @@ -5649,11 +5640,11 @@ paddd m0, m6 ; t16a psubd m6, m7, m2 ; t28a paddd m7, m2 ; t31a - mova m2, [o(clip_min)] + mova m2, [o(clip_18b_min)] REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pmaxsd m2, [r3+0*16] mova [r3+0*16], m2 - mova m2, [o(clip_max)] + mova m2, [o(clip_18b_max)] REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pminsd m2, [r3+0*16] mova [r3+16*0], m0 @@ -5824,11 +5815,11 @@ paddd m1, m5 ; t20 psubd m5, m3, m7 ; t22 paddd m7, m3 ; t23 - mova m3, [o(clip_min)] + mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pmaxsd m3, [r3+ 8*16] mova [r3+ 8*16], m3 - mova m3, [o(clip_max)] + mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pminsd m3, [r3+ 8*16] mova [r3+ 8*16], m0 @@ -5851,11 +5842,11 @@ paddd m0, m6 ; t24a psubd m6, m7, m2 ; t20a paddd m7, m2 ; t23a - mova m2, [o(clip_min)] + mova m2, [o(clip_18b_min)] REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pmaxsd m2, [r3+ 8*16] mova [r3+ 8*16], m2 - mova m2, [o(clip_max)] + mova m2, [o(clip_18b_max)] REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pminsd m2, [r3+ 8*16] mova [r3+ 8*16], m0 @@ -5866,8 +5857,8 @@ ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20 mova [r3+10*16], m3 - mova m0, [o(clip_min)] - mova m2, [o(clip_max)] + mova m0, [o(clip_18b_min)] + mova m2, [o(clip_18b_max)] mova m5, [r3+16*2] ; t18a mova m7, [r3+16*3] ; t19 psubd m3, m5, m1 ; t21 @@ -6021,8 +6012,8 @@ mova [r3+22*16], m6 mova [r3+23*16], m7 mova m1, [o(pd_2)] - mova m2, [o(clip_min)] - mova m3, [o(clip_max)] + mova m2, [o(clip_18b_min)] + mova m3, [o(clip_18b_max)] mov r4, 15*16 .loop_dct32_end: @@ -6103,8 +6094,8 @@ .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+64* 1+r5*8] @@ -6320,8 +6311,8 @@ mova [r3+22*16], m6 mova [r3+23*16], m7 pcmpeqd m1, m1 ; -1 - mova m2, [o(clip_min)] - mova m3, [o(clip_max)] + mova m2, [o(clip_18b_min)] + mova m3, [o(clip_18b_max)] mov r4, 15*16 .loop_dct32_end: @@ -6418,8 +6409,8 @@ mova m7, [cq+128*31+r5*8] %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp @@ -6636,8 +6627,8 @@ .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*128+r5*8] @@ -6919,8 +6910,8 @@ .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+128* 1+r5*8] @@ -7136,8 +7127,8 @@ .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif @@ -7339,11 +7330,11 @@ paddd m3, m2 ; t35 psubd m2, m4, m0 ; t61 paddd m4, m0 ; t60 - mova m0, [o(clip_min)] + mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4 pmaxsd m0, [r3+1*16] mova [r3+0*16], m0 - mova m0, [o(clip_max)] + mova m0, [o(clip_18b_max)] REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4 pminsd m0, [r3+0*16] mova [r3+0*16], m0 @@ -7373,11 +7364,11 @@ paddd m1, m6 ; t33 psubd m6, m5, m2 ; t61 paddd m2, m5 ; t62 - mova m5, [o(clip_min)] + mova m5, [o(clip_18b_min)] REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2 pmaxsd m5, [r3+0*16] mova [r3+0*16], m5 - mova m5, [o(clip_max)] + mova m5, [o(clip_18b_max)] REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2 pminsd m5, [r3+0*16] mova [r3+16*0], m0 @@ -7476,11 +7467,11 @@ paddd m5, m4 ; t47 psubd m4, m7, m6 ; t55 paddd m7, m6 ; t48 - mova m6, [o(clip_min)] + mova m6, [o(clip_18b_min)] REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7 pmaxsd m6, [r3+0*16] mova [r3+0*16], m6 - mova m6, [o(clip_max)] + mova m6, [o(clip_18b_max)] REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7 pminsd m6, [r3+0*16] mova [r3+0*16], m0 @@ -7503,11 +7494,11 @@ paddd m6, m4 ; t56 psubd m4, m1, m3 ; t40 paddd m1, m3 ; t39 - mova m3, [o(clip_min)] + mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1 pmaxsd m3, [r3+1*16] mova [r3+0*16], m3 - mova m3, [o(clip_max)] + mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1 pminsd m3, [r3+0*16] mova [r4-16* 8], m2 @@ -7577,8 +7568,8 @@ mova m1, [r3+16*44] ; idct16 15 - n paddd m4, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n - mova m5, [o(clip_min)] - mova m6, [o(clip_max)] + mova m5, [o(clip_18b_min)] + mova m6, [o(clip_18b_max)] REPX {pmaxsd x, m5}, m4, m0 REPX {pminsd x, m6}, m4, m0 paddd m1, m4, m3 ; idct32 out0 + n @@ -7720,8 +7711,8 @@ .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif @@ -7956,8 +7947,8 @@ .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] - mova m12, [o(clip_min)] - mova m13, [o(clip_max)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif diff -Nru dav1d-0.9.2/src/x86/itx_avx2.asm dav1d-1.0.0/src/x86/itx_avx2.asm --- dav1d-0.9.2/src/x86/itx_avx2.asm 2021-09-03 15:51:24.421037200 +0000 +++ dav1d-1.0.0/src/x86/itx_avx2.asm 2022-03-18 14:31:56.018356000 +0000 @@ -131,16 +131,7 @@ ; single rip-relative lea and then address things relative from that with ; 1-byte offsets as long as data is within +-128 bytes of the base pointer. %define o_base deint_shuf + 128 -%define o(x) (rax - (o_base) + (x)) - -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - +%define o(x) (r6 - (o_base) + (x)) %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) ; flags: 1 = swap, 2 = interleave, 4: coef_regs @@ -364,7 +355,7 @@ %macro INV_TXFM_FN 3 ; type1, type2, size cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%3_internal_8bpc) - lea rax, [o_base] + lea r6, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(i%2_%3_internal_8bpc).pass2] @@ -1019,7 +1010,7 @@ vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 vperm2i128 m4, m1, m4, 0x31 vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 - pshufd m3, m3, q1032 ; in12 in15 in13 in14 + pshufd m3, m3, q1032 ; in15 in12 in13 in14 pshufd m2, m4, q1032 ; in11 in8 in9 in10 cglobal_label .main2 vpbroadcastd m8, [o(pd_2048)] @@ -3029,7 +3020,7 @@ %endmacro cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob @@ -3182,7 +3173,7 @@ mova [rsp+gprsize+1*32], m1 mova m0, [rsp+gprsize+2*32] mova [rsp+gprsize+2*32], m6 - lea r5, [rax-(o_base)+pw_201_4091x8] + lea r5, [r6-(o_base)+pw_201_4091x8] ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a @@ -3292,7 +3283,7 @@ %endmacro cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] @@ -3628,7 +3619,7 @@ %endmacro cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ @@ -3992,7 +3983,7 @@ %endmacro cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] @@ -4092,7 +4083,7 @@ cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob %undef cmp - lea rax, [o_base] + lea r6, [o_base] vpbroadcastd m9, [o(pw_2896x8)] vpbroadcastd m10, [o(pw_1697x16)] vpbroadcastd m12, [o(pw_8192)] @@ -4103,7 +4094,7 @@ add eobd, -279 ; if (eob > 278) adc r4b, al ; iteration_count++ lea r3, [strideq*3] - mov rax, cq + mov r6, cq paddw m11, m12, m12 ; pw_16384 .loop: mova xm0, [cq+64* 0] @@ -4138,30 +4129,30 @@ sub cq, 32 pxor m0, m0 mov r0d, 8 - cmp cq, rax + cmp cq, r6 ja .zero_loop .zero_loop_half: - mova [rax+64*0], m0 - mova [rax+64*1], m0 - add rax, 64*4 - mova [rax-64*2], m0 - mova [rax-64*1], m0 + mova [r6+64*0], m0 + mova [r6+64*1], m0 + add r6, 64*4 + mova [r6-64*2], m0 + mova [r6-64*1], m0 sub r0d, 2 jg .zero_loop_half RET .zero_loop: - mova [rax+32*0], m0 - mova [rax+32*1], m0 - mova [rax+32*2], m0 - mova [rax+32*3], m0 - add rax, 32*4 + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 dec r0d jg .zero_loop RET cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob %undef cmp - lea rax, [o_base] + lea r6, [o_base] vpbroadcastd m9, [o(pw_2896x8)] vpbroadcastd m10, [o(pw_1697x16)] vpbroadcastd m11, [o(pw_2048)] @@ -4172,7 +4163,7 @@ lea r4d, [r4+r3*2] lea r3, [strideq*3] mov r5, dstq - mov rax, cq + mov r6, cq .loop: mova xm0, [cq+32* 0] mova xm1, [cq+32* 1] @@ -4214,17 +4205,17 @@ pxor m0, m0 add cd, 384 .zero_loop: - mova [rax+32*0], m0 - mova [rax+32*1], m0 - mova [rax+32*2], m0 - mova [rax+32*3], m0 - add rax, 32*4 + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 sub cd, 128 jge .zero_loop RET cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] @@ -4375,7 +4366,7 @@ lea eobd, [eobq*2-8] lea r4, [strideq*3] mov r5, dstq - lea rax, [cq+32] + lea r6, [cq+32] .loop: mova xm0, [cq+64* 0] mova xm1, [cq+64* 1] @@ -4412,23 +4403,23 @@ .ret: pxor m0, m0 mov r0d, 16 - cmp cq, rax + cmp cq, r6 jne .zero_loop .zero_loop_topleft: - mova [rax-32*1], m0 - mova [rax+32*1], m0 - mova [rax+32*3], m0 - mova [rax+32*5], m0 - add rax, 64*4 + mova [r6-32*1], m0 + mova [r6+32*1], m0 + mova [r6+32*3], m0 + mova [r6+32*5], m0 + add r6, 64*4 sub r0d, 4 jg .zero_loop_topleft RET .zero_loop: - mova [rax-32*1], m0 - mova [rax+32*0], m0 - mova [rax+32*1], m0 - mova [rax+32*2], m0 - add rax, 32*4 + mova [r6-32*1], m0 + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + add r6, 32*4 dec r0d jg .zero_loop RET @@ -4488,7 +4479,7 @@ %endmacro cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] @@ -4628,9 +4619,9 @@ vinserti128 m5, [r3+32*3+ 0], 1 vinserti128 m6, [r3+32*0+16], 1 .fast3: - add rax, o_idct64_offset + add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 - add rax, 8 + add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova xm0, [r2-32*2+ 0] @@ -4762,7 +4753,7 @@ ret %define o_base pw_5 + 128 .main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub - sub rax, o_idct64_offset + 8 + sub r6, o_idct64_offset + 8 vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] vpbroadcastd m13, [o(pw_2896_2896)] @@ -4810,7 +4801,7 @@ ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a ret .main_part2_pass2: - sub rax, o_idct64_offset + 8 + sub r6, o_idct64_offset + 8 vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] vpbroadcastd m13, [o(pw_2896_2896)] @@ -4834,7 +4825,7 @@ ret cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] @@ -4914,9 +4905,9 @@ mova m7, [cq+32* 7] pxor m8, m8 REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 - add rax, o_idct64_offset + add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 - add rax, 8 + add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [cq+32* 5] @@ -4998,7 +4989,7 @@ RET cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] @@ -5170,9 +5161,9 @@ mova m5, [r8-32*1] mova m6, [r8+32*0] .fast4: - add rax, o_idct64_offset + add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 - add rax, 8 + add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [r7-32*2] @@ -5201,7 +5192,7 @@ RET cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] @@ -5261,10 +5252,10 @@ pmulhrsw m7, [cq+64* 7] pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 - add rax, o_idct64_offset + add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] - add rax, 8 + add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 pmulhrsw m0, m7, [cq+64* 5] @@ -5383,7 +5374,7 @@ ret cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] + lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] @@ -5440,9 +5431,9 @@ mova m7, [cq+64* 7] pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 - add rax, o_idct64_offset + add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 - add rax, 8 + add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [cq+64* 5] @@ -5535,9 +5526,9 @@ mova m5, [r3-32*1] ; 23 mova m6, [r3+32*0] ; 25 .fast3: - add rax, o_idct64_offset + add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 - add rax, 8 + add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [r2-32*2] ; 5 diff -Nru dav1d-0.9.2/src/x86/itx_avx512.asm dav1d-1.0.0/src/x86/itx_avx512.asm --- dav1d-0.9.2/src/x86/itx_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/itx_avx512.asm 2022-03-18 14:31:56.018356000 +0000 @@ -0,0 +1,7320 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 +int8_permA: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 + db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 + db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 + db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 +int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 + db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 + db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 + db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 +int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 + db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 + db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 + db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 +dup16_perm: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 + db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 + db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 + db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 +idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 + db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 + db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 + db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63 +idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9 + db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17 + db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51 + db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35 +idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35 + db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21 + db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51 + db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37 +end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60 + db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61 + db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63 + db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62 + +; packed 4-bit qword shuffle indices +permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262 + dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373 + dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb + dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea +permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604 + dq 0xc824352d56128751, 0xd906171e74301e15 + dq 0x6271604b03472d62, 0x735342782165b426 + dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37 +permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486 + dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597 + dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e + dq 0x5115049dd9045b79, 0x733726bffb263d1f +permD: dq 0x0cda098800041504, 0x0edb09b2028c3726 + dq 0x0f11fa9c01150415, 0x0988f326039d2637 + dq 0x05640f1108269d8c, 0x05290edb0aaebfae + dq 0x0005000509378c9d, 0xffffffff0bbfaebf + +pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 +gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 +gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 +gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10 + +int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 +int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 +int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7 +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0 + +pb_32: times 4 db 32 +pw_2048: times 2 dw 2048 +pw_4096: times 2 dw 4096 +pw_8192: times 2 dw 8192 +pw_16384: times 2 dw 16384 +pw_1697x16: times 2 dw 1697*16 +pw_1697x8: times 2 dw 1697*8 +pw_2896x8: times 2 dw 2896*8 +pd_2048: dd 2048 + +%define pw_5 (permD+52) +%define pd_m1 (permD+60) +%define pw_3803_1321 (permD+44) +%define pw_2482_3803 (permD+12) +%define pw_2440_3290 (permD+ 4) +%define pw_m3290_2440 (permD+28) +%define pw_3857_1380 (permD+36) +%define pw_m1380_3857 (permD+20) + +pw_8192_m8192: dw 8192, -8192 +pw_m8192_8192: dw -8192, 8192 +pw_16384_m16384: dw 16384, -16384 +pw_m16384_16384: dw -16384, 16384 + +pw_m1321_2482: dw -1321, 2482 +pw_m3344_3344: dw -3344, 3344 +pw_2482_3344: dw 2482, 3344 +pw_m3803_3344: dw -3803, 3344 +pd_3344: dd 3344 +pw_m1321_m3344: dw -1321, -3344 +pw_2896_m2896: dw 2896, -2896 + +pw_1567_m3784: dw 1567, -3784 +pw_3784_m1567: dw 3784, -1567 +pw_4017_m799: dw 4017, -799 +pw_2276_m3406: dw 2276, -3406 +pw_m799_m4017: dw -799, -4017 +pw_m3406_m2276: dw -3406, -2276 + +%macro COEF_PAIR 2-3 0 +pw_%1_%2: dw %1, %2 +pw_m%2_%1: dw -%2, %1 +%if %3 +pw_m%1_m%2: dw -%1, -%2 +%endif +%endmacro + +COEF_PAIR 2896, 2896 +COEF_PAIR 1567, 3784, 1 +COEF_PAIR 3784, 1567 +COEF_PAIR 201, 4091 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 3035, 2751 +COEF_PAIR 3513, 2106 +COEF_PAIR 4052, 601 +COEF_PAIR 3166, 2598, 1 +COEF_PAIR 3920, 1189, 1 +COEF_PAIR 2276, 3406 +COEF_PAIR 4017, 799 + +%macro COEF_X8 1-* +%rep %0 + dw %1*8, %1*8 + %rotate 1 +%endrep +%endmacro + +pw_m2276x8: COEF_X8 -2276 +pw_3406x8: COEF_X8 3406 +pw_4017x8: COEF_X8 4017 +pw_799x8: COEF_X8 799 +pw_3784x8: COEF_X8 3784 +pw_1567x8: COEF_X8 1567 + +pw_4076x8: COEF_X8 4076 +pw_401x8: COEF_X8 401 +pw_m2598x8: COEF_X8 -2598 +pw_3166x8: COEF_X8 3166 +pw_3612x8: COEF_X8 3612 +pw_1931x8: COEF_X8 1931 +pw_m1189x8: COEF_X8 -1189 +pw_3920x8: COEF_X8 3920 + +pw_4091x8: COEF_X8 4091 +pw_201x8: COEF_X8 201 +pw_m2751x8: COEF_X8 -2751 +pw_3035x8: COEF_X8 3035 +pw_3703x8: COEF_X8 3703 +pw_1751x8: COEF_X8 1751 +pw_m1380x8: COEF_X8 -1380 +pw_3857x8: COEF_X8 3857 +pw_3973x8: COEF_X8 3973 +pw_995x8: COEF_X8 995 +pw_m2106x8: COEF_X8 -2106 +pw_3513x8: COEF_X8 3513 +pw_3290x8: COEF_X8 3290 +pw_2440x8: COEF_X8 2440 +pw_m601x8: COEF_X8 -601 +pw_4052x8: COEF_X8 4052 + +pw_401_4076x8: dw 401*8, 4076*8 +pw_m2598_3166x8: dw -2598*8, 3166*8 +pw_1931_3612x8: dw 1931*8, 3612*8 +pw_m1189_3920x8: dw -1189*8, 3920*8 +pw_799_4017x8: dw 799*8, 4017*8 +pw_m2276_3406x8: dw -2276*8, 3406*8 + +pw_201_4091x8: dw 201*8, 4091*8 +pw_m601_4052x8: dw -601*8, 4052*8 +pw_995_3973x8: dw 995*8, 3973*8 +pw_m1380_3857x8: dw -1380*8, 3857*8 +pw_1751_3703x8: dw 1751*8, 3703*8 +pw_m2106_3513x8: dw -2106*8, 3513*8 +pw_2440_3290x8: dw 2440*8, 3290*8 +pw_m2751_3035x8: dw -2751*8, 3035*8 + +pw_101_4095x8: dw 101*8, 4095*8 +pw_m2824_2967x8: dw -2824*8, 2967*8 +pw_1660_3745x8: dw 1660*8, 3745*8 +pw_m1474_3822x8: dw -1474*8, 3822*8 +pw_897_3996x8: dw 897*8, 3996*8 +pw_m2191_3461x8: dw -2191*8, 3461*8 +pw_2359_3349x8: dw 2359*8, 3349*8 +pw_m700_4036x8: dw -700*8, 4036*8 +pw_501_4065x8: dw 501*8, 4065*8 +pw_m2520_3229x8: dw -2520*8, 3229*8 +pw_2019_3564x8: dw 2019*8, 3564*8 +pw_m1092_3948x8: dw -1092*8, 3948*8 +pw_1285_3889x8: dw 1285*8, 3889*8 +pw_m1842_3659x8: dw -1842*8, 3659*8 +pw_2675_3102x8: dw 2675*8, 3102*8 +pw_m301_4085x8: dw -301*8, 4085*8 + +idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474 +COEF_PAIR 401, 4076, 1 +COEF_PAIR 799, 4017 + COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996 +dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017 + COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092 +COEF_PAIR 1931, 3612, 1 +COEF_PAIR 3406, 2276 + COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889 +dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276 + +SECTION .text + +%define o_base int8_permA+64*18 +%define o(x) (r5 - (o_base) + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, +; 16 = special_mul1, 32 = special_mul2 +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags + mova m%2, m%4 +%if %7 & 16 + vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} + mova m%3, m%4 +%if %7 & 32 + vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} +%else + vpdpwssd m%3, m%1, m%6 +%endif +%elif %7 & 32 + vpdpwssd m%2, m%1, m%5 + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} +%elif %6 < 32 + vpdpwssd m%2, m%1, m%5 + mova m%3, m%4 + vpdpwssd m%3, m%1, m%6 +%elif %7 & 1 + vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} +%else + vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} +%endif +%if %7 & 2 + psrld m%2, 12 + pslld m%3, 4 + vpshrdd m%1, m%3, m%2, 16 +%elif %7 & 4 + ; compared to using shifts (as above) this has better throughput, + ; but worse latency and requires setting up the opmask/index + ; registers, so only use this method for the larger transforms + pslld m%1, m%2, 4 + vpmultishiftqb m%1{k7}, m13, m%3 +%else + psrad m%2, 12 + psrad m%3, 12 +%if %7 & 8 == 0 + packssdw m%1, m%3, m%2 +%endif +%endif +%endmacro + +; flags: same as ITX_MUL2X_PACK +%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags +%if %11 & 1 + vpbroadcastd m%4, [o(pw_%9_%10)] + vpbroadcastd m%4{k1}, [o(pw_%7_%8)] + vpbroadcastd m%5, [o(pw_m%10_%9)] + vpbroadcastd m%5{k1}, [o(pw_m%8_%7)] +%else + vpbroadcastd m%4, [o(pw_m%10_%9)] + vpbroadcastd m%4{k1}, [o(pw_m%8_%7)] + vpbroadcastd m%5, [o(pw_%9_%10)] + vpbroadcastd m%5{k1}, [o(pw_%7_%8)] +%endif + ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11 +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 + punpcklwd m%3, m%2, m%1 + punpckhwd m%2, m%1 +%if %7 < 32 + mova m%1, m%5 + vpdpwssd m%1, m%3, m%7 + mova m%4, m%5 + vpdpwssd m%4, m%2, m%7 +%else + mova m%1, m%5 + vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} + mova m%4, m%5 + vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} +%endif + psrad m%1, 12 + psrad m%4, 12 + packssdw m%1, m%4 + mova m%4, m%5 +%if %7 < 32 + vpdpwssd m%4, m%2, m%6 + mova m%2, m%5 + vpdpwssd m%2, m%3, m%6 +%else + vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} + mova m%2, m%5 + vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} +%endif + psrad m%4, 12 + psrad m%2, 12 +%if %0 == 8 + packssdw m%8, m%2, m%4 +%else + packssdw m%2, m%4 +%endif +%endmacro + +%macro WRAP_XMM 1+ + %xdefine %%reset RESET_MM_PERMUTATION + INIT_XMM cpuname + DEFINE_MMREGS xmm + AVX512_MM_PERMUTATION + %1 + %%reset +%endmacro + +%macro WRAP_YMM 1+ + INIT_YMM cpuname + %1 + INIT_ZMM cpuname +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + vpbroadcastd m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + movd m2, [%%row_adr1] + pinsrd m2, [%%row_adr2], 1 + movd m3, [%%row_adr3] + pinsrd m3, [%%row_adr4], 1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + movd [%%row_adr1], m0 + pextrd [%%row_adr2], m0, 1 + pextrd [%%row_adr3], m0, 2 + pextrd [%%row_adr4], m0, 3 + ret +%endmacro + +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base + %define %%p1 m(i%1_%3_internal_8bpc) + lea baseq, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%3_internal_8bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct_dct + vpbroadcastw m0, [cq] + vpbroadcastd m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [cq], eobd + pmulhrsw m0, m1 + mova m1, m0 + jmp m(iadst_4x4_internal_8bpc).end2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0 + vpbroadcastd m4, [o(pd_2048)] + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 +%endmacro + +%macro IADST4_1D_PACKED 0 + punpcklwd m4, m1, m0 ; in2 in0 + punpckhwd m5, m1, m0 ; in3 in1 +.main2: + vpbroadcastd m3, [o(pd_2048)] + mova m0, m3 + vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd} + mova m2, m3 + vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd} + mova m1, m3 + vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd} + vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd} + vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd} + vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd} + vpdpwssd m1, m5, [o(pd_3344)] {bcstd} + vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd} + REPX {psrad x, 12}, m0, m2, m1, m3 + packssdw m0, m2 ; out0 out1 + packssdw m1, m3 ; out2 out3 +%endmacro + +INIT_XMM avx512icl +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + IDCT4_1D_PACKED + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + pxor ymm16, ymm16 + mova [cq], ymm16 + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call .main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + pxor ymm16, ymm16 + mova [cq], ymm16 +.end2: + ITX4_END 0, 1, 2, 3 +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call m(iadst_4x4_internal_8bpc).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x4_internal_8bpc).main +.end: + pxor ymm16, ymm16 + mova [cq], ymm16 +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8bpc).end + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct_dct + movd xmm1, [o(pw_2896x8)] + pmulhrsw xmm0, xmm1, [cq] + movd xmm2, [o(pw_2048)] + pmulhrsw xmm0, xmm1 + pmulhrsw xmm0, xmm1 + pmulhrsw xmm0, xmm2 + vpbroadcastw ym0, xmm0 + mova ym1, ym0 + jmp m(iadst_4x8_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT8_1D_PACKED 0 + punpckhwd m5, m3, m0 ; in7 in1 + punpckhwd m4, m1, m2 ; in3 in5 + punpcklwd m3, m1 ; in6 in2 + punpcklwd m2, m0 ; in4 in0 +.main2: + vpbroadcastd m6, [o(pd_2048)] + ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 + psubsw m0, m5, m4 ; t5a t6a (interleaved) + paddsw m4, m5 ; t4 t7 (interleaved) + ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 + ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5 +%if mmsize > 16 + vbroadcasti32x4 m1, [o(deint_shuf)] + pshufb m4, m1 +%else + pshufb m4, [o(deint_shuf)] +%endif + psubsw m1, m2, m3 ; tmp3 tmp2 + paddsw m3, m2 ; tmp0 tmp1 + punpckhqdq m2, m4, m0 ; t7 t6 + punpcklqdq m4, m0 ; t4 t5 + paddsw m0, m3, m2 ; out0 out1 + psubsw m3, m2 ; out7 out6 + psubsw m2, m1, m4 ; out4 out5 + paddsw m1, m4 ; out3 out2 +%endmacro + +%macro IADST8_1D_PACKED 1 ; pass + vpbroadcastd m6, [o(pd_2048)] +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti32x4 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + vprord m1, 16 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + mova m2, m6 + vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd} + mova m5, m6 + vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd} + psrad m2, 12 + psrad m5, 12 + packssdw m2, m5 ; out4 -out5 + mova m5, m6 + vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd} + mova m3, m6 + vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd} + psrad m5, 12 + psrad m3, 12 + packssdw m1, m3, m5 ; out2 -out3 +%else + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a + psubsw m4, m0, m2 ; t4 t5 + paddsw m0, m2 ; t0 t1 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + shufps m2, m5, m4, q1032 + punpckhwd m4, m2 + punpcklwd m5, m2 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a + psubsw m2, m0, m1 ; t2 t3 + paddsw m0, m1 ; out0 -out7 + psubsw m1, m4, m5 ; t6 t7 + paddsw m4, m5 ; -out1 out6 + vpbroadcastd m5, [o(pw_2896x8)] + punpckhqdq m3, m2, m1 ; t3 t7 + punpcklqdq m2, m1 ; t2 t6 + paddsw m1, m2, m3 ; t2+t3 t6+t7 + psubsw m2, m3 ; t2-t3 t6-t7 + punpckhqdq m3, m4, m0 ; out6 -out7 + punpcklqdq m0, m4 ; out0 -out1 + pmulhrsw m2, m5 ; out4 -out5 + pshufd m1, m1, q1032 + pmulhrsw m1, m5 ; out2 -out3 +%endif +%endmacro + +INIT_YMM avx512icl +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + IDCT4_1D_PACKED + vbroadcasti32x4 m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + call .main + vpbroadcastd m4, [o(pw_2048)] + vinserti32x4 m0, m0, xm2, 1 + vinserti32x4 m1, m1, xm3, 1 + pshufd m1, m1, q1032 + jmp m(iadst_4x8_internal_8bpc).end2 +ALIGN function_align +.main: + WRAP_XMM IDCT8_1D_PACKED + ret + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call .main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vinserti32x4 m0, xm2, 1 + vinserti32x4 m1, xm3, 1 + pxor m5, m5 + psubw m5, m4 +.end: + punpcklqdq m4, m5 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + vpbroadcastd m3, strided + pmulld m5, m3, [o(pd_0to15)] + kxnorb k1, k1, k1 + kmovb k2, k1 + vpgatherdd m3{k1}, [dstq+m5] + pxor m4, m4 + mova [cq], zmm20 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpscatterdd [dstq+m5]{k2}, m0 + RET +ALIGN function_align +.main_pass1: + punpckhwd xm0, xm4, xm3 ; 0 7 + punpckhwd xm1, xm5, xm2 ; 2 5 + punpcklwd xm2, xm5 ; 4 3 + punpcklwd xm3, xm4 ; 6 1 + WRAP_XMM IADST8_1D_PACKED 1 + punpcklqdq xm3, xm4, xm0 ; out6 -out7 + punpckhqdq xm0, xm4 ; out0 -out1 + ret +ALIGN function_align +.main_pass2: + WRAP_XMM IADST8_1D_PACKED 2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + jmp tx2q +.pass2: + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call m(iadst_4x8_internal_8bpc).main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vinserti32x4 m3, xm1, 1 + vinserti32x4 m2, xm0, 1 + pxor m4, m4 + psubw m4, m5 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + jmp m(iadst_4x8_internal_8bpc).end + +INIT_ZMM avx512icl +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd m0, [o(pw_2896x8)] + pmulhrsw m0, [cq] + mova m1, [o(int8_permB)] + vpbroadcastd m2, [o(pw_1697x8)] + vpermb m0, m1, m0 + pmulhrsw m2, m0 + paddsw m0, m2 + vextracti32x8 ym1, m0, 1 + jmp tx2q +.pass2: + vpbroadcastd ym4, [o(pw_4096)] + jmp m(iadst_4x8_internal_8bpc).end2 + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + imul r6d, 181 + add r6d, 128+2048 + sar r6d, 8+4 + vpbroadcastw m0, r6d + mova m1, m0 + jmp m(iadst_4x16_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT16_1D_PACKED 0 + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 +.main2: + vpbroadcastd m10, [o(pd_2048)] +.main3: + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 ; 0x33... + ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a + ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a + ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a + ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a +.main4: + psubsw m2, m8, m0 ; t9 t14 + paddsw m8, m0 ; t8 t15 + psubsw m4, m1, m5 ; t10 t13 + paddsw m1, m5 ; t11 t12 + ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2 + psubsw m0, m8, m1 ; t11a t12a + paddsw m8, m1 ; t8a t15a + psubsw m1, m7, m3 ; t5a t6a + paddsw m7, m3 ; t4 t7 +.main5: + ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a + ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a +%if mmsize > 16 + vbroadcasti32x4 m5, [o(deint_shuf)] +%else + mova m5, [o(deint_shuf)] +%endif + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + paddsw m3, m2, m4 ; t9 t14 + psubsw m2, m4 ; t10 t13 + pshufb m8, m5 + pshufb m7, m5 + pshufb m3, m5 + ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 + ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 + ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a + punpckhqdq m2, m7, m1 ; t7 t6 + punpcklqdq m7, m1 ; t4 t5 + psubsw m1, m9, m6 ; dct4 out3 out2 + paddsw m9, m6 ; dct4 out0 out1 + packssdw m5, m11 ; t12 t13a + packssdw m4, m0 ; t11 t10a + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + psubsw m3, m9, m2 ; dct8 out7 out6 + paddsw m9, m2 ; dct8 out0 out1 + psubsw m2, m1, m7 ; dct8 out4 out5 + paddsw m1, m7 ; dct8 out3 out2 + psubsw m7, m9, m0 ; out15 out14 + paddsw m0, m9 ; out0 out1 + psubsw m6, m1, m5 ; out12 out13 + paddsw m1, m5 ; out3 out2 + psubsw m5, m2, m4 ; out11 out10 + paddsw m2, m4 ; out4 out5 + psubsw m4, m3, m8 ; out8 out9 + paddsw m3, m8 ; out7 out6 +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova ym1, [cq+32*2] + vinserti32x8 m1, [cq+32*0], 1 + mova m0, [o(int16_perm)] + mova ym2, [cq+32*3] + vinserti32x8 m2, [cq+32*1], 1 + vpbroadcastd m4, [o(pd_2048)] + vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3 + vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2 + vpbroadcastd m4, [o(pw_16384)] + psubsw m3, m1, m2 + paddsw m1, m2 ; out0 out1 + vprord m3, 16 ; out2 out3 + punpckldq m0, m1, m3 + punpckhdq m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + jmp tx2q +.pass2: + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, ym1, 1 + vextracti32x4 xm4, m0, 2 + vextracti32x4 xm5, m1, 2 + vextracti32x4 xm6, m0, 3 + vextracti32x4 xm7, m1, 3 + call .main + vinserti32x4 ym0, xm2, 1 + vinserti32x4 ym1, xm3, 1 + vinserti32x4 ym4, xm6, 1 + vinserti32x4 ym5, xm7, 1 + vinserti32x8 m0, ym4, 1 + vinserti32x8 m1, ym5, 1 + vpbroadcastd m5, [o(pw_2048)] + pshufd m1, m1, q1032 + jmp m(iadst_4x16_internal_8bpc).end2 +ALIGN function_align +.main: + WRAP_XMM IDCT16_1D_PACKED + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m1, [o(permB)] + vpermq m0, m1, [cq+64*0] + vpermq m1, m1, [cq+64*1] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m3, [o(pw_16384)] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m3 + pmulhrsw m0, m3 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + call .main + vpbroadcastd m5, [o(pw_2048)] + psrlq m10, 4 + psubw m6, m8, m5 +.end: + vpbroadcastd m7, [o(pw_2896x8)] + paddsw ym1, ym2, ym4 + psubsw ym2, ym4 + vinserti32x8 m1, ym2, 1 + pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10 + psrlq m0, m10, 4 + vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d + vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f + punpcklqdq m5, m6 +.end2: + pmulhrsw m0, m5 + pmulhrsw m1, m5 +.end3: + vpbroadcastd m3, strided + pmulld m5, m3, [o(pd_0to15)] + kxnorw k1, k1, k1 + kmovw k2, k1 + vpgatherdd m3{k1}, [dstq+m5] + pxor m4, m4 + mova [cq+64*0], m4 + mova [cq+64*1], m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpscatterdd [dstq+m5]{k2}, m0 + RET +ALIGN function_align +.main: + movu m3, [o(permB+1)] + psrlq m10, m3, 4 +.main2: + vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10 + vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5 + vpbroadcastd m9, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + kxnorb k1, k1, k1 + punpckhwd m4, m3, m0 ; in12 in3 in14 in1 + punpcklwd m0, m3 ; in0 in15 in2 in13 + kshiftrb k1, k1, 4 + vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5 + vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9 +INIT_YMM avx512icl + vpcmpub k7, m13, m9, 6 ; 0x33... + pxor m8, m8 + ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5 + ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5 + ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5 + ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5 + psubsw m2, m0, m3 ; t9a t8a t11a t10a + paddsw m0, m3 ; t1a t0a t3a t2a + psubsw m3, m1, m4 ; t13a t12a t15a t14a + paddsw m4, m1 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5 + psubw m7, m8, m7 + ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4 + vpbroadcastd m6, [o(pw_3784_m1567)] + vpbroadcastd m6{k1}, [o(pw_m3784_1567)] + psubsw m1, m0, m4 ; t5 t4 t7 t6 + paddsw m0, m4 ; t1 t0 t3 t2 + psubsw m4, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a + ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14 + vbroadcasti32x4 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a + vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a + vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14 + vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13 + pshufd m2, m2, q1032 ; t6a t7a t14 t15 + psubsw m4, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m1, m2 ; out12 -out3 -out13 out2 + psubsw m1, m2 ; t7 t6 t15a t14a + punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a + punpcklqdq m4, m1 ; t3a t7 t11 t15a +INIT_ZMM avx512icl + vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m1, [o(permB)] + vpermq m0, m1, [cq+64*0] + vpermq m1, m1, [cq+64*1] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m3, [o(pw_16384)] + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_8bpc).main + vpbroadcastd m6, [o(pw_2048)] + psrlq m10, 12 + psubw m5, m8, m6 + jmp m(iadst_4x16_internal_8bpc).end + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m2, [o(int16_perm)] + vpermb m1, m2, [cq+64*0] + vpermb m2, m2, [cq+64*1] + vpbroadcastd m4, [o(pw_1697x8)] + vpbroadcastd m0, [o(pd_m1)] + pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is + vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal + pmulhrsw m4, m2 ; it still works, but if the input is -1 the + vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes + vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless + vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here. + punpckldq m0, m1, m2 + punpckhdq m1, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x16)] + vpbroadcastd m5, [o(pw_2048)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m0 + paddsw m1, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x16_internal_8bpc).end2 + +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3] + movq xm%3, [dstq ] + movhps xm%3, [dstq+%5] + movq xm%4, [dstq+%6] + movhps xm%4, [dstq+%7] + pmovzxbw m%3, xm%3 + pmovzxbw m%4, xm%4 +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vextracti32x4 xm%4, m%3, 1 + movq [dstq ], xm%3 + movhps [dstq+%6], xm%3 + movq [dstq+%5], xm%4 + movhps [dstq+%7], xm%4 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_8x4_internal_8bpc).end3 +%endif +%endmacro + +INIT_YMM avx512icl +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd xm3, [o(pw_2896x8)] + pmulhrsw xm0, xm3, [cq+16*0] + pmulhrsw xm1, xm3, [cq+16*1] + pmulhrsw xm2, xm3, [cq+16*2] + pmulhrsw xm3, [cq+16*3] + call m(idct_4x8_internal_8bpc).main + vbroadcasti32x4 m4, [o(deint_shuf)] + vinserti32x4 m3, m1, xm3, 1 + vinserti32x4 m1, m0, xm2, 1 + shufps m0, m1, m3, q0220 + shufps m1, m3, q1331 + pshufb m0, m4 + pshufb m1, m4 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti32x4 m0, xm2, 1 + vinserti32x4 m1, xm3, 1 + pxor m3, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + psubsw m3, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +.end2: + vpbroadcastd m2, [o(pw_2048)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +.end3: + pxor m2, m2 + mova [cq], zmm18 + lea r6, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + RET +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti32x4 m3, m3, xm1, 1 + vinserti32x4 m2, m2, xm0, 1 + punpckhwd m1, m3, m2 + punpcklwd m3, m2 + pxor m0, m0 + psubsw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_8bpc).main + mova m2, m1 + vpermq m1, m0, q2031 + vpermq m0, m2, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti32x4 m2, [cq+16*2], 1 + vinserti32x4 m0, [cq+16*3], 1 + vpbroadcastd m3, [o(pw_2896x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + paddsw m0, m0 + paddsw m1, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_8x4_internal_8bpc).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct_dct +INIT_ZMM avx512icl + movsx r6d, word [cq] + mov [cq], eobd +.dconly: + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 +.dconly2: + vpbroadcastd ym2, strided + imul r6d, 181 + pmulld ym5, ym2, [o(pd_0to15)] + kxnorb k1, k1, k1 + add r6d, 128+2048 + sar r6d, 8+4 + pxor m3, m3 + vpbroadcastw m4, r6d +.dconly_loop: + kmovb k2, k1 + vpgatherdq m2{k1}, [dstq+ym5] + punpcklbw m0, m2, m3 + punpckhbw m1, m2, m3 + paddw m0, m4 + paddw m1, m4 + packuswb m0, m1 + kmovb k1, k2 + vpscatterdq [dstq+ym5]{k2}, m0 + lea dstq, [dstq+strideq*8] + sub r3d, 8 + jg .dconly_loop + RET +INIT_YMM avx512icl +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m1, [cq+32*1], q3120 ; 2 3 + call .main + shufps m4, m0, m1, q0220 + shufps m5, m0, m1, q1331 + shufps m1, m2, m3, q0220 + shufps m3, m2, m3, q1331 + vbroadcasti32x4 m0, [o(deint_shuf)] + vpbroadcastd m2, [o(pw_16384)] + REPX {pshufb x, m0}, m4, m5, m1, m3 + REPX {pmulhrsw x, m2}, m4, m5, m1, m3 + vinserti32x4 m0, m4, xm1, 1 + vshufi32x4 m2, m4, m1, 0x03 + vinserti32x4 m1, m5, xm3, 1 + vshufi32x4 m3, m5, m3, 0x03 + jmp tx2q +.pass2: + call .main + vpbroadcastd m4, [o(pw_2048)] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + jmp m(iadst_8x8_internal_8bpc).end2 +ALIGN function_align +.main: + IDCT8_1D_PACKED + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call .main_pass1 + vpbroadcastd m5, [o(pw_16384_m16384)] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + REPX {pmulhrsw x, m5}, m3, m4, m0, m1 + vshufi32x4 m2, m3, m0, 0x03 + vinserti32x4 m0, m3, xm0, 1 + vshufi32x4 m3, m4, m1, 0x03 + vinserti32x4 m1, m4, xm1, 1 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call .main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vpbroadcastd xm4, [o(pw_4096)] + psubw m4, m5 ; lower half = 2048, upper half = -2048 +.end: + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + pmulhrsw m2, m4 + pmulhrsw m3, m4 +.end4: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + lea r6, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 5 + RET +ALIGN function_align +.main_pass1: + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 + IADST8_1D_PACKED 1 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 + ret +ALIGN function_align +.main_pass2: + IADST8_1D_PACKED 2 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call m(iadst_8x8_internal_8bpc).main_pass1 + vpbroadcastd m5, [o(pw_m16384_16384)] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + punpckhwd m0, m4, m3 + punpcklwd m4, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + REPX {pmulhrsw x, m5}, m0, m4, m3, m2 + vinserti32x4 m1, m0, xm3, 1 + vshufi32x4 m3, m0, m3, 0x03 + vinserti32x4 m0, m4, xm2, 1 + vshufi32x4 m2, m4, m2, 0x03 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vpbroadcastd xm5, [o(pw_4096)] + psubw m4, m5 ; lower half = -2048, upper half = 2048 + vpermq m5, m3, q2031 + vpermq m3, m0, q2031 + vpermq m0, m2, q2031 + vpermq m2, m1, q2031 + pmulhrsw m1, m0, m4 + pmulhrsw m0, m5, m4 + jmp m(iadst_8x8_internal_8bpc).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti32x4 m3, [cq+16*4], 1 + vinserti32x4 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti32x4 m4, [cq+16*6], 1 + vinserti32x4 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_8x8_internal_8bpc).end + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 16 + add r6d, 128 + sar r6d, 8 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly +%endif +%endmacro + +%macro ITX_8X16_LOAD_COEFS 0 + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m0, m4, [cq+32*0] + add cq, 32*4 + pmulhrsw m7, m4, [cq+32*3] + pmulhrsw m1, m4, [cq-32*3] + pmulhrsw m6, m4, [cq+32*2] + pmulhrsw m2, m4, [cq-32*2] + pmulhrsw m5, m4, [cq+32*1] + pmulhrsw m3, m4, [cq-32*1] + pmulhrsw m4, [cq+32*0] +%endmacro + +INIT_ZMM avx512icl +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m3, [o(permB)] + vpermq m0, m3, [cq+64*0] + vpbroadcastd m4, [o(pw_2896x8)] + vpermq m1, m3, [cq+64*1] + vpermq m2, m3, [cq+64*2] + vpermq m3, m3, [cq+64*3] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3 + punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3 + punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3 + punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3 + REPX {pmulhrsw x, m5}, m4, m0, m2, m1 + punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3 + punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1 + punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3 + punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1 + punpckhdq m1, m0, m2 ; 1 5 9 13 + punpckldq m0, m2 ; 0 4 8 12 + punpckldq m2, m3, m4 ; 2 6 10 14 + punpckhdq m3, m4 ; 3 7 11 15 + jmp tx2q +.pass2: + vprord m5, [o(int16_perm)], 16 + vshufi32x4 m2, m2, q1320 ; 2 10 14 6 + vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 + vshufi32x4 m1, m3, q0132 ; 9 13 7 3 + vpermb m9, m5, m0 + vpermb m7, m5, m2 + vpermb m8, m5, m4 + vpermb m0, m5, m1 + vextracti32x8 ym6, m9, 1 + vextracti32x8 ym3, m7, 1 + vextracti32x8 ym5, m8, 1 + vextracti32x8 ym1, m0, 1 + call .main2 + mova ym8, [o(gather8a)] + lea r3, [dstq+strideq*4] + pmovzxdq m9, ym8 + pshufd ym8, ym8, q1230 + vpermt2q m0, m9, m4 + vpermt2q m1, m9, m5 + vpermt2q m2, m9, m6 + vpermt2q m3, m9, m7 +.end: + vpbroadcastd m7, [o(pw_2048)] +.end2: + pmulhrsw m0, m7 + pmulhrsw m1, m7 +.end3: + pmulhrsw m2, m7 + pmulhrsw m3, m7 +.end4: + vpbroadcastd ym6, strided + kxnorb k1, k1, k1 + pxor m4, m4 + pmulld ym8, ym6 + kmovb k2, k1 + vpgatherdq m6{k1}, [dstq+ym8] + kmovb k1, k2 + vpgatherdq m7{k2}, [r3+ym8] + mova [cq+64*0], m4 + mova [cq+64*1], m4 + kmovb k2, k1 + mova [cq+64*2], m4 + mova [cq+64*3], m4 + punpcklbw m5, m6, m4 + punpckhbw m6, m4 + paddw m0, m5 + paddw m1, m6 + packuswb m0, m1 + vpscatterdq [dstq+ym8]{k1}, m0 + punpcklbw m6, m7, m4 + punpckhbw m7, m4 + paddw m2, m6 + paddw m3, m7 + packuswb m2, m3 + vpscatterdq [r3+ym8]{k2}, m2 + RET +ALIGN function_align +.main: + WRAP_YMM IDCT16_1D_PACKED + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_16x8_internal_8bpc).main_pass1 + vbroadcasti32x4 m6, [o(int_shuf1)] + vpbroadcastd m7, [o(pw_16384_m16384)] + punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 + pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 + pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 +.pass1_end: + REPX {pmulhrsw x, m7}, m3, m5, m4, m2 + punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m5 + punpckhqdq m3, m5 + jmp tx2q +.pass2: + call .main_pass2 + vpbroadcastd m6, [o(pw_2048)] + psrlq m10, 4 + psubw m7, m8, m6 +.pass2_end: + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m5, m2 ; out8 -out11 -out9 out10 + mova ym8, [o(gather8c)] + lea r3, [dstq+strideq] + psrlq m2, m10, 4 + vpermi2q m2, m0, m3 ; 1 3 13 15 + vpermt2q m0, m10, m3 ; 0 2 12 14 + psrlq m3, m10, 8 + vpermi2q m3, m1, m5 ; 5 7 9 11 + psrlq m10, 12 + vpermt2q m1, m10, m5 ; 4 6 8 10 + pmulhrsw m0, m6 + pmulhrsw m1, m6 + jmp m(idct_8x16_internal_8bpc).end3 +ALIGN function_align +.main_pass1: + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m5, m2, [cq+64*0] + pmulhrsw m3, m2, [cq+64*3] + pmulhrsw m1, m2, [cq+64*1] + pmulhrsw m2, [cq+64*2] + movu m4, [o(permA+3)] + psrlq m10, m4, 4 + mova m6, m4 + vpermi2q m4, m5, m3 ; in0 in12 in2 in14 + vpermt2q m5, m10, m3 ; in15 in3 in13 in1 + vpermi2q m6, m1, m2 ; in4 in8 in6 in10 + vpermt2q m1, m10, m2 ; in11 in7 in9 in5 + jmp .main +ALIGN function_align +.main_pass2: + mova m4, [o(permC)] + psrlq m5, m4, 4 + vpermi2q m4, m0, m2 ; in0 in12 in2 in14 + psrlq m6, m5, 4 + vpermi2q m5, m1, m3 ; in15 in3 in13 in1 + psrlq m10, m6, 4 + vpermi2q m6, m0, m2 ; in4 in8 in6 in10 + vpermt2q m1, m10, m3 ; in11 in7 in9 in5 +.main: + vpbroadcastd m9, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + kxnorb k1, k1, k1 + punpcklwd m0, m4, m5 ; in0 in15 in2 in13 + punpckhwd m4, m5 ; in12 in3 in14 in1 + punpcklwd m5, m6, m1 ; in4 in11 in6 in9 + punpckhwd m6, m1 ; in8 in7 in10 in5 + vpcmpub k7, m13, m9, 6 ; 0x33... + pxor m8, m8 + ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 + ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5 + ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5 + ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5 + psubsw m2, m0, m6 ; t9a t8a t11a t10a + paddsw m0, m6 ; t1a t0a t3a t2a + psubsw m3, m5, m4 ; t13a t12a t15a t14a + paddsw m5, m4 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5 + psubw m7, m8, m7 + ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4 + vpbroadcastd m6, [o(pw_3784_m1567)] + vpbroadcastd m6{k1}, [o(pw_m3784_1567)] + psubsw m1, m0, m5 ; t5 t4 t7 t6 + paddsw m0, m5 ; t1 t0 t3 t2 + psubsw m4, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a + ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15 + vbroadcasti32x4 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a + vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a + vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15 + vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12 + pshufd m2, m2, q1032 ; t7a t6a t15 t14 + psubsw m4, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m1, m2 ; out12 -out3 -out13 out2 + psubsw m1, m2 ; t7 t6 t15a t14a + punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a + punpcklqdq m4, m1 ; t3a t7 t11 t15a + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_16x8_internal_8bpc).main_pass1 + vbroadcasti32x4 m6, [o(int_shuf2)] + vpbroadcastd m7, [o(pw_m16384_16384)] + punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 + pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 + pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 + jmp m(iadst_8x16_internal_8bpc).pass1_end +.pass2: + call m(iadst_8x16_internal_8bpc).main_pass2 + vpbroadcastd m7, [o(pw_2048)] + psrlq m10, 36 + psubw m6, m8, m7 + jmp m(iadst_8x16_internal_8bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [o(int16_perm)] + vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 + vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 + vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 + vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 + vpbroadcastd m5, [o(pw_2896x8)] + punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1 + punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3 + REPX {pmulhrsw x, m5}, m1, m2, m3, m4 + punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 + punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1 + punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2 + punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 + jmp tx2q +.pass2: + vpbroadcastd m7, [o(pw_1697x16)] + mova ym8, [o(gather8b)] + lea r3, [dstq+strideq*2] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_8x16_internal_8bpc).end + +%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] + pmovzxbw m%3, [dstq+%5] +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif + pmovzxbw m%4, [dstq+%6] +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vpermq m%3, m%3, q3120 + mova [dstq+%5], xm%3 + vextracti32x4 [dstq+%6], m%3, 1 +%endmacro + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2 +%endif +%endmacro + +INIT_ZMM avx512icl +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] + mova xm4, [cq+16*4] + mova xm5, [cq+16*5] + mova xm6, [cq+16*6] + mova xm7, [cq+16*7] + call m(idct_4x16_internal_8bpc).main + vpbroadcastd m8, [o(pw_16384)] + vinserti32x4 ym1, xm3, 1 ; 3 2 7 6 + vinserti32x4 ym5, xm7, 1 ; b a f e + vinserti32x4 ym0, xm2, 1 ; 0 1 4 5 + vinserti32x4 ym4, xm6, 1 ; 8 9 c d + vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e + vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d + pmulhrsw m1, m8 + pmulhrsw m0, m8 + pshufd m1, m1, q1032 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + mova m2, [o(permA)] + jmp m(iadst_16x4_internal_8bpc).end + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+64*0] + mova m1, [cq+64*1] + movshdup m3, [o(permB)] + psrlq m10, m3, 4 + call m(iadst_4x16_internal_8bpc).main2 + vpbroadcastd m6, [o(pw_16384_m16384)] + psrlq m0, m10, 4 + psrlq m10, 8 +.pass1_end: + punpcklwd ym5, ym4, ym2 + punpckhwd ym4, ym2 + vinserti32x8 m5, ym4, 1 + mova m1, m9 + vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} + mova m4, m9 + vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16} + psrad m1, 12 + psrad m4, 12 + packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5 + vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d + vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + pmulhrsw m0, m6 + pmulhrsw m1, m6 + jmp tx2q +.pass2: + call .main + movu m2, [o(permA+1)] +.end: + vpbroadcastd m3, [o(pw_2048)] + pmulhrsw m0, m3 + pmulhrsw m1, m3 +.end2: + psrlq m3, m2, 4 + vpermi2q m2, m0, m1 + vpermi2q m3, m0, m1 +.end3: + lea r3, [dstq+strideq*2] + mova xm1, [dstq+strideq*0] + vinserti32x4 ym1, [dstq+strideq*1], 1 + vinserti32x4 m1, [r3 +strideq*0], 2 + vinserti32x4 m1, [r3 +strideq*1], 3 + pxor m4, m4 + mova [cq+64*0], m4 + mova [cq+64*1], m4 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [r3 +strideq*0], m0, 2 + vextracti32x4 [r3 +strideq*1], m0, 3 + RET +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+64*0] + mova m1, [cq+64*1] + movshdup m3, [o(permB)] + psrlq m10, m3, 4 + call m(iadst_4x16_internal_8bpc).main2 + vpbroadcastd m6, [o(pw_m16384_16384)] + psrlq m0, m10, 12 + psrlq m10, 16 + jmp m(iadst_16x4_internal_8bpc).pass1_end +.pass2: + call m(iadst_16x4_internal_8bpc).main + movu m2, [o(permA+2)] + jmp m(iadst_16x4_internal_8bpc).end + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m1, [cq+64*0] + mova m2, [cq+64*1] + vpbroadcastd m3, [o(pw_1697x16)] + vpbroadcastd m4, [o(pw_16384)] + mova m5, [o(idtx_16x4p)] + shufps m0, m1, m2, q2020 + shufps m1, m2, q3131 + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddsw m0, m2 + paddsw m1, m3 + vpermb m0, m5, m0 + vpermb m1, m5, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + movu m2, [o(permA+1)] + jmp m(iadst_16x4_internal_8bpc).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 8 +.dconly: + imul r6d, 181 + add r6d, 128 + sar r6d, 8 +.dconly2: + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 +.dconly3: + imul r6d, 181 + lea r2, [strideq*3] + add r6d, 128+2048 + sar r6d, 8+4 + pxor m2, m2 + vpbroadcastw m3, r6d +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti32x4 ym1, [dstq+strideq*1], 1 + vinserti32x4 m1, [dstq+strideq*2], 2 + vinserti32x4 m1, [dstq+r2 ], 3 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub r3d, 4 + jg .dconly_loop + RET +%endif +%endmacro + +%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m0, [cq+32*0], q3120 + add cq, 32*4 + vpermq m7, [cq+32*3], q%1 + vpermq m1, [cq-32*3], q%1 + vpermq m6, [cq+32*2], q3120 + vpermq m2, [cq-32*2], q3120 + vpermq m5, [cq+32*1], q%1 + vpermq m3, [cq-32*1], q%1 + vpermq m4, [cq+32*0], q3120 + REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst + +cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd m1, [o(pw_2896x8)] + vpermq m0, [cq+64*0], q3120 + vpermq m2, [cq+64*1], q3120 + vpermq m4, [cq+64*2], q3120 + vpermq m6, [cq+64*3], q3120 + REPX {pmulhrsw x, m1}, m0, m2, m4, m6 + vextracti32x8 ym1, m0, 1 + vextracti32x8 ym3, m2, 1 + vextracti32x8 ym5, m4, 1 + vextracti32x8 ym7, m6, 1 + call m(idct_8x16_internal_8bpc).main + vbroadcasti32x4 m8, [o(int_shuf1)] + vbroadcasti32x4 m9, [o(int_shuf2)] + vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3 + vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3 + vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3 + vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3 + vpbroadcastd m2, [o(pw_16384)] + pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3 + pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3 + pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3 + pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3 + REPX {pmulhrsw x, m2}, m0, m1, m6, m7 + punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 + jmp tx2q +.pass2: + vshufi32x4 m0, m2, m4, q2020 ; 0 1 + vshufi32x4 m2, m4, q3131 ; 4 5 + vshufi32x4 m1, m3, m5, q2020 ; 2 3 + vshufi32x4 m3, m5, q3131 ; 6 7 + call .main + movshdup m4, [o(permC)] + psrlq m6, m4, 4 + vpermq m5, m4, q1032 + vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3 + vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1 + psrlq m6, m5, 4 + vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3 + vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1 + vpbroadcastd m6, [o(pw_2048)] +.end: + REPX {pmulhrsw x, m6}, m0, m4, m1, m5 +.end2: + lea r3, [dstq+strideq*4] + lea r4, [strideq*3] + mova xm3, [dstq+strideq*0] + mova xm6, [dstq+strideq*2] + vinserti32x4 ym3, [dstq+strideq*1], 1 + vinserti32x4 ym6, [dstq+r4 ], 1 + vinserti32x4 m3, [r3 +strideq*0], 2 + vinserti32x4 m6, [r3 +strideq*2], 2 + vinserti32x4 m3, [r3 +strideq*1], 3 + vinserti32x4 m6, [r3 +r4 ], 3 + pxor m7, m7 + mova [cq+64*0], m7 + mova [cq+64*1], m7 + mova [cq+64*2], m7 + mova [cq+64*3], m7 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m4, m3 + packuswb m0, m4 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [r3 +strideq*0], m0, 2 + vextracti32x4 [r3 +strideq*1], m0, 3 + punpcklbw m3, m6, m7 + punpckhbw m6, m7 + paddw m1, m3 + paddw m5, m6 + packuswb m1, m5 + mova [dstq+strideq*2], xm1 + vextracti32x4 [dstq+r4 ], ym1, 1 + vextracti32x4 [r3 +strideq*2], m1, 2 + vextracti32x4 [r3 +r4 ], m1, 3 + RET +ALIGN function_align +.main: + IDCT8_1D_PACKED + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_8x16_internal_8bpc).main_pass1 + vpbroadcastd m7, [o(pw_16384_m16384)] + psrlq m10, 4 +.pass1_end: + punpcklwd m5, m4, m2 + punpckhwd m4, m2 + mova m1, m9 + vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} + mova m6, m9 + vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16} + mova m2, m9 + vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16} + vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16} + psrad m1, 12 + psrad m6, 12 + packssdw m1, m6 ; out8 -out7 -out9 out6 + psrad m2, 12 + psrad m9, 12 + packssdw m2, m9 ; -out11 out4 out10 -out5 + psrlq m4, m10, 4 + vpermi2q m4, m0, m2 + vpermt2q m0, m10, m2 + psrlq m5, m10, 8 + vpermi2q m5, m1, m3 + psrlq m10, 12 + vpermt2q m1, m10, m3 + punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3 + punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3 + punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3 + punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3 + punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3 + REPX {pmulhrsw x, m7}, m2, m3, m4, m5 + jmp tx2q +.pass2: + vshufi32x4 m0, m2, m4, q2020 + vshufi32x4 m2, m4, q3131 ; 4 5 + vshufi32x4 m1, m3, m5, q2020 + vshufi32x4 m3, m5, q3131 ; 6 7 + pshufd m4, m0, q1032 ; 1 0 + pshufd m5, m1, q1032 ; 3 2 + call .main_pass2 + pmulhrsw m0, m6 + pmulhrsw m1, m6 + psrlq m6, m4, 4 + mova m5, m4 + vpermi2q m4, m0, m2 + vpermt2q m0, m6, m2 + vpermi2q m5, m1, m3 + vpermt2q m1, m6, m3 + jmp m(idct_16x8_internal_8bpc).end2 +ALIGN function_align +.main_pass1: + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m3, m4, [cq+64*0] + pmulhrsw m1, m4, [cq+64*3] + pmulhrsw m2, m4, [cq+64*1] + pmulhrsw m4, [cq+64*2] + mova m5, [o(int16_perm)] + kxnorb k1, k1, k1 + vpblendmd m0{k1}, m1, m3 ; 0 7 + vmovdqa32 m3{k1}, m1 ; 6 1 + vpblendmd m1{k1}, m4, m2 ; 2 5 + vmovdqa32 m2{k1}, m4 ; 4 3 + REPX {vpermb x, m5, x}, m0, m1, m2, m3 + IADST8_1D_PACKED 1 + ret +ALIGN function_align +.main_pass2: + IADST8_1D_PACKED 2 + movshdup m4, [o(permC)] + pxor m5, m5 + psubd m5, m6 + packssdw m6, m5 + pmulhrsw m2, m6 + pmulhrsw m3, m6 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_8x16_internal_8bpc).main_pass1 + vpbroadcastd m7, [o(pw_m16384_16384)] + psrlq m10, 20 + jmp m(iadst_16x8_internal_8bpc).pass1_end +.pass2: + vshufi32x4 m0, m2, m4, q2020 + vshufi32x4 m2, m4, q3131 ; 4 5 + vshufi32x4 m1, m3, m5, q2020 + vshufi32x4 m3, m5, q3131 ; 6 7 + pshufd m4, m0, q1032 ; 1 0 + pshufd m5, m1, q1032 ; 3 2 + call m(iadst_16x8_internal_8bpc).main_pass2 + pmulhrsw m5, m6, m0 + pmulhrsw m0, m6, m1 + psrlq m1, m4, 12 + psrlq m4, 8 + mova m7, m4 + vpermi2q m4, m0, m3 + vpermt2q m0, m1, m3 + vpermi2q m1, m5, m2 + vpermt2q m5, m7, m2 + jmp m(idct_16x8_internal_8bpc).end2 + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd m0, [o(pw_2896x8)] + pmulhrsw m3, m0, [cq+64*0] + pmulhrsw m4, m0, [cq+64*1] + pmulhrsw m5, m0, [cq+64*2] + pmulhrsw m0, [cq+64*3] + vpbroadcastd m7, [o(pw_1697x16)] + vpbroadcastd m8, [o(pw_16384)] + shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5 + shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7 + shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5 + shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7 + mova m9, [o(int8_permA)] + pmulhrsw m0, m7, m2 + pmulhrsw m1, m7, m3 + pmulhrsw m6, m7, m4 + pmulhrsw m7, m5 + REPX {pmulhrsw x, m8}, m0, m1, m6, m7 + paddsw m2, m0 + paddsw m3, m1 + paddsw m4, m6 + paddsw m5, m7 + REPX {vpermb x, m9, x}, m2, m3, m4, m5 + jmp tx2q +.pass2: + mova m7, [o(permB)] + vpbroadcastd m6, [o(pw_4096)] + vpermq m0, m7, m2 + vpermq m4, m7, m4 + vpermq m1, m7, m3 + vpermq m5, m7, m5 + jmp m(idct_16x8_internal_8bpc).end + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 16 + add r6d, 128+512 + sar r6d, 8+2 + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst + +cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m7, [o(permB)] + vpermq m0, m7, [cq+64*0] + vpermq m1, m7, [cq+64*1] + vpermq m2, m7, [cq+64*2] + vpermq m3, m7, [cq+64*3] + vpermq m4, m7, [cq+64*4] + vpermq m5, m7, [cq+64*5] + vpermq m6, m7, [cq+64*6] + vpermq m7, m7, [cq+64*7] + call .main + vbroadcasti32x4 m12, [o(int_shuf1)] + vbroadcasti32x4 m11, [o(int_shuf2)] + vpbroadcastd m13, [o(pw_8192)] + pshufb m0, m12 + pshufb m8, m1, m11 + pshufb m2, m12 + pshufb m9, m3, m11 + pshufb m4, m12 + pshufb m10, m5, m11 + pshufb m6, m12 + pshufb m11, m7, m11 + REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11 + punpckhdq m1, m0, m8 + punpckldq m0, m8 + punpckhdq m3, m2, m9 + punpckldq m2, m9 + punpckhdq m5, m4, m10 + punpckldq m4, m10 + punpckhdq m7, m6, m11 + punpckldq m6, m11 + jmp tx2q +.pass2: + vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc + vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 + vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec + vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 + vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me + vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 + vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee + vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 + vshufi32x4 m3, m1, m5, q3131 ; 6 7 + vshufi32x4 m1, m5, q2020 ; 2 3 + vshufi32x4 m5, m7, m9, q2020 ; 10 11 + vshufi32x4 m7, m9, q3131 ; 14 15 + call .main + mova m8, [o(permD)] + psrlq m12, m8, 4 + psrlq m9, m8, 8 + psrlq m13, m8, 12 + mova m10, m8 + vpermi2q m8, m0, m2 ; 0 1 4 5 + vpermt2q m0, m12, m2 + mova m11, m9 + vpermi2q m9, m1, m3 ; 2 3 6 7 + vpermt2q m1, m13, m3 + vpermi2q m10, m4, m6 ; 8 9 12 13 + vpermt2q m4, m12, m6 + vpermi2q m11, m5, m7 ; 10 11 14 15 + vpermt2q m5, m13, m7 +.end: + vpbroadcastd m12, [o(pw_2048)] +.end2: + REPX {pmulhrsw x, m12}, m0, m1, m4, m5 +.end3: + REPX {pmulhrsw x, m12}, m8, m9, m10, m11 + lea r3, [strideq*3] + lea r4, [dstq+strideq*4] + lea r5, [dstq+strideq*8] + lea r6, [r4 +strideq*8] + mova xm3, [dstq+strideq*0] + mova xm6, [dstq+strideq*2] + vinserti32x4 ym3, [dstq+strideq*1], 1 + vinserti32x4 ym6, [dstq+r3 ], 1 + vinserti32x4 m3, [r4+strideq*0], 2 + vinserti32x4 m6, [r4+strideq*2], 2 + vinserti32x4 m3, [r4+strideq*1], 3 + vinserti32x4 m6, [r4+r3 ], 3 + mova xm12, [r5+strideq*0] + mova xm13, [r5+strideq*2] + vinserti32x4 ym12, [r5+strideq*1], 1 + vinserti32x4 ym13, [r5+r3 ], 1 + vinserti32x4 m12, [r6+strideq*0], 2 + vinserti32x4 m13, [r6+strideq*2], 2 + vinserti32x4 m12, [r6+strideq*1], 3 + vinserti32x4 m13, [r6+r3 ], 3 + pxor m7, m7 + REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m8, m3 + packuswb m0, m8 + punpcklbw m2, m6, m7 + punpckhbw m6, m7 + paddw m1, m2 + paddw m9, m6 + packuswb m1, m9 + punpcklbw m2, m12, m7 + punpckhbw m12, m7 + paddw m2, m4 + paddw m10, m12 + packuswb m2, m10 + punpcklbw m3, m13, m7 + punpckhbw m13, m7 + paddw m3, m5 + paddw m11, m13 + packuswb m3, m11 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + mova [dstq+strideq*2], xm1 + vextracti32x4 [dstq+r3 ], ym1, 1 + vextracti32x4 [r4+strideq*0], m0, 2 + vextracti32x4 [r4+strideq*1], m0, 3 + vextracti32x4 [r4+strideq*2], m1, 2 + vextracti32x4 [r4+r3 ], m1, 3 + mova [r5+strideq*0], xm2 + vextracti32x4 [r5+strideq*1], ym2, 1 + mova [r5+strideq*2], xm3 + vextracti32x4 [r5+r3 ], ym3, 1 + vextracti32x4 [r6+strideq*0], m2, 2 + vextracti32x4 [r6+strideq*1], m2, 3 + vextracti32x4 [r6+strideq*2], m3, 2 + vextracti32x4 [r6+r3 ], m3, 3 + RET +ALIGN function_align +.main_fast2: ; bottom three-quarters are zero + vpbroadcastd m10, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 +.main_fast4: + vpbroadcastd m2, [o(pw_401_4076x8)] + vpbroadcastd m4, [o(pw_m1189_3920x8)] + vpbroadcastd m3, [o(pw_799_4017x8)] + pmulhrsw m2, m8 ; t8a t15a + pmulhrsw m4, m1 ; t11a t12a + pmulhrsw m7, m3 ; t4a t7a + pxor m6, m6 + psubsw m0, m2, m4 ; t11a t12a + paddsw m8, m2, m4 ; t8a t15a + mova m1, m7 + jmp .main5 +ALIGN function_align +.main_fast: ; bottom half is zero + vpbroadcastd m10, [o(pd_2048)] +.main_fast3: + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 +.main_fast5: + vpbroadcastd m2, [o(pw_401_4076x8)] + vpbroadcastd m4, [o(pw_m2598_3166x8)] + vpbroadcastd m11, [o(pw_1931_3612x8)] + vpbroadcastd m12, [o(pw_m1189_3920x8)] + pmulhrsw m8, m2 ; t8a t15a + vpbroadcastd m2, [o(pw_799_4017x8)] + pmulhrsw m0, m4 ; t9a t14a + vpbroadcastd m4, [o(pw_m2276_3406x8)] + pmulhrsw m5, m11 ; t10a t13a + pmulhrsw m1, m12 ; t11a t12a + pmulhrsw m7, m2 ; t4a t7a + pmulhrsw m3, m4 ; t5a t6a + jmp .main4 +ALIGN function_align +.main: + IDCT16_1D_PACKED + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call .main_pass1 + vpbroadcastd m10, [o(pw_8192_m8192)] + punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3 + punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3 + punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3 + punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3 + punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3 + punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3 + punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3 + punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3 + punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 +.pass1_end: + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .main_pass2 + mova m10, [o(permD)] + psrlq m8, m10, 8 + psrlq m12, m10, 12 + psrlq m13, m10, 4 + mova m9, m8 + vpermi2q m8, m0, m2 ; 0 1 4 5 + vpermt2q m0, m12, m2 + vpermi2q m9, m1, m3 ; 2 3 6 7 + vpermt2q m1, m12, m3 + vpbroadcastd m12, [o(pw_2048)] + mov r3d, 0xff00ff00 + mova m11, m10 + vpermi2q m10, m4, m6 ; 8 9 12 13 + vpermt2q m4, m13, m6 + kmovd k1, r3d + vpermi2q m11, m5, m7 ; 10 11 14 15 + vpermt2q m5, m13, m7 + pxor m7, m7 + vpsubw m12{k1}, m7, m12 + jmp m(idct_16x16_internal_8bpc).end2 +ALIGN function_align +.main_pass1: + mova m4, [o(permB)] + psrlq m3, m4, 4 + vpermq m0, m4, [cq+64*0] + vpermq m7, m3, [cq+64*7] + vpermq m6, m4, [cq+64*6] + vpermq m1, m3, [cq+64*1] + vpermq m2, m4, [cq+64*2] + vpermq m5, m3, [cq+64*5] + vpermq m4, m4, [cq+64*4] + vpermq m3, m3, [cq+64*3] + call .main + vpbroadcastd m13, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + mova m2, m10 + vpdpwssd m2, m5, m13 ; -out5 + mova m8, m10 + vpdpwssd m8, m11, m13 ; out4 + mova m9, m10 + vpdpwssd m9, m5, m12 ; out10 + mova m5, m10 + vpdpwssd m5, m11, m12 ; -out11 + mova m11, m10 + vpdpwssd m11, m3, m13 ; -out7 + mova m14, m10 + vpdpwssd m14, m4, m13 ; out6 + mova m13, m10 + vpdpwssd m13, m3, m12 ; out8 + vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9 + REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10 + packssdw m2, m8 ; -out5 out4 + packssdw m5, m9, m5 ; out10 -out11 + packssdw m3, m11, m14 ; -out7 out6 + packssdw m4, m13, m10 ; out8 -out9 + ret +ALIGN function_align +.main_pass2: + vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc + vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 + vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec + vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 + vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me + vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 + vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee + vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 + vshufi32x4 m3, m1, m5, q3131 ; 6 7 + vshufi32x4 m1, m5, q2020 ; 2 3 + vshufi32x4 m5, m7, m9, q2020 ; 10 11 + vshufi32x4 m7, m9, q3131 ; 14 15 + REPX {pshufd x, x, q1032}, m1, m3, m5, m7 + call .main + vpbroadcastd m8, [o(pw_2896x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m9, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + shufps m3, m4, q3210 ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m9 ; -out5 out4 + psubsw m5, m9 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 + ret +ALIGN function_align +.main: + vpbroadcastd m10, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + vpcmpub k7, m13, m10, 6 ; 0x33... + ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15 + psubsw m4, m0, m5 ; t9a t8a + paddsw m0, m5 ; t1a t0a + psubsw m5, m1, m6 ; t11a t10a + paddsw m1, m6 ; t3a t2a + psubsw m6, m2, m7 ; t13a t12a + paddsw m2, m7 ; t5a t4a + psubsw m7, m3, m8 ; t15a t14a + paddsw m3, m8 ; t7a t6a + ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9 + ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13 + ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11 + ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15 + psubsw m8, m1, m3 ; t7 t6 + paddsw m1, m3 ; t3 t2 + psubsw m3, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m2, m5, m7 ; t14a t15a + paddsw m7, m5 ; t10a t11a + psubsw m5, m4, m6 ; t12a t13a + paddsw m4, m6 ; t8a t9a + ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a + ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a + ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14 + ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12 + vbroadcasti32x4 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a + paddsw m0, m1 ; -out15 out0 + paddsw m1, m2, m5 ; -out13 out2 + psubsw m5, m2 ; t15a t14a + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + shufps m0, m6, m0, q3210 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_16x16_internal_8bpc).main_pass1 + vpbroadcastd m10, [o(pw_m8192_8192)] + punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3 + punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3 + punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3 + punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3 + punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1 + punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3 + punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3 + punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3 + punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3 + punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 + jmp m(iadst_16x16_internal_8bpc).pass1_end +.pass2: + call m(iadst_16x16_internal_8bpc).main_pass2 + mova m10, [o(permD)] + psrlq m8, m10, 8 + psrlq m12, m10, 12 + psrlq m13, m10, 4 + mova m9, m8 + vpermi2q m8, m7, m5 ; 0 1 4 5 + vpermt2q m7, m12, m5 + vpermi2q m9, m6, m4 ; 2 3 6 7 + vpermt2q m6, m12, m4 + vpbroadcastd m12, [o(pw_2048)] + mov r3d, 0x00ff00ff + mova m11, m10 + vpermi2q m10, m3, m1 ; 8 9 12 13 + vpermt2q m3, m13, m1 + kmovd k1, r3d + vpermi2q m11, m2, m0 ; 10 11 14 15 + vpermt2q m2, m13, m0 + pxor m0, m0 + vpsubw m12{k1}, m0, m12 + pmulhrsw m0, m7, m12 + pmulhrsw m1, m6, m12 + pmulhrsw m4, m3, m12 + pmulhrsw m5, m2, m12 + jmp m(idct_16x16_internal_8bpc).end3 + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m8, [o(int16_perm)] + vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 + vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 + vpbroadcastd m0, [o(pw_1697x16)] + vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 + vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 + vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3 + vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3 + vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3 + vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3 + pmulhrsw m9, m0, m1 + pmulhrsw m10, m0, m2 + pmulhrsw m11, m0, m3 + pmulhrsw m12, m0, m4 + pmulhrsw m13, m0, m5 + pmulhrsw m14, m0, m6 + pmulhrsw m15, m0, m7 + pmulhrsw m0, m8 + REPX {psraw x, 1}, m9, m10, m11, m12 + pavgw m1, m9 + pavgw m2, m10 + pavgw m3, m11 + pavgw m4, m12 + REPX {psraw x, 1}, m13, m14, m15, m0 + pavgw m5, m13 + pavgw m6, m14 + pavgw m7, m15 + pavgw m8, m0 + punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 + punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 + jmp tx2q +ALIGN function_align +.pass2: + vpbroadcastd m11, [o(pw_1697x16)] + pmulhrsw m12, m11, m0 + pmulhrsw m13, m11, m1 + pmulhrsw m14, m11, m2 + pmulhrsw m15, m11, m3 + pmulhrsw m8, m11, m4 + pmulhrsw m9, m11, m5 + pmulhrsw m10, m11, m6 + pmulhrsw m11, m7 + REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + paddsw m0, m12 + paddsw m1, m13 + paddsw m2, m14 + paddsw m3, m15 + paddsw m8, m4 + movu m4, [o(permD+2)] + paddsw m9, m5 + paddsw m6, m10 + paddsw m7, m11 + psrlq m12, m4, 4 + mova m5, m4 + mova m10, m4 + mova m11, m4 + vpermi2q m4, m0, m2 ; 8 9 12 13 + vpermt2q m0, m12, m2 ; 0 1 4 5 + vpermi2q m5, m1, m3 ; 10 11 14 15 + vpermt2q m1, m12, m3 ; 2 3 6 7 + vpermi2q m10, m8, m6 + vpermt2q m8, m12, m6 + vpermi2q m11, m9, m7 + vpermt2q m9, m12, m7 + jmp m(idct_16x16_internal_8bpc).end + +%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] + vpbroadcastd m%3, [o(pw_%4_%5x8)] + punpcklwd m%1, m%2, m%2 + pmulhrsw m%1, m%3 + vpbroadcastd m%3, [o(pw_%6_%7x8)] + punpckhwd m%2, m%2 + pmulhrsw m%2, m%3 +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + cmp eobd, 107 + jb .fast + mova m5, [cq+64*5] + mova m3, [cq+64*3] + mova m1, [cq+64*1] + mova m7, [cq+64*7] + mova m2, [cq+64*2] + mova m6, [cq+64*6] + mova m0, [cq+64*0] + mova m4, [cq+64*4] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + mova m8, [o(idct_8x32p)] + vpbroadcastd m9, [o(pw_8192)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7 + punpckldq m8, m0, m1 ; ab + punpckhdq m0, m1 + punpckldq m1, m2, m3 ; cd + punpckhdq m2, m3 + punpckldq m3, m4, m5 ; ef + punpckhdq m4, m5 + punpckldq m5, m6, m7 ; gh + punpckhdq m6, m7 + REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6 + punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9 + punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21 + punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13 + punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17 + punpcklqdq m20, m3, m5 + punpckhqdq m16, m3, m5 + punpcklqdq m19, m4, m6 + punpckhqdq m17, m4, m6 + vinserti32x4 ym8, ym18, xm20, 1 + vshufi32x4 ym1, ym18, ym20, 0x03 + vinserti32x4 ym9, ym14, xm16, 1 + vshufi32x4 ym3, ym14, ym16, 0x03 + vinserti32x4 ym0, ym21, xm19, 1 + vshufi32x4 ym5, ym21, ym19, 0x03 + vinserti32x4 ym7, ym15, xm17, 1 + vshufi32x4 ym6, ym15, ym17, 0x03 + call m(idct_8x16_internal_8bpc).main2 + psrlq m12, [o(permB)], 60 + vpermt2q m14, m12, m16 + vpermt2q m21, m12, m19 + vpermt2q m15, m12, m17 + vpermi2q m12, m18, m20 + vextracti32x8 ym16, m14, 1 + vextracti32x8 ym19, m21, 1 + vextracti32x8 ym17, m15, 1 + vextracti32x8 ym20, m12, 1 + call .main2 + jmp .end +.fast: ; right half is zero + mova m0, [o(int16_perm)] + mova ym2, [cq+64*4] + vinserti32x8 m2, [cq+64*0], 1 + mova ym3, [cq+64*6] + vinserti32x8 m3, [cq+64*2], 1 + mova ym4, [cq+64*3] + vinserti32x8 m4, [cq+64*5], 1 + mova ym5, [cq+64*7] + vinserti32x8 m5, [cq+64*1], 1 + REPX {vpermb x, m0, x}, m2, m3, m4, m5 + call m(idct_16x8_internal_8bpc).main2 + vbroadcasti32x4 m4, [o(int_shuf3)] + vbroadcasti32x4 m5, [o(int_shuf4)] + pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3 + pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3 + pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3 + pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3 + vpbroadcastd m4, [o(pw_8192)] + psrlq m5, [o(permB)], 60 + punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2 + punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3 + punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2 + punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3 + REPX {pmulhrsw x, m4}, m6, m17, m2, m16 + vinserti32x4 ym0, ym2, xm6, 1 ; 0 2 + vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 + vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 + vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 + pxor ym4, ym4 + vpermt2q m2, m5, m6 ; 8 10 + vpermt2q m16, m5, m17 ; 9 11 + mova ym5, ym4 + mova ym6, ym4 + mova ym7, ym4 + vextracti32x8 ym3, m2, 1 ; 12 14 + vextracti32x8 ym17, m16, 1 ; 13 15 + call m(idct_8x16_internal_8bpc).main + call .main_fast +.end: + vpbroadcastd ym12, strided + vpbroadcastd m13, [o(pw_2048)] + pmulld ym7, ym12, [o(gather8d)] + REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11 + lea r3, [dstq+strideq*4] + shl strideq, 4 + lea r4, [dstq+strideq] + add r1, r3 + kxnorb k1, k1, k1 + pxor m6, m6 + kmovb k2, k1 + vpgatherdq m12{k1}, [r0+ym7] + kmovb k1, k2 + vpgatherdq m13{k2}, [r3+ym7] + kmovb k2, k1 + vpgatherdq m14{k1}, [r4+ym7] + kmovb k1, k2 + vpgatherdq m15{k2}, [r1+ym7] + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m4, m12, m6 + punpckhbw m12, m6 + paddw m0, m4 + paddw m1, m12 + packuswb m0, m1 + kmovb k2, k1 + vpscatterdq [r0+ym7]{k1}, m0 + punpcklbw m4, m13, m6 + punpckhbw m13, m6 + paddw m2, m4 + paddw m3, m13 + packuswb m2, m3 + kmovb k1, k2 + vpscatterdq [r3+ym7]{k2}, m2 + punpcklbw m4, m14, m6 + punpckhbw m14, m6 + paddw m8, m4 + paddw m9, m14 + packuswb m8, m9 + kmovb k2, k1 + vpscatterdq [r4+ym7]{k1}, m8 + punpcklbw m4, m15, m6 + punpckhbw m15, m6 + paddw m10, m4 + paddw m11, m15 + packuswb m10, m11 + vpscatterdq [r1+ym7]{k2}, m10 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 32 + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 +INIT_YMM avx512icl +ALIGN function_align +.main_fast: ; bottom half is zero + ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a + jmp .main3 +ALIGN function_align +.main: + punpcklwd m12, m21, m14 ; in31 in1 + punpckhwd m14, m21 ; in3 in29 + punpcklwd m21, m20, m15 ; in27 in5 + punpckhwd m15, m20 ; in7 in25 + punpcklwd m20, m19, m16 ; in23 in9 + punpckhwd m16, m19 ; in11 in21 + punpcklwd m19, m18, m17 ; in19 in13 + punpckhwd m17, m18 ; in15 in17 +.main2: + ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a + ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a + ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a + ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a + ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a + ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a + ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a + ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a +.main3: + psubsw m11, m12, m17 ; t17 t30 + paddsw m12, m17 ; t16 t31 + psubsw m17, m15, m20 ; t18 t29 + paddsw m20, m15 ; t19 t28 + psubsw m15, m21, m16 ; t21 t26 + paddsw m21, m16 ; t20 t27 + psubsw m16, m14, m19 ; t22 t25 + paddsw m14, m19 ; t23 t24 + ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a + ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a + ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a + ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a + vpbroadcastd m8, [o(pw_m3784_1567)] + psubsw m19, m12, m20 ; t19a t28a + paddsw m20, m12 ; t16a t31a + psubsw m12, m14, m21 ; t20a t27a + paddsw m14, m21 ; t23a t24a + psubsw m21, m11, m17 ; t18 t29 + paddsw m11, m17 ; t17 t30 + psubsw m17, m16, m15 ; t21 t26 + paddsw m16, m15 ; t22 t25 + ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a + ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28 + ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27 + ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a + vbroadcasti32x4 m18, [o(deint_shuf)] + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m9, [o(pw_2896_2896)] + psubsw m15, m20, m14 ; t23 t24 + paddsw m20, m14 ; t16 t31 + psubsw m14, m11, m16 ; t22a t25a + paddsw m11, m16 ; t17a t30a + psubsw m16, m21, m17 ; t21 t26 + paddsw m21, m17 ; t18 t29 + psubsw m17, m19, m12 ; t20a t27a + paddsw m19, m12 ; t19a t28a + REPX {pshufb x, m18}, m20, m11, m21, m19 + ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a + ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 + packssdw m18, m13 ; t23a t22 + packssdw m12, m15 ; t24a t25 + ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a + ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 + packssdw m16, m13 ; t20 t21a + packssdw m14, m15 ; t27 t26a + punpcklqdq m13, m19, m21 ; t19a t18 + punpckhqdq m19, m21 ; t28a t29 + punpcklqdq m21, m20, m11 ; t16 t17a + punpckhqdq m20, m11 ; t31 t30a + psubsw m15, m1, m19 ; out28 out29 + paddsw m1, m19 ; out3 out2 + psubsw m9, m6, m13 ; out19 out18 + paddsw m6, m13 ; out12 out13 + psubsw m10, m5, m16 ; out20 out21 + paddsw m5, m16 ; out11 out10 + psubsw m19, m3, m12 ; out24 out25 + paddsw m3, m12 ; out7 out6 + psubsw m8, m7, m21 ; out16 out17 + paddsw m7, m21 ; out15 out14 + psubsw m21, m0, m20 ; out31 out30 + paddsw m0, m20 ; out0 out1 + psubsw m11, m4, m18 ; out23 out22 + paddsw m4, m18 ; out8 out9 + psubsw m18, m2, m14 ; out27 out26 + paddsw m2, m14 ; out4 out5 +INIT_ZMM avx512icl + movu m16, [o(permD+3)] + vpermt2q m0, m16, m4 ; 0 1 8 9 + vpermt2q m8, m16, m19 ; 16 17 24 25 + vpermt2q m1, m16, m5 ; 3 2 11 10 + vpermt2q m9, m16, m18 ; 19 18 27 26 + vpermt2q m2, m16, m6 ; 4 5 12 13 + vpermt2q m10, m16, m15 ; 20 21 28 29 + vpermt2q m3, m16, m7 ; 7 6 15 14 + vpermt2q m11, m16, m21 ; 23 22 31 30 + vzeroupper + ret + +%macro LOAD_PACKED_16X2 3 ; dst, row[1-2] + vbroadcasti32x4 ym%1, [cq+16*%2] + vbroadcasti32x4 ym8, [cq+16*%3] + shufpd ym%1, ym8, 0x0c +%endmacro + +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob +%undef cmp + test eobd, eobd + jz .dconly + lea r5, [o_base] + LOAD_PACKED_16X2 0, 0, 2 ; in0 in2 + LOAD_PACKED_16X2 1, 4, 6 ; in4 in6 + LOAD_PACKED_16X2 2, 8, 10 ; in8 in10 + LOAD_PACKED_16X2 3, 12, 14 ; in12 in14 + LOAD_PACKED_16X2 14, 1, 3 ; in1 in3 + LOAD_PACKED_16X2 15, 5, 7 ; in5 in7 + LOAD_PACKED_16X2 16, 9, 11 ; in9 in11 + LOAD_PACKED_16X2 17, 13, 15 ; in13 in15 + pxor m4, m4 + REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 + cmp eobd, 107 + jb .fast + LOAD_PACKED_16X2 4, 16, 18 ; in16 in18 + LOAD_PACKED_16X2 5, 20, 22 ; in20 in22 + LOAD_PACKED_16X2 6, 24, 26 ; in24 in26 + LOAD_PACKED_16X2 7, 28, 30 ; in28 in30 + call m(idct_8x16_internal_8bpc).main + LOAD_PACKED_16X2 18, 19, 17 ; in19 in17 + LOAD_PACKED_16X2 19, 23, 21 ; in23 in21 + LOAD_PACKED_16X2 20, 27, 25 ; in27 in25 + LOAD_PACKED_16X2 21, 31, 29 ; in31 in29 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main + jmp .pass2 +.fast: ; bottom half is zero + mova ym5, ym4 + mova ym6, ym4 + mova ym7, ym4 + call m(idct_8x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast +.pass2: + vpbroadcastd m12, [o(pw_8192)] + vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31 + vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30 + vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29 + vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28 + vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27 + vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26 + vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15 + vshufi32x4 m0, m8, q2020 ; 0 8 16 24 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 + call .main + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + lea r2, [strideq*3] + lea r3, [dstq+strideq*4] + movshdup m12, [o(permD)] + pmovzxbw m8, [dstq+strideq*0] + pmovzxbw m9, [dstq+strideq*1] + pmovzxbw m10, [dstq+strideq*2] + pmovzxbw m11, [dstq+r2 ] + paddw m0, m8 + paddw m1, m9 + paddw m2, m10 + paddw m3, m11 + pmovzxbw m8, [r3+strideq*0] + pmovzxbw m9, [r3+strideq*1] + pmovzxbw m10, [r3+strideq*2] + pmovzxbw m11, [r3+r2 ] + paddw m4, m8 + paddw m5, m9 + paddw m6, m10 + paddw m7, m11 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m12, m0 + vpermq m2, m12, m2 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym2 + vextracti32x8 [dstq+r2 ], m2, 1 + packuswb m4, m5 + packuswb m6, m7 + vpermq m4, m12, m4 + vpermq m6, m12, m6 + mova [r3+strideq*0], ym4 + vextracti32x8 [r3+strideq*1], m4, 1 + mova [r3+strideq*2], ym6 + vextracti32x8 [r3+r2 ], m6, 1 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 8 +.dconly2: + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 +.dconly3: + imul r6d, 181 + add r6d, 128+2048 + sar r6d, 8+4 + pxor m2, m2 + vpbroadcastw m3, r6d +.dconly_loop: + mova ym1, [dstq+strideq*0] + vinserti32x8 m1, [dstq+strideq*1], 1 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +ALIGN function_align +.main: + vpbroadcastd m10, [o(pd_2048)] +.main2: + ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a + ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a + ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3 + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0 +.main3: + paddsw m8, m1, m5 ; t4 + psubsw m1, m5 ; t5a + paddsw m9, m7, m3 ; t7 + psubsw m7, m3 ; t6a + ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6 + psubsw m5, m0, m2 ; dct4 out2 + paddsw m2, m0 ; dct4 out1 + paddsw m0, m4, m6 ; dct4 out0 + psubsw m4, m6 ; dct4 out3 + psubsw m6, m2, m1 ; out6 + paddsw m1, m2 ; out1 + paddsw m2, m5, m7 ; out2 + psubsw m5, m7 ; out5 + psubsw m7, m0, m9 ; out7 + paddsw m0, m9 ; out0 + paddsw m3, m4, m8 ; out3 + psubsw m4, m8 ; out4 + ret + +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c + vpbroadcastd m7, [pw_5] + paddsw m0, m7, [cq+64*0] + paddsw m1, m7, [cq+64*1] + vpbroadcastd ym9, strided + paddsw m2, m7, [cq+64*2] + paddsw m3, m7, [cq+64*3] + paddsw m4, m7, [cq+64*4] + paddsw m5, m7, [cq+64*5] + paddsw m6, m7, [cq+64*6] + paddsw m7, [cq+64*7] + pmulld ym14, ym9, [pd_0to15] + lea r3, [dstq+strideq*1] + lea r4, [dstq+strideq*2] + kxnorb k1, k1, k1 + pxor m13, m13 + add r1, r4 ; dstq+strideq*3 + kmovb k2, k1 + vpgatherdq m9{k1}, [r0+ym14*4] + kmovb k1, k2 + vpgatherdq m10{k2}, [r3+ym14*4] + kmovb k2, k1 + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 + REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + vpgatherdq m11{k1}, [r4+ym14*4] + kmovb k1, k2 + vpgatherdq m12{k2}, [r1+ym14*4] + REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m8, m9, m13 ; 0 8 16 24 + punpckhbw m9, m13 ; 4 12 20 28 + paddw m0, m8 + paddw m4, m9 + packuswb m0, m4 + kmovb k2, k1 + vpscatterdq [r0+ym14*4]{k1}, m0 + punpcklbw m8, m10, m13 ; 1 9 17 25 + punpckhbw m10, m13 ; 5 13 21 29 + paddw m1, m8 + paddw m5, m10 + packuswb m1, m5 + kmovb k1, k2 + vpscatterdq [r3+ym14*4]{k2}, m1 + punpcklbw m8, m11, m13 ; 2 10 18 26 + punpckhbw m11, m13 ; 6 14 22 30 + paddw m2, m8 + paddw m6, m11 + packuswb m2, m6 + kmovb k2, k1 + vpscatterdq [r4+ym14*4]{k1}, m2 + punpcklbw m8, m12, m13 ; 3 11 19 27 + punpckhbw m12, m13 ; 7 15 23 31 + paddw m3, m8 + paddw m7, m12 + packuswb m3, m7 + vpscatterdq [r1+ym14*4]{k2}, m3 + RET + +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c + vpbroadcastd m0, [pw_4096] + pmulhrsw m3, m0, [cq+64*0] + pmulhrsw m4, m0, [cq+64*4] + pmulhrsw m6, m0, [cq+64*1] + pmulhrsw m5, m0, [cq+64*5] + pmulhrsw m7, m0, [cq+64*2] + pmulhrsw m2, m0, [cq+64*6] + pmulhrsw m8, m0, [cq+64*3] + pmulhrsw m0, [cq+64*7] + mova m13, [int8_permA] + lea r3, [strideq*3] + lea r4, [dstq+strideq*4] + punpckldq m1, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m6, m5 + punpckhdq m6, m5 + punpckldq m5, m7, m2 + punpckhdq m7, m2 + punpckldq m2, m8, m0 + punpckhdq m8, m0 + mova ym9, [dstq+strideq*0] + vinserti32x8 m9, [dstq+strideq*2], 1 + mova ym10, [dstq+strideq*1] + vinserti32x8 m10, [dstq+r3 ], 1 + mova ym11, [r4+strideq*0] + vinserti32x8 m11, [r4+strideq*2], 1 + mova ym12, [r4+strideq*1] + vinserti32x8 m12, [r4+r3 ], 1 + REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8 + pxor m13, m13 + REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklqdq m0, m1, m4 ; a0 a2 c0 c2 + punpckhqdq m1, m4 ; b0 b2 d0 d2 + punpcklqdq m4, m5, m2 ; a1 a3 c1 c3 + punpckhqdq m5, m2 ; b1 b3 d1 d3 + punpcklqdq m2, m3, m6 ; e0 e2 g0 g2 + punpckhqdq m3, m6 ; f0 f2 h0 h2 + punpcklqdq m6, m7, m8 ; e1 e3 g1 g3 + punpckhqdq m7, m8 ; f1 f3 h1 h3 + punpcklbw m8, m9, m13 + punpckhbw m9, m13 + paddw m0, m8 + paddw m4, m9 + packuswb m0, m4 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*2], m0, 1 + punpcklbw m8, m10, m13 + punpckhbw m10, m13 + paddw m1, m8 + paddw m5, m10 + packuswb m1, m5 + mova [dstq+strideq*1], ym1 + vextracti32x8 [dstq+r3 ], m1, 1 + punpcklbw m8, m11, m13 + punpckhbw m11, m13 + paddw m2, m8 + paddw m6, m11 + packuswb m2, m6 + mova [r4+strideq*0], ym2 + vextracti32x8 [r4+strideq*2], m2, 1 + punpcklbw m8, m12, m13 + punpckhbw m12, m13 + paddw m3, m8 + paddw m7, m12 + packuswb m3, m7 + mova [r4+strideq*1], ym3 + vextracti32x8 [r4+r3 ], m3, 1 + RET + +%macro IDCT_16x32_END 3 ; src[1-2], row + mova xm8, [dstq+strideq*0] + vinserti32x4 ym8, [dstq+strideq*1], 1 + mova xm9, [dstq+r3 ] + vinserti32x4 ym9, [dstq+strideq*2], 1 + pmulhrsw m%1, m10 + pmulhrsw m%2, m10 + vpermb m8, m11, m8 + vpermb m9, m11, m9 + mova [cq+64*(%3*2+0)], m13 + mova [cq+64*(%3*2+1)], m13 + paddw m8, m%1 + paddw m9, m%2 + packuswb m8, m9 + vpermd m8, m12, m8 + mova [dstq+strideq*0], xm8 + vextracti32x4 [dstq+strideq*1], ym8, 1 + vextracti32x4 [dstq+strideq*2], m8, 2 + vextracti32x4 [dstq+r3 ], m8, 3 +%if %1 != 20 + lea dstq, [dstq+strideq*4] +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m15, [o(pw_2896x8)] + cmp eobd, 151 + jb .fast + pmulhrsw m5, m15, [cq+64*10] + pmulhrsw m3, m15, [cq+64* 6] + pmulhrsw m1, m15, [cq+64* 2] + pmulhrsw m7, m15, [cq+64*14] + pmulhrsw m2, m15, [cq+64* 4] + pmulhrsw m6, m15, [cq+64*12] + pmulhrsw m0, m15, [cq+64* 0] + pmulhrsw m4, m15, [cq+64* 8] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + pmulhrsw m14, m15, [cq+64* 1] + pmulhrsw m21, m15, [cq+64*15] + pmulhrsw m18, m15, [cq+64* 9] + pmulhrsw m17, m15, [cq+64* 7] + pmulhrsw m16, m15, [cq+64* 5] + pmulhrsw m19, m15, [cq+64*11] + pmulhrsw m20, m15, [cq+64*13] + pmulhrsw m15, [cq+64* 3] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova m8, [o(idct_16x32p)] + vpbroadcastd m9, [o(pw_16384)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m14, m15, m16, m17, m18, m19, m20, m21 + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m1, m2, m3 + punpckhdq m2, m3 + REPX {pmulhrsw x, m9}, m8, m0, m1, m2 + punpckldq m3, m4, m5 + punpckhdq m4, m5 + punpckldq m5, m6, m7 + punpckhdq m6, m7 + REPX {pmulhrsw x, m9}, m3, m4, m5, m6 + punpckldq m7, m14, m15 + punpckhdq m14, m15 + punpckldq m15, m16, m17 + punpckhdq m16, m17 + REPX {pmulhrsw x, m9}, m7, m14, m15, m16 + punpckldq m17, m18, m19 + punpckhdq m18, m19 + punpckldq m19, m20, m21 + punpckhdq m20, m21 + REPX {pmulhrsw x, m9}, m17, m18, m19, m20 + punpcklqdq m21, m8, m1 + punpckhqdq m8, m1 + punpcklqdq m1, m0, m2 + punpckhqdq m0, m2 + punpcklqdq m2, m3, m5 + punpckhqdq m3, m5 + punpcklqdq m5, m4, m6 + punpckhqdq m4, m6 + punpcklqdq m6, m7, m15 + punpckhqdq m7, m15 + punpcklqdq m15, m14, m16 + punpckhqdq m14, m16 + punpcklqdq m16, m17, m19 + punpckhqdq m17, m19 + punpcklqdq m19, m18, m20 + punpckhqdq m18, m20 + vinserti32x8 m20, m21, ym2, 1 + vshufi32x4 m21, m2, q3232 + vinserti32x8 m2, m8, ym3, 1 + vshufi32x4 m8, m3, q3232 + vinserti32x8 m3, m1, ym5, 1 + vshufi32x4 m1, m5, q3232 + vinserti32x8 m5, m0, ym4, 1 + vshufi32x4 m0, m4, q3232 + vinserti32x8 m4, m6, ym16, 1 + vshufi32x4 m6, m16, q3232 + vinserti32x8 m16, m7, ym17, 1 + vshufi32x4 m7, m17, q3232 + vinserti32x8 m17, m15, ym19, 1 + vshufi32x4 m15, m19, q3232 + vinserti32x8 m19, m14, ym18, 1 + vshufi32x4 m14, m18, q3232 + vshufi32x4 m18, m21, m6, q3131 ; 27 5 + vshufi32x4 m21, m6, q2020 ; 31 1 + vshufi32x4 m6, m8, m7, q2020 ; 24 8 + vshufi32x4 m8, m7, q3131 ; 30 2 + vshufi32x4 m7, m1, m15, q2020 ; 28 4 + vshufi32x4 m1, m15, q3131 ; 6 26 + vshufi32x4 m15, m0, m14, q2020 ; 7 25 + vshufi32x4 m0, m14, q3131 ; 14 18 + vshufi32x4 m14, m20, m4, q2020 ; 3 29 + vshufi32x4 m20, m4, q3131 ; 23 9 + vshufi32x4 m9, m3, m17, q2020 ; 16 0 + vshufi32x4 m3, m17, q3131 ; 12 20 + vshufi32x4 m17, m5, m19, q2020 ; 15 17 + vshufi32x4 m5, m19, q3131 ; 22 10 + vshufi32x4 m19, m2, m16, q2020 ; 19 13 + vshufi32x4 m16, m2, m16, q3131 ; 11 21 + call m(idct_16x16_internal_8bpc).main3 + call .main_oddhalf + jmp .pass2 +.fast: ; right half is zero + mova ym8, [cq+64*15] + vinserti32x8 m8, [cq+64* 1], 1 + mova m2, [o(int16_perm)] + mova ym9, [cq+64* 8] + vinserti32x8 m9, [cq+64* 0], 1 + mova ym0, [cq+64* 7] + vinserti32x8 m0, [cq+64* 9], 1 + mova ym7, [cq+64*14] + vinserti32x8 m7, [cq+64* 2], 1 + mova ym1, [cq+64* 3] + vinserti32x8 m1, [cq+64*13], 1 + mova ym3, [cq+64* 6] + vinserti32x8 m3, [cq+64*10], 1 + mova ym5, [cq+64*11] + vinserti32x8 m5, [cq+64* 5], 1 + mova ym6, [cq+64*12] + vinserti32x8 m6, [cq+64* 4], 1 + REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6 + REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 + call m(idct_16x16_internal_8bpc).main2 + vbroadcasti32x4 m8, [o(int_shuf3)] + vbroadcasti32x4 m9, [o(int_shuf4)] + vpbroadcastd m11, [o(pw_16384)] + pshufb m0, m8 + pshufb m1, m9 + pshufb m2, m8 + pshufb m3, m9 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + pshufb m4, m8 + pshufb m5, m9 + pshufb m6, m8 + pshufb m7, m9 + REPX {pmulhrsw x, m11}, m4, m5, m6, m7 + punpckhdq m17, m0, m1 + punpckldq m0, m1 + punpckhdq m16, m2, m3 + punpckldq m2, m3 + punpckhdq m18, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m6, m7 + punpckldq m6, m7 + vinserti32x8 m1, m0, ym2, 1 + vshufi32x4 m3, m0, m2, q3232 + vinserti32x8 m2, m4, ym6, 1 + vshufi32x4 m4, m6, q3232 + vinserti32x8 m15, m17, ym16, 1 + vshufi32x4 m17, m16, q3232 + vinserti32x8 m16, m18, ym5, 1 + vshufi32x4 m18, m5, q3232 + vshufi32x4 m0, m1, m2, q2020 ; 0 2 + vshufi32x4 m1, m2, q3131 ; 4 6 + vshufi32x4 m2, m3, m4, q2020 ; 8 10 + vshufi32x4 m3, m4, q3131 ; 12 14 + vshufi32x4 m14, m15, m16, q2020 ; 1 3 + vshufi32x4 m15, m16, q3131 ; 5 7 + vshufi32x4 m16, m17, m18, q2020 ; 9 11 + vshufi32x4 m17, m18, q3131 ; 13 15 + pxor m6, m6 + punpckhwd m8, m0, m0 + punpcklwd m9, m6, m0 + punpckhwd m0, m3, m3 + punpckhwd m5, m2, m2 + punpcklwd m7, m1, m1 + punpckhwd m1, m1 + punpcklwd m3, m3 + punpcklwd m6, m2 + call m(idct_16x16_internal_8bpc).main_fast5 + punpcklwd m21, m14, m14 + punpckhwd m14, m14 + punpcklwd m18, m15, m15 + punpckhwd m15, m15 + punpcklwd m20, m16, m16 + punpckhwd m16, m16 + punpcklwd m19, m17, m17 + punpckhwd m17, m17 + call .main_oddhalf_fast +.pass2: + vpbroadcastd m10, [o(pw_2048)] + mova m11, [o(end_16x32p)] + lea r3, [strideq*3] + pxor m13, m13 + psrld m12, m11, 8 + IDCT_16x32_END 0, 1, 0 + IDCT_16x32_END 2, 3, 1 + IDCT_16x32_END 4, 5, 2 + IDCT_16x32_END 6, 7, 3 + IDCT_16x32_END 14, 15, 4 + IDCT_16x32_END 16, 17, 5 + IDCT_16x32_END 18, 19, 6 + IDCT_16x32_END 20, 21, 7 + RET +ALIGN function_align +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 32 + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly +ALIGN function_align +.main_oddhalf_fast2: ; bottom three-quarters are zero + vpbroadcastd m8, [o(pw_201_4091x8)] + vpbroadcastd m20, [o(pw_m1380_3857x8)] + vpbroadcastd m9, [o(pw_995_3973x8)] + vpbroadcastd m16, [o(pw_m601_4052x8)] + pmulhrsw m21, m8 ; t16a, t31a + pmulhrsw m20, m15 ; t19a, t28a + pmulhrsw m18, m9 ; t20a, t27a + pmulhrsw m14, m16 ; t23a, t24a + mova m8, m21 + mova m17, m20 + mova m15, m18 + mova m16, m14 + jmp .main3 +ALIGN function_align +.main_oddhalf_fast: ; bottom half is zero + vpbroadcastd m8, [o(pw_201_4091x8)] + vpbroadcastd m9, [o(pw_m2751_3035x8)] + vpbroadcastd m11, [o(pw_1751_3703x8)] + vpbroadcastd m12, [o(pw_m1380_3857x8)] + pmulhrsw m21, m8 ; t16a, t31a + vpbroadcastd m8, [o(pw_995_3973x8)] + pmulhrsw m17, m9 ; t17a, t30a + vpbroadcastd m9, [o(pw_m2106_3513x8)] + pmulhrsw m20, m11 ; t18a, t29a + vpbroadcastd m11, [o(pw_2440_3290x8)] + pmulhrsw m15, m12 ; t19a, t28a + vpbroadcastd m12, [o(pw_m601_4052x8)] + pmulhrsw m18, m8 ; t20a, t27a + pmulhrsw m16, m9 ; t21a, t26a + pmulhrsw m19, m11 ; t22a, t25a + pmulhrsw m14, m12 ; t23a, t24a + jmp .main2 +ALIGN function_align +.main_oddhalf: + ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a + ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a + ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a + ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a + ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a + ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a + ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a + ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a +.main2: + psubsw m8, m21, m17 ; t17 t30 + paddsw m21, m17 ; t16 t31 + psubsw m17, m15, m20 ; t18 t29 + paddsw m20, m15 ; t19 t28 + psubsw m15, m18, m16 ; t21 t26 + paddsw m18, m16 ; t20 t27 + psubsw m16, m14, m19 ; t22 t25 + paddsw m14, m19 ; t23 t24 +.main3: + ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a + ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a + ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a + ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a + vpbroadcastd m11, [o(pw_m3784_1567)] + psubsw m19, m21, m20 ; t19a t28a + paddsw m21, m20 ; t16a t31a + psubsw m20, m14, m18 ; t20a t27a + paddsw m14, m18 ; t23a t24a + psubsw m18, m8, m17 ; t18 t29 + paddsw m8, m17 ; t17 t30 + psubsw m17, m16, m15 ; t21 t26 + paddsw m15, m16 ; t22 t25 + ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a + ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28 + ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27 + ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a + vbroadcasti32x4 m9, [o(deint_shuf)] + psubsw m16, m21, m14 ; t23 t24 + paddsw m14, m21 ; t16 t31 + psubsw m21, m8, m15 ; t22a t25a + paddsw m15, m8 ; t17a t30a + psubsw m8, m18, m17 ; t21 t26 + paddsw m18, m17 ; t18 t29 + paddsw m17, m19, m20 ; t19a t28a + psubsw m19, m20 ; t20a t27a + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + REPX {pshufb x, m9}, m14, m15, m18, m17 + mova m9, m10 + vpdpwssd m9, m16, m11 + mova m20, m10 + vpdpwssd m20, m21, m11 + psrad m9, 12 + psrad m20, 12 + packssdw m9, m20 ; t23a t22 + mova m20, m10 + vpdpwssd m20, m16, m12 + mova m16, m10 + vpdpwssd m16, m21, m12 + psrad m20, 12 + psrad m16, 12 + packssdw m16, m20, m16 ; t24a t25 + ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a + ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 + packssdw m11, m20 ; t27 t26a + packssdw m8, m21 ; t20 t21a + punpcklqdq m20, m14, m15 ; t16 t17a + punpckhqdq m14, m15 ; t31 t30a + punpckhqdq m15, m17, m18 ; t28a t29 + punpcklqdq m17, m18 ; t19a t18 + psubsw m21, m0, m14 ; out31 out30 + paddsw m0, m14 ; out0 out1 + psubsw m14, m7, m20 ; out16 out17 + paddsw m7, m20 ; out15 out14 + psubsw m20, m1, m15 ; out28 out29 + paddsw m1, m15 ; out3 out2 + psubsw m15, m6, m17 ; out19 out18 + paddsw m6, m17 ; out12 out13 + psubsw m17, m4, m9 ; out23 out22 + paddsw m4, m9 ; out8 out9 + psubsw m18, m3, m16 ; out24 out25 + paddsw m3, m16 ; out7 out6 + psubsw m16, m5, m8 ; out20 out21 + paddsw m5, m8 ; out11 out10 + psubsw m19, m2, m11 ; out27 out26 + paddsw m2, m11 ; out4 out5 + ret + +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + mova m21, [o(permB)] + vpermq m1, m21, [cq+64* 0] ; 0 1 + vpermq m14, m21, [cq+64* 1] ; 2 3 + vpermq m20, m21, [cq+64* 2] ; 4 5 + vpermq m15, m21, [cq+64* 3] ; 6 7 + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m2, m21, [cq+64* 4] ; 8 9 + vpermq m16, m21, [cq+64* 5] ; 10 11 + vpermq m3, m21, [cq+64* 6] ; 12 13 + vpermq m17, m21, [cq+64* 7] ; 14 15 + REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17 + pxor m12, m12 + REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7 + cmp eobd, 151 + jb .fast + vpermq m9, m21, [cq+64* 8] ; 16 17 + vpermq m19, m21, [cq+64* 9] ; 18 19 + vpermq m4, m21, [cq+64*10] ; 20 21 + vpermq m5, m21, [cq+64*11] ; 22 23 + vpermq m6, m21, [cq+64*12] ; 24 25 + vpermq m18, m21, [cq+64*13] ; 26 27 + vpermq m7, m21, [cq+64*14] ; 28 29 + vpermq m21, m21, [cq+64*15] ; 30 31 + REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21 + REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15 + punpcklwd m8, m21, m14 ; 30 2 + punpckhwd m21, m1 ; 31 1 + punpcklwd m0, m17, m19 ; 14 18 + punpckhwd m17, m9 ; 15 17 + punpcklwd m9, m1 ; 16 0 + punpckhwd m14, m7 ; 3 29 + punpcklwd m1, m15, m18 ; 6 26 + punpckhwd m15, m6 ; 7 25 + punpcklwd m6, m2 ; 24 8 + punpckhwd m19, m3 ; 19 13 + punpcklwd m3, m4 ; 12 20 + punpckhwd m18, m20 ; 27 5 + punpcklwd m7, m20 ; 28 4 + punpckhwd m20, m5, m2 ; 23 9 + punpcklwd m5, m16 ; 22 10 + punpckhwd m16, m4 ; 11 21 + call m(idct_16x16_internal_8bpc).main2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + jmp .pass2 +.fast: ; bottom half zero + punpcklwd m8, m14, m14 ; 2 + punpcklwd m0, m17, m17 ; 14 + punpcklwd m5, m16, m16 ; 10 + punpcklwd m9, m12, m1 ; __ 0 + punpckhwd m21, m1, m1 ; 1 + punpcklwd m1, m15, m15 ; 6 + punpcklwd m7, m20, m20 ; 4 + punpckhwd m19, m3, m3 ; 13 + punpcklwd m3, m3 ; 12 + punpcklwd m6, m12, m2 ; __ 8 + punpckhwd m18, m20, m20 ; 5 + punpckhwd m20, m2, m2 ; 9 + call m(idct_16x16_internal_8bpc).main_fast + punpckhwd m15, m15 ; 7 + punpckhwd m14, m14 ; 3 + punpckhwd m16, m16 ; 11 + punpckhwd m17, m17 ; 15 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast +.pass2: + vpbroadcastd m9, [o(pw_16384)] + call .transpose_round + vshufi32x4 m16, m14, m2, q3131 ; 5 + vshufi32x4 m14, m2, q2020 ; 1 + vshufi32x4 m2, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m1, m18, q3131 ; 6 + vshufi32x4 m1, m18, q2020 ; 2 + vshufi32x4 m18, m20, m6, q2020 ; 9 + vshufi32x4 m20, m6, q3131 ; 13 + vshufi32x4 m6, m21, m4, q3131 ; 12 + vshufi32x4 m4, m21, m4, q2020 ; 8 + vshufi32x4 m21, m19, m7, q3131 ; 15 + vshufi32x4 m19, m7, q2020 ; 11 + vshufi32x4 m7, m5, m15, q3131 ; 14 + vshufi32x4 m5, m15, q2020 ; 10 + vshufi32x4 m15, m17, m9, q2020 ; 3 + vshufi32x4 m17, m9, q3131 ; 7 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 + call .main_oddhalf + vpbroadcastd m12, [o(pw_2048)] + movshdup m13, [o(permD)] + lea r2, [strideq*3] + pmovzxbw m8, [dstq+strideq*0] + pmovzxbw m9, [dstq+strideq*1] + pmovzxbw m10, [dstq+strideq*2] + pmovzxbw m11, [dstq+r2 ] + REPX {pmulhrsw x, m12}, m0, m1, m2, m3 + lea r3, [dstq+strideq*4] + paddw m0, m8 + paddw m1, m9 + paddw m2, m10 + paddw m3, m11 + pmovzxbw m8, [r3+strideq*0] + pmovzxbw m9, [r3+strideq*1] + pmovzxbw m10, [r3+strideq*2] + pmovzxbw m11, [r3+r2 ] + REPX {pmulhrsw x, m12}, m4, m5, m6, m7 + lea r4, [dstq+strideq*8] + packuswb m0, m1 + paddw m4, m8 + paddw m5, m9 + packuswb m2, m3 + paddw m6, m10 + paddw m7, m11 + pmovzxbw m8, [r4+strideq*0] + pmovzxbw m9, [r4+strideq*1] + pmovzxbw m10, [r4+strideq*2] + pmovzxbw m11, [r4+r2 ] + REPX {pmulhrsw x, m12}, m14, m15, m16, m17 + lea r5, [r3+strideq*8] + packuswb m4, m5 + paddw m14, m8 + paddw m15, m9 + packuswb m6, m7 + paddw m16, m10 + paddw m17, m11 + pmovzxbw m8, [r5+strideq*0] + pmovzxbw m9, [r5+strideq*1] + pmovzxbw m10, [r5+strideq*2] + pmovzxbw m11, [r5+r2 ] + REPX {pmulhrsw x, m12}, m18, m19, m20, m21 + packuswb m14, m15 + paddw m18, m8 + paddw m19, m9 + packuswb m16, m17 + paddw m20, m10 + paddw m21, m11 + packuswb m18, m19 + packuswb m20, m21 + REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym2 + vextracti32x8 [dstq+r2 ], m2, 1 + mova [r3+strideq*0], ym4 + vextracti32x8 [r3+strideq*1], m4, 1 + mova [r3+strideq*2], ym6 + vextracti32x8 [r3+r2 ], m6, 1 + mova [r4+strideq*0], ym14 + vextracti32x8 [r4+strideq*1], m14, 1 + mova [r4+strideq*2], ym16 + vextracti32x8 [r4+r2 ], m16, 1 + mova [r5+strideq*0], ym18 + vextracti32x8 [r5+strideq*1], m18, 1 + mova [r5+strideq*2], ym20 + vextracti32x8 [r5+r2 ], m20, 1 + RET +ALIGN function_align +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 +ALIGN function_align +.main_oddhalf_fast2: ; bottom three-quarters are zero + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m2, [o(pw_4017x8)] + vpbroadcastd m3, [o(pw_799x8)] + vpbroadcastd m18, [o(pw_4076x8)] + vpbroadcastd m19, [o(pw_401x8)] + vpbroadcastd m20, [o(pw_m1189x8)] + vpbroadcastd m16, [o(pw_3920x8)] + pmulhrsw m9, m0 ; t0 + pmulhrsw m2, m1 ; t7a + pmulhrsw m1, m3 ; t4a + pmulhrsw m18, m14 ; t15a + pmulhrsw m14, m19 ; t8a + pmulhrsw m20, m15 ; t11a + pmulhrsw m15, m16 ; t12a + psubsw m7, m9, m2 ; idct8 out7 + paddsw m0, m9, m2 ; idct8 out0 + psubsw m4, m9, m1 ; idct8 out4 + paddsw m3, m9, m1 ; idct8 out3 + ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6 + mova m21, m18 + mova m19, m14 + mova m16, m15 + mova m8, m20 + psubsw m6, m9, m1 ; idct8 out6 + paddsw m1, m9 ; idct8 out1 + psubsw m5, m9, m2 ; idct8 out5 + paddsw m2, m9 ; idct8 out2 + jmp .main3 +ALIGN function_align +.main_oddhalf_fast: ; bottom half is zero + vpbroadcastd m5, [o(pw_m2276x8)] + vpbroadcastd m11, [o(pw_3406x8)] + vpbroadcastd m7, [o(pw_4017x8)] + vpbroadcastd m12, [o(pw_799x8)] + vpbroadcastd m6, [o(pw_3784x8)] + vpbroadcastd m10, [o(pw_1567x8)] + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m5, m3 ; t5a + pmulhrsw m3, m11 ; t6a + pmulhrsw m7, m1 ; t7a + pmulhrsw m1, m12 ; t4a + pmulhrsw m6, m2 ; t3 + pmulhrsw m2, m10 ; t2 + pmulhrsw m4, m0 ; t0 + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + vpbroadcastd m10, [o(pd_2048)] + mova m0, m4 ; t1 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main3 + vpbroadcastd m21, [o(pw_4076x8)] + vpbroadcastd m8, [o(pw_401x8)] + vpbroadcastd m18, [o(pw_m2598x8)] + vpbroadcastd m9, [o(pw_3166x8)] + vpbroadcastd m19, [o(pw_3612x8)] + vpbroadcastd m11, [o(pw_1931x8)] + vpbroadcastd m20, [o(pw_m1189x8)] + vpbroadcastd m12, [o(pw_3920x8)] + pmulhrsw m21, m14 ; t15a + pmulhrsw m14, m8 ; t8a + pmulhrsw m18, m17 ; t9a + pmulhrsw m17, m9 ; t14a + pmulhrsw m19, m16 ; t13a + pmulhrsw m16, m11 ; t10a + pmulhrsw m20, m15 ; t11a + pmulhrsw m15, m12 ; t12a + jmp .main2 +ALIGN function_align +.main_oddhalf: + ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a + ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a +.main2: + paddsw m8, m20, m16 ; t11 + psubsw m20, m16 ; t10 + paddsw m16, m15, m19 ; t12 + psubsw m15, m19 ; t13 + psubsw m19, m14, m18 ; t9 + paddsw m14, m18 ; t8 + psubsw m18, m21, m17 ; t14 + paddsw m21, m17 ; t15 +.main3: + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a + vpbroadcastd m11, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + psubsw m17, m14, m8 ; t11a + paddsw m8, m14 ; t8a + paddsw m14, m18, m15 ; t9 + psubsw m18, m15 ; t10 + psubsw m15, m19, m20 ; t13 + paddsw m19, m20 ; t14 + paddsw m20, m21, m16 ; t15a + psubsw m16, m21, m16 ; t12a + ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a + ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12 + psubsw m21, m0, m20 ; out15 + paddsw m0, m20 ; out0 + psubsw m20, m1, m19 ; out14 + paddsw m1, m19 ; out1 + psubsw m19, m2, m18 ; out13 + paddsw m2, m18 ; out2 + psubsw m18, m3, m17 ; out12 + paddsw m3, m17 ; out3 + psubsw m17, m4, m16 ; out11 + paddsw m4, m16 ; out4 + psubsw m16, m5, m15 ; out10 + paddsw m5, m15 ; out5 + psubsw m15, m6, m14 ; out9 + paddsw m6, m14 ; out6 + psubsw m14, m7, m8 ; out8 + paddsw m7, m8 ; out7 + ret +.transpose_round: + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpcklwd m7, m14, m16 + punpckhwd m14, m16 + punpcklwd m16, m15, m17 + punpckhwd m15, m17 + punpcklwd m17, m19, m21 + punpckhwd m19, m21 + punpckhwd m21, m18, m20 + punpcklwd m18, m20 + punpcklwd m20, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + REPX {pmulhrsw x, m9}, m20, m8, m1, m0 + punpcklwd m6, m7, m15 + punpckhwd m7, m15 + punpcklwd m15, m14, m16 + punpckhwd m14, m16 + REPX {pmulhrsw x, m9}, m2, m3, m5, m4 + punpckhwd m16, m18, m19 + punpcklwd m18, m19 + punpcklwd m19, m21, m17 + punpckhwd m21, m17 + REPX {pmulhrsw x, m9}, m6, m7, m15, m14 + punpcklwd m17, m8, m0 ; a2 a6 aa ae + punpckhwd m8, m0 ; a3 a7 ab af + punpcklwd m0, m20, m1 ; a0 a4 a8 ac + punpckhwd m20, m1 ; a1 a5 a9 ad + REPX {pmulhrsw x, m9}, m16, m18, m19, m21 + punpcklwd m1, m2, m5 ; b0 b4 b8 bc + punpckhwd m2, m5 ; b1 b5 b9 bd + punpcklwd m5, m3, m4 ; b2 b6 ba be + punpckhwd m3, m4 ; b3 b7 bb bf + punpcklwd m4, m6, m15 ; c0 c4 c8 cc + punpckhwd m6, m15 ; c1 c5 c9 cd + punpcklwd m15, m7, m14 ; c2 c6 ca ce + punpckhwd m7, m14 ; c3 c7 cb cf + punpcklwd m14, m18, m19 ; d0 d4 d8 dc + punpckhwd m18, m19 ; d1 d5 d9 dd + punpcklwd m9, m16, m21 ; d2 d6 da de + punpckhwd m16, m21 ; d3 d7 db df + vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc + vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 + vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 + vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be + vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 + vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf + vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 + vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc + vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 + vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd + vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 + vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd + vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 + vshufi32x4 m15, m9, q3232 ; ca ce da de + vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 + vshufi32x4 m7, m16, q3232 ; cb cf db df + ret + +%macro IDTX_16x32 4 ; src/dst[1-4] + pmulhrsw m%1, m15, [cq+64*%1] + pmulhrsw m%2, m15, [cq+64*%2] + pmulhrsw m%3, m15, [cq+64*%3] + pmulhrsw m%4, m15, [cq+64*%4] + pmulhrsw m18, m16, m%1 + pmulhrsw m19, m16, m%2 + pmulhrsw m20, m16, m%3 + pmulhrsw m21, m16, m%4 + REPX {pmulhrsw x, m17}, m18, m19, m20, m21 + paddsw m%1, m18 + paddsw m%2, m19 + paddsw m%3, m20 + paddsw m%4, m21 +%endmacro + +%macro IDTX_16x32_STORE 2 ; src[1-2] + mova xm17, [dstq+r3*0] + vinserti128 ym17, [dstq+r3*4], 1 + vinserti32x4 m17, [dstq+r3*8], 2 + vinserti32x4 m17, [dstq+r4*8], 3 + mova [cq+64*(%1*2+0)], m18 + mova [cq+64*(%1*2+1)], m18 + punpcklbw m16, m17, m18 + punpckhbw m17, m18 + paddw m16, m%1 + paddw m17, m%2 + packuswb m16, m17 + mova [dstq+r3*0], xm16 + vextracti128 [dstq+r3*4], ym16, 1 + vextracti32x4 [dstq+r3*8], m16, 2 + vextracti32x4 [dstq+r4*8], m16, 3 +%if %1 != 7 + add dstq, strideq +%endif +%endmacro + +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c + vpbroadcastd m15, [pw_2896x8] + vpbroadcastd m16, [pw_1697x16] + vpbroadcastd m17, [pw_16384] + IDTX_16x32 0, 1, 2, 3 + IDTX_16x32 4, 5, 6, 7 + IDTX_16x32 8, 9, 10, 11 + IDTX_16x32 12, 13, 14, 15 + vpbroadcastd m16, [pw_8192] + call .transpose_2x8x8_round + lea r3, [strideq*2] + lea r4, [strideq*3] + pxor m18, m18 + IDTX_16x32_STORE 0, 8 + IDTX_16x32_STORE 1, 9 + IDTX_16x32_STORE 2, 10 + IDTX_16x32_STORE 3, 11 + IDTX_16x32_STORE 4, 12 + IDTX_16x32_STORE 5, 13 + IDTX_16x32_STORE 6, 14 + IDTX_16x32_STORE 7, 15 + RET +ALIGN function_align +.transpose_2x8x8_round: + punpckhwd m17, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m17, m1 + punpckhdq m17, m1 + REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m17 + punpcklqdq m6, m17 + punpckhwd m17, m12, m13 + punpcklwd m12, m13 + punpckhwd m13, m8, m9 + punpcklwd m8, m9 + punpckhwd m9, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m10, m11 + punpcklwd m10, m11 + punpckhdq m11, m8, m10 + punpckldq m8, m10 + punpckldq m10, m12, m14 + punpckhdq m12, m14 + punpckhdq m14, m13, m15 + punpckldq m13, m15 + punpckldq m15, m17, m9 + punpckhdq m17, m9 + REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17 + punpckhqdq m9, m8, m10 + punpcklqdq m8, m10 + punpcklqdq m10, m11, m12 + punpckhqdq m11, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m14, m17 + punpcklqdq m14, m17 + ret + +%macro IDTX_32x16 4 ; dst[1-4] + pmulhrsw m%2, m12, [cq+32*(%1+ 0)] + pmulhrsw m18, m12, [cq+32*(%1+16)] + pmulhrsw m%4, m12, [cq+32*(%3+ 0)] + pmulhrsw m19, m12, [cq+32*(%3+16)] + REPX {paddsw x, x}, m%2, m18, m%4, m19 + mova m%1, m14 + vpermi2q m%1, m%2, m18 + vpermt2q m%2, m16, m18 +%if %3 != 14 + mova m%3, m14 +%endif + vpermi2q m%3, m%4, m19 + vpermt2q m%4, m16, m19 + pmulhrsw m18, m17, m%1 + pmulhrsw m19, m17, m%2 + pmulhrsw m20, m17, m%3 + pmulhrsw m21, m17, m%4 + REPX {paddsw x, x}, m%1, m%2, m%3, m%4 + paddsw m%1, m18 + paddsw m%2, m19 + paddsw m%3, m20 + paddsw m%4, m21 +%endmacro + +%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32 + mova ym19, [dstq+strideq*0] + vinserti32x8 m19, [dstq+strideq*8], 1 +%if %3 == 0 + mova [cq+64*(%1*2+0)], m20 + mova [cq+64*(%1*2+1)], m20 +%endif + punpcklbw m18, m19, m20 + punpckhbw m19, m20 + paddw m18, m%1 + paddw m19, m%2 + packuswb m18, m19 + mova [dstq+strideq*0], ym18 + vextracti32x8 [dstq+strideq*8], m18, 1 +%if %3 || %1 != 7 + add dstq, strideq +%endif +%endmacro + +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c + vpbroadcastd m12, [pw_2896x8] + movu m14, [permB+7] + vpbroadcastd m17, [pw_1697x16] + psrlq m16, m14, 4 + IDTX_32x16 0, 1, 2, 3 + IDTX_32x16 4, 5, 6, 7 + IDTX_32x16 8, 9, 10, 11 + IDTX_32x16 12, 13, 14, 15 + vpbroadcastd m16, [pw_2048] + call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round + pxor m20, m20 + IDTX_32x16_STORE 0, 8 + IDTX_32x16_STORE 1, 9 + IDTX_32x16_STORE 2, 10 + IDTX_32x16_STORE 3, 11 + IDTX_32x16_STORE 4, 12 + IDTX_32x16_STORE 5, 13 + IDTX_32x16_STORE 6, 14 + IDTX_32x16_STORE 7, 15 + RET + +%macro IDCT_32x32_END 4 ; src, mem, stride[1-2] + pmovzxbw m10, [dstq+%3] + pmovzxbw m11, [r3 +%4] +%if %2 < 8 + paddsw m8, m%2, m%1 + psubsw m9, m%2, m%1 +%else + mova m9, [cq+64*(%2*2-16)] + paddsw m8, m9, m%1 + psubsw m9, m%1 +%endif + pmulhrsw m8, m12 + pmulhrsw m9, m12 +%if %2 >= 8 +%if %2 == 8 + pxor m0, m0 +%endif + mova [cq+64*(%2*2-16)], m0 + mova [cq+64*(%2*2-15)], m0 +%endif + paddw m8, m10 + paddw m9, m11 + packuswb m8, m9 + vpermq m8, m13, m8 + mova [dstq+%3], ym8 + vextracti32x8 [r3 +%4], m8, 1 +%if %2 == 3 || %2 == 7 || %2 == 11 + add dstq, r5 + sub r3, r5 +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + WIN64_SPILL_XMM 30 + cmp eobd, 136 + jb .fast + mova m5, [cq+64*20] + mova m3, [cq+64*12] + mova m1, [cq+64* 4] + mova m7, [cq+64*28] + mova m2, [cq+64* 8] + mova m6, [cq+64*24] + mova m0, [cq+64* 0] + mova m4, [cq+64*16] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + mova m14, [cq+64* 2] + mova m21, [cq+64*30] + mova m18, [cq+64*18] + mova m17, [cq+64*14] + mova m16, [cq+64*10] + mova m19, [cq+64*22] + mova m20, [cq+64*26] + mova m15, [cq+64* 6] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + mova m22, [cq+64* 1] + mova m21, [cq+64*31] + mova m14, [cq+64*17] + mova m29, [cq+64*15] + mova m26, [cq+64* 9] + mova m17, [cq+64*23] + mova m18, [cq+64*25] + mova m25, [cq+64* 7] + mova m24, [cq+64* 5] + mova m19, [cq+64*27] + mova m16, [cq+64*21] + mova m27, [cq+64*11] + mova m28, [cq+64*13] + mova m15, [cq+64*19] + mova m20, [cq+64*29] + mova m23, [cq+64* 3] + call .main_oddhalf + vpbroadcastd m10, [o(pw_8192)] + psubsw m13, m0, m29 ; 31 + paddsw m0, m29 ; 0 + psubsw m29, m1, m28 ; 30 + paddsw m1, m28 ; 1 + psubsw m28, m2, m27 ; 29 + paddsw m2, m27 ; 2 + psubsw m27, m3, m26 ; 28 + paddsw m3, m26 ; 3 + psubsw m26, m4, m25 ; 27 + paddsw m4, m25 ; 4 + psubsw m25, m5, m24 ; 26 + paddsw m5, m24 ; 5 + psubsw m24, m6, m23 ; 25 + paddsw m6, m23 ; 6 + psubsw m23, m7, m22 ; 24 + paddsw m7, m22 ; 7 + pxor m9, m9 + punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 + REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 + punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 + REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 + punpckhwd m3, m23, m24 + punpcklwd m23, m24 + punpckhwd m24, m25, m26 + punpcklwd m25, m26 + REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 + punpckhwd m26, m27, m28 + punpcklwd m27, m28 + punpckhwd m28, m29, m13 + punpcklwd m29, m13 + REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 + punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 + REPX {pmulhrsw x, m10}, m0, m4, m8, m22 + punpckhdq m13, m23, m25 + punpckldq m23, m25 + punpckhdq m25, m27, m29 + punpckldq m27, m29 + REPX {pmulhrsw x, m10}, m13, m23, m25, m27 + punpckhdq m9, m3, m24 + punpckldq m3, m24 + punpckhdq m24, m26, m28 + punpckldq m26, m28 + punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 + punpckhqdq m23, m27 ; d01 d09 d17 d25 + punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 + punpcklqdq m13, m25 ; d02 d10 d18 d26 + punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 + punpcklqdq m3, m26 ; d04 d12 d20 d28 + punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 + punpcklqdq m9, m24 ; d06 d14 d22 d30 + REPX {pmulhrsw x, m10}, m25, m3, m26 + mova [cq+64* 9], m23 + mova [cq+64*11], m27 + mova [cq+64*13], m25 + mova [cq+64*15], m26 + punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 + punpcklqdq m8, m22 ; a04 a12 a20 a28 + punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 + punpcklqdq m0, m4 ; a00 a08 a16 a24 + punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 + punpcklqdq m7, m2 ; a02 a10 a18 a26 + punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 + punpcklqdq m6, m1 ; a06 a14 a22 a30 + mova m2, [cq+64* 0] + mova m11, [cq+64* 2] + mova m12, [cq+64* 4] + mova m29, [cq+64* 6] + mova m27, [cq+64* 8] + mova m26, [cq+64*10] + mova m4, [cq+64*12] + mova m28, [cq+64*14] + psubsw m1, m2, m21 ; 23 + paddsw m2, m21 ; 8 + psubsw m21, m11, m20 ; 22 + paddsw m11, m20 ; 9 + psubsw m20, m12, m19 ; 21 + paddsw m12, m19 ; 10 + psubsw m19, m29, m18 ; 20 + paddsw m29, m18 ; 11 + psubsw m18, m27, m17 ; 19 + paddsw m27, m17 ; 12 + psubsw m17, m26, m16 ; 18 + paddsw m26, m16 ; 13 + paddsw m16, m4, m15 ; 14 + psubsw m4, m15 ; 17 + pmulhrsw m15, m6, m10 + psubsw m6, m28, m14 ; 16 + paddsw m28, m14 ; 15 + pmulhrsw m14, m7, m10 + punpcklwd m7, m6, m4 + punpckhwd m6, m4 + punpckhwd m4, m17, m18 + punpcklwd m17, m18 + punpckhwd m18, m19, m20 + punpcklwd m19, m20 + punpckhwd m20, m21, m1 + punpcklwd m21, m1 + punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 + punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 + punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 + punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 + punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 + punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 + punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 + punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 + pmulhrsw m23, m10 + pmulhrsw m25, m10 + punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 + REPX {pmulhrsw x, m10}, m28, m2, m12, m27 + punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 + punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 + punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 + punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 + REPX {pmulhrsw x, m10}, m16, m1, m11, m29 + punpckhdq m26, m19, m21 + punpckldq m19, m21 + punpckhdq m21, m6, m4 + punpckldq m6, m4 + REPX {pmulhrsw x, m10}, m26, m19, m21, m6 + punpckhdq m4, m18, m20 + punpckldq m18, m20 + punpckhdq m20, m7, m17 + punpckldq m7, m17 + REPX {pmulhrsw x, m10}, m4, m18, m20, m7 + punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 + punpckhqdq m28, m12 ; b03 b11 b19 b27 + punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 + punpcklqdq m2, m27 ; b00 b08 b16 b24 + punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 + punpcklqdq m1, m29 ; b04 b12 b20 b28 + punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 + punpcklqdq m16, m11 ; b06 b14 b22 b30 + mova [cq+64* 1], m12 + mova [cq+64* 3], m28 + mova [cq+64* 5], m27 + mova [cq+64* 7], m29 + punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 + punpcklqdq m20, m26 ; c02 c10 c18 c26 + punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 + punpcklqdq m7, m19 ; c00 c08 c16 c24 + punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 + punpcklqdq m6, m18 ; c04 c12 c20 c28 + punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 + punpcklqdq m21, m4 ; c06 c14 c22 c30 + pmulhrsw m19, m9, m10 + vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 + vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 + vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 + vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 + vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 + vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 + vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 + vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 + vshufi32x4 m3, m1, m6, q3131 ; 12 + vshufi32x4 m1, m6, q2020 ; 4 + vshufi32x4 m6, m4, m2, q3131 ; 24 + vshufi32x4 m4, m2, q2020 ; 16 + vshufi32x4 m2, m0, m7, q3131 ; 8 + vshufi32x4 m0, m7, q2020 ; 0 + vshufi32x4 m7, m5, m8, q3131 ; 28 + vshufi32x4 m5, m8, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 + vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 + vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 + vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 + vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30 + vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14 + vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 + vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 + vshufi32x4 m16, m14, m20, q3131 ; 10 + vshufi32x4 m14, m20, q2020 ; 2 + vshufi32x4 m20, m18, m17, q3131 ; 26 + vshufi32x4 m18, m17, q2020 ; 18 + vshufi32x4 m17, m15, m21, q3131 ; 14 + vshufi32x4 m15, m21, q2020 ; 6 + vshufi32x4 m21, m19, m13, q3131 ; 30 + vshufi32x4 m19, m13, q2020 ; 22 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + mova m15, [cq+64* 1] + mova m16, [cq+64* 3] + mova m17, [cq+64* 5] + mova m19, [cq+64* 7] + mova m20, [cq+64* 9] + mova m21, [cq+64*11] + mova m13, [cq+64*13] + mova m18, [cq+64*15] + vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 + vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 + vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 + vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 + vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 + vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 + vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 + vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 + vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 + vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 + vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 + vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 + vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 + vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 + vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 + vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 + vshufi32x4 m18, m14, m26, q3131 ; 25 + vshufi32x4 m14, m26, q2020 ; 17 + vshufi32x4 m19, m15, m27, q3131 ; 27 + vshufi32x4 m15, m27, q2020 ; 19 + vshufi32x4 m20, m16, m28, q3131 ; 29 + vshufi32x4 m16, m28, q2020 ; 21 + vshufi32x4 m21, m17, m29, q3131 ; 31 + vshufi32x4 m17, m29, q2020 ; 23 + vshufi32x4 m26, m22, m8, q3131 ; 9 + vshufi32x4 m22, m8, q2020 ; 1 + vshufi32x4 m27, m23, m9, q3131 ; 11 + vshufi32x4 m23, m9, q2020 ; 3 + vshufi32x4 m28, m24, m11, q3131 ; 13 + vshufi32x4 m24, m11, q2020 ; 5 + vshufi32x4 m29, m25, m12, q3131 ; 15 + vshufi32x4 m25, m12, q2020 ; 7 + call .main_oddhalf + jmp .end +.fast: ; bottom/right halves are zero + mova m14, [o(dup16_perm)] + pmovzxwd m9, [cq+64* 0] + pmovzxwd m6, [cq+64* 8] + vpermb m8, m14, [cq+64* 2] + vpermb ym0, ym14, [cq+64*14] + vpermb ym5, ym14, [cq+64*10] + vpermb m1, m14, [cq+64* 6] + vpermb m7, m14, [cq+64* 4] + vpermb ym3, ym14, [cq+64*12] + pslld m9, 16 + pslld m6, 16 + call m(idct_16x16_internal_8bpc).main_fast + vpermb m21, m14, [cq+64* 1] + vpermb ym17, ym14, [cq+64*15] + vpermb ym20, ym14, [cq+64* 9] + vpermb m15, m14, [cq+64* 7] + vpermb m18, m14, [cq+64* 5] + vpermb ym16, ym14, [cq+64*11] + vpermb ym19, ym14, [cq+64*13] + vpermb m14, m14, [cq+64* 3] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m9, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round + vshufi32x4 m22, m14, m2, q2020 ; 1 + vshufi32x4 m24, m14, m2, q3131 ; 5 + vshufi32x4 m23, m17, m9, q2020 ; 3 + vshufi32x4 m25, m17, m9, q3131 ; 7 + vshufi32x4 m16, m5, m15, q2020 ; 10 + vshufi32x4 m17, m5, m15, q3131 ; 14 + vshufi32x4 m14, m1, m18, q2020 ; 2 + vshufi32x4 m15, m1, m18, q3131 ; 6 + vshufi32x4 m1, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m21, m4, q3131 ; 12 + vshufi32x4 m2, m21, m4, q2020 ; 8 + vshufi32x4 m26, m20, m6, q2020 ; 9 + vshufi32x4 m28, m20, m6, q3131 ; 13 + vshufi32x4 m27, m19, m7, q2020 ; 11 + vshufi32x4 m29, m19, m7, q3131 ; 15 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + call .main_oddhalf_fast +.end: + lea r4, [strideq*3] + vpbroadcastd m12, [o(pw_2048)] + movshdup m13, [o(permD)] + lea r3, [dstq+r4*8] + lea r5, [strideq+r4] ; stride*4 + add r3, r5 ; dst+stride*28 + IDCT_32x32_END 29, 0, strideq*0, r4 + IDCT_32x32_END 28, 1, strideq*1, strideq*2 + IDCT_32x32_END 27, 2, strideq*2, strideq*1 + IDCT_32x32_END 26, 3, r4 , strideq*0 + IDCT_32x32_END 25, 4, strideq*0, r4 + IDCT_32x32_END 24, 5, strideq*1, strideq*2 + IDCT_32x32_END 23, 6, strideq*2, strideq*1 + IDCT_32x32_END 22, 7, r4 , strideq*0 + IDCT_32x32_END 21, 8, strideq*0, r4 + IDCT_32x32_END 20, 9, strideq*1, strideq*2 + IDCT_32x32_END 19, 10, strideq*2, strideq*1 + IDCT_32x32_END 18, 11, r4 , strideq*0 + IDCT_32x32_END 17, 12, strideq*0, r4 + IDCT_32x32_END 16, 13, strideq*1, strideq*2 + IDCT_32x32_END 15, 14, strideq*2, strideq*1 + IDCT_32x32_END 14, 15, r4 , strideq*0 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 +ALIGN function_align +.main_oddhalf_fast2: ; bottom three-quarters are zero + vpbroadcastd m21, [o(pw_4091x8)] + vpbroadcastd m8, [o(pw_201x8)] + vpbroadcastd m18, [o(pw_m1380x8)] + vpbroadcastd m9, [o(pw_3857x8)] + vpbroadcastd m19, [o(pw_3973x8)] + vpbroadcastd m11, [o(pw_995x8)] + vpbroadcastd m28, [o(pw_m601x8)] + vpbroadcastd m12, [o(pw_4052x8)] + pmulhrsw m21, m22 ; t31a + pmulhrsw m22, m8 ; t16a + pmulhrsw m18, m25 ; t19a + pmulhrsw m25, m9 ; t28a + pmulhrsw m19, m24 ; t27a + pmulhrsw m24, m11 ; t20a + pmulhrsw m28, m23 ; t23a + pmulhrsw m23, m12 ; t24a + mova m15, m21 + mova m8, m22 + mova m14, m18 + mova m27, m25 + mova m29, m19 + mova m26, m24 + mova m16, m28 + mova m20, m23 + jmp .main3 +ALIGN function_align +.main_oddhalf_fast: ; bottom half is zero + vpbroadcastd m21, [o(pw_4091x8)] + vpbroadcastd m8, [o(pw_201x8)] + vpbroadcastd m14, [o(pw_m2751x8)] + vpbroadcastd m9, [o(pw_3035x8)] + vpbroadcastd m17, [o(pw_3703x8)] + vpbroadcastd m11, [o(pw_1751x8)] + vpbroadcastd m18, [o(pw_m1380x8)] + vpbroadcastd m12, [o(pw_3857x8)] + pmulhrsw m21, m22 ; t31a + vpbroadcastd m19, [o(pw_3973x8)] + pmulhrsw m22, m8 ; t16a + vpbroadcastd m8, [o(pw_995x8)] + pmulhrsw m14, m29 ; t30a + vpbroadcastd m16, [o(pw_m2106x8)] + pmulhrsw m29, m9 ; t17a + vpbroadcastd m9, [o(pw_3513x8)] + pmulhrsw m17, m26 ; t29a + vpbroadcastd m15, [o(pw_3290x8)] + pmulhrsw m26, m11 ; t18a + vpbroadcastd m11, [o(pw_2440x8)] + pmulhrsw m18, m25 ; t19a + vpbroadcastd m20, [o(pw_m601x8)] + pmulhrsw m25, m12 ; t28a + vpbroadcastd m12, [o(pw_4052x8)] + pmulhrsw m19, m24 ; t27a + pmulhrsw m24, m8 ; t20a + pmulhrsw m16, m27 ; t21a + pmulhrsw m27, m9 ; t26a + pmulhrsw m15, m28 ; t25a + pmulhrsw m28, m11 ; t22a + pmulhrsw m20, m23 ; t23a + pmulhrsw m23, m12 ; t24a + jmp .main2 +ALIGN function_align +.main_oddhalf: + ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a + ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a + ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a + ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a +.main2: + psubsw m8, m22, m14 ; t17 + paddsw m22, m14 ; t16 + paddsw m14, m18, m26 ; t19 + psubsw m18, m26 ; t18 + psubsw m26, m24, m16 ; t21 + paddsw m24, m16 ; t20 + psubsw m16, m20, m28 ; t22 + paddsw m28, m20 ; t23 + psubsw m20, m23, m15 ; t25 + paddsw m23, m15 ; t24 + psubsw m15, m21, m29 ; t30 + paddsw m21, m29 ; t31 + psubsw m29, m19, m27 ; t26 + paddsw m19, m27 ; t27 + paddsw m27, m25, m17 ; t28 + psubsw m25, m17 ; t29 +.main3: + ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a + ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a + ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a + ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m11, [o(pw_1567_3784)] + psubsw m17, m21, m27 ; t28a + paddsw m21, m27 ; t31a + psubsw m27, m15, m25 ; t18 + paddsw m15, m25 ; t17 + psubsw m25, m20, m29 ; t21 + paddsw m20, m29 ; t22 + psubsw m29, m8, m18 ; t29 + paddsw m8, m18 ; t30 + psubsw m18, m22, m14 ; t19a + paddsw m22, m14 ; t16a + psubsw m14, m28, m24 ; t20a + paddsw m24, m28 ; t23a + paddsw m28, m16, m26 ; t25 + psubsw m16, m26 ; t26 + psubsw m26, m23, m19 ; t27a + paddsw m23, m19 ; t24a + ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a + ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 + vpbroadcastd m11, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a + ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 + vpbroadcastd m12, [o(pw_m2896_2896)] + vpbroadcastd m11, [o(pw_2896_2896)] + psubsw m19, m27, m25 ; t26 + paddsw m27, m25 ; t29 + psubsw m25, m17, m26 ; t20a + paddsw m17, m26 ; t19a + paddsw m26, m18, m14 ; t28a + psubsw m18, m14 ; t27a + paddsw m14, m22, m24 ; t16 + psubsw m22, m24 ; t23 + psubsw m24, m29, m16 ; t21 + paddsw m16, m29 ; t18 + paddsw m29, m21, m23 ; t31 + psubsw m21, m23 ; t24 + psubsw m23, m15, m20 ; t22a + paddsw m15, m20 ; t17a + psubsw m20, m8, m28 ; t25a + paddsw m28, m8 ; t30a + ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 + ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a + ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a + ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 + ret + +%macro IDTX_32x32 2 ; dst[1-2] + vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which + vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to + vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements + vmovdqa32 ym18, [cq+64*(%2+16)] + vpermt2q m%1, m21, m17 + vpermt2q m%2, m21, m18 +%endmacro + +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c + movu m21, [permB+7] + vpbroadcastd m16, [pw_8192] + pxor m20, m20 +.loop: + IDTX_32x32 0, 1 + IDTX_32x32 2, 3 + IDTX_32x32 4, 5 + IDTX_32x32 6, 7 + IDTX_32x32 8, 9 + IDTX_32x32 10, 11 + IDTX_32x32 12, 13 + IDTX_32x32 14, 15 + call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round + IDTX_32x16_STORE 0, 8, 1 + IDTX_32x16_STORE 1, 9, 1 + IDTX_32x16_STORE 2, 10, 1 + IDTX_32x16_STORE 3, 11, 1 + IDTX_32x16_STORE 4, 12, 1 + IDTX_32x16_STORE 5, 13, 1 + IDTX_32x16_STORE 6, 14, 1 + IDTX_32x16_STORE 7, 15, 1 + lea dstq, [dstq+strideq*8] + btc cq, 5 + jnc .loop + mov r0d, 8 +.zero_loop: + mova [cq+64*0], m20 + mova [cq+64*1], m20 + mova [cq+64*2], m20 + mova [cq+64*3], m20 + add cq, 64*4 + dec r0d + jg .zero_loop + RET + +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + WIN64_SPILL_XMM 30 + cmp eobd, 151 + jb .fast + mova m5, [cq+64*10] + mova m3, [cq+64* 6] + mova m1, [cq+64* 2] + mova m7, [cq+64*14] + mova m2, [cq+64* 4] + mova m6, [cq+64*12] + mova m0, [cq+64* 0] + mova m4, [cq+64* 8] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + mova m14, [cq+64* 1] + mova m21, [cq+64*15] + mova m18, [cq+64* 9] + mova m17, [cq+64* 7] + mova m16, [cq+64* 5] + mova m19, [cq+64*11] + mova m20, [cq+64*13] + mova m15, [cq+64* 3] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + vpbroadcastd m9, [o(pw_8192)] +%macro TRANSPOSE_8x4_ROUND 4 + punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3 + punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7 + REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4 +%endmacro + TRANSPOSE_8x4_ROUND 0, 1, 2, 3 + TRANSPOSE_8x4_ROUND 4, 5, 6, 7 + TRANSPOSE_8x4_ROUND 14, 15, 16, 17 + TRANSPOSE_8x4_ROUND 18, 19, 20, 21 + vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4 + vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12 + vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5 + vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13 + vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6 + vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14 + vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7 + vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15 + vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4 + vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12 + vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5 + vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13 + vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6 + vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14 + vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7 + vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15 + vshufi32x4 m22, m26, m4, q2020 ; 0 1 + vshufi32x4 m26, m4, q3131 ; 8 9 + vshufi32x4 m23, m27, m5, q2020 ; 2 3 + vshufi32x4 m27, m5, q3131 ; 10 11 + vshufi32x4 m24, m28, m6, q2020 ; 4 5 + vshufi32x4 m28, m6, q3131 ; 12 13 + vshufi32x4 m25, m29, m7, q2020 ; 6 7 + vshufi32x4 m29, m7, q3131 ; 14 15 + vshufi32x4 m4, m0, m14, q2020 ; 16 17 + vshufi32x4 m3, m0, m14, q3131 ; 24 25 + vshufi32x4 m20, m1, m15, q2020 ; 18 19 + vshufi32x4 m19, m1, m15, q3131 ; 26 27 + vshufi32x4 m5, m2, m16, q2020 ; 20 21 + vshufi32x4 m0, m2, m16, q3131 ; 28 29 + vshufi32x4 m16, m8, m17, q2020 ; 22 23 + vshufi32x4 m17, m8, m17, q3131 ; 30 31 + pxor m6, m6 + mova [cq+64* 0], m4 + mova [cq+64* 2], m5 + mova [cq+64* 4], m3 + mova [cq+64* 6], m0 + punpcklwd m8, m24, m24 ; 4 + punpcklwd m0, m0 ; 28 + punpcklwd m5, m5 ; 20 + punpcklwd m1, m28, m28 ; 12 + punpcklwd m7, m26, m26 ; 8 + punpcklwd m3, m3 ; 24 + punpcklwd m9, m6, m22 ; __ 0 + punpcklwd m6, m4 ; __ 16 + call m(idct_16x16_internal_8bpc).main_fast3 + mova [cq+64* 1], m20 + mova [cq+64* 3], m16 + mova [cq+64* 5], m19 + mova [cq+64* 7], m17 + punpcklwd m21, m23, m23 ; 2 + punpcklwd m17, m17 ; 30 + punpcklwd m20, m20 ; 18 + punpcklwd m15, m29, m29 ; 14 + punpcklwd m18, m27, m27 ; 10 + punpcklwd m16, m16 ; 22 + punpcklwd m19, m19 ; 26 + punpcklwd m14, m25, m25 ; 6 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + mova m21, [cq+64* 7] + mova m14, [cq+64* 0] + mova m17, [cq+64* 3] + mova m18, [cq+64* 4] + mova m19, [cq+64* 5] + mova m16, [cq+64* 2] + mova m15, [cq+64* 1] + mova m20, [cq+64* 6] + REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ + m24, m19, m16, m27, m28, m15, m20, m23 + call .main_oddhalf + jmp .end +.fast: ; right half is zero + mova ym8, [cq+64*15] + vinserti32x8 m8, [cq+64* 1], 1 + mova m2, [o(int16_perm)] + mova ym9, [cq+64* 8] + vinserti32x8 m9, [cq+64* 0], 1 + mova ym0, [cq+64* 7] + vinserti32x8 m0, [cq+64* 9], 1 + mova ym7, [cq+64*14] + vinserti32x8 m7, [cq+64* 2], 1 + mova ym1, [cq+64* 3] + vinserti32x8 m1, [cq+64*13], 1 + mova ym3, [cq+64* 6] + vinserti32x8 m3, [cq+64*10], 1 + mova ym5, [cq+64*11] + vinserti32x8 m5, [cq+64* 5], 1 + mova ym6, [cq+64*12] + vinserti32x8 m6, [cq+64* 4], 1 + REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 + call m(idct_16x16_internal_8bpc).main2 + vbroadcasti32x4 m8, [o(int_shuf3)] + vbroadcasti32x4 m9, [o(int_shuf4)] + vpbroadcastd m11, [o(pw_8192)] + pshufb m0, m8 + pshufb m1, m9 + pshufb m2, m8 + pshufb m3, m9 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + pshufb m4, m8 + pshufb m5, m9 + pshufb m6, m8 + pshufb m7, m9 + REPX {pmulhrsw x, m11}, m4, m5, m6, m7 + punpckhdq m28, m0, m1 + punpckldq m0, m1 + punpckhdq m27, m2, m3 + punpckldq m2, m3 + punpckhdq m22, m4, m5 + punpckldq m4, m5 + punpckhdq m23, m6, m7 + punpckldq m6, m7 + vinserti32x8 m14, m0, ym2, 1 + vshufi32x4 m15, m0, m2, q3232 + vinserti32x8 m2, m4, ym6, 1 + vshufi32x4 m4, m6, q3232 + vshufi32x4 m21, m14, m2, q2020 ; 0 2 + vshufi32x4 m14, m2, q3131 ; 4 6 + vshufi32x4 m18, m15, m4, q2020 ; 8 10 + vshufi32x4 m15, m4, q3131 ; 12 14 + pxor m9, m9 + punpcklwd m8, m14, m14 ; 4 + punpcklwd m1, m15, m15 ; 12 + punpcklwd m7, m18, m18 ; 8 + punpcklwd m9, m21 ; __ 0 + call m(idct_16x16_internal_8bpc).main_fast4 + punpckhwd m21, m21 ; 2 + punpckhwd m15, m15 ; 14 + punpckhwd m18, m18 ; 10 + punpckhwd m14, m14 ; 6 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + vinserti32x8 m24, m28, ym27, 1 + vshufi32x4 m28, m27, q3232 + vinserti32x8 m27, m22, ym23, 1 + vshufi32x4 m22, m23, q3232 + vshufi32x4 m23, m24, m27, q2020 ; 1 3 + vshufi32x4 m24, m27, q3131 ; 5 7 + vshufi32x4 m27, m28, m22, q2020 ; 9 11 + vshufi32x4 m28, m22, q3131 ; 13 15 + punpcklwd m22, m23, m23 ; 1 + punpckhwd m29, m28, m28 ; 15 + punpcklwd m26, m27, m27 ; 9 + punpckhwd m25, m24, m24 ; 7 + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + punpcklwd m24, m24 ; 5 + punpckhwd m27, m27 ; 11 + punpcklwd m28, m28 ; 13 + punpckhwd m23, m23 ; 3 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + call .main_oddhalf_fast +.end: + imul r6, strideq, 60 + mova m10, [o(end_16x32p)] + vpbroadcastd m11, [o(pw_2048)] + lea r3, [strideq*3] + pxor m12, m12 + add r6, dstq ; dst+stride*60 + psrldq m13, m10, 1 + lea r4, [strideq+r3] ; stride*4 +%macro IDCT_16x64_END 3 ; idct32, idct64, tmp +%if %1 & 1 + %define %%s0 r3 + %define %%s1 strideq*2 + %define %%s2 strideq*1 + %define %%s3 strideq*0 +%else + %define %%s0 strideq*0 + %define %%s1 strideq*1 + %define %%s2 strideq*2 + %define %%s3 r3 +%if %1 + add dstq, r4 + sub r6, r4 +%endif +%endif +%if %1 < 8 + pmulhrsw m8, m11, m%1 + pmulhrsw m9, m11, m%2 +%else + mova m9, [cq+64*%1] + paddsw m8, m9, m%2 ; out 0+n, 1+n + psubsw m9, m%2 ; out 63-n, 62-n + pmulhrsw m8, m11 + pmulhrsw m9, m11 +%endif + mova xm29, [dstq+%%s0] + vinserti128 ym29, [dstq+%%s1], 1 + mova xm%3, [r6 +%%s3] + vinserti128 ym%3, [r6 +%%s2], 1 + vpermb m29, m10, m29 + vpermb m%3, m10, m%3 + mova [cq+64*%1], m12 + paddw m29, m8 + paddw m%3, m9 + packuswb m29, m%3 + vpermd m29, m13, m29 + mova [dstq+%%s0], xm29 + vextracti128 [dstq+%%s1], ym29, 1 + vextracti32x4 [r6 +%%s2], m29, 2 + vextracti32x4 [r6 +%%s3], m29, 3 +%endmacro + IDCT_16x64_END 0, 29, 0 + IDCT_16x64_END 1, 28, 28 + IDCT_16x64_END 2, 27, 28 + IDCT_16x64_END 3, 26, 28 + IDCT_16x64_END 4, 25, 28 + IDCT_16x64_END 5, 24, 28 + IDCT_16x64_END 6, 23, 28 + IDCT_16x64_END 7, 22, 28 + IDCT_16x64_END 8, 21, 28 + IDCT_16x64_END 9, 20, 28 + IDCT_16x64_END 10, 19, 28 + IDCT_16x64_END 11, 18, 28 + IDCT_16x64_END 12, 17, 28 + IDCT_16x64_END 13, 16, 28 + IDCT_16x64_END 14, 15, 28 + IDCT_16x64_END 15, 14, 28 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 64 + add r6d, 128+512 + sar r6d, 8+2 + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 +ALIGN function_align +.main_oddhalf_fast: ; bottom three-quarters are zero + vpbroadcastd m8, [o(pw_101_4095x8)] + vpbroadcastd m21, [o(pw_m1474_3822x8)] + vpbroadcastd m14, [o(pw_897_3996x8)] + vpbroadcastd m17, [o(pw_m700_4036x8)] + vpbroadcastd m18, [o(pw_501_4065x8)] + vpbroadcastd m19, [o(pw_m1092_3948x8)] + vpbroadcastd m16, [o(pw_1285_3889x8)] + vpbroadcastd m15, [o(pw_m301_4085x8)] + pmulhrsw m8, m22 ; t32a t63a + pmulhrsw m21, m29 ; t35a t60a + pmulhrsw m14, m26 ; t36a t59a + pmulhrsw m17, m25 ; t39a t56 + pmulhrsw m18, m24 ; t40a t55a + pmulhrsw m19, m27 ; t43a t52a + pmulhrsw m16, m28 ; t44a t51a + pmulhrsw m15, m23 ; t47a t48a + mova m22, m8 + mova m29, m21 + mova m26, m14 + mova m25, m17 + mova m24, m18 + mova m27, m19 + mova m28, m16 + mova m20, m15 + jmp .main_oddhalf2 +ALIGN function_align +.main_oddhalf: + vpbroadcastd m8, [o(pw_101_4095x8)] + vpbroadcastd m9, [o(pw_m2824_2967x8)] + vpbroadcastd m11, [o(pw_1660_3745x8)] + vpbroadcastd m12, [o(pw_m1474_3822x8)] + pmulhrsw m22, m8 ; t32a t63a + vpbroadcastd m8, [o(pw_897_3996x8)] + pmulhrsw m21, m9 ; t33a t62a + vpbroadcastd m9, [o(pw_m2191_3461x8)] + pmulhrsw m14, m11 ; t34a t61a + vpbroadcastd m11, [o(pw_2359_3349x8)] + pmulhrsw m29, m12 ; t35a t60a + vpbroadcastd m12, [o(pw_m700_4036x8)] + pmulhrsw m26, m8 ; t36a t59a + vpbroadcastd m8, [o(pw_501_4065x8)] + pmulhrsw m17, m9 ; t37a t58a + vpbroadcastd m9, [o(pw_m2520_3229x8)] + pmulhrsw m18, m11 ; t38a t57a + vpbroadcastd m11, [o(pw_2019_3564x8)] + pmulhrsw m25, m12 ; t39a t56a + vpbroadcastd m12, [o(pw_m1092_3948x8)] + pmulhrsw m24, m8 ; t40a t55a + vpbroadcastd m8, [o(pw_1285_3889x8)] + pmulhrsw m19, m9 ; t41a t54a + vpbroadcastd m9, [o(pw_m1842_3659x8)] + pmulhrsw m16, m11 ; t42a t53a + vpbroadcastd m11, [o(pw_2675_3102x8)] + pmulhrsw m27, m12 ; t43a t52a + vpbroadcastd m12, [o(pw_m301_4085x8)] + pmulhrsw m28, m8 ; t44a t51a + pmulhrsw m15, m9 ; t45a t50a + pmulhrsw m20, m11 ; t46a t49a + pmulhrsw m23, m12 ; t47a t48a + psubsw m8, m22, m21 ; t33 t62 + paddsw m22, m21 ; t32 t63 + psubsw m21, m29, m14 ; t34 t61 + paddsw m29, m14 ; t35 t60 + psubsw m14, m26, m17 ; t37 t58 + paddsw m26, m17 ; t36 t59 + psubsw m17, m25, m18 ; t38 t57 + paddsw m25, m18 ; t39 t56 + psubsw m18, m24, m19 ; t41 t54 + paddsw m24, m19 ; t40 t55 + psubsw m19, m27, m16 ; t42 t53 + paddsw m27, m16 ; t43 t52 + psubsw m16, m28, m15 ; t45 t50 + paddsw m28, m15 ; t44 t51 + psubsw m15, m23, m20 ; t46 t49 + paddsw m20, m23 ; t47 t48 +.main_oddhalf2: + ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a + ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a + ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a + ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a + ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a + ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a + ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a + ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a + vpbroadcastd m11, [o(pw_m4017_799)] + psubsw m23, m25, m26 ; t36a t59a + paddsw m25, m26 ; t39a t56a + psubsw m26, m24, m27 ; t43a t52a + paddsw m27, m24 ; t40a t55a + psubsw m24, m20, m28 ; t44a t51a + paddsw m20, m28 ; t47a t48a + psubsw m28, m8, m21 ; t34 t61 + paddsw m8, m21 ; t33 t62 + psubsw m21, m17, m14 ; t37 t58 + paddsw m17, m14 ; t38 t57 + psubsw m14, m18, m19 ; t42 t53 + paddsw m18, m19 ; t41 t54 + psubsw m19, m15, m16 ; t45 t50 + paddsw m15, m16 ; t46 t49 + psubsw m16, m22, m29 ; t35a t60a + paddsw m22, m29 ; t32a t63a + ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60 + ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a + ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59 + ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a + vpbroadcastd m11, [o(pw_m2276_3406)] + ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52 + ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a + ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51 + ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + psubsw m29, m22, m25 ; t39 t56 + paddsw m22, m25 ; t32 t63 + psubsw m25, m20, m27 ; t40 t55 + paddsw m20, m27 ; t47 t48 + psubsw m27, m8, m17 ; t38a t57a + paddsw m8, m17 ; t33a t62a + psubsw m17, m15, m18 ; t41a t54a + paddsw m15, m18 ; t46a t49a + paddsw m18, m16, m23 ; t35a t60a + psubsw m16, m23 ; t36a t59a + psubsw m23, m24, m26 ; t43a t52a + paddsw m24, m26 ; t44a t51a + paddsw m26, m28, m21 ; t34 t61 + psubsw m28, m21 ; t37 t58 + psubsw m21, m19, m14 ; t42 t53 + paddsw m19, m14 ; t45 t50 + ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a + ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57 + ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59 + ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a + vpbroadcastd m11, [o(pw_m1567_m3784)] + ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a + ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54 + ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52 + ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a + vbroadcasti32x4 m13, [o(deint_shuf)] + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + paddsw m14, m22, m20 ; t32a t63a + psubsw m22, m20 ; t47a t48a + psubsw m20, m8, m15 ; t46 t49 + paddsw m8, m15 ; t33 t62 + paddsw m15, m18, m24 ; t35 t60 + psubsw m18, m24 ; t44 t51 + psubsw m24, m26, m19 ; t45a t50a + paddsw m26, m19 ; t34a t61a + REPX {pshufb x, m13}, m14, m8, m15, m26 + psubsw m19, m29, m25 ; t40 t55 + paddsw m25, m29 ; t39 t56 + psubsw m29, m27, m17 ; t41a t54a + paddsw m27, m17 ; t38a t57a + psubsw m17, m16, m23 ; t43a t52a + paddsw m16, m23 ; t36a t59a + psubsw m9, m28, m21 ; t42 t53 + paddsw m28, m21 ; t37 t58 + REPX {pshufb x, m13}, m25, m27, m16, m28 + ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48 + ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a + packssdw m21, m22 ; t47 t46a + packssdw m13, m23 ; t48 t49a + ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a + ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50 + packssdw m20, m18 ; t44a t45 + packssdw m22, m23 ; t51a t50 + ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a + ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54 + packssdw m18, m19 ; t40a t41 + packssdw m24, m23 ; t55a t54 + ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52 + ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a + packssdw m19, m17 ; t43 t42a + packssdw m23, m29 ; t52 t53a + punpcklqdq m17, m25, m27 ; t39 t38a + punpckhqdq m25, m27 ; t56 t57a + punpckhqdq m27, m15, m26 ; t60 t61a + punpcklqdq m15, m26 ; t35 t34a + punpckhqdq m26, m16, m28 ; t59a t58 + punpcklqdq m16, m28 ; t36a t37 + punpckhqdq m28, m14, m8 ; t63a t62 + punpcklqdq m14, m8 ; t32a t33 + psubsw m29, m0, m28 ; out63 out62 + paddsw m0, m28 ; out0 out1 + psubsw m28, m1, m27 ; out60 out61 + paddsw m1, m27 ; out3 out2 + psubsw m27, m2, m26 ; out59 out58 + paddsw m2, m26 ; out4 out5 + psubsw m26, m3, m25 ; out56 out57 + paddsw m3, m25 ; out7 out6 + psubsw m25, m4, m24 ; out55 out54 + paddsw m4, m24 ; out8 out9 + psubsw m24, m5, m23 ; out52 out53 + paddsw m5, m23 ; out11 out10 + psubsw m23, m6, m22 ; out51 out50 + paddsw m6, m22 ; out12 out13 + psubsw m22, m7, m13 ; out48 out49 + paddsw m7, m13 ; out15 out14 + ret + +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jnz .normal + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 16 +.dconly: + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 +.dconly2: + imul r6d, 181 + add r6d, 128+2048 + sar r6d, 8+4 + pxor m2, m2 + vpbroadcastw m3, r6d +.dconly_loop: + mova m1, [dstq] + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + WIN64_SPILL_XMM 31 + mova m19, [o(dup16_perm)] + mova m24, [cq+64* 2] + mova m28, [cq+64* 6] + mova m26, [cq+64* 4] + mova m22, [cq+64* 0] + mova m23, [cq+64* 1] + mova m29, [cq+64* 7] + mova m27, [cq+64* 5] + mova m25, [cq+64* 3] + vpermb m8, m19, m24 ; 4 + vpermb m1, m19, m28 ; 12 + vpermb m7, m19, m26 ; 8 + vpermb m9, m19, m22 ; __ 0 + vpermb m21, m19, m23 ; 2 + vpermb m15, m19, m29 ; 14 + vpermb m18, m19, m27 ; 10 + vpermb m14, m19, m25 ; 6 + pslld m9, 16 + vpord m30, m19, [o(pb_32)] {1to16} + REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23 + cmp eobd, 151 + jb .fast + vpermb m0, m19, [cq+64*14] ; 28 + vpermb m5, m19, [cq+64*10] ; 20 + vpermb m3, m19, [cq+64*12] ; 24 + vpermb m6, m19, [cq+64* 8] ; __ 16 + pslld m6, 16 + call m(idct_16x16_internal_8bpc).main_fast + vpermb m17, m19, [cq+64*15] ; 30 + vpermb m20, m19, [cq+64* 9] ; 18 + vpermb m16, m19, [cq+64*11] ; 22 + vpermb m19, m19, [cq+64*13] ; 26 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + vpermb m21, m30, [cq+64*15] + vpermb m14, m30, [cq+64* 8] + vpermb m17, m30, [cq+64*11] + vpermb m18, m30, [cq+64*12] + vpermb m19, m30, [cq+64*13] + vpermb m16, m30, [cq+64*10] + vpermb m15, m30, [cq+64* 9] + vpermb m20, m30, [cq+64*14] + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf + jmp .end +.fast: ; bottom half is zero + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast +.end: + mova [cq+64* 8], m4 + mova [cq+64* 9], m5 + mova [cq+64*10], m6 + mova [cq+64*11], m7 + mova [cq+64*12], m26 + mova [cq+64*13], m27 + mova [cq+64*14], m28 + mova [cq+64*15], m29 + vpbroadcastd m13, [o(pw_8192)] + call .pass1_end + call .pass2 + mova [cq+64* 0], m0 + mova [cq+64* 1], m1 + mova [cq+64* 2], m2 + mova [cq+64* 3], m3 + mova [cq+64* 4], m4 + mova [cq+64* 5], m5 + mova [cq+64* 6], m6 + mova [cq+64* 7], m7 + pmulhrsw m0, m13, [cq+64* 8] + pmulhrsw m1, m13, [cq+64* 9] + pmulhrsw m2, m13, [cq+64*10] + pmulhrsw m3, m13, [cq+64*11] + vpbroadcastd m30, [o(pw_2048)] + pmulhrsw m4, m13, m22 + pmulhrsw m5, m13, m23 + pmulhrsw m6, m13, m24 + pmulhrsw m7, m13, m25 + pmulhrsw m22, m30, m14 + pmulhrsw m14, m13, m26 + pmulhrsw m23, m30, m15 + pmulhrsw m15, m13, m27 + pmulhrsw m24, m30, m16 + pmulhrsw m16, m13, m28 + pmulhrsw m25, m30, m17 + pmulhrsw m17, m13, m29 + pmulhrsw m26, m30, m18 + pmulhrsw m18, m13, [cq+64*12] + pmulhrsw m27, m30, m19 + pmulhrsw m19, m13, [cq+64*13] + pmulhrsw m28, m30, m20 + pmulhrsw m20, m13, [cq+64*14] + pmulhrsw m29, m30, m21 + pmulhrsw m21, m13, [cq+64*15] + call .transpose_round + call .pass2 + pxor m10, m10 + lea r3, [strideq*3] +%macro IDCT_64x16_END 4 + mova m9, [dstq+%4] +%if %1 < 8 + pmulhrsw m%3, m30, [cq+64*%1] +%endif + pmulhrsw m%2, m30 + mova [cq+64*%1], m10 + punpcklbw m8, m9, m10 + punpckhbw m9, m10 + paddw m8, m%3 + paddw m9, m%2 + packuswb m8, m9 + mova [dstq+%4], m8 +%if %1 == 3 || %1 == 7 || %1 == 11 + lea dstq, [dstq+strideq*4] +%endif +%endmacro + IDCT_64x16_END 0, 0, 11, strideq*0 + IDCT_64x16_END 1, 1, 11, strideq*1 + IDCT_64x16_END 2, 2, 11, strideq*2 + IDCT_64x16_END 3, 3, 11, r3 + IDCT_64x16_END 4, 4, 11, strideq*0 + IDCT_64x16_END 5, 5, 11, strideq*1 + IDCT_64x16_END 6, 6, 11, strideq*2 + IDCT_64x16_END 7, 7, 11, r3 + IDCT_64x16_END 8, 14, 22, strideq*0 + IDCT_64x16_END 9, 15, 23, strideq*1 + IDCT_64x16_END 10, 16, 24, strideq*2 + IDCT_64x16_END 11, 17, 25, r3 + IDCT_64x16_END 12, 18, 26, strideq*0 + IDCT_64x16_END 13, 19, 27, strideq*1 + IDCT_64x16_END 14, 20, 28, strideq*2 + IDCT_64x16_END 15, 21, 29, r3 + RET +ALIGN function_align +.pass1_end: + mova m4, [cq+64* 0] + mova m5, [cq+64* 1] + mova m6, [cq+64* 2] + mova m7, [cq+64* 3] + mova m8, [cq+64* 4] + mova m9, [cq+64* 5] + mova m11, [cq+64* 6] + mova m12, [cq+64* 7] + psubsw m29, m4, m21 ; out47 out46 + paddsw m4, m21 ; out16 out17 + psubsw m28, m5, m20 ; out44 out45 + paddsw m5, m20 ; out19 out18 + REPX {pmulhrsw x, m13}, m0, m1, m2, m3 + psubsw m27, m6, m19 ; out43 out42 + paddsw m6, m19 ; out20 out21 + psubsw m26, m7, m18 ; out40 out41 + paddsw m7, m18 ; out23 out22 + pmulhrsw m18, m13, m22 + pmulhrsw m19, m13, m23 + pmulhrsw m20, m13, m24 + pmulhrsw m21, m13, m25 + paddsw m25, m12, m14 ; out31 out30 + psubsw m14, m12, m14 ; out32 out33 + paddsw m24, m11, m15 ; out28 out29 + psubsw m15, m11, m15 ; out35 out34 + REPX {pmulhrsw x, m13}, m4, m5, m6, m7 + paddsw m23, m9, m16 ; out27 out26 + psubsw m16, m9, m16 ; out36 out37 + paddsw m22, m8, m17 ; out24 out25 + psubsw m17, m8, m17 ; out39 out38 + REPX {pmulhrsw x, m13}, m14, m15, m16, m17 +.transpose_round: +%macro TRANSPOSE_8x4_PACKED 4 + punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3 + punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3 + punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3 + punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3 + punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3 + punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1 + punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3 + punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1 + punpcklwd m%3, m%4, m%2 ; 2 + punpckhwd m%4, m%2 ; 3 + punpckhwd m%2, m%1, m8 ; 1 + punpcklwd m%1, m8 ; 0 +%endmacro + TRANSPOSE_8x4_PACKED 0, 1, 2, 3 + TRANSPOSE_8x4_PACKED 18, 19, 20, 21 + TRANSPOSE_8x4_PACKED 4, 5, 6, 7 + TRANSPOSE_8x4_PACKED 14, 15, 16, 17 + vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03 + vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01 + vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13 + vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11 + vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23 + vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21 + vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33 + vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31 + vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03 + vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01 + vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13 + vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11 + vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23 + vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21 + vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33 + vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 + ret +.pass2: + vshufi32x4 m7, m5, m19, q3131 ; 14 + vshufi32x4 m5, m19, q2020 ; 10 + vshufi32x4 m21, m6, m20, q3131 ; 15 + vshufi32x4 m19, m6, m20, q2020 ; 11 + vshufi32x4 m20, m4, m18, q3131 ; 13 + vshufi32x4 m18, m4, m18, q2020 ; 9 + vshufi32x4 m6, m8, m2, q3131 ; 12 + vshufi32x4 m4, m8, m2, q2020 ; 8 + vshufi32x4 m2, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m1, m16, q3131 ; 6 + vshufi32x4 m1, m16, q2020 ; 2 + vshufi32x4 m16, m9, m15, q3131 ; 5 + vshufi32x4 m14, m9, m15, q2020 ; 1 + vshufi32x4 m15, m11, m17, q2020 ; 3 + vshufi32x4 m17, m11, m17, q3131 ; 7 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 + jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob + vpbroadcastd m23, [o(pw_2896x8)] +%undef cmp + cmp eobd, 136 + jb .fast + pmulhrsw m5, m23, [cq+64*20] + pmulhrsw m3, m23, [cq+64*12] + pmulhrsw m1, m23, [cq+64* 4] + pmulhrsw m7, m23, [cq+64*28] + pmulhrsw m2, m23, [cq+64* 8] + pmulhrsw m6, m23, [cq+64*24] + pmulhrsw m0, m23, [cq+64* 0] + pmulhrsw m4, m23, [cq+64*16] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + pmulhrsw m14, m23, [cq+64* 2] + pmulhrsw m21, m23, [cq+64*30] + pmulhrsw m18, m23, [cq+64*18] + pmulhrsw m17, m23, [cq+64*14] + pmulhrsw m16, m23, [cq+64*10] + pmulhrsw m19, m23, [cq+64*22] + pmulhrsw m20, m23, [cq+64*26] + pmulhrsw m15, m23, [cq+64* 6] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + pmulhrsw m22, m23, [cq+64* 1] + pmulhrsw m21, m23, [cq+64*31] + pmulhrsw m14, m23, [cq+64*17] + pmulhrsw m29, m23, [cq+64*15] + pmulhrsw m26, m23, [cq+64* 9] + pmulhrsw m17, m23, [cq+64*23] + pmulhrsw m18, m23, [cq+64*25] + pmulhrsw m25, m23, [cq+64* 7] + pmulhrsw m24, m23, [cq+64* 5] + pmulhrsw m19, m23, [cq+64*27] + pmulhrsw m16, m23, [cq+64*21] + pmulhrsw m27, m23, [cq+64*11] + pmulhrsw m28, m23, [cq+64*13] + pmulhrsw m15, m23, [cq+64*19] + pmulhrsw m20, m23, [cq+64*29] + pmulhrsw m23, [cq+64* 3] + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + vpbroadcastd m12, [o(pw_16384)] + psubsw m13, m0, m29 ; 31 + paddsw m0, m29 ; 0 + psubsw m29, m1, m28 ; 30 + paddsw m1, m28 ; 1 + psubsw m28, m2, m27 ; 29 + paddsw m2, m27 ; 2 + psubsw m27, m3, m26 ; 28 + paddsw m3, m26 ; 3 + psubsw m26, m4, m25 ; 27 + paddsw m4, m25 ; 4 + psubsw m25, m5, m24 ; 26 + paddsw m5, m24 ; 5 + psubsw m24, m6, m23 ; 25 + paddsw m6, m23 ; 6 + psubsw m23, m7, m22 ; 24 + paddsw m7, m22 ; 7 + pxor m9, m9 + punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 + REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 + punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 + REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 + punpckhwd m3, m23, m24 + punpcklwd m23, m24 + punpckhwd m24, m25, m26 + punpcklwd m25, m26 + REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 + punpckhwd m26, m27, m28 + punpcklwd m27, m28 + punpckhwd m28, m29, m13 + punpcklwd m29, m13 + REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 + punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 + REPX {pmulhrsw x, m12}, m7, m0, m2, m4 + punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 + REPX {pmulhrsw x, m12}, m6, m8, m1, m22 + punpckhdq m13, m23, m25 + punpckldq m23, m25 + punpckhdq m25, m27, m29 + punpckldq m27, m29 + REPX {pmulhrsw x, m12}, m13, m23, m25, m27 + punpckhdq m9, m3, m24 + punpckldq m3, m24 + punpckhdq m24, m26, m28 + punpckldq m26, m28 + REPX {pmulhrsw x, m12}, m9, m3, m24, m26 + punpckhqdq m5, m23, m27 ; d01 d09 d17 d25 + punpcklqdq m23, m27 ; d00 d08 d16 d24 + punpcklqdq m27, m13, m25 ; d02 d10 d18 d26 + punpckhqdq m13, m25 ; d03 d11 d19 d27 + punpcklqdq m25, m3, m26 ; d04 d12 d20 d28 + punpckhqdq m3, m26 ; d05 d13 d21 d29 + punpcklqdq m26, m9, m24 ; d06 d14 d22 d30 + punpckhqdq m9, m24 ; d07 d15 d23 d31 + mova [cq+64* 3], m23 + mova [cq+64*13], m27 + mova [cq+64* 7], m25 + mova [cq+64*15], m26 + punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 + punpcklqdq m8, m22 ; a04 a12 a20 a28 + punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 + punpcklqdq m0, m4 ; a00 a08 a16 a24 + punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 + punpcklqdq m7, m2 ; a02 a10 a18 a26 + punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 + punpcklqdq m6, m1 ; a06 a14 a22 a30 + mova [cq+64* 1], m0 + mova [cq+64* 9], m7 + mova [cq+64* 5], m8 + mova [cq+64*11], m6 + mova m2, [cq+64* 0] + mova m11, [cq+64* 2] + mova m8, [cq+64* 4] + mova m29, [cq+64* 6] + mova m27, [cq+64* 8] + mova m26, [cq+64*10] + mova m4, [cq+64*12] + mova m28, [cq+64*14] + psubsw m1, m2, m21 ; 23 + paddsw m2, m21 ; 8 + psubsw m21, m11, m20 ; 22 + paddsw m11, m20 ; 9 + psubsw m20, m8, m19 ; 21 + paddsw m8, m19 ; 10 + psubsw m19, m29, m18 ; 20 + paddsw m29, m18 ; 11 + psubsw m18, m27, m17 ; 19 + paddsw m27, m17 ; 12 + psubsw m17, m26, m16 ; 18 + paddsw m26, m16 ; 13 + psubsw m16, m4, m15 ; 17 + paddsw m4, m15 ; 14 + psubsw m15, m28, m14 ; 16 + paddsw m28, m14 ; 15 + punpcklwd m14, m15, m16 + punpckhwd m15, m16 + punpckhwd m16, m17, m18 + punpcklwd m17, m18 + punpckhwd m18, m19, m20 + punpcklwd m19, m20 + punpckhwd m20, m21, m1 + punpcklwd m21, m1 + punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 + punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 + punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 + punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 + punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 + punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 + punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 + punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 + punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1 + REPX {pmulhrsw x, m12}, m28, m2, m8, m27 + punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 + punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 + punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 + punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 + REPX {pmulhrsw x, m12}, m4, m1, m11, m29 + punpckhdq m26, m19, m21 + punpckldq m19, m21 + punpckhdq m21, m15, m16 + punpckldq m15, m16 + REPX {pmulhrsw x, m12}, m26, m19, m21, m15 + punpckhdq m16, m18, m20 + punpckldq m18, m20 + punpckhdq m20, m14, m17 + punpckldq m14, m17 + REPX {pmulhrsw x, m12}, m16, m18, m20, m14 + punpckhqdq m17, m28, m8 ; b03 b11 b19 b27 + punpcklqdq m28, m8 ; b02 b10 b18 b26 + punpckhqdq m8, m2, m27 ; b01 b09 b17 b25 + punpcklqdq m2, m27 ; b00 b08 b16 b24 + punpcklqdq m27, m1, m29 ; b04 b12 b20 b28 + punpckhqdq m1, m29 ; b05 b13 b21 b29 + punpcklqdq m29, m4, m11 ; b06 b14 b22 b30 + punpckhqdq m4, m11 ; b07 b15 b23 b31 + mova [cq+64* 0], m2 + mova [cq+64* 8], m28 + mova [cq+64* 4], m27 + mova [cq+64*10], m29 + punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 + punpcklqdq m20, m26 ; c02 c10 c18 c26 + punpckhqdq m26, m14, m19 ; c01 c09 c17 c25 + punpcklqdq m14, m19 ; c00 c08 c16 c24 + punpckhqdq m28, m15, m18 ; c05 c13 c21 c29 + punpcklqdq m15, m18 ; c04 c12 c20 c28 + punpckhqdq m29, m21, m16 ; c07 c15 c23 c31 + punpcklqdq m21, m16 ; c06 c14 c22 c30 + mova [cq+64* 2], m14 + mova [cq+64*12], m20 + mova [cq+64* 6], m15 + mova [cq+64*14], m21 + vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25 + vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09 + vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27 + vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11 + vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29 + vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13 + vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31 + vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15 + vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09 + vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25 + vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11 + vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27 + vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13 + vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29 + vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15 + vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31 + mov r4, rsp + vshufi32x4 m0, m22, m19, q2020 ; 1 + vshufi32x4 m1, m17, m29, q3131 ; 31 + vshufi32x4 m2, m14, m26, q2020 ; 17 + vshufi32x4 m3, m25, m18, q3131 ; 15 + call .main_part1 + vshufi32x4 m0, m25, m18, q2020 ; 7 + vshufi32x4 m1, m14, m26, q3131 ; 25 + vshufi32x4 m2, m17, m29, q2020 ; 23 + vshufi32x4 m3, m22, m19, q3131 ; 9 + call .main_part1 + vshufi32x4 m0, m24, m21, q2020 ; 5 + vshufi32x4 m1, m15, m27, q3131 ; 27 + vshufi32x4 m2, m16, m28, q2020 ; 21 + vshufi32x4 m3, m23, m20, q3131 ; 11 + call .main_part1 + vshufi32x4 m0, m23, m20, q2020 ; 3 + vshufi32x4 m1, m16, m28, q3131 ; 29 + vshufi32x4 m2, m15, m27, q2020 ; 19 + vshufi32x4 m3, m24, m21, q3131 ; 13 + call .main_part1 + call .main_part2 + mova m0, [cq+64* 1] ; a0 + mova m15, [cq+64* 0] ; b0 + mova m3, [cq+64* 2] ; c0 + mova m16, [cq+64* 3] ; d0 + mova m14, [cq+64* 5] ; a4 + mova m8, [cq+64* 4] ; b4 + mova m17, [cq+64* 6] ; c4 + mova m1, [cq+64* 7] ; d4 + vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24 + vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08 + vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24 + vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08 + vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28 + vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12 + vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28 + vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12 + vshufi32x4 m1, m0, m3, q3131 ; 8 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m2, m15, q3131 ; 24 + vshufi32x4 m2, m15, q2020 ; 16 + vshufi32x4 m15, m14, m17, q3131 ; 12 + vshufi32x4 m14, m17, q2020 ; 4 + vshufi32x4 m17, m16, m8, q3131 ; 28 + vshufi32x4 m16, m8, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova m8, [cq+64* 8] + mova m9, [cq+64*12] + mova m11, [cq+64*10] + mova m12, [cq+64*14] + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + mova m22, [cq+64* 9] + mova m27, [cq+64*13] + mova m23, [cq+64*11] + mova m24, [cq+64*15] + vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26 + vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10 + vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26 + vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10 + vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30 + vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14 + vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30 + vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14 + vshufi32x4 m28, m26, m8, q3131 ; 26 + vshufi32x4 m26, m8, q2020 ; 18 + vshufi32x4 m24, m22, m9, q3131 ; 10 + vshufi32x4 m22, m9, q2020 ; 2 + vshufi32x4 m29, m27, m11, q3131 ; 30 + vshufi32x4 m27, m11, q2020 ; 22 + vshufi32x4 m25, m23, m12, q3131 ; 14 + vshufi32x4 m23, m12, q2020 ; 6 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + jmp .end +.fast: ; bottom/right halves are zero + pmulhrsw ym9, ym23, [cq+64* 0] + pmulhrsw ym6, ym23, [cq+64* 8] + mova m14, [o(dup16_perm)] + pmulhrsw ym8, ym23, [cq+64* 2] + pmulhrsw xm0, xm23, [cq+64*14] + pmulhrsw xm5, xm23, [cq+64*10] + pmulhrsw ym1, ym23, [cq+64* 6] + pmulhrsw ym7, ym23, [cq+64* 4] + pmulhrsw xm3, xm23, [cq+64*12] + pmovzxwd m9, ym9 + pmovzxwd m6, ym6 + vpermb m8, m14, m8 + punpcklwd xm0, xm0 + vpermb ym5, ym14, ym5 + vpermb m1, m14, m1 + vpermb m7, m14, m7 + punpcklwd xm3, xm3 + pslld m9, 16 + pslld m6, 16 + call m(idct_16x16_internal_8bpc).main_fast + vpmulhrsw ym21, ym23, [cq+64* 1] + {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which + {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to + {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements + {evex}vpmulhrsw ym18, ym23, [cq+64* 5] + {evex}vpmulhrsw xm16, xm23, [cq+64*11] + {evex}vpmulhrsw xm19, xm23, [cq+64*13] + {evex}vpmulhrsw ym23, [cq+64* 3] + vpermb m21, m14, m21 + punpcklwd xm17, xm17 + vpermb ym20, ym14, ym20 + vpermb m15, m14, m15 + vpermb m18, m14, m18 + vpermb ym16, ym14, ym16 + punpcklwd xm19, xm19 + vpermb m14, m14, m23 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m9, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round + vshufi32x4 m16, m0, m3, q2020 ; 0 + vshufi32x4 m26, m0, m3, q3131 ; 4 + vshufi32x4 m0, m14, m2, q2020 ; 1 + vshufi32x4 m14, m2, q3131 ; 5 + vshufi32x4 m3, m19, m7, q3131 ; 15 + vshufi32x4 m19, m7, q2020 ; 11 + vshufi32x4 m27, m17, m9, q2020 ; 3 + vshufi32x4 m17, m9, q3131 ; 7 + vshufi32x4 m28, m20, m6, q2020 ; 9 + vshufi32x4 m20, m6, q3131 ; 13 + vshufi32x4 m22, m1, m18, q2020 ; 2 + vshufi32x4 m23, m1, m18, q3131 ; 6 + vshufi32x4 m24, m5, m15, q2020 ; 10 + vshufi32x4 m25, m5, m15, q3131 ; 14 + vshufi32x4 m15, m21, m4, q3131 ; 12 + vshufi32x4 m21, m21, m4, q2020 ; 8 + mov r4, rsp + call .main_part1_fast + mova m0, m17 + mova m3, m28 + call .main_part1_fast + mova m0, m14 + mova m3, m19 + call .main_part1_fast + mova m0, m27 + mova m3, m20 + call .main_part1_fast + call .main_part2 + mova m0, m16 + mova m1, m21 + mova m14, m26 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 + mova [cq+64*14], m21 + mova [cq+64* 0], m14 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64* 4], m16 + mova [cq+64* 2], m15 + mova [cq+64*12], m20 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 +.end: + lea r4, [strideq*3] + vpbroadcastd m12, [o(pw_2048)] + movshdup m13, [o(permD)] + lea r5, [r4+strideq] ; stride*4 + lea r3, [dstq+r4*8] + lea r6, [strideq+r5*8] ; stride*33 + lea r8, [r4+r5*8] ; stride*35 + add r3, r5 ; dst+stride*28 + lea r7, [r6+strideq] ; stride*34 +%macro IDCT_32x64_END 6 ; src, mem, stride[1-4] +%if %2 < 8 + paddsw m10, m%2, m%1 + psubsw m11, m%2, m%1 +%else + mova m11, [cq+64*(%2*2-16)] + paddsw m10, m11, m%1 + psubsw m11, m%1 +%endif + mova m9, [rsp+64*(31-%2)] + mova m%1, [rsp+64*%2] + paddsw m8, m10, m9 + psubsw m10, m9 + paddsw m9, m11, m%1 + pmovzxbw m0, [dstq+%3] + psubsw m11, m%1 + pmovzxbw m%1, [r3 +%4] + REPX {pmulhrsw x, m12}, m8, m10, m9, m11 + paddw m8, m0 + pmovzxbw m0, [r3 +%5] + paddw m10, m%1 + pmovzxbw m%1, [dstq+%6] + paddw m9, m0 + paddw m11, m%1 +%if %2 >= 8 +%if %2 == 8 + pxor m1, m1 +%endif + mova [cq+64*(%2*2-16)], m1 + mova [cq+64*(%2*2-15)], m1 +%endif + packuswb m8, m10 + packuswb m9, m11 + vpermq m8, m13, m8 + vpermq m9, m13, m9 + mova [dstq+%3], ym8 + vextracti32x8 [r3 +%4], m8, 1 + mova [r3 +%5], ym9 + vextracti32x8 [dstq+%6], m9, 1 +%if %2 == 3 || %2 == 7 || %2 == 11 + add dstq, r5 + sub r3, r5 +%endif +%endmacro + IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8 + IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8 + IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8 + IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 64 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 +ALIGN function_align ; bottom three-quarters are zero +.main_part1_fast: + vpbroadcastd m1, [o(idct64_mul+4*0)] + vpbroadcastd m8, [o(idct64_mul+4*1)] + vpbroadcastd m2, [o(idct64_mul+4*6)] + vpbroadcastd m9, [o(idct64_mul+4*7)] + pmulhrsw m1, m0 ; t63a + pmulhrsw m0, m8 ; t32a + pmulhrsw m2, m3 ; t60a + pmulhrsw m3, m9 ; t35a + mova m8, m0 + mova m7, m1 + mova m6, m3 + mova m5, m2 + jmp .main_part1b +.main_part1: + ; idct64 steps 1-5: + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vpbroadcastd m7, [o(idct64_mul+4*0)] + vpbroadcastd m8, [o(idct64_mul+4*1)] + vpbroadcastd m6, [o(idct64_mul+4*2)] + vpbroadcastd m9, [o(idct64_mul+4*3)] + pmulhrsw m7, m0 ; t63a + vpbroadcastd m5, [o(idct64_mul+4*4)] + pmulhrsw m0, m8 ; t32a + vpbroadcastd m8, [o(idct64_mul+4*5)] + pmulhrsw m6, m1 ; t62a + vpbroadcastd m4, [o(idct64_mul+4*6)] + pmulhrsw m1, m9 ; t33a + vpbroadcastd m9, [o(idct64_mul+4*7)] + pmulhrsw m5, m2 ; t61a + pmulhrsw m2, m8 ; t34a + pmulhrsw m4, m3 ; t60a + pmulhrsw m3, m9 ; t35a + psubsw m8, m0, m1 ; t33 + paddsw m0, m1 ; t32 + psubsw m1, m7, m6 ; t62 + paddsw m7, m6 ; t63 + psubsw m6, m3, m2 ; t34 + paddsw m3, m2 ; t35 + psubsw m2, m4, m5 ; t61 + paddsw m5, m4 ; t60 +.main_part1b: + vpbroadcastd m11, [o(idct64_mul+4*8)] + vpbroadcastd m12, [o(idct64_mul+4*9)] + ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a + vpbroadcastd m11, [o(idct64_mul+4*10)] + ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a + vpbroadcastd m11, [o(idct64_mul+4*11)] + vpbroadcastd m12, [o(idct64_mul+4*12)] + psubsw m4, m0, m3 ; t35a + paddsw m0, m3 ; t32a + psubsw m3, m7, m5 ; t60a + paddsw m7, m5 ; t63a + psubsw m5, m1, m2 ; t34 + paddsw m1, m2 ; t33 + psubsw m2, m8, m6 ; t61 + paddsw m6, m8 ; t62 + add r5, 4*13 + ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 + ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a + mova [r4+64*0], m0 + mova [r4+64*7], m7 + mova [r4+64*1], m1 + mova [r4+64*6], m6 + mova [r4+64*3], m3 + mova [r4+64*4], m4 + mova [r4+64*2], m2 + mova [r4+64*5], m5 + add r4, 64*8 + ret +.main_part2: + vpbroadcastd m11, [o(pw_1567_3784 -16*13)] + vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] + lea r6, [r4+64*7] + vpbroadcastd m17, [o(pw_m1567_m3784-16*13)] + vpbroadcastd m18, [o(pw_2896_2896 -16*13)] + vpbroadcastd m19, [o(pw_m2896_2896 -16*13)] + sub r5, 16*13 +.main_part2_loop: + mova m0, [r4-64*32] ; t32a + mova m1, [r6-64*24] ; t39a + mova m2, [r6-64*32] ; t63a + mova m3, [r4-64*24] ; t56a + mova m4, [r4-64*16] ; t40a + mova m5, [r6-64* 8] ; t47a + mova m6, [r6-64*16] ; t55a + mova m7, [r4-64* 8] ; t48a + psubsw m8, m0, m1 ; t39 + paddsw m0, m1 ; t32 + psubsw m1, m2, m3 ; t56 + paddsw m2, m3 ; t63 + psubsw m3, m5, m4 ; t40 + paddsw m5, m4 ; t47 + psubsw m4, m7, m6 ; t55 + paddsw m7, m6 ; t48 + ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a + ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a + psubsw m6, m2, m7 ; t48a + paddsw m2, m7 ; t63a + psubsw m7, m0, m5 ; t47a + paddsw m0, m5 ; t32a + psubsw m5, m8, m3 ; t55 + paddsw m8, m3 ; t56 + psubsw m3, m1, m4 ; t40 + paddsw m1, m4 ; t39 + ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48 + ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a + mova [r6-64* 8], m2 + mova [r4-64*32], m0 + mova [r4-64* 8], m8 + mova [r6-64*32], m1 + mova [r6-64*24], m6 + mova [r4-64*16], m7 + mova [r4-64*24], m5 + mova [r6-64*16], m3 + add r4, 64 + sub r6, 64 + cmp r4, r6 + jb .main_part2_loop + ret + +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob + vpbroadcastd m23, [o(pw_2896x8)] +%undef cmp + cmp eobd, 136 + jb .fast + pmulhrsw m0, m23, [cq+64* 1] + pmulhrsw m1, m23, [cq+64*31] + pmulhrsw m2, m23, [cq+64*17] + pmulhrsw m3, m23, [cq+64*15] + vpbroadcastd m10, [o(pd_2048)] + mov r4, rsp + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + pmulhrsw m0, m23, [cq+64* 7] + pmulhrsw m1, m23, [cq+64*25] + pmulhrsw m2, m23, [cq+64*23] + pmulhrsw m3, m23, [cq+64* 9] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + pmulhrsw m0, m23, [cq+64* 5] + pmulhrsw m1, m23, [cq+64*27] + pmulhrsw m2, m23, [cq+64*21] + pmulhrsw m3, m23, [cq+64*11] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + pmulhrsw m0, m23, [cq+64* 3] + pmulhrsw m1, m23, [cq+64*29] + pmulhrsw m2, m23, [cq+64*19] + pmulhrsw m3, m23, [cq+64*13] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + pmulhrsw m3, m23, [cq+64*24] + pmulhrsw m1, m23, [cq+64* 8] + pmulhrsw m2, m23, [cq+64*16] + pmulhrsw m0, m23, [cq+64* 0] + pmulhrsw m14, m23, [cq+64* 4] + pmulhrsw m17, m23, [cq+64*28] + pmulhrsw m16, m23, [cq+64*20] + pmulhrsw m15, m23, [cq+64*12] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + pmulhrsw m22, m23, [cq+64* 2] + pmulhrsw m29, m23, [cq+64*30] + pmulhrsw m26, m23, [cq+64*18] + pmulhrsw m25, m23, [cq+64*14] + pmulhrsw m24, m23, [cq+64*10] + pmulhrsw m27, m23, [cq+64*22] + pmulhrsw m28, m23, [cq+64*26] + pmulhrsw m23, [cq+64* 6] + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_16384)] + call .pass1_end_part1 + mova [cq+64*16], m1 + mova [cq+64*17], m3 + mova [cq+64*18], m5 + mova [cq+64*19], m7 + mova [cq+64*24], m23 + mova [cq+64*25], m25 + mova [cq+64*26], m27 + mova [cq+64*27], m29 + pmulhrsw m23, m13, m0 ; a0 + pmulhrsw m25, m13, m2 ; a2 + pmulhrsw m27, m13, m4 ; a4 + pmulhrsw m29, m13, m6 ; a6 + REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6 + call .pass1_end_part2 + mova [cq+64*20], m15 + mova [cq+64*21], m17 + mova [cq+64*22], m19 + mova [cq+64*23], m21 + mova [cq+64*28], m1 + mova [cq+64*29], m3 + mova [cq+64*30], m5 + mova [cq+64*31], m7 + REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6 + REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6 + vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01 + vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03 + vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01 + vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03 + vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41 + vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43 + vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 + vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43 + vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21 + vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23 + vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21 + vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23 + vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61 + vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 + vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61 + vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63 + vshufi32x4 m2, m3, m15, q3131 ; 8 + vshufi32x4 m0, m3, m15, q2020 ; 0 + vshufi32x4 m6, m23, m22, q3131 ; 24 + vshufi32x4 m4, m23, m22, q2020 ; 16 + vshufi32x4 m3, m1, m18, q3131 ; 12 + vshufi32x4 m1, m18, q2020 ; 4 + vshufi32x4 m7, m27, m26, q3131 ; 28 + vshufi32x4 m5, m27, m26, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + vshufi32x4 m16, m14, m17, q3131 ; 10 + vshufi32x4 m14, m17, q2020 ; 2 + vshufi32x4 m17, m19, m20, q3131 ; 14 + vshufi32x4 m15, m19, m20, q2020 ; 6 + vshufi32x4 m20, m25, m24, q3131 ; 26 + vshufi32x4 m18, m25, m24, q2020 ; 18 + vshufi32x4 m21, m29, m28, q3131 ; 30 + vshufi32x4 m19, m29, m28, q2020 ; 22 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + pmulhrsw m22, m13, [cq+64*16] ; a1 + pmulhrsw m23, m13, [cq+64*20] ; c1 + pmulhrsw m24, m13, [cq+64*24] ; e1 + pmulhrsw m25, m13, [cq+64*28] ; g1 + pmulhrsw m26, m13, [cq+64*17] ; a3 + pmulhrsw m27, m13, [cq+64*21] ; c3 + pmulhrsw m28, m13, [cq+64*25] ; e3 + pmulhrsw m29, m13, [cq+64*29] ; g3 + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + pmulhrsw m14, m13, [cq+64*18] ; a5 + pmulhrsw m15, m13, [cq+64*22] ; c5 + pmulhrsw m16, m13, [cq+64*26] ; e5 + pmulhrsw m17, m13, [cq+64*30] ; g5 + pmulhrsw m18, m13, [cq+64*19] ; a7 + pmulhrsw m19, m13, [cq+64*23] ; c7 + pmulhrsw m20, m13, [cq+64*27] ; e7 + pmulhrsw m21, m13, [cq+64*31] ; g7 + vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11 + vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13 + vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11 + vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13 + vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31 + vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33 + vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31 + vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33 + mova [cq+64* 0], m0 + mova [cq+64* 1], m1 + mova [cq+64* 2], m2 + mova [cq+64* 3], m3 + mova [cq+64* 4], m4 + mova [cq+64* 5], m5 + mova [cq+64* 6], m6 + mova [cq+64* 7], m7 + vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51 + vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53 + vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51 + vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53 + vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71 + vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73 + vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71 + vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73 + vshufi32x4 m27, m23, m11, q3131 ; 11 m27 + vshufi32x4 m23, m11, q2020 ; 3 m23 + vshufi32x4 m19, m26, m28, q3131 ; 27 m19 + vshufi32x4 m15, m26, m28, q2020 ; 19 m15 + vshufi32x4 m29, m25, m17, q3131 ; 15 m29 + vshufi32x4 m25, m17, q2020 ; 7 m25 + vshufi32x4 m21, m18, m20, q3131 ; 31 m21 + vshufi32x4 m17, m18, m20, q2020 ; 23 m17 + vshufi32x4 m20, m14, m16, q3131 ; 29 m20 + vshufi32x4 m16, m14, m16, q2020 ; 21 m16 + vshufi32x4 m18, m22, m24, q3131 ; 25 m18 + vshufi32x4 m14, m22, m24, q2020 ; 17 m14 + vshufi32x4 m26, m8, m9, q3131 ; 9 m26 + vshufi32x4 m22, m8, m9, q2020 ; 1 m22 + vshufi32x4 m28, m12, m13, q3131 ; 13 m28 + vshufi32x4 m24, m12, m13, q2020 ; 5 m24 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + vpbroadcastd m13, [o(pw_16384)] + pmulhrsw m0, m13, [r4-64*21] + pmulhrsw m1, m13, [r4-64*22] + pmulhrsw m2, m13, [r4-64*23] + pmulhrsw m3, m13, [r4-64*24] + pmulhrsw m4, m13, [r4-64*25] + pmulhrsw m5, m13, [r4-64*26] + pmulhrsw m6, m13, [r4-64*27] + pmulhrsw m7, m13, [r4-64*28] + mova [cq+64*16], m14 + mova [cq+64*17], m15 + mova [cq+64*18], m16 + mova [cq+64*19], m17 + mova [cq+64*20], m18 + mova [cq+64*21], m19 + mova [cq+64*22], m20 + mova [cq+64*23], m21 + pmulhrsw m14, m13, [r4-64*12] + pmulhrsw m15, m13, [r4-64*11] + pmulhrsw m16, m13, [r4-64*10] + pmulhrsw m17, m13, [r4-64* 9] + pmulhrsw m18, m13, [r4-64* 8] + pmulhrsw m19, m13, [r4-64* 7] + pmulhrsw m20, m13, [r4-64* 6] + pmulhrsw m21, m13, [r4-64* 5] + mova [cq+64*24], m22 + mova [cq+64*25], m23 + mova [cq+64*26], m24 + mova [cq+64*27], m25 + mova [cq+64*28], m26 + mova [cq+64*29], m27 + mova [cq+64*30], m28 + mova [cq+64*31], m29 + call .transpose_2x8x8_lo + mova [r4-64*12], m1 + mova [r4-64*11], m3 + mova [r4-64*10], m5 + mova [r4-64* 9], m7 + mova [r4-64* 8], m15 + mova [r4-64* 7], m17 + mova [r4-64* 6], m19 + mova [r4-64* 5], m21 + vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01 + vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03 + vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21 + vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23 + vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41 + vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43 + vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61 + vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63 + pmulhrsw m0, m13, [r4-64*20] + pmulhrsw m1, m13, [r4-64*19] + pmulhrsw m2, m13, [r4-64*18] + pmulhrsw m3, m13, [r4-64*17] + pmulhrsw m4, m13, [r4-64*16] + pmulhrsw m5, m13, [r4-64*15] + pmulhrsw m6, m13, [r4-64*14] + pmulhrsw m7, m13, [r4-64*13] + pmulhrsw m14, m13, [r4-64*29] + pmulhrsw m15, m13, [r4-64*30] + pmulhrsw m16, m13, [r4-64*31] + pmulhrsw m17, m13, [r4-64*32] + pmulhrsw m18, m13, [r4-64*33] + pmulhrsw m19, m13, [r4-64*34] + pmulhrsw m20, m13, [r4-64*35] + pmulhrsw m21, m13, [r4-64*36] + call .transpose_2x8x8_lo + mova [r4-64*20], m1 + mova [r4-64*19], m3 + mova [r4-64*18], m5 + mova [r4-64*17], m7 + mova [r4-64*16], m15 + mova [r4-64*15], m17 + mova [r4-64*14], m19 + mova [r4-64*13], m21 + vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41 + vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43 + vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03 + vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01 + vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21 + vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23 + vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61 + vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63 + vshufi32x4 m2, m0, m22, q3131 ; 8 + vshufi32x4 m0, m22, q2020 ; 0 + vshufi32x4 m3, m1, m26, q3131 ; 12 + vshufi32x4 m1, m26, q2020 ; 4 + vshufi32x4 m6, m4, m23, q3131 ; 24 + vshufi32x4 m4, m23, q2020 ; 16 + vshufi32x4 m7, m5, m27, q3131 ; 28 + vshufi32x4 m5, m27, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + vshufi32x4 m16, m14, m24, q3131 ; 10 + vshufi32x4 m14, m24, q2020 ; 2 + vshufi32x4 m17, m15, m28, q3131 ; 14 + vshufi32x4 m15, m28, q2020 ; 6 + vshufi32x4 m20, m18, m25, q3131 ; 26 + vshufi32x4 m18, m25, q2020 ; 18 + vshufi32x4 m21, m19, m29, q3131 ; 30 + vshufi32x4 m19, m29, q2020 ; 22 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova m22, [r4-64*20] + mova m26, [r4-64*16] + mova m23, [r4-64*19] + mova m27, [r4-64*15] + mova m24, [r4-64*18] + mova m28, [r4-64*14] + mova m25, [r4-64*17] + mova m29, [r4-64*13] + mova [r4-64*20], m14 + mova [r4-64*19], m15 + mova [r4-64*18], m16 + mova [r4-64*17], m17 + mova [r4-64*16], m18 + mova [r4-64*15], m19 + mova [r4-64*14], m20 + mova [r4-64*13], m21 + mova m19, [r4-64*12] + mova m11, [r4-64* 8] + mova m20, [r4-64*11] + mova m12, [r4-64* 7] + mova m21, [r4-64*10] + mova m8, [r4-64* 6] + mova m9, [r4-64* 9] + mova m18, [r4-64* 5] + vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13 + vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11 + vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33 + vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31 + vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53 + vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51 + vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73 + vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71 + vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11 + vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13 + vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31 + vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33 + vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51 + vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53 + vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71 + vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73 + vshufi32x4 m26, m22, m27, q3131 ; 9 + vshufi32x4 m22, m27, q2020 ; 1 + vshufi32x4 m27, m23, m28, q3131 ; 11 + vshufi32x4 m23, m28, q2020 ; 3 + vshufi32x4 m28, m24, m29, q3131 ; 13 + vshufi32x4 m24, m29, q2020 ; 5 + vshufi32x4 m29, m25, m8, q3131 ; 15 + vshufi32x4 m25, m8, q2020 ; 7 + vshufi32x4 m18, m14, m19, q3131 ; 25 + vshufi32x4 m14, m19, q2020 ; 17 + vshufi32x4 m19, m15, m20, q3131 ; 27 + vshufi32x4 m15, m20, q2020 ; 19 + vshufi32x4 m20, m16, m21, q3131 ; 29 + vshufi32x4 m16, m21, q2020 ; 21 + vshufi32x4 m21, m17, m9, q3131 ; 31 + vshufi32x4 m17, m9, q2020 ; 23 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + jmp .end +.fast: ; bottom/right halves are zero + {evex}vpmulhrsw ym8, ym23, [cq+64* 4] + {evex}vpmulhrsw xm1, xm23, [cq+64*12] + mova m28, [o(dup16_perm)] + {evex}vpmulhrsw ym7, ym23, [cq+64* 8] + vpmulhrsw ym22, ym23, [cq+64* 0] + vpermb m8, m28, m8 + vpermb ym1, ym28, ym1 + vpermb m7, m28, m7 + pmovzxwd m9, ym22 + pslld m9, 16 + call m(idct_16x16_internal_8bpc).main_fast2 + {evex}vpmulhrsw ym21, ym23, [cq+64* 2] + {evex}vpmulhrsw xm15, xm23, [cq+64*14] + {evex}vpmulhrsw xm18, xm23, [cq+64*10] + {evex}vpmulhrsw ym14, ym23, [cq+64* 6] + vpermb m21, m28, m21 + punpcklwd xm15, xm15 + vpermb ym18, ym28, ym18 + vpermb m14, m28, m14 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + vpmulhrsw ym22, ym23, [cq+64* 1] + {evex}vpmulhrsw xm29, xm23, [cq+64*15] + {evex}vpmulhrsw xm26, xm23, [cq+64* 9] + {evex}vpmulhrsw ym25, ym23, [cq+64* 7] + {evex}vpmulhrsw ym24, ym23, [cq+64* 5] + {evex}vpmulhrsw xm27, xm23, [cq+64*11] + {evex}vpmulhrsw xm8, xm23, [cq+64*13] + {evex}vpmulhrsw ym23, [cq+64* 3] + vpermb m22, m28, m22 + punpcklwd xm29, xm29 + vpermb ym26, ym28, ym26 + vpermb m25, m28, m25 + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + REPX {vpermb x, m28, x}, m24, m27, m23 + punpcklwd xm28, xm8, xm8 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + mov r4, rsp + vpbroadcastd m13, [o(pw_16384)] + mova [r4+64*16], m4 + mova [r4+64*17], m5 + mova [r4+64*18], m6 + mova [r4+64*19], m7 + mova [r4+64*28], m26 + mova [r4+64*29], m27 + mova [r4+64*30], m28 + mova [r4+64*31], m29 + call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end + mova [r4+64*20], m22 + mova [r4+64*21], m23 + mova [r4+64*22], m24 + mova [r4+64*23], m25 + mova [r4+64*24], m26 + mova [r4+64*25], m27 + mova [r4+64*26], m28 + mova [r4+64*27], m29 + call .pass2_fast + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + mova [cq+64* 0], m0 + mova [cq+64* 1], m1 + mova [cq+64* 2], m2 + mova [cq+64* 3], m3 + mova [cq+64* 4], m4 + mova [cq+64* 5], m5 + mova [cq+64* 6], m6 + mova [cq+64* 7], m7 + pmulhrsw m0, m13, [r4+64*16] + pmulhrsw m1, m13, [r4+64*17] + pmulhrsw m2, m13, [r4+64*18] + pmulhrsw m3, m13, [r4+64*19] + pmulhrsw m4, m13, [r4+64*20] + pmulhrsw m5, m13, [r4+64*21] + pmulhrsw m6, m13, [r4+64*22] + pmulhrsw m7, m13, [r4+64*23] + mova [cq+64*16], m14 + mova [cq+64*17], m15 + mova [cq+64*18], m16 + mova [cq+64*19], m17 + mova [cq+64*20], m18 + mova [cq+64*21], m19 + mova [cq+64*22], m20 + mova [cq+64*23], m21 + pmulhrsw m14, m13, [r4+64*24] + pmulhrsw m15, m13, [r4+64*25] + pmulhrsw m16, m13, [r4+64*26] + pmulhrsw m17, m13, [r4+64*27] + pmulhrsw m18, m13, [r4+64*28] + pmulhrsw m19, m13, [r4+64*29] + pmulhrsw m20, m13, [r4+64*30] + pmulhrsw m21, m13, [r4+64*31] + mova [cq+64*24], m22 + mova [cq+64*25], m23 + mova [cq+64*26], m24 + mova [cq+64*27], m25 + mova [cq+64*28], m26 + mova [cq+64*29], m27 + mova [cq+64*30], m28 + mova [cq+64*31], m29 + call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round + call .pass2_fast + mova [r4+64*16], m14 + mova [r4+64*17], m15 + mova [r4+64*18], m16 + mova [r4+64*19], m17 + mova [r4+64*20], m18 + mova [r4+64*21], m19 + mova [r4+64*22], m20 + mova [r4+64*23], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast +.end: + vpbroadcastd m13, [o(pw_2048)] + lea r5, [strideq*3] + pxor m12, m12 + lea r3, [dstq+r5*8] + lea r6, [strideq+r5] ; stride*4 + add r3, r6 ; dst+stride*28 +%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi + mova m11, [cq+64*( %3)] ; 0 + mova m9, [cq+64*(31-%3)] ; 31 +%if %3 >= 8 + mova m%1, [rsp+64*(%1+16)] +%endif + mova m10, [dstq+%4] + paddsw m8, m11, m9 + psubsw m11, m9 + paddsw m9, m%1, m%2 + psubsw m%1, m%2 + punpcklbw m%2, m10, m12 + punpckhbw m10, m12 + pmulhrsw m8, m13 + pmulhrsw m9, m13 + paddw m8, m%2 + paddw m9, m10 + mova m10, [r3+%5] + pmulhrsw m11, m13 + pmulhrsw m%1, m13 + mova [cq+64*( %3)], m12 + mova [cq+64*(31-%3)], m12 + punpcklbw m%2, m10, m12 + punpckhbw m10, m12 + packuswb m8, m9 + paddw m11, m%2 + paddw m%1, m10 + packuswb m11, m%1 + mova [dstq+%4], m8 + mova [r3 +%5], m11 +%if %3 == 3 || %3 == 7 || %3 == 11 + add dstq, r6 + sub r3, r6 +%endif +%endmacro + IDCT_64x32_END 0, 29, 0, strideq*0, r5 + IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2 + IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1 + IDCT_64x32_END 3, 26, 3, r5 , strideq*0 + IDCT_64x32_END 4, 25, 4, strideq*0, r5 + IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2 + IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1 + IDCT_64x32_END 7, 22, 7, r5 , strideq*0 + IDCT_64x32_END 0, 21, 8, strideq*0, r5 + IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2 + IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1 + IDCT_64x32_END 3, 18, 11, r5 , strideq*0 + IDCT_64x32_END 4, 17, 12, strideq*0, r5 + IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2 + IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1 + IDCT_64x32_END 7, 14, 15, r5 , strideq*0 + RET +ALIGN function_align +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2 +ALIGN function_align +.pass1_end_part1: +%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64 +%if %1 != %3 + mova m%1, [cq+64*%1] +%endif + mova m9, [r4+64*(%3-36)] ; idct64 32+n + mova m11, [r4+64*(-5-%3)] ; idct64 63-n + psubsw m8, m%1, m%2 ; idct32 31-n + paddsw m%1, m%2 ; idct32 0+n +%if %1 == %3 + psubsw m%2, m8, m9 ; out 32+n e + paddsw m8, m9 ; out 31-n d + psubsw m9, m%1, m11 ; out 63-n h + paddsw m%1, m11 ; out 0+n a +%else + paddsw m%2, m8, m9 ; out 23-n c + psubsw m8, m9 ; out 40+n f + paddsw m9, m%1, m11 ; out 8+n b + psubsw m%1, m11 ; out 55-n g +%endif + mova [r4+64*(%3-36)], m8 + mova [r4+64*(-5-%3)], m9 +%endmacro + IDCT_64x32_PASS1_END 0, 29, 0 + IDCT_64x32_PASS1_END 1, 28, 1 + IDCT_64x32_PASS1_END 2, 27, 2 + IDCT_64x32_PASS1_END 3, 26, 3 + IDCT_64x32_PASS1_END 4, 25, 4 + IDCT_64x32_PASS1_END 5, 24, 5 + IDCT_64x32_PASS1_END 6, 23, 6 + IDCT_64x32_PASS1_END 7, 22, 7 +.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted) + punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3 + punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3 + punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5 + punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckhqdq m23, m22, m27 ; 1 23 + punpcklqdq m22, m27 ; 0 22 + punpckhqdq m27, m26, m28 ; 5 27 + punpcklqdq m26, m28 ; 4 26 + punpcklqdq m28, m29, m25 ; 6 28 + punpckhqdq m29, m25 ; 7 29 + punpckhqdq m25, m24, m8 ; 3 25 + punpcklqdq m24, m8 ; 2 24 +.transpose_8x8: + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m8, m1 + punpckhdq m8, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + ret +.pass1_end_part2: + IDCT_64x32_PASS1_END 0, 21, 8 + IDCT_64x32_PASS1_END 1, 20, 9 + IDCT_64x32_PASS1_END 2, 19, 10 + IDCT_64x32_PASS1_END 3, 18, 11 + IDCT_64x32_PASS1_END 4, 17, 12 + IDCT_64x32_PASS1_END 5, 16, 13 + IDCT_64x32_PASS1_END 6, 15, 14 + IDCT_64x32_PASS1_END 7, 14, 15 +.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21 + punpcklwd m8, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m7, m6 + punpckhwd m7, m6 + punpcklwd m6, m5, m4 + punpckhwd m5, m4 + punpckldq m4, m7, m5 + punpckhdq m7, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m6 + punpckldq m0, m6 + punpckldq m6, m3, m1 + punpckhdq m3, m1 + punpckhqdq m1, m0, m5 + punpcklqdq m0, m5 + punpckhqdq m5, m4, m6 + punpcklqdq m4, m6 + punpcklqdq m6, m7, m3 + punpckhqdq m7, m3 + punpckhqdq m3, m2, m8 + punpcklqdq m2, m8 + punpckhwd m8, m18, m19 + punpcklwd m18, m19 + punpckhwd m19, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m20, m21 + punpcklwd m20, m21 + punpckhwd m21, m16, m17 + punpcklwd m16, m17 + punpckhdq m17, m14, m16 + punpckldq m14, m16 + punpckldq m16, m18, m20 + punpckhdq m18, m20 + punpckhdq m20, m19, m21 + punpckldq m19, m21 + punpckldq m21, m8, m15 + punpckhdq m8, m15 + punpckhqdq m15, m14, m16 + punpcklqdq m14, m16 + punpcklqdq m16, m17, m18 + punpckhqdq m17, m18 + punpcklqdq m18, m19, m21 + punpckhqdq m19, m21 + punpckhqdq m21, m20, m8 + punpcklqdq m20, m8 + ret +.pass2_fast: + vshufi32x4 m24, m9, m15, q3131 ; 5 + vshufi32x4 m22, m9, m15, q2020 ; 1 + vshufi32x4 m15, m1, m16, q3131 ; 6 + vshufi32x4 m14, m1, m16, q2020 ; 2 + vshufi32x4 m1, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m8, m2, q3131 ; 12 + vshufi32x4 m2, m8, m2, q2020 ; 8 + vshufi32x4 m25, m11, m17, q3131 ; 7 + vshufi32x4 m23, m11, m17, q2020 ; 3 + vshufi32x4 m17, m5, m19, q3131 ; 14 + vshufi32x4 m16, m5, m19, q2020 ; 10 + vshufi32x4 m29, m6, m20, q3131 ; 15 + vshufi32x4 m27, m6, m20, q2020 ; 11 + vshufi32x4 m28, m4, m18, q3131 ; 13 + vshufi32x4 m26, m4, m18, q2020 ; 9 + jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob +%undef cmp + cmp eobd, 136 + jb .fast + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + vpbroadcastd m10, [o(pd_2048)] + mov r4, rsp + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + mova m14, [cq+64* 4] + mova m15, [cq+64*12] + mova m16, [cq+64*20] + mova m17, [cq+64*28] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova m22, [cq+64* 2] + mova m29, [cq+64*30] + mova m26, [cq+64*18] + mova m25, [cq+64*14] + mova m24, [cq+64*10] + mova m27, [cq+64*22] + mova m28, [cq+64*26] + mova m23, [cq+64* 6] + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1 + mova [r4+64*36], m1 + mova [r4+64*37], m3 + mova [r4+64*38], m5 + mova [r4+64*39], m7 + mova [r4+64*44], m23 + mova [r4+64*45], m25 + mova [r4+64*46], m27 + mova [r4+64*47], m29 + pmulhrsw m23, m13, m0 ; a0 + pmulhrsw m25, m13, m2 ; a2 + pmulhrsw m27, m13, m4 ; a4 + pmulhrsw m29, m13, m6 ; a6 + call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2 + lea r6, [r4-64*4] + add r4, 64*28 + call .pass2_end + mov r4, rsp + mova m0, [r4+64*23] + mova m1, [r4+64*22] + mova m2, [r4+64*21] + mova m3, [r4+64*20] + mova m4, [r4+64*19] + mova m5, [r4+64*18] + mova m6, [r4+64*17] + mova m7, [r4+64*16] + mova m22, [r4+64*15] + mova m23, [r4+64*14] + mova m24, [r4+64*13] + mova m25, [r4+64*12] + mova m26, [r4+64*11] + mova m27, [r4+64*10] + mova m28, [r4+64* 9] + mova m29, [r4+64* 8] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi + vpbroadcastd m13, [o(pw_8192)] + mova [r4+64* 8], m1 + mova [r4+64* 9], m3 + mova [r4+64*10], m5 + mova [r4+64*11], m7 + mova [r4+64*16], m23 + mova [r4+64*17], m25 + mova [r4+64*18], m27 + mova [r4+64*19], m29 + pmulhrsw m23, m13, m0 ; b0 + pmulhrsw m25, m13, m2 ; b2 + pmulhrsw m27, m13, m4 ; b4 + pmulhrsw m29, m13, m6 ; b6 + mova m0, [r4+64*31] + mova m1, [r4+64*30] + mova m2, [r4+64*29] + mova m3, [r4+64*28] + mova m4, [r4+64*27] + mova m5, [r4+64*26] + mova m6, [r4+64*25] + mova m7, [r4+64*24] + mova m14, [r4+64* 7] + mova m15, [r4+64* 6] + mova m16, [r4+64* 5] + mova m17, [r4+64* 4] + mova m18, [r4+64* 3] + mova m19, [r4+64* 2] + mova m20, [r4+64* 1] + mova m21, [r4+64* 0] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo + mov r6, cq + call .pass2_end + jmp .end +.fast: ; bottom/right halves are zero + mova m28, [o(dup16_perm)] + pmovzxwd m9, [cq+64* 0] + vpermb m8, m28, [cq+64* 4] + vpermb ym1, ym28, [cq+64*12] + vpermb m7, m28, [cq+64* 8] + pslld m9, 16 + call m(idct_16x16_internal_8bpc).main_fast2 + vpermb m21, m28, [cq+64* 2] + vpermb ym15, ym28, [cq+64*14] + vpermb ym18, ym28, [cq+64*10] + vpermb m14, m28, [cq+64* 6] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + vpermb m22, m28, [cq+64* 1] + vpermb ym29, ym28, [cq+64*15] + vpermb ym26, ym28, [cq+64* 9] + vpermb m25, m28, [cq+64* 7] + vpermb m24, m28, [cq+64* 5] + vpermb ym27, ym28, [cq+64*11] + vpermb m23, m28, [cq+64* 3] + vpermb ym28, ym28, [cq+64*13] + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_8192)] + mova [cq+64*16], m4 + mova [cq+64*17], m5 + mova [cq+64*18], m6 + mova [cq+64*19], m7 + mova [cq+64*28], m26 + mova [cq+64*29], m27 + mova [cq+64*30], m28 + mova [cq+64*31], m29 + call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end + mova [cq+64*20], m22 + mova [cq+64*21], m23 + mova [cq+64*22], m24 + mova [cq+64*23], m25 + mova [cq+64*24], m26 + mova [cq+64*25], m27 + mova [cq+64*26], m28 + mova [cq+64*27], m29 + lea r4, [rsp+64*64] + lea r3, [rsp+64*32] + call .pass2_fast + pmulhrsw m0, m13, [cq+64*16] + pmulhrsw m1, m13, [cq+64*17] + pmulhrsw m2, m13, [cq+64*18] + pmulhrsw m3, m13, [cq+64*19] + pmulhrsw m4, m13, [cq+64*20] + pmulhrsw m5, m13, [cq+64*21] + pmulhrsw m6, m13, [cq+64*22] + pmulhrsw m7, m13, [cq+64*23] + pmulhrsw m14, m13, [cq+64*24] + pmulhrsw m15, m13, [cq+64*25] + pmulhrsw m16, m13, [cq+64*26] + pmulhrsw m17, m13, [cq+64*27] + pmulhrsw m18, m13, [cq+64*28] + pmulhrsw m19, m13, [cq+64*29] + pmulhrsw m20, m13, [cq+64*30] + pmulhrsw m21, m13, [cq+64*31] + call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round + mov r4, rsp + mov r3, cq + call .pass2_fast +.end: + vpbroadcastd m17, [o(pw_2048)] + lea r5, [strideq*8] + mov r3, dstq + pxor m16, m16 + sub r4, 64*5 ; rsp+64*31 + mov r6, rsp +.end_loop: + mova m2, [r6+64*32] ; idct16 0+n lo + mova m7, [r6+64*48] ; idct32 31-n lo + mova m6, [cq+64* 0] ; idct16 0+n hi + mova m0, [cq+64*16] ; idct32 31-n hi + mova m4, [r4+64*64] ; idct64 63-n lo + mova m1, [r4+64* 0] ; idct64 63-n hi + mova m5, [r6+64*64] ; idct64 32+n lo + mova m8, [r6+64* 0] ; idct64 32+n hi + sub r3, strideq + paddsw m3, m2, m7 ; idct32 0+n lo + mova m12, [dstq+r5*0] + psubsw m2, m7 ; idct32 31-n lo + mova m15, [r3 +r5*8] + paddsw m7, m6, m0 ; idct32 0+n hi + mova m13, [r3 +r5*4] + psubsw m6, m0 ; idct32 31-n hi + mova m14, [dstq+r5*4] + paddsw m0, m3, m4 ; out 0+n lo + add r6, 64 + psubsw m3, m4 ; out 63-n lo + sub r4, 64 + paddsw m4, m7, m1 ; out 0+n hi + mova [cq+64* 0], m16 + psubsw m7, m1 ; out 63-n hi + mova [cq+64*16], m16 + paddsw m1, m2, m5 ; out 31-n lo + add cq, 64 + psubsw m2, m5 ; out 32+n lo + paddsw m5, m6, m8 ; out 31-n hi + psubsw m6, m8 ; out 32+n hi + pmulhrsw m0, m17 + punpcklbw m8, m12, m16 + pmulhrsw m4, m17 + punpckhbw m12, m16 + pmulhrsw m3, m17 + punpcklbw m11, m15, m16 + pmulhrsw m7, m17 + punpckhbw m15, m16 + pmulhrsw m1, m17 + punpcklbw m9, m13, m16 + pmulhrsw m5, m17 + punpckhbw m13, m16 + pmulhrsw m2, m17 + punpcklbw m10, m14, m16 + pmulhrsw m6, m17 + punpckhbw m14, m16 + paddw m0, m8 + paddw m4, m12 + packuswb m0, m4 + paddw m3, m11 + paddw m7, m15 + packuswb m3, m7 + paddw m1, m9 + paddw m5, m13 + packuswb m1, m5 + paddw m2, m10 + paddw m6, m14 + packuswb m2, m6 + mova [dstq+r5*0], m0 + mova [r3 +r5*8], m3 + mova [r3 +r5*4], m1 + mova [dstq+r5*4], m2 + add dstq, strideq + cmp r6, r4 + jb .end_loop + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly +ALIGN function_align +.pass2_end: + REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6 + mova [r4+64*20], m1 + mova [r4+64*21], m3 + mova [r4+64*22], m5 + mova [r4+64*23], m7 + vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01 + vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03 + vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01 + vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03 + mova [r4+64*12], m15 + mova [r4+64*13], m17 + mova [r4+64*14], m19 + mova [r4+64*15], m21 + vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41 + vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43 + vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 + vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43 + vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21 + vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23 + vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21 + vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23 + vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61 + vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 + vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63 + vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61 + vshufi32x4 m0, m1, m5, q2020 ; 0 + vshufi32x4 m1, m5, q3131 ; 8 + vshufi32x4 m2, m3, m14, q2020 ; 16 + vshufi32x4 m3, m14, q3131 ; 24 + vshufi32x4 m14, m15, m18, q2020 ; 4 + vshufi32x4 m15, m18, q3131 ; 12 + vshufi32x4 m16, m17, m19, q2020 ; 20 + vshufi32x4 m17, m19, q3131 ; 28 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + vshufi32x4 m24, m22, m25, q3131 ; 10 + vshufi32x4 m22, m25, q2020 ; 2 + vshufi32x4 m25, m23, m28, q3131 ; 14 + vshufi32x4 m23, m28, q2020 ; 6 + vshufi32x4 m28, m26, m27, q3131 ; 26 + vshufi32x4 m26, m27, q2020 ; 18 + vshufi32x4 m27, m29, m13, q2020 ; 22 + vshufi32x4 m29, m13, q3131 ; 30 + mova [r6+64* 0], m0 + mova [r6+64* 1], m1 + mova [r6+64* 2], m2 + mova [r6+64* 3], m3 + mova [r6+64* 4], m4 + mova [r6+64* 5], m5 + mova [r6+64* 6], m6 + mova [r6+64* 7], m7 + mova [r6+64* 8], m14 + mova [r6+64* 9], m15 + mova [r6+64*10], m16 + mova [r6+64*11], m17 + mova [r6+64*12], m18 + mova [r6+64*13], m19 + mova [r6+64*14], m20 + mova [r6+64*15], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_8192)] + mova [r6+64*16], m29 + mova [r6+64*17], m28 + mova [r6+64*18], m27 + mova [r6+64*19], m26 + mova [r6+64*20], m25 + mova [r6+64*21], m24 + mova [r6+64*22], m23 + mova [r6+64*23], m22 + mova [r6+64*24], m21 + mova [r6+64*25], m20 + mova [r6+64*26], m19 + mova [r6+64*27], m18 + mova [r6+64*28], m17 + mova [r6+64*29], m16 + mova [r6+64*30], m15 + mova [r6+64*31], m14 + pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25 + pmulhrsw m16, m13, [r4+64*12] + pmulhrsw m17, m13, [r4+64*16] + pmulhrsw m18, m13, [r4+64*20] + pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31 + pmulhrsw m20, m13, [r4+64*15] + pmulhrsw m21, m13, [r4+64*19] + pmulhrsw m22, m13, [r4+64*23] + vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9 + vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25 + vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9 + vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25 + pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29 + pmulhrsw m24, m13, [r4+64*14] + pmulhrsw m25, m13, [r4+64*18] + pmulhrsw m26, m13, [r4+64*22] + vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15 + vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31 + vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15 + vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31 + pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27 + pmulhrsw m28, m13, [r4+64*13] + pmulhrsw m29, m13, [r4+64*17] + pmulhrsw m13, [r4+64*21] + vshufi32x4 m0, m14, m16, q2020 ; 1 + vshufi32x4 m1, m19, m21, q3131 ; 31 + vshufi32x4 m2, m15, m17, q2020 ; 17 + vshufi32x4 m3, m18, m20, q3131 ; 15 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + vshufi32x4 m0, m18, m20, q2020 ; 7 + vshufi32x4 m1, m15, m17, q3131 ; 25 + vshufi32x4 m2, m19, m21, q2020 ; 23 + vshufi32x4 m3, m14, m16, q3131 ; 9 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13 + vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29 + vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13 + vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29 + vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11 + vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27 + vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11 + vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27 + vshufi32x4 m0, m22, m24, q2020 ; 5 + vshufi32x4 m1, m27, m29, q3131 ; 27 + vshufi32x4 m2, m23, m25, q2020 ; 21 + vshufi32x4 m3, m26, m28, q3131 ; 11 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + vshufi32x4 m0, m26, m28, q2020 ; 3 + vshufi32x4 m1, m23, m25, q3131 ; 29 + vshufi32x4 m2, m27, m29, q2020 ; 19 + vshufi32x4 m3, m22, m24, q3131 ; 13 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 +ALIGN function_align +.pass2_fast: + vshufi32x4 m23, m1, m16, q3131 ; 6 + vshufi32x4 m22, m1, m16, q2020 ; 2 + vshufi32x4 m14, m0, m3, q3131 ; 4 + vshufi32x4 m26, m0, m3, q2020 ; 0 + vshufi32x4 m28, m9, m15, q3131 ; 5 + vshufi32x4 m0, m9, m15, q2020 ; 1 + vshufi32x4 m16, m11, m17, q3131 ; 7 + vshufi32x4 m29, m11, m17, q2020 ; 3 + vshufi32x4 m15, m8, m2, q3131 ; 12 + vshufi32x4 m27, m8, m2, q2020 ; 8 + vshufi32x4 m25, m5, m19, q3131 ; 14 + vshufi32x4 m24, m5, m19, q2020 ; 10 + vshufi32x4 m3, m6, m20, q3131 ; 15 + vshufi32x4 m19, m6, m20, q2020 ; 11 + vshufi32x4 m17, m4, m18, q3131 ; 13 + vshufi32x4 m18, m4, m18, q2020 ; 9 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m16 + mova m3, m18 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m28 + mova m3, m19 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m29 + mova m3, m17 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + mova m0, m26 + mova m1, m27 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 + mova [r3+64* 0], m0 + mova [r3+64* 1], m1 + mova [r3+64* 2], m2 + mova [r3+64* 3], m3 + mova [r3+64* 4], m4 + mova [r3+64* 5], m5 + mova [r3+64* 6], m6 + mova [r3+64* 7], m7 + mova [r3+64* 8], m14 + mova [r3+64* 9], m15 + mova [r3+64*10], m16 + mova [r3+64*11], m17 + mova [r3+64*12], m18 + mova [r3+64*13], m19 + mova [r3+64*14], m20 + mova [r3+64*15], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 + mova [r3+64*16], m29 + mova [r3+64*17], m28 + mova [r3+64*18], m27 + mova [r3+64*19], m26 + mova [r3+64*20], m25 + mova [r3+64*21], m24 + mova [r3+64*22], m23 + mova [r3+64*23], m22 + mova [r3+64*24], m21 + mova [r3+64*25], m20 + mova [r3+64*26], m19 + mova [r3+64*27], m18 + mova [r3+64*28], m17 + mova [r3+64*29], m16 + mova [r3+64*30], m15 + mova [r3+64*31], m14 + ret + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/itx_init_tmpl.c dav1d-1.0.0/src/x86/itx_init_tmpl.c --- dav1d-0.9.2/src/x86/itx_init_tmpl.c 2021-09-03 15:51:24.421037200 +0000 +++ dav1d-1.0.0/src/x86/itx_init_tmpl.c 2022-03-18 14:31:56.018356000 +0000 @@ -28,6 +28,8 @@ #include "src/cpu.h" #include "src/itx.h" +#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix + #define decl_itx2_fns(w, h, opt) \ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) @@ -77,10 +79,60 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext)) + +#define decl_itx2_bpc_fns(w, h, bpc, opt) \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt)) + +#define decl_itx12_bpc_fns(w, h, bpc, opt) \ +decl_itx2_bpc_fns(w, h, bpc, opt); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt)) + +#define decl_itx16_bpc_fns(w, h, bpc, opt) \ +decl_itx12_bpc_fns(w, h, bpc, opt); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt)) + +#define decl_itx_bpc_fns(bpc, ext) \ +decl_itx16_bpc_fns( 4, 4, bpc, ext); \ +decl_itx16_bpc_fns( 4, 8, bpc, ext); \ +decl_itx16_bpc_fns( 4, 16, bpc, ext); \ +decl_itx16_bpc_fns( 8, 4, bpc, ext); \ +decl_itx16_bpc_fns( 8, 8, bpc, ext); \ +decl_itx16_bpc_fns( 8, 16, bpc, ext); \ +decl_itx2_bpc_fns ( 8, 32, bpc, ext); \ +decl_itx16_bpc_fns(16, 4, bpc, ext); \ +decl_itx16_bpc_fns(16, 8, bpc, ext); \ +decl_itx12_bpc_fns(16, 16, bpc, ext); \ +decl_itx2_bpc_fns (16, 32, bpc, ext); \ +decl_itx2_bpc_fns (32, 8, bpc, ext); \ +decl_itx2_bpc_fns (32, 16, bpc, ext); \ +decl_itx2_bpc_fns (32, 32, bpc, ext); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext)) + +decl_itx_fns(avx512icl); decl_itx_fns(avx2); +decl_itx_bpc_fns(10, avx2); +decl_itx_bpc_fns(12, avx2); decl_itx_fns(sse4); decl_itx_fns(ssse3); -decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_sse2); +decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c, const int bpc) @@ -120,18 +172,48 @@ assign_itx16_fn(pfx, w, h, ext); \ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + +#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext) + +#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext) + +#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext) + +#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext) + +#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext) + const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; -#if BITDEPTH == 16 assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2); -#endif if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; #if BITDEPTH == 8 - assign_itx17_fn(, 4, 4, ssse3); + assign_itx16_fn(, 4, 4, ssse3); assign_itx16_fn(R, 4, 8, ssse3); assign_itx16_fn(R, 8, 4, ssse3); assign_itx16_fn(, 8, 8, ssse3); @@ -178,16 +260,26 @@ } #endif +#if ARCH_X86_64 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if ARCH_X86_64 assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2); +#if BITDEPTH == 16 + assign_itx16_bpc_fn( , 4, 4, 12, avx2); + assign_itx16_bpc_fn(R, 4, 8, 12, avx2); + assign_itx16_bpc_fn(R, 4, 16, 12, avx2); + assign_itx16_bpc_fn(R, 8, 4, 12, avx2); + assign_itx16_bpc_fn( , 8, 8, 12, avx2); + assign_itx16_bpc_fn(R, 8, 16, 12, avx2); + assign_itx16_bpc_fn(R, 16, 4, 12, avx2); + assign_itx16_bpc_fn(R, 16, 8, 12, avx2); + assign_itx12_bpc_fn( , 16, 16, 12, avx2); #endif if (bpc > 10) return; -#if ARCH_X86_64 - assign_itx17_fn( , 4, 4, avx2); +#if BITDEPTH == 8 + assign_itx16_fn( , 4, 4, avx2); assign_itx16_fn(R, 4, 8, avx2); assign_itx16_fn(R, 4, 16, avx2); assign_itx16_fn(R, 8, 4, avx2); @@ -206,5 +298,50 @@ assign_itx1_fn (R, 64, 16, avx2); assign_itx1_fn (R, 64, 32, avx2); assign_itx1_fn ( , 64, 64, avx2); +#elif BITDEPTH == 16 + assign_itx16_bpc_fn( , 4, 4, 10, avx2); + assign_itx16_bpc_fn(R, 4, 8, 10, avx2); + assign_itx16_bpc_fn(R, 4, 16, 10, avx2); + assign_itx16_bpc_fn(R, 8, 4, 10, avx2); + assign_itx16_bpc_fn( , 8, 8, 10, avx2); + assign_itx16_bpc_fn(R, 8, 16, 10, avx2); + assign_itx2_bpc_fn (R, 8, 32, 10, avx2); + assign_itx16_bpc_fn(R, 16, 4, 10, avx2); + assign_itx16_bpc_fn(R, 16, 8, 10, avx2); + assign_itx12_bpc_fn( , 16, 16, 10, avx2); + assign_itx2_bpc_fn (R, 16, 32, 10, avx2); + assign_itx1_bpc_fn (R, 16, 64, 10, avx2); + assign_itx2_bpc_fn (R, 32, 8, 10, avx2); + assign_itx2_bpc_fn (R, 32, 16, 10, avx2); + assign_itx2_bpc_fn ( , 32, 32, 10, avx2); + assign_itx1_bpc_fn (R, 32, 64, 10, avx2); + assign_itx1_bpc_fn (R, 64, 16, 10, avx2); + assign_itx1_bpc_fn (R, 64, 32, 10, avx2); + assign_itx1_bpc_fn ( , 64, 64, 10, avx2); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if BITDEPTH == 8 + assign_itx16_fn( , 4, 4, avx512icl); // no wht + assign_itx16_fn(R, 4, 8, avx512icl); + assign_itx16_fn(R, 4, 16, avx512icl); + assign_itx16_fn(R, 8, 4, avx512icl); + assign_itx16_fn( , 8, 8, avx512icl); + assign_itx16_fn(R, 8, 16, avx512icl); + assign_itx2_fn (R, 8, 32, avx512icl); + assign_itx16_fn(R, 16, 4, avx512icl); + assign_itx16_fn(R, 16, 8, avx512icl); + assign_itx12_fn( , 16, 16, avx512icl); + assign_itx2_fn (R, 16, 32, avx512icl); + assign_itx1_fn (R, 16, 64, avx512icl); + assign_itx2_fn (R, 32, 8, avx512icl); + assign_itx2_fn (R, 32, 16, avx512icl); + assign_itx2_fn ( , 32, 32, avx512icl); + assign_itx1_fn (R, 32, 64, avx512icl); + assign_itx1_fn (R, 64, 16, avx512icl); + assign_itx1_fn (R, 64, 32, avx512icl); + assign_itx1_fn ( , 64, 64, avx512icl); +#endif #endif } diff -Nru dav1d-0.9.2/src/x86/itx_sse.asm dav1d-1.0.0/src/x86/itx_sse.asm --- dav1d-0.9.2/src/x86/itx_sse.asm 2021-09-03 15:51:24.421037200 +0000 +++ dav1d-1.0.0/src/x86/itx_sse.asm 2022-03-18 14:31:56.022356000 +0000 @@ -142,14 +142,6 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %if ARCH_X86_64 @@ -430,6 +422,7 @@ paddw m2, m1 ;low: out3 %endmacro +INIT_XMM sse2 cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] @@ -438,20 +431,15 @@ mova [coeffq+16*1], m2 psraw m0, 2 psraw m1, 2 - IWHT4_1D_PACKED - punpckhwd m0, m1 punpcklwd m3, m1, m2 punpckhdq m1, m0, m3 punpckldq m0, m3 - IWHT4_1D_PACKED - shufpd m0, m2, 0x01 ITX4_END 0, 3, 2, 1, 0 - %macro IDCT8_1D_PACKED 0 mova m6, [o(pd_2048)] punpckhwd m4, m0, m3 ;unpacked in1 in7 @@ -566,6 +554,7 @@ %endif %endmacro +INIT_XMM ssse3 INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst @@ -2391,7 +2380,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 32, 1 mov r3, tx2q - lea tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 @@ -2403,7 +2392,7 @@ jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass2: - lea tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] .end: mova [rsp+gprsize+16*0], m7 @@ -2459,7 +2448,7 @@ LOAD_8ROWS coeffq+16*1, 32, 1 call .main mov r3, tx2q - lea tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: @@ -2470,7 +2459,7 @@ jmp m(idct_8x8_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(idct_16x8_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(idct_8x8_internal_8bpc).pass2_main @@ -2598,7 +2587,7 @@ call .main call .main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(iadst_8x8_internal_8bpc).pass1_end .pass1_end: @@ -2609,7 +2598,7 @@ jmp m(iadst_8x8_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(iadst_16x8_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iadst_8x8_internal_8bpc).pass2_main @@ -2883,7 +2872,7 @@ LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass1_end: @@ -2894,7 +2883,7 @@ jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass2: - lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iflipadst_8x8_internal_8bpc).pass2_main @@ -2917,7 +2906,7 @@ mova m6, [coeffq-16*3] mova m7, [coeffq-16*1] mov r3, tx2q - lea tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] .pass1: mova m0, [o(pw_2896x8)] @@ -2975,7 +2964,7 @@ jmp .pass1 .pass2: - lea tx2q, [o(m(iidentity_16x8_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iidentity_8x8_internal_8bpc).end @@ -3013,7 +3002,7 @@ LOAD_8ROWS coeffq+16*3, 64 call m(idct_16x8_internal_8bpc).main mov r3, tx2q - lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 @@ -3021,7 +3010,7 @@ SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 @@ -3032,7 +3021,7 @@ SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 @@ -3045,13 +3034,13 @@ jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass2: - lea tx2q, [o(m(idct_16x16_internal_8bpc).end)] + lea tx2q, [o(.end)] jmp m(idct_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] mov dstq, r3 lea r3, [dstq+8] jmp m(idct_8x8_internal_8bpc).end @@ -3139,7 +3128,7 @@ call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 @@ -3147,7 +3136,7 @@ SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 @@ -3157,7 +3146,7 @@ call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end - lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 @@ -3170,13 +3159,13 @@ jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass2: - lea tx2q, [o(m(iadst_16x16_internal_8bpc).end)] + lea tx2q, [o(.end)] jmp m(iadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] mov dstq, r3 lea r3, [dstq+8] jmp m(iadst_8x8_internal_8bpc).end @@ -3214,7 +3203,7 @@ call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 @@ -3222,7 +3211,7 @@ SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 @@ -3236,7 +3225,7 @@ SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 @@ -3249,14 +3238,14 @@ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass2: - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)] + lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iflipadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] lea dstq, [dstq+strideq*2] jmp m(iflipadst_8x8_internal_8bpc).end @@ -3279,7 +3268,7 @@ mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 - lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)] + lea tx2q, [o(.end2)] mov dstq, r3 jmp m(iflipadst_8x16_internal_8bpc).pass2_main @@ -3303,7 +3292,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*17 mov r3, tx2q - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] .pass1: mova m6, [o(pw_1697x16)] @@ -3324,13 +3313,13 @@ .pass1_end: SAVE_8ROWS coeffq, 32 sub coeffq, 16 - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp .pass1 .pass1_end1: SAVE_8ROWS coeffq, 32 sub coeffq, 15*16 - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp .pass1 .pass1_end2: @@ -3341,7 +3330,7 @@ .pass2: lea r3, [dstq+8] - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] .end: mova [rsp+gprsize+16*0], m7 @@ -3364,7 +3353,7 @@ .end1: LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)] + lea tx2q, [o(.end2)] lea dstq, [dstq+strideq*2] jmp .end @@ -3374,7 +3363,7 @@ add coeffq, 32*8 LOAD_8ROWS coeffq, 32 - lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)] + lea tx2q, [o(.end3)] mov dstq, r3 jmp .end @@ -3406,7 +3395,7 @@ pshuflw m0, m0, q0000 punpcklwd m0, m0 mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop .end: @@ -3415,14 +3404,13 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp cmp eobd, 106 jle .fast LOAD_8ROWS coeffq+16*3, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1)] + lea tx2q, [o(.pass1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1: @@ -3437,7 +3425,7 @@ LOAD_8ROWS coeffq+16*2, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)] + lea tx2q, [o(.pass1_1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_1: @@ -3454,7 +3442,7 @@ LOAD_8ROWS coeffq+16*1, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: @@ -3469,7 +3457,7 @@ LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: @@ -3517,11 +3505,11 @@ call .main .pass2: - lea r3, [o(m(idct_8x32_internal_8bpc).end6)] + lea r3, [o(.end6)] .end: mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_8x32_internal_8bpc).end2)] + lea tx2q, [o(.end2)] .end1: pxor m7, m7 @@ -3533,21 +3521,21 @@ jmp tx2q .end2: - lea tx2q, [o(m(idct_8x32_internal_8bpc).end3)] + lea tx2q, [o(.end3)] jmp m(idct_8x8_internal_8bpc).end .end3: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal_8bpc).end4)] + lea tx2q, [o(.end4)] jmp m(idct_8x8_internal_8bpc).end .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal_8bpc).end5)] + lea tx2q, [o(.end5)] jmp m(idct_8x8_internal_8bpc).end .end5: @@ -3886,7 +3874,7 @@ movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + lea tx2q, [o(.end)] .body: pmulhrsw m0, m2 @@ -3922,7 +3910,6 @@ cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 @@ -3961,55 +3948,55 @@ .pass2: mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x8_internal_8bpc).end)] + lea tx2q, [o(.end)] jmp m(idct_8x32_internal_8bpc).end1 .end: mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end1)] + lea tx2q, [o(.end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end1: lea r3, [dstq+8] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end2)] + lea tx2q, [o(.end2)] jmp m(idct_8x8_internal_8bpc).pass2_main .end2: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end3)] + lea tx2q, [o(.end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end3: mov dstq, r3 add r3, 8 - lea tx2q, [o(m(idct_32x8_internal_8bpc).end4)] + lea tx2q, [o(.end4)] jmp m(idct_8x8_internal_8bpc).pass2_main .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end5)] + lea tx2q, [o(.end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end5: mov dstq, r3 add r3, 8 - lea tx2q, [o(m(idct_32x8_internal_8bpc).end6)] + lea tx2q, [o(.end6)] jmp m(idct_8x8_internal_8bpc).pass2_main .end6: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal_8bpc).end7)] + lea tx2q, [o(.end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end7: mov dstq, r3 - lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + lea tx2q, [o(.end8)] jmp m(idct_8x8_internal_8bpc).pass2_main .end8: @@ -4088,6 +4075,7 @@ test eobd, eobd jz .dconly call m(idct_16x32_internal_8bpc) +.end: RET .dconly: @@ -4097,28 +4085,24 @@ mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly -.end: - RET cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - LOAD_8ROWS coeffq+16*1, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*5, 128, 1 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: @@ -4135,14 +4119,14 @@ SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: @@ -4185,14 +4169,14 @@ SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*6, 128, 1 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)] + lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: @@ -4210,14 +4194,14 @@ SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*7, 128, 1 call m(idct_16x8_internal_8bpc).main - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)] + lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)] + lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: @@ -4249,7 +4233,7 @@ mov [rsp+gprsize*1+16*35], eobd lea r3, [dstq+8] mov [rsp+gprsize*2+16*35], r3 - lea r3, [o(m(idct_16x32_internal_8bpc).end)] + lea r3, [o(.end)] jmp m(idct_8x32_internal_8bpc).end .end: @@ -4299,7 +4283,7 @@ SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast - jmp .end1 + jmp m(idct_8x32_internal_8bpc).pass2 .full1: mova m4, [coeffq+16*2 ] ;in16 @@ -4340,12 +4324,9 @@ mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main - -.end1: jmp m(idct_8x32_internal_8bpc).pass2 - cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ @@ -4393,10 +4374,8 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - add coeffq, 16 - lea r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)] + lea r3, [o(.pass1_end1)] .pass1: LOAD_8ROWS coeffq+16*0, 128, 1 call m(idct_8x8_internal_8bpc).main @@ -4437,28 +4416,28 @@ SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+16*32, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*48, 32 sub coeffq, 16 - lea r3, [o(m(idct_32x16_internal_8bpc).end)] + lea r3, [o(.end)] jmp .pass1 .end: @@ -4466,8 +4445,6 @@ cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, eobd cmp eobd, 43 ;if (eob > 43) sbb r3d, r3d ; iteration_count++ @@ -4531,8 +4508,6 @@ cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 12 ;0100b mov r5d, 136 ;1000 1000b cmp eobd, 44 ;if (eob > 43) @@ -4611,8 +4586,6 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*35], eobd @@ -4687,7 +4660,7 @@ .pass1_end: mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: @@ -4695,7 +4668,7 @@ LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: @@ -4703,7 +4676,7 @@ LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: @@ -4711,7 +4684,7 @@ LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: @@ -4725,7 +4698,7 @@ .pass2: mov coeffq, [rsp+gprsize*2+16*35] mov r3d, 4 - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] + lea tx2q, [o(.pass2_end)] .pass2_loop: mov [rsp+gprsize*3+16*35], r3d @@ -4821,11 +4794,11 @@ jmp tx2q .pass2_end: - lea r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)] + lea r3, [o(.pass2_end1)] jmp m(idct_8x32_internal_8bpc).end .pass2_end1: - lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] + lea tx2q, [o(.pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] @@ -4836,8 +4809,6 @@ cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 cmp eobd, 136 mov r3d, 4 @@ -4898,8 +4869,8 @@ %endif test eobd, eobd jz .dconly - call m(idct_16x64_internal_8bpc) +.end: RET .dconly: @@ -4908,16 +4879,11 @@ movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly -.end: - RET - cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 sub eobd, 151 mov [rsp+gprsize*1+16*67], eobd @@ -4937,7 +4903,7 @@ LOAD_8ROWS coeffq+64*1, 64*2 call m(idct_16x8_internal_8bpc).main mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: @@ -4945,7 +4911,7 @@ LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: @@ -4959,7 +4925,7 @@ mov r3d, 2 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + lea r4, [o(.end1)] .pass2_loop: mov [rsp+gprsize*3+16*67], r3d @@ -5085,23 +5051,47 @@ .end1: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] - add rsp, 16*32 - lea r3, [o(m(idct_16x64_internal_8bpc).end2)] - jmp m(idct_8x32_internal_8bpc).end - -.end2: - add coeffq, 16*32 - sub rsp, 16*32 - + lea r3, [rsp+16*32+gprsize] + call .write mov dstq, [rsp+gprsize*2+16*67] mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + lea r4, [o(.end1)] dec r3d jg .pass2_loop ret +.write: + mova [r3+16*0], m7 + mov r4, -16*32 + pxor m7, m7 + sub coeffq, r4 +.zero_loop: + mova [coeffq+r4+16*0], m7 + mova [coeffq+r4+16*1], m7 + add r4, 16*2 + jl .zero_loop + call .write_main2 + LOAD_8ROWS r3+16*11, 16 + call .write_main + LOAD_8ROWS r3+16*19, 16 + call .write_main + LOAD_8ROWS r3+16*27, 16 +.write_main: + mova [r3+16*0], m7 +.write_main2: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [r3+16*0] + mova [r3+16*2], m5 + mova [r3+16*1], m6 + mova [r3+16*0], m7 + WRITE_8X4 0, 1, 2, 3, 5, 6, 7 + lea dstq, [dstq+strideq*2] + WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7 + lea dstq, [dstq+strideq*2] + ret ALIGN function_align @@ -5768,7 +5758,7 @@ movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)] + lea tx2q, [o(.end)] .body: pmulhrsw m0, m2 @@ -5898,7 +5888,7 @@ LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: @@ -5906,7 +5896,7 @@ LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: @@ -5914,7 +5904,7 @@ LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: @@ -5922,7 +5912,7 @@ LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: @@ -5930,7 +5920,7 @@ LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: @@ -5938,7 +5928,7 @@ LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)] + lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: @@ -5946,7 +5936,7 @@ LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)] + lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: @@ -5954,7 +5944,7 @@ LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)] + lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: @@ -5982,14 +5972,14 @@ call m(idct_16x8_internal_8bpc).main mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal_8bpc).end)] + lea tx2q, [o(.end)] lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal_8bpc).end1)] + lea tx2q, [o(.end1)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end @@ -6019,14 +6009,14 @@ call m(idct_16x8_internal_8bpc).main mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal_8bpc).end2)] + lea tx2q, [o(.end2)] lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end .end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal_8bpc).end3)] + lea tx2q, [o(.end3)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end @@ -6048,8 +6038,8 @@ %endif test eobd, eobd jz .dconly - call m(idct_32x64_internal_8bpc) +.end: RET .dconly: @@ -6059,16 +6049,11 @@ mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 64 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body -.end: - RET - cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*67], eobd @@ -6136,28 +6121,28 @@ .pass1_end: mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: @@ -6182,8 +6167,8 @@ %endif test eobd, eobd jz .dconly - call m(idct_64x32_internal_8bpc) +.end: RET .dconly: @@ -6193,15 +6178,11 @@ pmulhrsw m0, m1 mov [coeffq], eobd mov r3d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body -.end: - RET cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*67], eobd @@ -6269,56 +6250,56 @@ LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)] + lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)] + lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)] + lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: @@ -6335,17 +6316,17 @@ mov eobd, [rsp+gprsize*1+16*67] lea dstq, [dstq+32] mov [rsp+gprsize*1+16*35], eobd - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] + lea tx2q, [o(.pass2_end)] mov r3d, 4 jmp m(idct_32x32_internal_8bpc).pass2_loop .pass2_end: mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)] + lea r3, [o(.pass2_end1)] jmp m(idct_8x32_internal_8bpc).end2 .pass2_end1: - lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] + lea tx2q, [o(.pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] @@ -6380,8 +6361,6 @@ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - mov r5d, 4 mov r4d, 2 sub eobd, 136 @@ -6451,7 +6430,7 @@ LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)] + lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: @@ -6459,7 +6438,7 @@ LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)] + lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: @@ -6467,7 +6446,7 @@ LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)] + lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: @@ -6475,7 +6454,7 @@ LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)] + lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: @@ -6483,7 +6462,7 @@ LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)] + lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: @@ -6491,7 +6470,7 @@ LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)] + lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: @@ -6499,7 +6478,7 @@ LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)] + lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: @@ -6507,7 +6486,7 @@ LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)] + lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: @@ -6525,26 +6504,20 @@ mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] + lea r4, [o(.pass2_end)] jmp m(idct_16x64_internal_8bpc).pass2_loop .pass2_end: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] - add rsp, 16*32 + lea r3, [rsp+16*32+gprsize] mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)] - jmp m(idct_8x32_internal_8bpc).end2 - -.pass2_end1: - add coeffq, 16*32 - sub rsp, 16*32 - + call m(idct_16x64_internal_8bpc).write mov dstq, [rsp+gprsize*2+16*67] mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] + lea r4, [o(.pass2_end)] dec r3d jg m(idct_16x64_internal_8bpc).pass2_loop diff -Nru dav1d-0.9.2/src/x86/loopfilter16_avx2.asm dav1d-1.0.0/src/x86/loopfilter16_avx2.asm --- dav1d-0.9.2/src/x86/loopfilter16_avx2.asm 2021-09-03 15:51:24.421037200 +0000 +++ dav1d-1.0.0/src/x86/loopfilter16_avx2.asm 2022-03-18 14:31:56.022356000 +0000 @@ -49,14 +49,6 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - ; in: out: ; mm%1 a b c d a e i m ; mm%2 e f g h b f j n diff -Nru dav1d-0.9.2/src/x86/loopfilter16_sse.asm dav1d-1.0.0/src/x86/loopfilter16_sse.asm --- dav1d-0.9.2/src/x86/loopfilter16_sse.asm 2021-09-03 15:51:24.421037200 +0000 +++ dav1d-1.0.0/src/x86/loopfilter16_sse.asm 2022-03-18 14:31:56.022356000 +0000 @@ -106,14 +106,6 @@ %endif %endmacro -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %macro SPLATD 2 movd %1, %2 pshufd %1, %1, q0000 diff -Nru dav1d-0.9.2/src/x86/loopfilter_avx512.asm dav1d-1.0.0/src/x86/loopfilter_avx512.asm --- dav1d-0.9.2/src/x86/loopfilter_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/loopfilter_avx512.asm 2022-03-18 14:31:56.022356000 +0000 @@ -0,0 +1,1562 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 + +pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080 + dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 + +hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60 +hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51 +hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49 +hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + +pb_1: times 4 db 1 +pb_2: times 4 db 2 +pb_3: times 4 db 3 +pb_4: times 4 db 4 +pb_16: times 4 db 16 +pb_63: times 4 db 63 +pb_64: times 4 db 64 +pb_128: times 4 db 0x80 +pb_240: times 4 db 0xf0 +pb_248: times 4 db 0xf8 +pb_254: times 4 db 0xfe +pb_2_1: times 2 db 2, 1 +pb_3_1: times 2 db 3, 1 +pb_7_1: times 2 db 7, 1 +pb_m1_0: times 2 db -1, 0 +pb_m1_1: times 2 db -1, 1 +pb_m1_2: times 2 db -1, 2 +pw_2048: times 2 dw 2048 +pw_4096: times 2 dw 4096 + +SECTION .text + +%macro ABSSUB 4 ; dst, a, b, tmp + psubusb %1, %2, %3 + psubusb %4, %3, %2 + por %1, %4 +%endmacro + +%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 + punpcklbw m%5, m%1, m%2 + punpckhbw m%1, m%2 + punpcklbw m%2, m%3, m%4 + punpckhbw m%3, m%4 + punpcklwd m%4, m%5, m%2 + punpckhwd m%5, m%2 + punpcklwd m%2, m%1, m%3 + punpckhwd m%1, m%3 + kmovw k1, k6 + lea t0, [dstq+strideq*4] + vpscatterdd [dstq+m29-2]{k1}, m%4 + kmovw k1, k6 + lea t1, [dstq+strideq*8] + vpscatterdd [t0 +m29-2]{k1}, m%5 + kmovw k1, k6 + lea t2, [t0 +strideq*8] + vpscatterdd [t1 +m29-2]{k1}, m%2 + kmovw k1, k6 + vpscatterdd [t2 +m29-2]{k1}, m%1 +%endmacro + +%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem +%if %1 == 0 + SWAP m16, m15 +%endif + ; input in m0-15 + punpcklbw m15, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklbw m7, m8, m9 + punpckhbw m8, m9 + punpcklbw m9, m10, m11 + punpckhbw m10, m11 + punpcklbw m11, m12, m13 + punpckhbw m12, m13 +%if %1 == 0 + SWAP m13, m16 +%else + mova m13, %3 +%endif + SWAP m16, m12 + punpcklbw m12, m14, m13 + punpckhbw m13, m14, m13 + ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 + punpcklwd m14, m15, m1 + punpckhwd m15, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m7, m9 + punpckhwd m7, m9 + punpcklwd m9, m8, m10 + punpckhwd m8, m10 + punpcklwd m10, m11, m12 + punpckhwd m11, m12 + SWAP m12, m16, m11 + punpcklwd m11, m12, m13 + punpckhwd m12, m13 + ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 + punpckldq m13, m14, m2 + punpckhdq m14, m2 + punpckldq m2, m15, m3 + punpckhdq m15, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + punpckldq m4, m6, m10 + punpckhdq m6, m10 + punpckldq m10, m9, m11 + punpckhdq m9, m11 + punpckldq m11, m8, m12 + punpckhdq m8, m12 + SWAP m12, m16, m8 + punpckldq m8, m7, m12 + punpckhdq m7, m12 + ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 + punpcklqdq m12, m13, m4 + punpckhqdq m13, m4 + punpcklqdq m4, m14, m6 + punpckhqdq m14, m6 + punpcklqdq m6, m2, m8 + punpckhqdq m2, m8 + punpcklqdq m8, m15, m7 + punpckhqdq m15, m7 + punpcklqdq m7, m3, m10 + punpckhqdq m3, m10 + punpcklqdq m10, m1, m9 + punpckhqdq m1, m9 + punpcklqdq m9, m5, m11 + punpckhqdq m5, m11 + SWAP m11, m16 +%if %2 == 0 + SWAP m16, m12 +%else + mova %3, m12 +%endif + punpcklqdq m12, m0, m11 + punpckhqdq m0, m11 +%if %2 == 0 + SWAP m11, m16 +%endif + ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 + SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 + SWAP 3, 14, 12, 9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%define is_h 0 +%if %1 == 4 + lea t0, [dstq+mstrideq*2] + mova m3, [t0 +strideq*0] ; p1 + mova m4, [t0 +strideq*1] ; p0 + mova m5, [t0 +strideq*2] ; q0 + mova m6, [t0 +stride3q ] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline +%if %1 == 16 + lea t0, [dstq+mstrideq*8] + mova m16, [t0 +strideq*1] + mova m17, [t0 +strideq*2] + mova m18, [t0 +stride3q ] +%endif + lea t0, [dstq+mstrideq*4] +%if %1 != 6 + mova m12, [t0 +strideq*0] +%endif + mova m13, [t0 +strideq*1] + mova m3, [t0 +strideq*2] + mova m4, [t0 +stride3q ] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q ] +%endif +%if %1 == 16 + lea t0, [dstq+strideq*4] + mova m19, [t0 +strideq*0] + mova m20, [t0 +strideq*1] + mova m21, [t0 +strideq*2] +%endif +%endif +%else ; h +%define is_h 1 + ; load lines +%if %1 == 4 + vbroadcasti32x4 m0, [hshuf4] + kmovw k1, k6 + lea t0, [dstq+strideq*4] + vpgatherdd m3{k1}, [dstq+m29-2] + kmovw k1, k6 + lea t1, [dstq+strideq*8] + vpgatherdd m4{k1}, [t0 +m29-2] + kmovw k1, k6 + lea t2, [t0 +strideq*8] + vpgatherdd m5{k1}, [t1 +m29-2] + kmovw k1, k6 + vpgatherdd m6{k1}, [t2 +m29-2] + pshufb m3, m0 + pshufb m4, m0 + pshufb m5, m0 + pshufb m6, m0 + punpckldq m7, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpcklqdq m6, m7, m4 + punpckhqdq m7, m4 + punpcklqdq m4, m3, m5 + punpckhqdq m3, m5 + SWAP 3, 6 + SWAP 5, 4, 7 + ; 6,7,4,3 -> 3,4,5,6 +%elif %1 == 6 || %1 == 8 + kmovb k1, k7 + lea t0, [dstq+strideq*1] + vpgatherdq m3{k1}, [dstq+ym31-%1/2] + kmovb k1, k7 + lea t1, [dstq+strideq*2] + vpgatherdq m4{k1}, [t0 +ym31-%1/2] + kmovb k1, k7 + lea t2, [dstq+stride3q ] + vpgatherdq m5{k1}, [t1 +ym31-%1/2] + kmovb k1, k7 + vextracti32x8 ym0, m31, 1 + vpgatherdq m6{k1}, [t2 +ym31-%1/2] + kmovb k1, k7 + vpgatherdq m12{k1}, [dstq+ym0 -%1/2] + kmovb k1, k7 + vpgatherdq m13{k1}, [t0 +ym0 -%1/2] + kmovb k1, k7 + vpgatherdq m14{k1}, [t1 +ym0 -%1/2] + kmovb k1, k7 + vpgatherdq m15{k1}, [t2 +ym0 -%1/2] + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm12: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklbw m7, m3, m4 + punpckhbw m3, m4 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + punpcklbw m6, m12, m13 + punpckhbw m12, m13 + punpcklbw m13, m14, m15 + punpckhbw m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 + ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 + ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 + ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 + ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 + ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 + ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 + ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 + punpcklwd m15, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m6, m13 + punpckhwd m6, m13 + punpcklwd m13, m12, m14 + punpckhwd m12, m14 + ; xm15: A0-3,B0-3,C0-3,D0-3 + ; xm7: E0-3,F0-3,G0-3,H0-3 + ; xm4: A8-11,B8-11,C8-11,D8-11 + ; xm3: E8-11,F8-11,G8-11,H8-11 + ; xm5: A4-7,B4-7,C4-7,D4-7 + ; xm6: E4-7,F4-7,G4-7,H4-7 + ; xm13: A12-15,B12-15,C12-15,D12-15 + ; xm12: E12-15,F12-15,G12-15,H12-15 + punpckldq m14, m15, m5 + punpckhdq m15, m5 + punpckldq m5, m7, m6 + %if %1 != 6 + punpckhdq m7, m6 + %endif + punpckldq m6, m4, m13 + punpckhdq m4, m13 + punpckldq m13, m3, m12 + %if %1 != 6 + punpckhdq m12, m3, m12 + %endif + ; xm14: A0-7,B0-7 + ; xm15: C0-7,D0-7 + ; xm5: E0-7,F0-7 + ; xm7: G0-7,H0-7 + ; xm6: A8-15,B8-15 + ; xm4: C8-15,D8-15 + ; xm13: E8-15,F8-15 + ; xm12: G8-15,H8-15 + punpcklqdq m3, m14, m6 + punpckhqdq m14, m6 + punpckhqdq m6, m15, m4 + punpcklqdq m15, m4 + punpcklqdq m4, m5, m13 + punpckhqdq m13, m5, m13 + %if %1 == 8 + punpcklqdq m5, m7, m12 + punpckhqdq m12, m7, m12 + ; xm3: A0-15 + ; xm14: B0-15 + ; xm15: C0-15 + ; xm6: D0-15 + ; xm4: E0-15 + ; xm13: F0-15 + ; xm5: G0-15 + ; xm12: H0-15 + SWAP 12, 3, 15 + SWAP 13, 14, 5, 4, 6 + ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 + %else + SWAP 13, 3, 14 + SWAP 6, 4, 15, 5 + ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 + %endif +%else ; 16, h + ; load and 16x16 transpose. We only use 14 pixels but we'll need the + ; remainder at the end for the second transpose + movu xm0, [dstq+strideq*0-8] + movu xm1, [dstq+strideq*1-8] + movu xm2, [dstq+strideq*2-8] + movu xm3, [dstq+stride3q -8] + lea t0, [dstq+strideq*4] + movu xm4, [t0 +strideq*0-8] + movu xm5, [t0 +strideq*1-8] + movu xm6, [t0 +strideq*2-8] + movu xm7, [t0 +stride3q -8] + lea t0, [t0 +strideq*4] + movu xm8, [t0 +strideq*0-8] + movu xm9, [t0 +strideq*1-8] + movu xm10, [t0 +strideq*2-8] + movu xm11, [t0 +stride3q -8] + lea t0, [t0 +strideq*4] + movu xm12, [t0 +strideq*0-8] + movu xm13, [t0 +strideq*1-8] + movu xm14, [t0 +strideq*2-8] + movu xm15, [t0 +stride3q -8] + lea t0, [t0 +strideq*4] + vinserti32x4 ym0, [t0 +strideq*0-8], 1 + vinserti32x4 ym1, [t0 +strideq*1-8], 1 + vinserti32x4 ym2, [t0 +strideq*2-8], 1 + vinserti32x4 ym3, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 ym4, [t0 +strideq*0-8], 1 + vinserti32x4 ym5, [t0 +strideq*1-8], 1 + vinserti32x4 ym6, [t0 +strideq*2-8], 1 + vinserti32x4 ym7, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 ym8, [t0 +strideq*0-8], 1 + vinserti32x4 ym9, [t0 +strideq*1-8], 1 + vinserti32x4 ym10, [t0 +strideq*2-8], 1 + vinserti32x4 ym11, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 ym12, [t0 +strideq*0-8], 1 + vinserti32x4 ym13, [t0 +strideq*1-8], 1 + vinserti32x4 ym14, [t0 +strideq*2-8], 1 + vinserti32x4 ym15, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 m0, [t0 +strideq*0-8], 2 + vinserti32x4 m1, [t0 +strideq*1-8], 2 + vinserti32x4 m2, [t0 +strideq*2-8], 2 + vinserti32x4 m3, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m4, [t0 +strideq*0-8], 2 + vinserti32x4 m5, [t0 +strideq*1-8], 2 + vinserti32x4 m6, [t0 +strideq*2-8], 2 + vinserti32x4 m7, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m8, [t0 +strideq*0-8], 2 + vinserti32x4 m9, [t0 +strideq*1-8], 2 + vinserti32x4 m10, [t0 +strideq*2-8], 2 + vinserti32x4 m11, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m12, [t0 +strideq*0-8], 2 + vinserti32x4 m13, [t0 +strideq*1-8], 2 + vinserti32x4 m14, [t0 +strideq*2-8], 2 + vinserti32x4 m15, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m0, [t0 +strideq*0-8], 3 + vinserti32x4 m1, [t0 +strideq*1-8], 3 + vinserti32x4 m2, [t0 +strideq*2-8], 3 + vinserti32x4 m3, [t0 +stride3q -8], 3 + lea t0, [t0 +strideq*4] + vinserti32x4 m4, [t0 +strideq*0-8], 3 + vinserti32x4 m5, [t0 +strideq*1-8], 3 + vinserti32x4 m6, [t0 +strideq*2-8], 3 + vinserti32x4 m7, [t0 +stride3q -8], 3 + lea t0, [t0 +strideq*4] + vinserti32x4 m8, [t0 +strideq*0-8], 3 + vinserti32x4 m9, [t0 +strideq*1-8], 3 + vinserti32x4 m10, [t0 +strideq*2-8], 3 + vinserti32x4 m11, [t0 +stride3q -8], 3 + lea t0, [t0 +strideq*4] + vinserti32x4 m12, [t0 +strideq*0-8], 3 + vinserti32x4 m13, [t0 +strideq*1-8], 3 + vinserti32x4 m14, [t0 +strideq*2-8], 3 + vinserti32x4 m15, [t0 +stride3q -8], 3 + ; + TRANSPOSE_16X16B 0, 1, [rsp+0*64] + SWAP m16, m1 + SWAP m17, m2 + SWAP m18, m3 + SWAP m19, m12 + SWAP m20, m13 + SWAP m21, m14 + mova [rsp+4*64], m15 + ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 + SWAP 12, 4, 7 + SWAP 13, 5, 8 + SWAP 3, 6, 9 + SWAP 10, 14 + SWAP 11, 15 +%endif +%endif + + ; load L/E/I/H +%if is_uv + SWAP m22, m15 +%endif + vpbroadcastd m22, [pb_1] +%ifidn %2, v + movu m1, [lq] + movu m0, [lq+l_strideq] +%else + kmovw k1, k6 + vpgatherdd m0{k1}, [lq+m30+4] + kmovw k1, k6 + vpgatherdd m1{k1}, [lq+m30+0] +%endif + pxor m2, m2 + pcmpeqb k1, m0, m2 + vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, pbshuf ; l[x][0] + vpcmpub k3, m0, m2, 4 ; neq ; L + psrlq m2, m0, [lutq+128] + pand m2, [pb_63]{bcstd} + vpbroadcastb m1, [lutq+136] + pminub m2, m1 + pmaxub m2, m22 ; I + pand m1, m0, [pb_240]{bcstd} + psrlq m1, 4 ; H + paddd m0, [pb_2]{bcstd} + paddb m0, m0 + paddb m0, m2 ; E + + ABSSUB m8, m3, m4, m9 ; abs(p1-p0) + ABSSUB m9, m5, m6, m10 ; abs(q1-q0) + pmaxub m8, m9 + vpcmpub k1, m8, m1, 6 ; gt ; hev +%if %1 != 4 + %if %1 == 6 + ABSSUB m9, m13, m4, m10 ; abs(p2-p0) + pmaxub m9, m8 + %else + ABSSUB m9, m12, m4, m10 ; abs(p3-p0) + pmaxub m9, m8 + ABSSUB m10, m13, m4, m11 ; abs(p2-p0) + pmaxub m9, m10 + %endif + ABSSUB m10, m5, m14, m11 ; abs(q2-q0) + pmaxub m9, m10 + %if %1 != 6 + ABSSUB m10, m5, m15, m11 ; abs(q3-q0) + pmaxub m9, m10 + %endif + vpcmpub k2{k3}, m9, m22, 2 ; le ; flat8in + %if %1 == 6 + ABSSUB m10, m13, m3, m1 ; abs(p2-p1) + %else + ABSSUB m10, m12, m13, m11 ; abs(p3-p2) + ABSSUB m11, m13, m3, m1 ; abs(p2-p1) + pmaxub m10, m11 + ABSSUB m11, m14, m15, m1 ; abs(q3-q2) + pmaxub m10, m11 + %endif + ABSSUB m11, m14, m6, m1 ; abs(q2-q1) + pmaxub m10, m11 + %if %1 == 16 + vpbroadcastd m11, [maskq+8] + por m11, [maskq+4]{bcstd} + pand m11, pbmask + %else + %if !is_h || %1 == 6 + pand m11, pbmask, [maskq+4]{bcstd} + %else + vpbroadcastd m11, [maskq+4] + pand m11, pbmask + %endif + %endif + pcmpeqd k4, m11, pbmask + vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks + pmaxub m8, m10 +%endif + vpcmpub k3{k3}, m8, m2, 2 ; le + ABSSUB m10, m3, m6, m11 ; abs(p1-q1) + ABSSUB m11, m4, m5, m2 ; abs(p0-q0) + paddusb m11, m11 + pand m10, [pb_254]{bcstd} + psrlq m10, 1 + paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E + +%if %1 == 16 + ABSSUB m1, m16, m4, m2 + ABSSUB m2, m17, m4, m10 + pmaxub m1, m2 + ABSSUB m2, m18, m4, m10 + pmaxub m1, m2 + ABSSUB m2, m19, m5, m10 + pmaxub m1, m2 + ABSSUB m2, m20, m5, m10 + pmaxub m1, m2 + ABSSUB m2, m21, m5, m10 + pmaxub m1, m2 + ; + vpcmpub k4, m1, m22, 2 ; flat8out + kandq k4, k4, k2 ; flat8in & flat8out + + vpbroadcastd m2, [maskq+8] + pand m10, m2, pbmask + pcmpeqd k5, m10, pbmask + vpmovm2d m7, k5 + vpmovb2m k5, m7 + kandq k4, k4, k5 ; flat16 + kandq k4, k3, k4 ; flat16 & fm + por m10, m2, [maskq+4]{bcstd} + pand m2, m10, pbmask + pcmpeqd k5, m2, pbmask + vpmovm2d m7, k5 + vpmovb2m k5, m7 + kandq k2, k2, k5 ; flat8in + kandq k2, k3, k2 + por m2, m10, [maskq+0]{bcstd} + pand m2, pbmask + pcmpeqd k5, m2, pbmask + vpmovm2d m7, k5 + vpmovb2m k5, m7 + kandq k3, k3, k5 + kandnq k3, k2, k3 ; fm & !flat8 & !flat16 + kandnq k2, k4, k2 ; flat8 & !flat16 +%elif %1 != 4 + vpbroadcastd m0, [maskq+4] + pand m2, m0, pbmask + pcmpeqd k4, m2, pbmask + vpmovm2d m7, k4 + vpmovb2m k4, m7 + kandq k2, k2, k4 + kandq k2, k2, k3 ; flat8 & fm + por m0, [maskq+0]{bcstd} + pand m0, pbmask + pcmpeqd k4, m0, pbmask + vpmovm2d m7, k4 + vpmovb2m k4, m7 + kandq k3, k3, k4 + kandnq k3, k2, k3 ; fm & !flat8 +%else + %ifidn %2, v + pand m0, pbmask, [maskq+0]{bcstd} + %else + vpbroadcastd m0, [maskq+0] + pand m0, pbmask + %endif + pcmpeqd k4, m0, pbmask + vpmovm2d m7, k4 + vpmovb2m k4, m7 + kandq k3, k3, k4 ; fm +%endif + + ; short filter +%if is_uv + SWAP m23, m22 + SWAP m24, m0 + SWAP m25, m12 + SWAP m26, m1 +%endif + vpbroadcastd m23, [pb_3] + vpbroadcastd m24, [pb_4] + vpbroadcastd m25, [pb_16] + vpbroadcastd m26, [pb_64] + pxor m3, pb128 + pxor m6, pb128 + psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev + pxor m4, pb128 + pxor m5, pb128 + psubsb m11, m5, m4 + paddsb m10, m11 + paddsb m10, m11 + paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm + paddsb m8, m10, m23 + paddsb m10, m24 + pand m8, [pb_248]{bcstd} + pand m10, [pb_248]{bcstd} + psrlq m8, 3 + psrlq m10, 3 + pxor m8, m25 + pxor m10, m25 + psubb m8, m25 ; f2 + psubb m10, m25 ; f1 + paddsb m4, m8 + psubsb m5, m10 + pxor m4, pb128 + pxor m5, pb128 + ; + pxor m10, pb128 + pxor m8, m8 + pavgb m8, m10 ; f=(f1+1)>>1 + psubb m8, m26 + knotq k1, k1 + paddsb m3{k1}, m3, m8 + psubsb m6{k1}, m6, m8 + pxor m3, pb128 + pxor m6, pb128 + +%if %1 == 16 + ; flat16 filter +%ifidn %2, v + lea t0, [dstq+mstrideq*8] +%endif + SWAP m0, m16, m14 + SWAP m2, m17, m15 + SWAP m7, m18 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A + ; write -6 + vpbroadcastd m26, [pb_7_1] + vpbroadcastd m25, [pb_2] + punpcklbw m14, m0, m12 + punpckhbw m15, m0, m12 + pmaddubsw m10, m14, m26 + pmaddubsw m11, m15, m26 ; p6*7+p3 + punpcklbw m8, m2, m7 + punpckhbw m9, m2, m7 + pmaddubsw m8, m25 + pmaddubsw m9, m25 + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3 +%ifidn %2, h + vpbroadcastd m27, [pw_2048] + vpbroadcastd m26, [pb_m1_1] + %define pw2048 m27 + %define pbm1_1 m26 +%endif + punpcklbw m8, m13, m3 + punpckhbw m9, m13, m3 + pmaddubsw m8, m22 + pmaddubsw m9, m22 + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 + punpcklbw m8, m4, m5 + punpckhbw m9, m4, m5 + pmaddubsw m8, m22 + pmaddubsw m9, m22 + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5 +%else + vpblendmb m8{k4}, m2, m8 + mova [rsp+1*64], m8 +%endif + + ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B + ; write -5 + pmaddubsw m14, pbm1_1 + pmaddubsw m15, pbm1_1 + paddw m10, m14 + paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m8, m0, m6 + punpckhbw m9, m0, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 + SWAP m18, m8 + SWAP m22, m9 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+stride3q]{k4}, m8 ; p4 +%else + vpblendmb m8{k4}, m7, m8 + mova [rsp+2*64], m8 +%endif + + ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C + ; write -4 + SWAP m14, m16 + punpcklbw m8, m0, m13 + punpckhbw m9, m0, m13 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 + punpcklbw m8, m2, m14 + punpckhbw m2, m14 + pmaddubsw m8, pbm1_1 + pmaddubsw m2, pbm1_1 + paddw m10, m8 + paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 + SWAP m16, m8 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3 +%else + vpblendmb m8{k4}, m12, m8 + mova [rsp+3*64], m8 +%endif + + ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D + ; write -3 + SWAP m15, m17 + punpcklbw m8, m0, m3 + punpckhbw m9, m0, m3 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 + punpcklbw m8, m7, m15 + punpckhbw m7, m15 + pmaddubsw m8, pbm1_1 + pmaddubsw m7, pbm1_1 + paddw m10, m8 + paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 + SWAP m17, m8 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 + vpblendmb m23{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F + + ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E + ; write -2 +%ifidn %2, v + lea t0, [dstq+strideq*4] +%endif + punpcklbw m8, m0, m4 + punpckhbw m9, m0, m4 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 + punpcklbw m8, m12, m19 + punpckhbw m9, m12, m19 + SWAP m1, m19 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 + SWAP m19, m8 + SWAP m24, m9 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 + vpblendmb m25{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G + + ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F + ; write -1 +%ifidn %2, h + SWAP m28, m0 + punpcklbw m8, m28, m5 + punpckhbw m0, m28, m5 +%else + punpcklbw m8, m0, m5 + punpckhbw m0, m5 +%endif + pmaddubsw m8, pbm1_1 + pmaddubsw m0, pbm1_1 + paddw m10, m8 + paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m0, m13, m20 + punpckhbw m9, m13, m20 +%ifidn %2, h + SWAP m27, m20 +%endif + SWAP m13, m23 + pmaddubsw m0, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m0 + paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 + SWAP m20, m0 + SWAP m23, m9 +%ifidn %2, h + SWAP m9, m0 + %define pw2048 m9 +%endif + pmulhrsw m0, m10, pw2048 + pmulhrsw m8, m11, pw2048 + paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 + paddw m11, m22 + packuswb m0, m8 + punpcklbw m8, m3, m21 + pmaddubsw m8, pbm1_1 + paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + SWAP m18, m8 + pmulhrsw m8, m10, pw2048 + paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 +%ifidn %2, h + SWAP m16, m9 + %define pw2048 m16 +%endif + punpckhbw m9, m3, m21 + SWAP m3, m25 + pmaddubsw m9, pbm1_1 + paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + SWAP m22, m9 + pmulhrsw m9, m11, pw2048 + paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 +%ifidn %2, h + SWAP m2, m26 + %define pbm1_1 m2 +%endif + vpblendmb m26{k4}, m4, m0 ; don't clobber p0/m4 since we need it in H + + ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G + ; write +0 + SWAP m0, m21 ; q6 + packuswb m8, m9 +%ifidn %2, h + SWAP m21, m2 + %define pbm1_1 m21 +%endif + vpblendmb m25{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I + + ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H + ; write +1 + punpcklbw m8, m4, m0 + punpckhbw m2, m4, m0 + SWAP m4, m26 + pmaddubsw m8, pbm1_1 + pmaddubsw m2, pbm1_1 + paddw m10, m8 + paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 + pmulhrsw m2, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m2, m9 + vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K + + ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I + ; write +2 + paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + paddw m11, m7 + punpcklbw m8, m5, m0 + punpckhbw m9, m5, m0 + SWAP m5, m25 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 + pmulhrsw m7, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m7, m9 + vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K + + ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J + ; write +3 + paddw m10, m19 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + paddw m11, m24 + punpcklbw m8, m6, m0 + punpckhbw m9, m6, m0 + SWAP 2, 6 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+mstrideq]{k4}, m8 +%else + SWAP m19, m16 + %define pw2048 m19 + vpblendmb m16{k4}, m15, m8 +%endif + + ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K + ; write +4 + paddw m10, m20 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + paddw m11, m23 +%ifidn %2, h + SWAP m23, m8 +%endif + punpcklbw m8, m14, m0 + punpckhbw m9, m14, m0 + SWAP 14, 7 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4 +%else + vpblendmb m17{k4}, m1, m8 +%endif + + ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L + ; write +5 + paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + paddw m11, m22 + punpcklbw m8, m15, m0 + punpckhbw m9, m15, m0 + SWAP m20, m0 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m10, pw2048 + pmulhrsw m11, pw2048 + packuswb m10, m11 +%ifidn %2, v + vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5 +%else + vmovdqu8 m27{k4}, m10 +%endif + +%ifidn %2, v + lea t0, [dstq+mstrideq*4] +%endif +%endif + +%if %1 >= 8 + ; flat8 filter + vpbroadcastd m9, [pb_3_1] + vpbroadcastd m10, [pb_2_1] +%if %1 == 16 + vpbroadcastd m22, [pb_1] + vpbroadcastd m24, [pb_4] +%elifidn %2, h + vpbroadcastd m21, [pb_m1_1] + %define pbm1_1 m21 +%endif + punpcklbw m0, m12, m3 + punpckhbw m1, m12, m3 + pmaddubsw m2, m0, m9 + pmaddubsw m7, m1, m9 ; 3 * p3 + p1 + punpcklbw m8, m13, m4 + punpckhbw m11, m13, m4 + pmaddubsw m8, m10 + pmaddubsw m11, m10 + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + punpcklbw m8, m5, m24 + punpckhbw m11, m5, m24 + pmaddubsw m8, m22 + pmaddubsw m11, m22 + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 +%if is_h || %1 == 16 + vpblendmb m10{k2}, m13, m8 ; p2 +%endif +%ifidn %2, v + %if %1 == 8 + vmovdqu8 [t0+strideq*1]{k2}, m8 + %else + mova [t0+strideq*1], m10 + %endif +%endif + + pmaddubsw m8, m0, pbm1_1 + pmaddubsw m11, m1, pbm1_1 + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m11, pbm1_1 + paddw m2, m8 + paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendmb m8{k2}, m3, m8 ; p1 +%ifidn %2, v + mova [t0+strideq*2], m8 +%else + SWAP m18, m8 +%endif + + pmaddubsw m0, m22 + pmaddubsw m1, m22 + psubw m2, m0 + psubw m7, m1 + punpcklbw m8, m4, m14 + punpckhbw m11, m4, m14 + pmaddubsw m8, m22 + pmaddubsw m11, m22 + paddw m2, m8 + paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendmb m8{k2}, m4, m8 ; p0 +%ifidn %2, v + mova [t0+stride3q], m8 +%else + SWAP m19, m8 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m8, m0, m22 + pmaddubsw m11, m1, m22 + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m4, m12 + punpckhbw m11, m4, m12 + pmaddubsw m8, m22 + pmaddubsw m11, m22 + psubw m2, m8 + psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendmb m11{k2}, m5, m8 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m11 +%endif + + pmaddubsw m0, pbm1_1 + pmaddubsw m1, pbm1_1 + paddw m2, m0 + paddw m7, m1 + punpcklbw m8, m13, m6 + punpckhbw m13, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m13, pbm1_1 + paddw m2, m8 + paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 + psrlw m8, m2, 3 + psrlw m13, m7, 3 + packuswb m8, m13 + vpblendmb m13{k2}, m6, m8 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m13 +%endif + + punpcklbw m0, m3, m6 + punpckhbw m1, m3, m6 + pmaddubsw m0, m22 + pmaddubsw m1, m22 + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m14, m15 + punpckhbw m1, m14, m15 + pmaddubsw m0, m22 + pmaddubsw m1, m22 + paddw m2, m0 + paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + psrlw m2, 3 + psrlw m7, 3 + packuswb m2, m7 +%if is_h || %1 == 16 + vpblendmb m2{k2}, m14, m2 ; q2 +%endif +%ifidn %2, v + %if %1 == 8 + vmovdqu8 [dstq+strideq*2]{k2}, m2 + %else + mova [dstq+strideq*2], m2 + %endif +%endif + +%ifidn %2, h + SWAP m0, m18 + SWAP m1, m19 +%if %1 == 8 + ; 16x8 transpose + punpcklbw m3, m12, m10 + punpckhbw m12, m10 + punpcklbw m10, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m11, m13 + punpckhbw m11, m13 + punpcklbw m13, m2, m15 + punpckhbw m2, m15 + ; + punpcklwd m15, m3, m10 + punpckhwd m3, m10 + punpcklwd m10, m12, m0 + punpckhwd m12, m0 + punpcklwd m0, m1, m13 + punpckhwd m1, m13 + punpcklwd m13, m11, m2 + punpckhwd m11, m2 + ; + punpckldq m2, m15, m0 + punpckhdq m15, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + punpckldq m1, m10, m13 + punpckhdq m10, m13 + punpckldq m13, m12, m11 + punpckhdq m12, m11 + ; write 8x32 + vpbroadcastd ym16, strided + pmulld ym16, [hmulD] + lea t1, [dstq+strideq*2] + lea t2, [dstq+strideq*4] + lea t3, [t1 +strideq*4] + lea t0, [dstq+strideq*8] + kmovb k1, k6 + kmovb k2, k6 + kmovb k3, k6 + kmovb k4, k6 + vpscatterdq [dstq+ym16-4]{k1}, m2 + vpscatterdq [t1 +ym16-4]{k2}, m15 + vpscatterdq [t2 +ym16-4]{k3}, m0 + vpscatterdq [t3 +ym16-4]{k4}, m3 + lea t1, [t0+strideq*2] + lea t2, [t0+strideq*4] + lea t3, [t1+strideq*4] + kmovb k1, k6 + kmovb k2, k6 + kmovb k3, k6 + kmovb k4, k6 + vpscatterdq [t0+ym16-4]{k1}, m1 + vpscatterdq [t1+ym16-4]{k2}, m10 + vpscatterdq [t2+ym16-4]{k3}, m13 + vpscatterdq [t3+ym16-4]{k4}, m12 +%else + ; 16x16 transpose and store + SWAP 5, 10, 2 + SWAP 6, 0 + SWAP 7, 1 + SWAP 8, 11 + SWAP 9, 13 + mova m0, [rsp+0*64] + SWAP m1, m28 + mova m2, [rsp+1*64] + mova m3, [rsp+2*64] + mova m4, [rsp+3*64] + SWAP m11, m16 + SWAP m12, m17 + SWAP m13, m27 + SWAP m14, m20 + TRANSPOSE_16X16B 1, 0, [rsp+4*64] + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*2-8], xm2 + movu [dstq+stride3q -8], xm3 + lea t0, [dstq+strideq*4] + movu [t0+strideq*0-8], xm4 + movu [t0+strideq*1-8], xm5 + movu [t0+strideq*2-8], xm6 + movu [t0+stride3q -8], xm7 + lea t0, [t0+strideq*4] + movu [t0+strideq*0-8], xm8 + movu [t0+strideq*1-8], xm9 + movu [t0+strideq*2-8], xm10 + movu [t0+stride3q -8], xm11 + lea t0, [t0+strideq*4] + movu [t0+strideq*0-8], xm12 + movu [t0+strideq*1-8], xm13 + movu [t0+strideq*2-8], xm14 + movu [t0+stride3q -8], xm15 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym0, 1 + vextracti128 [t0+strideq*1-8], ym1, 1 + vextracti128 [t0+strideq*2-8], ym2, 1 + vextracti128 [t0+stride3q -8], ym3, 1 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym4, 1 + vextracti128 [t0+strideq*1-8], ym5, 1 + vextracti128 [t0+strideq*2-8], ym6, 1 + vextracti128 [t0+stride3q -8], ym7, 1 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym8, 1 + vextracti128 [t0+strideq*1-8], ym9, 1 + vextracti128 [t0+strideq*2-8], ym10, 1 + vextracti128 [t0+stride3q -8], ym11, 1 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym12, 1 + vextracti128 [t0+strideq*1-8], ym13, 1 + vextracti128 [t0+strideq*2-8], ym14, 1 + vextracti128 [t0+stride3q -8], ym15, 1 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m0, 2 + vextracti32x4 [t0+strideq*1-8], m1, 2 + vextracti32x4 [t0+strideq*2-8], m2, 2 + vextracti32x4 [t0+stride3q -8], m3, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m4, 2 + vextracti32x4 [t0+strideq*1-8], m5, 2 + vextracti32x4 [t0+strideq*2-8], m6, 2 + vextracti32x4 [t0+stride3q -8], m7, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m8, 2 + vextracti32x4 [t0+strideq*1-8], m9, 2 + vextracti32x4 [t0+strideq*2-8], m10, 2 + vextracti32x4 [t0+stride3q -8], m11, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m12, 2 + vextracti32x4 [t0+strideq*1-8], m13, 2 + vextracti32x4 [t0+strideq*2-8], m14, 2 + vextracti32x4 [t0+stride3q -8], m15, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m0, 3 + vextracti32x4 [t0+strideq*1-8], m1, 3 + vextracti32x4 [t0+strideq*2-8], m2, 3 + vextracti32x4 [t0+stride3q -8], m3, 3 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m4, 3 + vextracti32x4 [t0+strideq*1-8], m5, 3 + vextracti32x4 [t0+strideq*2-8], m6, 3 + vextracti32x4 [t0+stride3q -8], m7, 3 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m8, 3 + vextracti32x4 [t0+strideq*1-8], m9, 3 + vextracti32x4 [t0+strideq*2-8], m10, 3 + vextracti32x4 [t0+stride3q -8], m11, 3 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m12, 3 + vextracti32x4 [t0+strideq*1-8], m13, 3 + vextracti32x4 [t0+strideq*2-8], m14, 3 + vextracti32x4 [t0+stride3q -8], m15, 3 +%endif +%endif + +%elif %1 == 6 + ; flat6 filter + SWAP m15, m23 + SWAP m0, m24 + SWAP m12, m25 + SWAP m1, m26 + vpbroadcastd m15, [pb_3_1] + vpbroadcastd m12, [pb_2] + punpcklbw m8, m13, m5 + punpckhbw m11, m13, m5 + pmaddubsw m0, m8, m15 + pmaddubsw m1, m11, m15 + punpcklbw m7, m4, m3 + punpckhbw m10, m4, m3 + pmaddubsw m2, m7, m12 + pmaddubsw m12, m10, m12 +%ifidn %2, h + vpbroadcastd m15, [pb_m1_1] + %define pbm1_1 m15 +%endif + paddw m0, m2 + paddw m1, m12 + pmulhrsw m2, m0, m16 + pmulhrsw m12, m1, m16 + packuswb m2, m12 + vpblendmb m2{k2}, m3, m2 ; p1 +%ifidn %2, v + mova [t0+strideq*2], m2 +%endif + + pmaddubsw m8, pbm1_1 + pmaddubsw m11, pbm1_1 + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m11, pbm1_1 + paddw m0, m8 + paddw m1, m11 + pmulhrsw m12, m0, m16 + pmulhrsw m13, m1, m16 + packuswb m12, m13 + vpblendmb m12{k2}, m4, m12 ; p0 +%ifidn %2, v + mova [t0+stride3q], m12 +%endif + + vpbroadcastd m9, [pb_m1_2] + vpbroadcastd m4, [pb_m1_0] + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m3, m14 + punpckhbw m11, m3, m14 + pmaddubsw m14, m8, pbm1_1 + pmaddubsw m13, m11, pbm1_1 + paddw m0, m14 + paddw m1, m13 + pmulhrsw m14, m0, m16 + pmulhrsw m13, m1, m16 + packuswb m14, m13 + vpblendmb m14{k2}, m5, m14 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m14 +%endif + + pmaddubsw m8, m9 + pmaddubsw m11, m9 + paddw m0, m8 + paddw m1, m11 + pmaddubsw m7, m4 + pmaddubsw m10, m4 + paddw m0, m7 + paddw m1, m10 + pmulhrsw m0, m16 + pmulhrsw m1, m16 + packuswb m0, m1 + vpblendmb m0{k2}, m6, m0 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m0 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 +%endif +%else ; %1 == 4 +%ifidn %2, v + mova [t0+strideq*0], m3 ; p1 + mova [t0+strideq*1], m4 ; p0 + mova [t0+strideq*2], m5 ; q0 + mova [t0+stride3q ], m6 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 +%endif +%endif +%endmacro + +%define k7 k6 + +INIT_ZMM avx512icl +cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride + DECLARE_REG_TMP 9 + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mova m31, [pb_4x0_4x4_4x8_4x12] + mova m30, [pb_mask] + vpbroadcastd m29, [pb_128] + vpbroadcastd m28, [pb_m1_1] + vpbroadcastd m27, [pw_2048] + %define pbshuf m31 + %define pbmask m30 + %define pb128 m29 + %define pbm1_1 m28 + %define pw2048 m27 + %define is_uv 0 + +.loop: + cmp word [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + FILTER 4, v + +.end: + add lq, 64 + add dstq, 64 + add maskq, 2 + sub wd, 16 + jg .loop + RET + +cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ + lut, h, stride3, stride8 + DECLARE_REG_TMP 9, 10, 11, 12 + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea stride8q, [strideq*8] + kxnorw k6, k6, k6 + vpbroadcastd m29, strided + vpbroadcastd m30, l_strided + pmulld m31, m29, [hmulA] + pmulld m30, m30, [hmulB] + pmulld m29, m29, [hmulC] + %define pbshuf [pb_4x0_4x4_4x8_4x12] + %define pbmask [pb_mask] + %define pb128 [pb_128]{bcstd} + shl l_strideq, 1 + +.loop: + cmp word [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + FILTER 4, h + +.end: + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+stride8q*8] + add maskq, 2 + sub hd, 16 + jg .loop + RET +RESET_MM_PERMUTATION + +cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride + DECLARE_REG_TMP 9 + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mova m20, [pb_4x0_4x4_4x8_4x12] + mova m19, [pb_mask] + vpbroadcastd m18, [pb_128] + vpbroadcastd m17, [pb_m1_1] + vpbroadcastd m16, [pw_4096] + %define pbshuf m20 + %define pbmask m19 + %define pb128 m18 + %define pbm1_1 m17 + %define is_uv 1 + +.loop: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + FILTER 4, v + +.end: + add lq, 64 + add dstq, 64 + add maskq, 2 + sub wd, 16 + jg .loop + RET + +%undef k7 +cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ + lut, h, stride3, stride8 + DECLARE_REG_TMP 9, 10, 11 + mov r7d, 0xffff + movzx r8d, r7b + cmp hd, 9 + cmovb r7d, r8d + kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff + shl l_strideq, 2 + sub lq, 4 + kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0 + lea stride3q, [strideq*3] + lea stride8q, [strideq*8] + vpbroadcastd m19, strided + vpbroadcastd m20, l_strided + pmulld m21, m19, [hmulA] + pmulld m20, m20, [hmulB] + pmulld m19, m19, [hmulC] + mova m18, [pb_mask] + vpbroadcastd m17, [pb_128] + vpbroadcastd m16, [pw_4096] + %define pbshuf [pb_4x0_4x4_4x8_4x12] + %define pbmask m18 + %define pb128 m17 + %xdefine m31 m21 + %xdefine m30 m20 + %xdefine m29 m19 + add l_strideq, l_strideq + +.loop: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + FILTER 4, h + +.end: + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+stride8q*8] + add maskq, 2 + sub hd, 16 + jg .loop + RET + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/loopfilter_init_tmpl.c dav1d-1.0.0/src/x86/loopfilter_init_tmpl.c --- dav1d-0.9.2/src/x86/loopfilter_init_tmpl.c 2021-09-03 15:51:24.421037200 +0000 +++ dav1d-1.0.0/src/x86/loopfilter_init_tmpl.c 2022-03-18 14:31:56.022356000 +0000 @@ -36,6 +36,7 @@ decl_loopfilter_sb_fns(ssse3); decl_loopfilter_sb_fns(avx2); +decl_loopfilter_sb_fns(avx512icl); COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); @@ -47,12 +48,21 @@ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3); c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3); +#if ARCH_X86_64 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if ARCH_X86_64 c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2); c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2); c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2); c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if BITDEPTH == 8 + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl); +#endif #endif } diff -Nru dav1d-0.9.2/src/x86/looprestoration16_avx2.asm dav1d-1.0.0/src/x86/looprestoration16_avx2.asm --- dav1d-0.9.2/src/x86/looprestoration16_avx2.asm 2021-09-03 15:51:24.421037200 +0000 +++ dav1d-1.0.0/src/x86/looprestoration16_avx2.asm 2022-03-18 14:31:56.022356000 +0000 @@ -66,25 +66,17 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - -DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers INIT_YMM avx2 -cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h +cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt %define base t4-wiener_hshift - mov fltq, fltmp - mov edged, r8m + mov fltq, r6mp movifnidn wd, wm - mov hd, r6m - mov t3d, r9m ; pixel_max + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max vbroadcasti128 m6, [wiener_shufA] vpbroadcastd m12, [fltq+ 0] ; x0 x1 lea t4, [wiener_hshift] @@ -92,13 +84,13 @@ add wd, wd vpbroadcastd m13, [fltq+ 4] ; x2 x3 shr t3d, 11 - vbroadcasti128 m8, [wiener_shufC] - add lpfq, wq - vbroadcasti128 m9, [wiener_shufD] - lea t1, [rsp+wq+16] vpbroadcastd m14, [fltq+16] ; y0 y1 - add dstq, wq + add lpfq, wq vpbroadcastd m15, [fltq+20] ; y2 y3 + add dstq, wq + vbroadcasti128 m8, [wiener_shufC] + lea t1, [rsp+wq+16] + vbroadcasti128 m9, [wiener_shufD] neg wq vpbroadcastd m0, [base+wiener_hshift+t3*4] vpbroadcastd m10, [base+wiener_round+t3*4] @@ -108,30 +100,29 @@ test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top - lea r7, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 - mov [rsp+8*1], lpf_strideq - add r7, lpf_strideq - mov [rsp+8*0], r7 ; below + add r10, strideq + mov [rsp], r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -144,19 +135,18 @@ jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 - mov lpfq, [rsp+8*0] + mov lpfq, [rsp] call .hv_bottom - add lpfq, [rsp+8*1] + add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: - lea r7, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - lea r7, [r7+lpf_strideq*2] - mov [rsp+8*0], r7 + lea r10, [r10+strideq*2] + mov [rsp], r10 call .h mov t6, t1 mov t5, t1 @@ -165,13 +155,13 @@ mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -267,7 +257,7 @@ ret ALIGN function_align .hv: - add lpfq, dst_strideq + add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left @@ -352,7 +342,7 @@ mov t2, t1 mov t1, t0 mov t0, t6 - add dstq, dst_strideq + add dstq, strideq ret .v: mov r10, wq @@ -387,16 +377,17 @@ mov t4, t3 mov t3, t2 mov t2, t1 - add dstq, dst_strideq + add dstq, strideq ret -cglobal wiener_filter5_16bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h + +cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt %define base t4-wiener_hshift - mov fltq, fltmp - mov edged, r8m + mov fltq, r6mp movifnidn wd, wm - mov hd, r6m - mov t3d, r9m ; pixel_max + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max vbroadcasti128 m5, [wiener_shufE] vpbroadcastw m11, [fltq+ 2] ; x1 vbroadcasti128 m6, [wiener_shufB] @@ -407,10 +398,10 @@ shr t3d, 11 vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) add lpfq, wq - lea t1, [rsp+wq+16] vpbroadcastw m13, [fltq+18] ; y1 add dstq, wq vpbroadcastd m14, [fltq+20] ; y2 y3 + lea t1, [rsp+wq+16] neg wq vpbroadcastd m0, [base+wiener_hshift+t3*4] vpbroadcastd m9, [base+wiener_round+t3*4] @@ -422,22 +413,21 @@ test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top - lea r7, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 - mov [rsp+8*1], lpf_strideq - add r7, lpf_strideq - mov [rsp+8*0], r7 ; below + add r10, strideq + mov [rsp], r10 ; below call .h mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -450,25 +440,24 @@ jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 - mov lpfq, [rsp+8*0] + mov lpfq, [rsp] call .hv_bottom - add lpfq, [rsp+8*1] + add lpfq, strideq call .hv_bottom .end: RET .no_top: - lea r7, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - lea r7, [r7+lpf_strideq*2] - mov [rsp+8*0], r7 + lea r10, [r10+strideq*2] + mov [rsp], r10 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -486,7 +475,7 @@ mov t4, t3 mov t3, t2 mov t2, t1 - add dstq, dst_strideq + add dstq, strideq .v1: call .v jmp .end @@ -557,7 +546,7 @@ ret ALIGN function_align .hv: - add lpfq, dst_strideq + add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left @@ -632,7 +621,7 @@ mov t2, t1 mov t1, t0 mov t0, t4 - add dstq, dst_strideq + add dstq, strideq ret .v: mov r10, wq @@ -662,61 +651,59 @@ jl .v_loop ret -cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_5x5_16bpc, 4, 14, 15, 400*24+16, dst, stride, left, lpf, \ + w, h, edge, params movifnidn wd, wm - mov paramsq, paramsmp + mov paramsq, r6mp lea r13, [sgr_x_by_x_avx2+256*4] - mov edged, r8m - mov hd, r6m + movifnidn hd, hm + mov edged, r7m add wd, wd vpbroadcastw m7, [paramsq+8] ; w0 add lpfq, wq vpbroadcastd m8, [pd_8] - lea t1, [rsp+wq+20] - vpbroadcastd m9, [pd_25] add dstq, wq - vpbroadcastd m10, [paramsq+0] ; s0 + vpbroadcastd m9, [pd_25] lea t3, [rsp+wq*2+400*12+16] - vpbroadcastd m11, [pd_0xf00800a4] + vpbroadcastd m10, [paramsq+0] ; s0 lea t4, [rsp+wq+400*20+16] - vpbroadcastd m12, [pw_256] + vpbroadcastd m11, [pd_0xf00800a4] + lea t1, [rsp+wq+20] + mova xm12, [sgr_lshuf5] neg wq vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15) pxor m6, m6 vpbroadcastd m14, [pw_1023] psllw m7, 4 - mova xm15, [sgr_lshuf5] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t2, t1 call .top_fixup add t1, 400*6 call .h_top - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - add r10, lpf_strideq - mov [rsp+8*0], r10 ; below + add r10, strideq + mov [rsp], r10 ; below mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: - add lpfq, dst_strideq + add lpfq, strideq call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: - add lpfq, dst_strideq + add lpfq, strideq test hd, hd jz .odd_height call .h - add lpfq, dst_strideq + add lpfq, strideq call .hv call .n0 call .n1 @@ -724,9 +711,9 @@ jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom - mov lpfq, [rsp+8*0] + mov lpfq, [rsp] call .h_top - add lpfq, [rsp+8*1] + add lpfq, strideq call .hv_bottom .end: call .n0 @@ -749,11 +736,10 @@ call .v jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - lea r10, [r10+lpf_strideq*2] - mov [rsp+8*0], r10 + lea r10, [r10+strideq*2] + mov [rsp], r10 call .h lea t2, [t1+400*6] call .top_fixup @@ -786,7 +772,7 @@ jmp .h_main .h_extend_left: mova xm4, [lpfq+wq] - pshufb xm4, xm15 + pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+10], 1 jmp .h_main .h_top: @@ -867,7 +853,7 @@ jmp .hv_main .hv_extend_left: mova xm4, [lpfq+wq] - pshufb xm4, xm15 + pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+10], 1 jmp .hv_main .hv_bottom: @@ -945,13 +931,12 @@ paddusw m4, m11 paddusw m5, m11 psrad m3, m4, 20 ; min(z, 255) - 256 - vpgatherdd m2, [r13+m3*4], m4 + vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 - psubw m2, m12, m2 ; a paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 mova [t4+r10+4], m2 @@ -968,12 +953,12 @@ mov t0, t2 ret .hv_last_row: ; esoteric edge case for odd heights - mova [t1+r10+400*0], m1 - paddw m1, m0 - mova [t1+r10+400*2], m4 - paddd m4, m2 - mova [t1+r10+400*4], m5 - paddd m5, m3 + mova [t1+r10+400*0], m1 + paddw m1, m0 + mova [t1+r10+400*2], m4 + paddd m4, m2 + mova [t1+r10+400*4], m5 + paddd m5, m3 jmp .hv_main2 .v: ; vertical boxsum + ab lea r10, [wq-4] @@ -1015,13 +1000,12 @@ paddusw m4, m11 paddusw m5, m11 psrad m3, m4, 20 ; min(z, 255) - 256 - vpgatherdd m2, [r13+m3*4], m4 + vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 - psubw m2, m12, m2 ; a paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 mova [t4+r10+4], m2 @@ -1098,21 +1082,19 @@ pmaddwd m3, m1 vinserti128 m1, m4, xm5, 1 vperm2i128 m4, m5, 0x31 - paddd m2, m1 ; a * src + b + (1 << 8) - paddd m3, m4 - psrld m2, 9 - psrld m3, 9 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m4, m3 + psrad m1, 9 + psrad m4, 9 + packssdw m1, m4 + pmulhrsw m1, m7 + paddw m0, m1 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+r10], m0 add r10, 32 jl .n0_loop - add dstq, dst_strideq + add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) @@ -1130,73 +1112,70 @@ pmaddwd m3, m1 vinserti128 m1, m4, xm5, 1 vperm2i128 m4, m5, 0x31 - paddd m2, m1 ; a * src + b + (1 <<7) - paddd m3, m4 - psrld m2, 8 - psrld m3, 8 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m1, m2 ; b - a * src + (1 << 7) + psubd m4, m3 + psrad m1, 8 + psrad m4, 8 + packssdw m1, m4 + pmulhrsw m1, m7 + paddw m0, m1 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+r10], m0 add r10, 32 jl .n1_loop - add dstq, dst_strideq + add dstq, strideq ret -cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_3x3_16bpc, 4, 14, 14, 400*42+8, dst, stride, left, lpf, \ + w, h, edge, params movifnidn wd, wm - mov paramsq, paramsmp + mov paramsq, r6mp lea r13, [sgr_x_by_x_avx2+256*4] - mov edged, r8m add wd, wd - mov hd, r6m + movifnidn hd, hm + mov edged, r7m add lpfq, wq vpbroadcastw m7, [paramsq+10] ; w1 - lea t1, [rsp+wq+12] - vpbroadcastd m8, [pd_8] add dstq, wq vpbroadcastd m9, [paramsq+ 4] ; s1 lea t3, [rsp+wq*2+400*12+8] - vpbroadcastd m10, [pd_0xf00801c7] + vpbroadcastd m8, [pd_8] lea t4, [rsp+wq+400*32+8] + vpbroadcastd m10, [pd_0xf00801c7] + lea t1, [rsp+wq+12] vpbroadcastd m11, [pd_34816] neg wq - vpbroadcastd m12, [pw_256] + mova xm12, [sgr_lshuf3] pxor m6, m6 vpbroadcastd m13, [pw_1023] psllw m7, 4 - mova xm14, [sgr_lshuf3] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t2, t1 add t1, 400*6 call .h_top - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - add r10, lpf_strideq + add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 - add lpfq, dst_strideq + add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: - add lpfq, dst_strideq + add lpfq, strideq call .hv0 test hd, hd jz .odd_height - add lpfq, dst_strideq + add lpfq, strideq call .hv1 call .n0 call .n1 @@ -1206,7 +1185,7 @@ jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom - add lpfq, lpf_strideq + add lpfq, strideq call .hv1_bottom .end: call .n0 @@ -1231,9 +1210,9 @@ call .v1 jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea r10, [wq-4] @@ -1268,7 +1247,7 @@ jmp .h_main .h_extend_left: mova xm4, [lpfq+wq] - pshufb xm4, xm14 + pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+12], 1 jmp .h_main .h_top: @@ -1318,7 +1297,7 @@ jmp .hv0_main .hv0_extend_left: mova xm4, [lpfq+wq] - pshufb xm4, xm14 + pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+12], 1 jmp .hv0_main .hv0_bottom: @@ -1388,7 +1367,7 @@ paddusw m4, m10 paddusw m5, m10 psrad m3, m4, 20 ; min(z, 255) - 256 - vpgatherdd m2, [r13+m3*4], m4 + vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 @@ -1396,7 +1375,6 @@ packssdw m2, m3 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - psubw m2, m12, m2 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*0+ 4], m2 @@ -1420,7 +1398,7 @@ jmp .hv1_main .hv1_extend_left: mova xm4, [lpfq+wq] - pshufb xm4, xm14 + pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+12], 1 jmp .hv1_main .hv1_bottom: @@ -1484,7 +1462,7 @@ paddusw m4, m10 paddusw m5, m10 psrad m3, m4, 20 ; min(z, 255) - 256 - vpgatherdd m2, [r13+m3*4], m4 + vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 @@ -1492,7 +1470,6 @@ packssdw m2, m3 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - psubw m2, m12, m2 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*2 +4], m2 @@ -1548,7 +1525,7 @@ paddusw m4, m10 paddusw m5, m10 psrad m3, m4, 20 ; min(z, 255) - 256 - vpgatherdd m2, [r13+m3*4], m4 + vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 @@ -1556,7 +1533,6 @@ packssdw m2, m3 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - psubw m2, m12, m2 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*0+ 4], m2 @@ -1606,7 +1582,7 @@ paddusw m4, m10 paddusw m5, m10 psrad m3, m4, 20 ; min(z, 255) - 256 - vpgatherdd m2, [r13+m3*4], m4 + vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 @@ -1614,7 +1590,6 @@ packssdw m2, m3 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - psubw m2, m12, m2 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*2+ 4], m2 @@ -1700,21 +1675,19 @@ pmaddwd m3, m1 vinserti128 m1, m4, xm5, 1 vperm2i128 m4, m5, 0x31 - paddd m2, m1 ; a * src + b + (1 << 8) - paddd m3, m4 - psrld m2, 9 - psrld m3, 9 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m4, m3 + psrad m1, 9 + psrad m4, 9 + packssdw m1, m4 + pmulhrsw m1, m7 + paddw m0, m1 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+r10], m0 add r10, 32 jl .n0_loop - add dstq, dst_strideq + add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) @@ -1756,73 +1729,71 @@ pmaddwd m3, m1 vinserti128 m1, m4, xm5, 1 vperm2i128 m4, m5, 0x31 - paddd m2, m1 ; a * src + b + (1 << 8) - paddd m3, m4 - psrld m2, 9 - psrld m3, 9 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m4, m3 + psrad m1, 9 + psrad m4, 9 + packssdw m1, m4 + pmulhrsw m1, m7 + paddw m0, m1 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+r10], m0 add r10, 32 jl .n1_loop - add dstq, dst_strideq + add dstq, strideq ret -cglobal sgr_filter_mix_16bpc, 5, 14, 16, 400*66+8, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \ + w, h, edge, params movifnidn wd, wm - mov paramsq, paramsmp + mov paramsq, r6mp lea r13, [sgr_x_by_x_avx2+256*4] - mov edged, r8m add wd, wd - mov hd, r6m + movifnidn hd, hm + mov edged, r7m add lpfq, wq - vpbroadcastd m9, [pd_8] - lea t1, [rsp+wq+12] - vpbroadcastd m10, [pd_34816] + vpbroadcastd m15, [paramsq+8] ; w0 w1 add dstq, wq - vpbroadcastd m11, [pw_256] + vpbroadcastd m13, [paramsq+0] ; s0 lea t3, [rsp+wq*2+400*24+8] - vpbroadcastd m12, [pd_0xf00801c7] + vpbroadcastd m14, [paramsq+4] ; s1 lea t4, [rsp+wq+400*52+8] - vpbroadcastd m15, [paramsq+8] ; w0 w1 + vpbroadcastd m9, [pd_8] + lea t1, [rsp+wq+12] + vpbroadcastd m10, [pd_34816] neg wq - vpbroadcastd m13, [paramsq+0] ; s0 + vpbroadcastd m11, [pd_4096] pxor m7, m7 - vpbroadcastd m14, [paramsq+4] ; s1 + vpbroadcastd m12, [pd_0xf00801c7] psllw m15, 2 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t2, t1 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup add t1, 400*12 call .h_top - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - add r10, lpf_strideq + add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 - add lpfq, dst_strideq + add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: - add lpfq, dst_strideq + add lpfq, strideq call .hv0 test hd, hd jz .odd_height - add lpfq, dst_strideq + add lpfq, strideq call .hv1 call .n0 call .n1 @@ -1832,7 +1803,7 @@ jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom - add lpfq, lpf_strideq + add lpfq, strideq call .hv1_bottom .end: call .n0 @@ -1857,9 +1828,9 @@ call .v1 jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea r10, [wq-4] @@ -2048,7 +2019,7 @@ paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z3, 255) - 256 - vpgatherdd m2, [r13+m3*4], m4 + vpgatherdd m2, [r13+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 @@ -2056,7 +2027,6 @@ packssdw m2, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 - psubw m2, m11, m2 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*2+ 4], m2 @@ -2154,7 +2124,7 @@ paddusw m2, m12 paddusw m3, m12 psrad m7, m2, 20 ; min(z3, 255) - 256 - vpgatherdd m6, [r13+m7*4], m2 + vpgatherdd m6, [r13+m7*4], m2 ; x3 psrad m2, m3, 20 vpgatherdd m7, [r13+m2*4], m3 pmulld m0, m6 @@ -2162,7 +2132,6 @@ pmulld m7, m1 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m7, m10 - psubw m6, m11, m6 psrld m0, 12 psrld m7, 12 paddw m1, m8, [t2+r10+400*0] @@ -2207,7 +2176,7 @@ paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 - vpgatherdd m4, [r13+m5*4], m2 + vpgatherdd m4, [r13+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r13+m2*4], m3 pmulld m0, m4 @@ -2215,7 +2184,6 @@ packssdw m4, m5 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m10 - psubw m4, m11, m4 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*0+ 4], m4 @@ -2271,7 +2239,7 @@ paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z3, 255) - 256 - vpgatherdd m2, [r13+m3*4], m4 + vpgatherdd m2, [r13+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 @@ -2279,7 +2247,6 @@ packssdw m2, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 - psubw m2, m11, m2 psrld m0, 12 psrld m1, 12 mova m3, [t1+r10+400*0] @@ -2341,7 +2308,7 @@ paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z3, 255) - 256 - vpgatherdd m2, [r13+m3*4], m4 + vpgatherdd m2, [r13+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 @@ -2349,7 +2316,6 @@ packssdw m2, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 - psubw m2, m11, m2 psrld m0, 12 psrld m8, m1, 12 mova [t4+r10*1+400*4+4], m2 @@ -2396,7 +2362,7 @@ paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 - vpgatherdd m4, [r13+m5*4], m2 + vpgatherdd m4, [r13+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r13+m2*4], m3 pmulld m0, m4 @@ -2404,7 +2370,6 @@ packssdw m4, m5 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m10 - psubw m4, m11, m4 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*0+ 4], m4 @@ -2508,16 +2473,13 @@ pmaddwd m2, m4 ; a5 * src pmaddwd m3, m4 ; a3 * src pslld m4, 13 - psubd m0, m4 - psubd m1, m4 - paddd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) - paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) + psubd m0, m2 ; b5 - a5 * src + (1 << 8) + psubd m1, m3 ; b3 - a3 * src + (1 << 8) psrld m0, 9 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 - vpbroadcastd m1, [pd_4096] - paddd m4, m1 + paddd m4, m11 paddd m0, m4 psrad m0, 7 vextracti128 xm1, m0, 1 @@ -2526,7 +2488,7 @@ mova [dstq+r10], xm0 add r10, 16 jl .n0_loop - add dstq, dst_strideq + add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) @@ -2551,22 +2513,19 @@ mova [t3+r10*2+400*20], m5 mova [t3+r10*2+400*24], m4 pmovzxwd m4, [dstq+r10] - pmovzxwd m0, [t4+r10*1+400* 6] + pmovzxwd m2, [t4+r10*1+400* 6] pmovzxwd m3, xm3 - pmaddwd m0, m4 ; a5 * src + mova m0, [t3+r10*2+400*12] + pmaddwd m2, m4 ; a5 * src pmaddwd m3, m4 ; a3 * src - pslld m4, 12 - psubd m2, m4, [t3+r10*2+400*12] - paddd m4, m4 - psubd m1, m4 - psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) - paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) + pslld m4, 13 + psubd m0, m2 ; b5 - a5 * src + (1 << 8) + psubd m1, m3 ; b3 - a3 * src + (1 << 8) psrld m0, 8 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 - vpbroadcastd m1, [pd_4096] - paddd m4, m1 + paddd m4, m11 paddd m0, m4 psrad m0, 7 vextracti128 xm1, m0, 1 @@ -2575,7 +2534,7 @@ mova [dstq+r10], xm0 add r10, 16 jl .n1_loop - add dstq, dst_strideq + add dstq, strideq ret %endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/looprestoration16_avx512.asm dav1d-1.0.0/src/x86/looprestoration16_avx512.asm --- dav1d-0.9.2/src/x86/looprestoration16_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/looprestoration16_avx512.asm 2022-03-18 14:31:56.022356000 +0000 @@ -0,0 +1,2524 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 + +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +r_ext_mask: times 72 db -1 + times 8 db 0 +wiener_hshift: dw 4, 4, 1, 1 +wiener_vshift: dw 1024, 1024, 4096, 4096 +wiener_round: dd 1049600, 1048832 + +pw_164_455: dw 164, 455 +pw_1023: times 2 dw 1023 +pw_61448: times 2 dw 61448 +pd_m262128: dd -262128 +pd_m34816: dd -34816 +pd_m25: dd -25 +pd_m9: dd -9 +pd_8: dd 8 +pd_2147483648: dd 2147483648 + +cextern sgr_x_by_x + +SECTION .text + +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers + +INIT_ZMM avx512icl +cglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt +%define base t4-wiener_hshift + mov fltq, r6mp + movifnidn wd, wm + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + vbroadcasti128 m6, [wiener_shufA] + vpbroadcastd m12, [fltq+ 0] ; x0 x1 + lea t4, [wiener_hshift] + vbroadcasti128 m7, [wiener_shufB] + add wd, wd + vpbroadcastd m13, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vpbroadcastd m14, [fltq+16] ; y0 y1 + add lpfq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + add dstq, wq + vbroadcasti128 m8, [wiener_shufC] + lea t1, [rsp+wq+16] + vbroadcasti128 m9, [wiener_shufD] + neg wq + vpbroadcastd m0, [base+wiener_hshift+t3*4] + mov r10d, 0xfe + vpbroadcastd m10, [base+wiener_round+t3*4] + kmovb k1, r10d + vpbroadcastd m11, [base+wiener_vshift+t3*4] + pmullw m12, m0 ; upshift filter coefs to make the + vpbroadcastd m16, [pd_m262128] + pmullw m13, m0 ; horizontal downshift constant + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq xm3, [leftq] + vmovdqu64 m3{k1}, [lpfq+r10-8] + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m4, [lpfq+r10+0] + vpbroadcastw xm3, xm4 + vmovdqu64 m3{k1}, [lpfq+r10-8] + jmp .h_main2 +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+r10-8] +.h_main: + mova m4, [lpfq+r10+0] +.h_main2: + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -68 + jl .h_have_right + push r0 + lea r0, [r_ext_mask+66] + vpbroadcastw m0, [lpfq-2] + vpternlogd m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b + vpternlogd m4, m0, [r0+r10+ 8], 0xe4 + vpternlogd m5, m0, [r0+r10+16], 0xe4 + pop r0 +.h_have_right: + pshufb m2, m3, m6 + pshufb m1, m4, m7 + paddw m2, m1 + pshufb m3, m8 + mova m0, m16 + vpdpwssd m0, m2, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + vpdpwssd m0, m3, m13 + pshufb m2, m5, m7 + paddw m2, m1 + mova m1, m16 + pshufb m4, m8 + vpdpwssd m1, m2, m12 + pshufb m5, m9 + paddw m4, m5 + vpdpwssd m1, m4, m13 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq xm3, [leftq] + vmovdqu64 m3{k1}, [lpfq+r10-8] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m4, [lpfq+r10+0] + vpbroadcastw xm3, xm4 + vmovdqu64 m3{k1}, [lpfq+r10-8] + jmp .hv_main2 +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-8] +.hv_main: + mova m4, [lpfq+r10+0] +.hv_main2: + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -68 + jl .hv_have_right + push r0 + lea r0, [r_ext_mask+66] + vpbroadcastw m0, [lpfq-2] + vpternlogd m3, m0, [r0+r10+ 0], 0xe4 + vpternlogd m4, m0, [r0+r10+ 8], 0xe4 + vpternlogd m5, m0, [r0+r10+16], 0xe4 + pop r0 +.hv_have_right: + pshufb m2, m3, m6 + pshufb m1, m4, m7 + paddw m2, m1 + pshufb m3, m8 + mova m0, m16 + vpdpwssd m0, m2, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + vpdpwssd m0, m3, m13 + pshufb m2, m5, m7 + paddw m2, m1 + pshufb m4, m8 + mova m1, m16 + vpdpwssd m1, m2, m12 + pshufb m5, m9 + paddw m4, m5 + vpdpwssd m1, m4, m13 + mova m2, [t4+r10] + paddw m2, [t2+r10] + mova m5, [t3+r10] + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova m4, [t5+r10] + paddw m4, [t1+r10] + psraw m0, 1 + paddw m3, m0, [t6+r10] + mova [t0+r10], m0 + punpcklwd m1, m2, m5 + mova m0, m10 + vpdpwssd m0, m1, m15 + punpckhwd m2, m5 + mova m1, m10 + vpdpwssd m1, m2, m15 + punpcklwd m2, m3, m4 + vpdpwssd m0, m2, m14 + punpckhwd m3, m4 + vpdpwssd m1, m3, m14 + psrad m0, 5 + psrad m1, 5 + packusdw m0, m1 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 64 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m2, [t4+r10] + paddw m2, [t2+r10] + mova m3, [t3+r10] + punpcklwd m1, m2, m3 + mova m0, m10 + vpdpwssd m0, m1, m15 + punpckhwd m2, m3 + mova m1, m10 + vpdpwssd m1, m2, m15 + mova m4, [t1+r10] + paddw m3, m4, [t6+r10] + paddw m4, [t5+r10] + punpcklwd m2, m3, m4 + vpdpwssd m0, m2, m14 + punpckhwd m3, m4 + vpdpwssd m1, m3, m14 + psrad m0, 5 + psrad m1, 5 + packusdw m0, m1 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 64 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret + +cglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt +%define base r13-r_ext_mask-70 + mov fltq, r6mp + movifnidn wd, wm + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + vbroadcasti128 m5, [wiener_shufE] + vpbroadcastw m11, [fltq+ 2] ; x1 + vbroadcasti128 m6, [wiener_shufB] + lea r13, [r_ext_mask+70] + vbroadcasti128 m7, [wiener_shufD] + add wd, wd + vpbroadcastd m12, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + vpbroadcastw m13, [fltq+18] ; y1 + add dstq, wq + vpbroadcastd m14, [fltq+20] ; y2 y3 + lea t1, [rsp+wq+16] + vpbroadcastd m0, [base+wiener_hshift+t3*4] + neg wq + vpbroadcastd m9, [base+wiener_round+t3*4] + mov r10d, 0xfffe + vpbroadcastd m10, [base+wiener_vshift+t3*4] + kmovw k1, r10d + pmullw m11, m0 + pmullw m12, m0 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.end: + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq +.v1: + call .v + jmp .end +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm3, [leftq+4] + vmovdqu32 m3{k1}, [lpfq+r10-4] + add leftq, 8 + jmp .h_main +.h_extend_left: + vpbroadcastw xm3, [lpfq+r10] + vmovdqu32 m3{k1}, [lpfq+r10-4] + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+r10-4] +.h_main: + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -66 + jl .h_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m3, m0, [r13+r10+0], 0xe4 ; c ? a : b + vpternlogd m4, m0, [r13+r10+8], 0xe4 +.h_have_right: + pshufb m1, m3, m5 + mova m0, m8 + vpdpwssd m0, m1, m11 + pshufb m2, m4, m5 + mova m1, m8 + vpdpwssd m1, m2, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + vpdpwssd m0, m2, m12 + pshufb m4, m7 + paddw m3, m4 + vpdpwssd m1, m3, m12 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm3, [leftq+4] + vmovdqu32 m3{k1}, [lpfq+r10-4] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + vpbroadcastw xm3, [lpfq+r10] + vmovdqu32 m3{k1}, [lpfq+r10-4] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-4] +.hv_main: + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -66 + jl .hv_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m3, m0, [r13+r10+0], 0xe4 + vpternlogd m4, m0, [r13+r10+8], 0xe4 +.hv_have_right: + pshufb m1, m3, m5 + mova m0, m8 + vpdpwssd m0, m1, m11 + pshufb m2, m4, m5 + mova m1, m8 + vpdpwssd m1, m2, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + vpdpwssd m0, m2, m12 + pshufb m4, m7 + paddw m4, m3 + vpdpwssd m1, m4, m12 + mova m2, [t3+r10] + paddw m2, [t1+r10] + mova m3, [t2+r10] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + mova m3, m9 + vpdpwssd m3, m2, m14 + mova m2, m9 + vpdpwssd m2, m4, m14 + mova m4, [t4+r10] + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t0+r10], m0 + punpcklwd m1, m0, m4 + vpdpwssd m2, m1, m13 + punpckhwd m0, m4 + vpdpwssd m3, m0, m13 + psrad m2, 5 + psrad m3, 5 + packusdw m2, m3 + pmulhuw m2, m10 + mova [dstq+r10], m2 + add r10, 64 + jl .hv_loop + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m0, [t1+r10] + paddw m2, m0, [t3+r10] + mova m1, [t2+r10] + mova m4, [t4+r10] + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 5 + psrad m0, 5 + packusdw m0, m1 + pmulhuw m0, m10 + mova [dstq+r10], m0 + add r10, 64 + jl .v_loop + ret + +cglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \ + w, h, edge, params +%define base r13-r_ext_mask-72 + movifnidn wd, wm + mov paramsq, r6mp + lea r13, [r_ext_mask+72] + mov edged, r7m + movifnidn hd, hm + pxor m6, m6 + vpbroadcastw m7, [paramsq+8] ; w0 + add wd, wd + vpbroadcastd m8, [base+pd_8] + add lpfq, wq + vpbroadcastd m9, [base+pd_m25] + add dstq, wq + vpsubd m10, m6, [paramsq+0] {1to16} ; -s0 + lea t3, [rsp+wq*2+416*12+8] + vpbroadcastd m11, [base+pw_164_455] + lea t4, [rsp+wq+416*20+8] + vpbroadcastd m12, [base+pw_61448] ; (15 << 12) + (1 << 3) + lea t1, [rsp+wq+12] + vpbroadcastd m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15)) + neg wq + vpbroadcastd m14, [base+pw_1023] + psllw m7, 4 + mova m18, [sgr_x_by_x+64*0] + mov r10d, 0xfffffff8 + mova m19, [sgr_x_by_x+64*1] + kmovd k1, r10d + mova m20, [sgr_x_by_x+64*2] + mov r10, 0x3333333333333333 + mova m21, [sgr_x_by_x+64*3] + kmovq k2, r10 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call .top_fixup + add t1, 416*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + test hd, hd + jz .odd_height + call .h + add lpfq, strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .h_top + add lpfq, strideq + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+416*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .h_main +.h_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m16, [lpfq+r10- 2] +.h_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -68 + jl .h_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.h_have_right: + palignr m2, m17, m16, 2 + paddw m0, m16, m2 + palignr m3, m17, m16, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + shufpd m17, m16, m17, 0x55 + paddw m0, m17 + punpcklwd m3, m16, m17 + vpdpwssd m1, m3, m3 + punpckhwd m3, m16, m17 + vpdpwssd m2, m3, m3 + shufps m16, m17, q2121 + paddw m0, m16 ; sum + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10+416*0] + paddd m1, [t1+r10+416*2] + paddd m2, [t1+r10+416*4] +.h_loop_end: + punpcklwd m17, m16, m6 + vpdpwssd m1, m17, m17 ; sumsq + punpckhwd m16, m6 + vpdpwssd m2, m16, m16 + mova [t1+r10+416*0], m0 + mova [t1+r10+416*2], m1 + mova [t1+r10+416*4], m2 + add r10, 64 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-4] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10+416*0] + mova m1, [t1+r10+416*2] + mova m2, [t1+r10+416*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m1 + mova [t2+r10+416*4], m2 + add r10, 64 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .hv_main +.hv_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m16, [lpfq+r10- 2] +.hv_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -68 + jl .hv_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv_have_right: + palignr m3, m17, m16, 2 + paddw m0, m16, m3 + palignr m1, m17, m16, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + shufpd m17, m16, m17, 0x55 + paddw m0, m17 + punpcklwd m1, m16, m17 + vpdpwssd m2, m1, m1 + punpckhwd m1, m16, m17 + vpdpwssd m3, m1, m1 + shufps m16, m17, q2121 + paddw m0, m16 ; h sum + punpcklwd m17, m16, m6 + vpdpwssd m2, m17, m17 ; h sumsq + punpckhwd m16, m6 + vpdpwssd m3, m16, m16 + paddw m1, m0, [t1+r10+416*0] + paddd m16, m2, [t1+r10+416*2] + paddd m17, m3, [t1+r10+416*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10+416*0] ; hv sum + paddd m16, [t2+r10+416*2] ; hv sumsq + paddd m17, [t2+r10+416*4] + mova [t0+r10+416*0], m0 + mova [t0+r10+416*2], m2 + mova [t0+r10+416*4], m3 + psrlw m3, m1, 1 + paddd m16, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m17, m8 + psrld m16, 4 ; (a + 8) >> 4 + psrld m17, 4 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + punpcklwd m2, m3, m6 + vpdpwssd m16, m2, m2 ; -p + punpckhwd m3, m6 + vpdpwssd m17, m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m16, m10 ; p * s + pmulld m17, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + pmaxsw m17, m6 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + packssdw m16, m17 + psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + psubd m1, m13 + mova [t4+r10+4], m16 + psrld m16, m0, 12 ; b + psrld m17, m1, 12 + mova [t3+r10*2+ 8], xm16 + mova [t3+r10*2+ 24], xm17 + vextracti128 [t3+r10*2+ 40], ym16, 1 + vextracti128 [t3+r10*2+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+ 72], m16, 2 + vextracti32x4 [t3+r10*2+ 88], m17, 2 + vextracti32x4 [t3+r10*2+104], m16, 3 + vextracti32x4 [t3+r10*2+120], m17, 3 + add r10, 64 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10+416*0], m1 + paddw m1, m0 + mova [t1+r10+416*2], m16 + paddd m16, m2 + mova [t1+r10+416*4], m17 + paddd m17, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-4] +.v_loop: + mova m2, [t1+r10+416*2] + mova m3, [t1+r10+416*4] + mova m0, [t1+r10+416*0] + paddd m16, m2, [t2+r10+416*2] + paddd m17, m3, [t2+r10+416*4] + paddw m1, m0, [t2+r10+416*0] + paddd m2, m2 + paddd m3, m3 + paddd m16, m2 ; hv sumsq + paddd m17, m3 + paddd m16, m8 + paddd m17, m8 + psrld m16, 4 ; (a + 8) >> 4 + psrld m17, 4 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + paddw m0, m0 + paddw m1, m0 ; hv sum + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + vpdpwssd m16, m2, m2 ; -p + punpckhwd m3, m6 + vpdpwssd m17, m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m16, m10 ; p * s + pmulld m17, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + pmaxsw m17, m6 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + packssdw m16, m17 + psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + psubd m1, m13 + mova [t4+r10+4], m16 + psrld m16, m0, 12 ; b + psrld m17, m1, 12 + mova [t3+r10*2+ 8], xm16 + mova [t3+r10*2+ 24], xm17 + vextracti128 [t3+r10*2+ 40], ym16, 1 + vextracti128 [t3+r10*2+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+ 72], m16, 2 + vextracti32x4 [t3+r10*2+ 88], m17, 2 + vextracti32x4 [t3+r10*2+104], m16, 3 + vextracti32x4 [t3+r10*2+120], m17, 3 + add r10, 64 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+68] + paddw m3, m0, [t4+r10*1+ 0] + paddd m16, m1, [t3+r10*2+ 0] + paddd m17, m2, [t3+r10*2+64] + paddw m3, [t4+r10*1+ 4] + paddd m16, [t3+r10*2+ 8] + paddd m17, [t3+r10*2+72] + paddw m0, m3 + psllw m3, 2 + paddd m1, m16 + pslld m16, 2 + paddd m2, m17 + pslld m17, 2 + paddw m0, m3 ; a 565 + paddd m1, m16 ; b 565 + paddd m2, m17 + mova [t4+r10*1+416*2+ 0], m0 + mova [t3+r10*2+416*4+ 0], m1 + mova [t3+r10*2+416*4+64], m2 + add r10, 64 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+68] + paddw m3, m0, [t4+r10*1+ 0] + paddd m16, m1, [t3+r10*2+ 0] + paddd m17, m2, [t3+r10*2+64] + paddw m3, [t4+r10*1+ 4] + paddd m16, [t3+r10*2+ 8] + paddd m17, [t3+r10*2+72] + paddw m0, m3 + psllw m3, 2 + paddd m1, m16 + pslld m16, 2 + paddd m2, m17 + pslld m17, 2 + paddw m0, m3 ; a 565 + paddd m1, m16 ; b 565 + paddd m2, m17 + paddw m3, m0, [t4+r10*1+416*2+ 0] + paddd m16, m1, [t3+r10*2+416*4+ 0] + paddd m17, m2, [t3+r10*2+416*4+64] + mova [t4+r10*1+416*2+ 0], m0 + mova [t3+r10*2+416*4+ 0], m1 + mova [t3+r10*2+416*4+64], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vshufi32x4 m1, m16, m17, q2020 + vshufi32x4 m16, m17, q3131 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m16, m3 + psrad m1, 9 + psrad m16, 9 + packssdw m1, m16 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 64 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m0, [dstq+r10] + mova m3, [t4+r10*1+416*2+ 0] + mova m16, [t3+r10*2+416*4+ 0] + mova m17, [t3+r10*2+416*4+64] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vshufi32x4 m1, m16, m17, q2020 + vshufi32x4 m16, m17, q3131 + psubd m1, m2 ; b - a * src + (1 << 7) + psubd m16, m3 + psrad m1, 8 + psrad m16, 8 + packssdw m1, m16 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 64 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \ + w, h, edge, params + movifnidn wd, wm + mov paramsq, r6mp + lea r13, [r_ext_mask+72] + mov edged, r7m + movifnidn hd, hm + pxor m6, m6 + vpbroadcastw m7, [paramsq+10] ; w1 + add wd, wd + vpbroadcastd m8, [base+pd_8] + add lpfq, wq + vpbroadcastd m9, [base+pd_m9] + add dstq, wq + vpsubd m10, m6, [paramsq+4] {1to16} ; -s1 + lea t3, [rsp+wq*2+416*12+8] + vpbroadcastd m11, [base+pw_164_455] + lea t4, [rsp+wq+416*32+8] + vpbroadcastd m12, [base+pw_61448] + lea t1, [rsp+wq+12] + vpbroadcastd m13, [base+pd_m34816] + neg wq + vpbroadcastd m14, [base+pw_1023] + psllw m7, 4 + mova m18, [sgr_x_by_x+64*0] + mov r10d, 0xfffffffc + mova m19, [sgr_x_by_x+64*1] + kmovd k1, r10d + mova m20, [sgr_x_by_x+64*2] + mov r10, 0x3333333333333333 + mova m21, [sgr_x_by_x+64*3] + kmovq k2, r10 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + add t1, 416*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+416*6] +.top_fixup_loop: + mova m0, [t1+r10+416*0] + mova m1, [t1+r10+416*2] + mova m2, [t1+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m1 + mova [t2+r10+416*4], m2 + add r10, 64 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm16, [leftq+4] + vmovdqu16 m16{k1}, [lpfq+wq-4] + add leftq, 8 + jmp .h_main +.h_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m16, [lpfq+r10+ 0] +.h_main: + movu m17, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -66 + jl .h_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.h_have_right: + palignr m0, m17, m16, 2 + paddw m1, m16, m0 + punpcklwd m2, m16, m0 + pmaddwd m2, m2 + punpckhwd m3, m16, m0 + pmaddwd m3, m3 + palignr m17, m16, 4 + paddw m1, m17 ; sum + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; sumsq + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + mova [t1+r10+416*0], m1 + mova [t1+r10+416*2], m2 + mova [t1+r10+416*4], m3 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movd xm16, [leftq+4] + vmovdqu16 m16{k1}, [lpfq+wq-4] + add leftq, 8 + jmp .hv0_main +.hv0_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-4] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m16, [lpfq+r10+ 0] +.hv0_main: + movu m17, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -66 + jl .hv0_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv0_have_right: + palignr m0, m17, m16, 2 + paddw m1, m16, m0 + punpcklwd m2, m16, m0 + pmaddwd m2, m2 + punpckhwd m3, m16, m0 + pmaddwd m3, m3 + palignr m17, m16, 4 + paddw m1, m17 ; sum + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; sumsq + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + paddw m0, m1, [t1+r10+416*0] + paddd m16, m2, [t1+r10+416*2] + paddd m17, m3, [t1+r10+416*4] + mova [t1+r10+416*0], m1 + mova [t1+r10+416*2], m2 + mova [t1+r10+416*4], m3 + paddw m1, m0, [t2+r10+416*0] + paddd m2, m16, [t2+r10+416*2] + paddd m3, m17, [t2+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m16 + mova [t2+r10+416*4], m17 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m17, m1, 1 + pavgw m17, m6 ; (b + 2) >> 2 + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; -p + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + punpcklwd m16, m6, m1 ; b + punpckhwd m17, m6, m1 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m10 ; p * s + pmulld m3, m10 + pmaddwd m16, m11 ; b * 455 + pmaddwd m17, m11 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m12 + psraw m3, 4 ; min(z, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x + pandn m2, m13, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + psubd m17, m13 + mova [t4+r10*1+416*0+4], m2 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*0+ 8], xm16 + mova [t3+r10*2+416*0+ 24], xm17 + vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*0+104], m16, 3 + vextracti32x4 [t3+r10*2+416*0+120], m17, 3 + add r10, 64 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movd xm16, [leftq+4] + vmovdqu16 m16{k1}, [lpfq+wq-4] + add leftq, 8 + jmp .hv1_main +.hv1_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-4] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m16, [lpfq+r10+ 0] +.hv1_main: + movu m17, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -66 + jl .hv1_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv1_have_right: + palignr m1, m17, m16, 2 + paddw m0, m16, m1 + punpcklwd m2, m16, m1 + pmaddwd m2, m2 + punpckhwd m3, m16, m1 + pmaddwd m3, m3 + palignr m17, m16, 4 + paddw m0, m17 ; h sum + punpcklwd m1, m17, m6 + vpdpwssd m2, m1, m1 ; h sumsq + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + paddw m1, m0, [t2+r10+416*0] + paddd m16, m2, [t2+r10+416*2] + paddd m17, m3, [t2+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m2 + mova [t2+r10+416*4], m3 + paddd m16, m8 + paddd m17, m8 + psrld m16, 4 ; (a + 8) >> 4 + psrld m17, 4 + pmulld m16, m9 ; -((a + 8) >> 4) * 9 + pmulld m17, m9 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + vpdpwssd m16, m2, m2 ; -p + punpckhwd m3, m6 + vpdpwssd m17, m3, m3 + punpcklwd m0, m6, m1 ; b + punpckhwd m1, m6, m1 + pminsd m16, m6 + pminsd m17, m6 + pmulld m16, m10 ; p * s + pmulld m17, m10 + pmaddwd m0, m11 ; b * 455 + pmaddwd m1, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + packssdw m16, m17 + psubd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + psubd m1, m13 + mova [t4+r10*1+416*2+4], m16 + psrld m16, m0, 12 + psrld m17, m1, 12 + mova [t3+r10*2+416*4+ 8], xm16 + mova [t3+r10*2+416*4+ 24], xm17 + vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*4+104], m16, 3 + vextracti32x4 [t3+r10*2+416*4+120], m17, 3 + add r10, 64 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+416*0] + mova m16, [t1+r10+416*2] + mova m17, [t1+r10+416*4] + paddw m0, m0 + paddd m16, m16 + paddd m17, m17 + paddw m1, m0, [t2+r10+416*0] + paddd m2, m16, [t2+r10+416*2] + paddd m3, m17, [t2+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m16 + mova [t2+r10+416*4], m17 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m17, m1, 1 + pavgw m17, m6 ; (b + 2) >> 2 + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; -p + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + punpcklwd m16, m6, m1 ; b + punpckhwd m17, m6, m1 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m10 ; p * s + pmulld m3, m10 + pmaddwd m16, m11 ; b * 455 + pmaddwd m17, m11 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m12 + psraw m3, 4 ; min(z, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x + pandn m2, m13, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + psubd m17, m13 + mova [t4+r10*1+416*0+4], m2 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*0+ 8], xm16 + mova [t3+r10*2+416*0+ 24], xm17 + vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*0+104], m16, 3 + vextracti32x4 [t3+r10*2+416*0+120], m17, 3 + add r10, 64 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m0, [t1+r10+416*0] + mova m16, [t1+r10+416*2] + mova m17, [t1+r10+416*4] + paddw m1, m0, [t2+r10+416*0] + paddd m2, m16, [t2+r10+416*2] + paddd m3, m17, [t2+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m16 + mova [t2+r10+416*4], m17 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m17, m1, 1 + pavgw m17, m6 ; (b + 2) >> 2 + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; -p + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + punpcklwd m16, m6, m1 ; b + punpckhwd m17, m6, m1 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m10 ; p * s + pmulld m3, m10 + pmaddwd m16, m11 ; b * 455 + pmaddwd m17, m11 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m12 + psraw m3, 4 ; min(z, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x + pandn m2, m13, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + psubd m17, m13 + mova [t4+r10*1+416*2+4], m2 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*4+ 8], xm16 + mova [t3+r10*2+416*4+ 24], xm17 + vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*4+104], m16, 3 + vextracti32x4 [t3+r10*2+416*4+120], m17, 3 + add r10, 64 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + mova ym16, [t4+r10*1+416*0+0] + paddw ym16, [t4+r10*1+416*0+4] + paddw ym17, ym16, [t4+r10*1+416*0+2] + mova m0, [t3+r10*2+416*0+0] + paddd m0, [t3+r10*2+416*0+8] + paddd m1, m0, [t3+r10*2+416*0+4] + psllw ym17, 2 ; a[-1] 444 + pslld m1, 2 ; b[-1] 444 + psubw ym17, ym16 ; a[-1] 343 + psubd m1, m0 ; b[-1] 343 + vmovdqa32 [t4+r10*1+416* 4], ym17 + vmovdqa32 [t3+r10*2+416* 8], m1 + mova ym16, [t4+r10*1+416*2+0] + paddw ym16, [t4+r10*1+416*2+4] + paddw ym17, ym16, [t4+r10*1+416*2+2] + mova m0, [t3+r10*2+416*4+0] + paddd m0, [t3+r10*2+416*4+8] + paddd m1, m0, [t3+r10*2+416*4+4] + psllw ym17, 2 ; a[ 0] 444 + pslld m1, 2 ; b[ 0] 444 + vmovdqa32 [t4+r10*1+416* 6], ym17 + vmovdqa32 [t3+r10*2+416*12], m1 + psubw ym17, ym16 ; a[ 0] 343 + psubd m1, m0 ; b[ 0] 343 + vmovdqa32 [t4+r10*1+416* 8], ym17 + vmovdqa32 [t3+r10*2+416*16], m1 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + mova m3, [t4+r10*1+416*0+0] + paddw m3, [t4+r10*1+416*0+4] + paddw m1, m3, [t4+r10*1+416*0+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+416*4] + paddw m3, [t4+r10*1+416*6] + mova [t4+r10*1+416*4], m2 + mova [t4+r10*1+416*6], m1 + mova m16, [t3+r10*2+416*0+0] + paddd m16, [t3+r10*2+416*0+8] + paddd m1, m16, [t3+r10*2+416*0+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m16 ; b[ 1] 343 + paddd m16, m2, [t3+r10*2+416* 8+ 0] + paddd m16, [t3+r10*2+416*12+ 0] + mova [t3+r10*2+416* 8+ 0], m2 + mova [t3+r10*2+416*12+ 0], m1 + mova m17, [t3+r10*2+416*0+64] + paddd m17, [t3+r10*2+416*0+72] + paddd m1, m17, [t3+r10*2+416*0+68] + pslld m1, 2 + psubd m2, m1, m17 + paddd m17, m2, [t3+r10*2+416* 8+64] + paddd m17, [t3+r10*2+416*12+64] + mova [t3+r10*2+416* 8+64], m2 + mova [t3+r10*2+416*12+64], m1 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vshufi32x4 m1, m16, m17, q2020 + vshufi32x4 m16, m17, q3131 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m16, m3 + psrad m1, 9 + psrad m16, 9 + packssdw m1, m16 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 64 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t4+r10*1+416*2+0] + paddw m3, [t4+r10*1+416*2+4] + paddw m1, m3, [t4+r10*1+416*2+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+416*6] + paddw m3, [t4+r10*1+416*8] + mova [t4+r10*1+416*6], m1 + mova [t4+r10*1+416*8], m2 + mova m16, [t3+r10*2+416*4+0] + paddd m16, [t3+r10*2+416*4+8] + paddd m1, m16, [t3+r10*2+416*4+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m16 ; b[ 1] 343 + paddd m16, m2, [t3+r10*2+416*12+ 0] + paddd m16, [t3+r10*2+416*16+ 0] + mova [t3+r10*2+416*12+ 0], m1 + mova [t3+r10*2+416*16+ 0], m2 + mova m17, [t3+r10*2+416*4+64] + paddd m17, [t3+r10*2+416*4+72] + paddd m1, m17, [t3+r10*2+416*4+68] + pslld m1, 2 + psubd m2, m1, m17 + paddd m17, m2, [t3+r10*2+416*12+64] + paddd m17, [t3+r10*2+416*16+64] + mova [t3+r10*2+416*12+64], m1 + mova [t3+r10*2+416*16+64], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vshufi32x4 m1, m16, m17, q2020 + vshufi32x4 m16, m17, q3131 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m16, m3 + psrad m1, 9 + psrad m16, 9 + packssdw m1, m16 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 64 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \ + w, h, edge, params + movifnidn wd, wm + mov paramsq, r6mp + lea r13, [r_ext_mask+72] + mov edged, r7m + movifnidn hd, hm + vpbroadcastd m7, [paramsq+8] ; w0 w1 + pxor m6, m6 + vpbroadcastd m8, [base+pd_8] + add wd, wd + vpbroadcastd m9, [base+pd_m9] + add lpfq, wq + vpbroadcastd m10, [base+pd_m25] + add dstq, wq + vpsubd m11, m6, [paramsq+0] {1to16} ; -s0 + lea t3, [rsp+wq*2+416*24+8] + vpsubd m12, m6, [paramsq+4] {1to16} ; -s1 + lea t4, [rsp+wq+416*52+8] + vpbroadcastd m13, [base+pw_164_455] + lea t1, [rsp+wq+12] + vpbroadcastd m14, [base+pw_61448] + neg wq + vpbroadcastd m15, [base+pd_m34816] + psllw m7, 2 + vpbroadcastd m22, [base+pd_2147483648] + mov r10d, 0xfffffff8 + mova m18, [sgr_x_by_x+64*0] + kmovd k1, r10d + mova m19, [sgr_x_by_x+64*1] + mov r10, 0x3333333333333333 + mova m20, [sgr_x_by_x+64*2] + kmovq k2, r10 + mova m21, [sgr_x_by_x+64*3] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup + add t1, 416*12 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+416*12] +.top_fixup_loop: + mova m0, [t1+r10+416* 0] + mova m1, [t1+r10+416* 2] + mova m2, [t1+r10+416* 4] + paddw m0, m0 + mova m3, [t1+r10+416* 6] + paddd m1, m1 + mova m4, [t1+r10+416* 8] + paddd m2, m2 + mova m5, [t1+r10+416*10] + mova [t2+r10+416* 0], m0 + mova [t2+r10+416* 2], m1 + mova [t2+r10+416* 4], m2 + mova [t2+r10+416* 6], m3 + mova [t2+r10+416* 8], m4 + mova [t2+r10+416*10], m5 + add r10, 64 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .h_main +.h_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m16, [lpfq+r10- 2] +.h_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -68 + jl .h_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.h_have_right: + palignr m3, m17, m16, 2 + palignr m0, m17, m16, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m17, m16, 6 + paddw m1, m0 ; sum3 + punpcklwd m4, m0, m6 + vpdpwssd m2, m4, m4 ; sumsq3 + punpckhwd m0, m6 + vpdpwssd m3, m0, m0 + shufpd m4, m16, m17, 0x55 + punpcklwd m17, m4, m16 + paddw m0, m16, m4 + punpckhwd m4, m16 + mova [t1+r10+416* 6], m1 + mova [t1+r10+416* 8], m2 + mova [t1+r10+416*10], m3 + paddw m1, m0 ; sum5 + vpdpwssd m2, m17, m17 ; sumsq5 + vpdpwssd m3, m4, m4 + mova [t1+r10+416* 0], m1 + mova [t1+r10+416* 2], m2 + mova [t1+r10+416* 4], m3 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .hv0_main +.hv0_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m16, [lpfq+r10- 2] +.hv0_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -68 + jl .hv0_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv0_have_right: + palignr m3, m17, m16, 2 + palignr m0, m17, m16, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m17, m16, 6 + paddw m1, m0 ; h sum3 + punpcklwd m4, m0, m6 + vpdpwssd m2, m4, m4 ; h sumsq3 + punpckhwd m0, m6 + vpdpwssd m3, m0, m0 + shufpd m17, m16, m17, 0x55 + paddw m4, m1, [t1+r10+416* 6] + paddd m5, m2, [t1+r10+416* 8] + mova [t1+r10+416* 6], m1 + mova [t1+r10+416* 8], m2 + paddw m1, m16 + paddw m1, m17 ; h sum5 + punpcklwd m0, m17, m16 + vpdpwssd m2, m0, m0 ; h sumsq5 + paddd m0, m3, [t1+r10+416*10] + mova [t1+r10+416*10], m3 + punpckhwd m17, m16 + vpdpwssd m3, m17, m17 + mova [t3+r10*2+416*8+ 8], m1 ; we need a clean copy of the last row + mova [t3+r10*2+416*0+ 8], m2 ; in case height is odd + mova [t3+r10*2+416*0+72], m3 + paddw m1, [t1+r10+416* 0] + paddd m2, [t1+r10+416* 2] + paddd m3, [t1+r10+416* 4] + mova [t1+r10+416* 0], m1 + mova [t1+r10+416* 2], m2 + mova [t1+r10+416* 4], m3 + paddw m17, m4, [t2+r10+416* 6] + paddd m2, m5, [t2+r10+416* 8] + paddd m3, m0, [t2+r10+416*10] + mova [t2+r10+416* 6], m4 + mova [t2+r10+416* 8], m5 + mova [t2+r10+416*10], m0 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m5, m17, 1 + pavgw m5, m6 ; (b3 + 2) >> 2 + punpcklwd m4, m5, m6 + vpdpwssd m2, m4, m4 ; -p3 + punpckhwd m5, m6 + vpdpwssd m3, m5, m5 + punpcklwd m16, m6, m17 ; b3 + punpckhwd m17, m6, m17 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m12 ; p3 * s1 + pmulld m3, m12 + pmaddwd m16, m13 ; b3 * 455 + pmaddwd m17, m13 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m14 + psraw m3, 4 ; min(z3, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x3 + pandn m2, m15, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + mova [t4+r10*1+416*2+4], m2 + psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*4+ 8], xm16 + mova [t3+r10*2+416*4+ 24], xm17 + vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*4+104], m16, 3 + vextracti32x4 [t3+r10*2+416*4+120], m17, 3 + add r10, 64 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .hv1_main +.hv1_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m16, [lpfq+r10- 2] +.hv1_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -68 + jl .hv1_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv1_have_right: + palignr m1, m17, m16, 2 + palignr m3, m17, m16, 4 + paddw m2, m1, m3 + punpcklwd m0, m1, m3 + pmaddwd m0, m0 + punpckhwd m1, m3 + pmaddwd m1, m1 + palignr m3, m17, m16, 6 + paddw m2, m3 ; h sum3 + punpcklwd m5, m3, m6 + vpdpwssd m0, m5, m5 ; h sumsq3 + punpckhwd m3, m6 + vpdpwssd m1, m3, m3 + shufpd m3, m16, m17, 0x55 + punpcklwd m5, m16, m3 + paddw m4, m16, m3 + punpckhwd m16, m3 + paddw m17, m2, [t2+r10+416* 6] + mova [t2+r10+416* 6], m2 + paddw m4, m2 ; h sum5 + paddd m2, m0, [t2+r10+416* 8] + paddd m3, m1, [t2+r10+416*10] + mova [t2+r10+416* 8], m0 + mova [t2+r10+416*10], m1 + vpdpwssd m0, m5, m5 ; h sumsq5 + vpdpwssd m1, m16, m16 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m16, m17, 1 + pavgw m16, m6 ; (b3 + 2) >> 2 + punpcklwd m5, m16, m6 + vpdpwssd m2, m5, m5 ; -p3 + punpckhwd m16, m6 + vpdpwssd m3, m16, m16 + punpcklwd m16, m6, m17 ; b3 + punpckhwd m17, m6, m17 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m12 ; p3 * s1 + pmulld m3, m12 + pmaddwd m16, m13 ; b3 * 455 + pmaddwd m17, m13 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m14 + psraw m3, 4 ; min(z3, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x3 + pandn m2, m15, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + mova [t4+r10*1+416*4+4], m2 + psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + paddw m5, m4, [t2+r10+416*0] + paddd m2, m0, [t2+r10+416*2] + paddd m3, m1, [t2+r10+416*4] + paddw m5, [t1+r10+416*0] + paddd m2, [t1+r10+416*2] + paddd m3, [t1+r10+416*4] + mova [t2+r10+416*0], m4 + mova [t2+r10+416*2], m0 + mova [t2+r10+416*4], m1 + mova [t3+r10*2+416*8+ 8], xm16 + mova [t3+r10*2+416*8+ 24], xm17 + vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*8+104], m16, 3 + vextracti32x4 [t3+r10*2+416*8+120], m17, 3 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 + pmulld m3, m10 + psrlw m17, m5, 1 + pavgw m17, m6 ; (b5 + 2) >> 2 + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; -p5 + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + punpcklwd m16, m5, m6 ; b5 + punpckhwd m17, m5, m6 + pmulld m2, m11 ; p5 * s0 + pmulld m3, m11 + pmaddwd m16, m13 ; b5 * 164 + pmaddwd m17, m13 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + pmaxsw m3, m6 + paddusw m3, m14 + psraw m3, 4 ; min(z5, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x5 + pandn m2, m15, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + mova [t4+r10*1+416*0+4], m2 + psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*0+ 8], xm16 + mova [t3+r10*2+416*0+ 24], xm17 + vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*0+104], m16, 3 + vextracti32x4 [t3+r10*2+416*0+120], m17, 3 + add r10, 64 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-4] +.v0_loop: + mova m16, [t1+r10+416* 6] + mova m2, [t1+r10+416* 8] + mova m3, [t1+r10+416*10] + paddw m16, m16 + paddd m2, m2 + paddd m3, m3 + paddw m17, m16, [t2+r10+416* 6] + paddd m4, m2, [t2+r10+416* 8] + paddd m5, m3, [t2+r10+416*10] + mova [t2+r10+416* 6], m16 + mova [t2+r10+416* 8], m2 + mova [t2+r10+416*10], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a3 + 8) >> 4 + psrld m5, 4 + pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 + pmulld m5, m9 + psrlw m3, m17, 1 + pavgw m3, m6 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m6 + vpdpwssd m4, m2, m2 ; -p3 + punpckhwd m3, m6 + vpdpwssd m5, m3, m3 + punpcklwd m16, m6, m17 ; b3 + punpckhwd m17, m6, m17 + pminsd m4, m6 + pminsd m5, m6 + pmulld m4, m12 ; p3 * s1 + pmulld m5, m12 + pmaddwd m16, m13 ; b3 * 455 + pmaddwd m17, m13 + vpalignr m5{k2}, m4, m4, 2 + mova m4, m20 + paddusw m5, m14 + psraw m5, 4 ; min(z3, 255) - 256 + vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m5 + vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m5{k3}, m4 ; x3 + pandn m4, m15, m5 + psrld m5, 16 + pmulld m16, m4 + pmulld m17, m5 + packssdw m4, m5 + mova [t4+r10*1+416*2+4], m4 + psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova m3, [t1+r10+416*0] + mova m4, [t1+r10+416*2] + mova m5, [t1+r10+416*4] + mova [t3+r10*2+416*8+ 8], m3 + mova [t3+r10*2+416*0+ 8], m4 + mova [t3+r10*2+416*0+72], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+r10+416*0], m3 + mova [t1+r10+416*2], m4 + mova [t1+r10+416*4], m5 + mova [t3+r10*2+416*4+ 8], xm16 + mova [t3+r10*2+416*4+ 24], xm17 + vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*4+104], m16, 3 + vextracti32x4 [t3+r10*2+416*4+120], m17, 3 + add r10, 64 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m16, [t1+r10+416* 6] + mova m2, [t1+r10+416* 8] + mova m3, [t1+r10+416*10] + paddw m17, m16, [t2+r10+416* 6] + paddd m4, m2, [t2+r10+416* 8] + paddd m5, m3, [t2+r10+416*10] + mova [t2+r10+416* 6], m16 + mova [t2+r10+416* 8], m2 + mova [t2+r10+416*10], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a3 + 8) >> 4 + psrld m5, 4 + pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 + pmulld m5, m9 + psrlw m3, m17, 1 + pavgw m3, m6 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m6 + vpdpwssd m4, m2, m2 ; -p3 + punpckhwd m3, m6 + vpdpwssd m5, m3, m3 + punpcklwd m16, m6, m17 ; b3 + punpckhwd m17, m6, m17 + pminsd m4, m6 + pminsd m5, m6 + pmulld m4, m12 ; p3 * s1 + pmulld m5, m12 + pmaddwd m16, m13 ; b3 * 455 + pmaddwd m17, m13 + vpalignr m5{k2}, m4, m4, 2 + mova m4, m20 + paddusw m5, m14 + psraw m5, 4 ; min(z3, 255) - 256 + vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m5 + vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m5{k3}, m4 ; x3 + pandn m4, m15, m5 + psrld m5, 16 + pmulld m16, m4 + pmulld m17, m5 + packssdw m4, m5 + mova [t4+r10*1+416*4+4], m4 + psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova m0, [t3+r10*2+416*8+ 8] + mova m4, [t3+r10*2+416*0+ 8] + mova m5, [t3+r10*2+416*0+72] + paddw m1, m0, [t2+r10+416*0] + paddd m2, m4, [t2+r10+416*2] + paddd m3, m5, [t2+r10+416*4] + paddw m1, [t1+r10+416*0] + paddd m2, [t1+r10+416*2] + paddd m3, [t1+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m4 + mova [t2+r10+416*4], m5 + mova [t3+r10*2+416*8+ 8], xm16 + mova [t3+r10*2+416*8+ 24], xm17 + vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*8+104], m16, 3 + vextracti32x4 [t3+r10*2+416*8+120], m17, 3 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 + pmulld m3, m10 + psrlw m5, m1, 1 + pavgw m5, m6 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m6 + vpdpwssd m2, m4, m4 ; -p5 + punpckhwd m5, m6 + vpdpwssd m3, m5, m5 + punpcklwd m16, m1, m6 ; b5 + punpckhwd m17, m1, m6 + pmulld m2, m11 ; p5 * s0 + pmulld m3, m11 + pmaddwd m16, m13 ; b5 * 164 + pmaddwd m17, m13 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + pmaxsw m3, m6 + paddusw m3, m14 + psraw m3, 4 ; min(z5, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x5 + pandn m2, m15, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + mova [t4+r10*1+416*0+4], m2 + psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*0+ 8], xm16 + mova [t3+r10*2+416*0+ 24], xm17 + vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*0+104], m16, 3 + vextracti32x4 [t3+r10*2+416*0+120], m17, 3 + add r10, 64 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu ym0, [t4+r10*1+416*0+2] + paddw ym2, ym0, [t4+r10*1+416*0+0] + paddw ym2, [t4+r10*1+416*0+4] + movu m1, [t3+r10*2+416*0+4] + paddd m3, m1, [t3+r10*2+416*0+0] + paddd m3, [t3+r10*2+416*0+8] + paddw ym0, ym2 + paddd m1, m3 + psllw ym2, 2 + pslld m3, 2 + paddw ym0, ym2 ; a5 565 + paddd m1, m3 ; b5 565 + mova [t4+r10*1+416* 6], ym0 + mova [t3+r10*2+416*12], m1 + mova ym0, [t4+r10*1+416*2+0] + paddw ym0, [t4+r10*1+416*2+4] + paddw ym2, ym0, [t4+r10*1+416*2+2] + mova m1, [t3+r10*2+416*4+0] + paddd m1, [t3+r10*2+416*4+8] + paddd m3, m1, [t3+r10*2+416*4+4] + psllw ym2, 2 ; a3[-1] 444 + pslld m3, 2 ; b3[-1] 444 + psubw ym2, ym0 ; a3[-1] 343 + psubd m3, m1 ; b3[-1] 343 + mova [t4+r10*1+416* 8], ym2 + mova [t3+r10*2+416*16], m3 + mova ym0, [t4+r10*1+416*4+0] + paddw ym0, [t4+r10*1+416*4+4] + paddw ym2, ym0, [t4+r10*1+416*4+2] + mova m1, [t3+r10*2+416*8+0] + paddd m1, [t3+r10*2+416*8+8] + paddd m3, m1, [t3+r10*2+416*8+4] + psllw ym2, 2 ; a3[ 0] 444 + pslld m3, 2 ; b3[ 0] 444 + mova [t4+r10*1+416*10], ym2 + mova [t3+r10*2+416*20], m3 + psubw ym2, ym0 ; a3[ 0] 343 + psubd m3, m1 ; b3[ 0] 343 + mova [t4+r10*1+416*12], ym2 + mova [t3+r10*2+416*24], m3 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu ym2, [t4+r10*1+2] + paddw ym0, ym2, [t4+r10*1+0] + paddw ym0, [t4+r10*1+4] + paddw ym2, ym0 + psllw ym0, 2 + paddw ym0, ym2 ; a5 + movu m1, [t3+r10*2+4] + paddd m4, m1, [t3+r10*2+0] + paddd m4, [t3+r10*2+8] + paddd m1, m4 + pslld m4, 2 + paddd m4, m1 ; b5 + paddw ym2, ym0, [t4+r10*1+416* 6] + mova [t4+r10*1+416* 6], ym0 + paddd m0, m4, [t3+r10*2+416*12] + mova [t3+r10*2+416*12], m4 + mova ym3, [t4+r10*1+416*2+0] + paddw ym3, [t4+r10*1+416*2+4] + paddw ym5, ym3, [t4+r10*1+416*2+2] + psllw ym5, 2 ; a3[ 1] 444 + psubw ym4, ym5, ym3 ; a3[ 1] 343 + paddw ym3, ym4, [t4+r10*1+416* 8] + paddw ym3, [t4+r10*1+416*10] + mova [t4+r10*1+416* 8], ym4 + mova [t4+r10*1+416*10], ym5 + mova m1, [t3+r10*2+416*4+0] + paddd m1, [t3+r10*2+416*4+8] + paddd m5, m1, [t3+r10*2+416*4+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+416*16] + paddd m1, [t3+r10*2+416*20] + mova [t3+r10*2+416*16], m4 + mova [t3+r10*2+416*20], m5 + pmovzxwd m4, [dstq+r10] + pmovzxwd m2, ym2 ; a5 + pmovzxwd m3, ym3 ; a3 + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + vpshldd m4, m22, 13 + psubd m0, m2 ; b5 - a5 * src + (1 << 8) + psubd m1, m3 ; b3 - a3 * src + (1 << 8) + psrld m0, 9 + pslld m1, 7 + vpblendmb m0{k2}, m1, m0 + vpdpwssd m4, m0, m7 + psrad m4, 7 + pmaxsd m4, m6 + vpmovusdw ym16, m4 ; clip + psrlw ym16, 6 + mova [dstq+r10], ym16 + add r10, 32 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova ym3, [t4+r10*1+416*4+0] + paddw ym3, [t4+r10*1+416*4+4] + paddw ym5, ym3, [t4+r10*1+416*4+2] + psllw ym5, 2 ; a3[ 1] 444 + psubw ym4, ym5, ym3 ; a3[ 1] 343 + paddw ym3, ym4, [t4+r10*1+416*12] + paddw ym3, [t4+r10*1+416*10] + mova [t4+r10*1+416*10], ym5 + mova [t4+r10*1+416*12], ym4 + mova m0, [t3+r10*2+416*8+0] + paddd m0, [t3+r10*2+416*8+8] + paddd m5, m0, [t3+r10*2+416*8+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m0 ; b3[ 1] 343 + paddd m0, m4, [t3+r10*2+416*24] + paddd m0, [t3+r10*2+416*20] + mova [t3+r10*2+416*20], m5 + mova [t3+r10*2+416*24], m4 + pmovzxwd m4, [dstq+r10] + pmovzxwd m2, [t4+r10*1+416* 6] + pmovzxwd m3, ym3 + mova m1, [t3+r10*2+416*12] + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + vpshldd m4, m22, 13 + psubd m1, m2 ; b5 - a5 * src + (1 << 8) + psubd m0, m3 ; b3 - a3 * src + (1 << 8) + pslld m0, 7 + vpalignr m0{k2}, m1, m1, 1 + vpdpwssd m4, m0, m7 + psrad m4, 7 + pmaxsd m4, m6 + vpmovusdw ym16, m4 ; clip + psrlw ym16, 6 + mova [dstq+r10], ym16 + add r10, 32 + jl .n1_loop + add dstq, strideq + ret + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/looprestoration16_sse.asm dav1d-1.0.0/src/x86/looprestoration16_sse.asm --- dav1d-0.9.2/src/x86/looprestoration16_sse.asm 2021-09-03 15:51:24.421037200 +0000 +++ dav1d-1.0.0/src/x86/looprestoration16_sse.asm 2022-03-18 14:31:56.026356000 +0000 @@ -77,24 +77,23 @@ INIT_XMM ssse3 %if ARCH_X86_32 -DECLARE_REG_TMP 4, 6 +DECLARE_REG_TMP 5, 6 %if STACK_ALIGNMENT < 16 - %assign extra_stack 14*16 + %assign extra_stack 13*16 %else %assign extra_stack 12*16 %endif -cglobal wiener_filter7_16bpc, 5, 7, 8, -384*12-16-extra_stack, \ - dst, dst_stride, left, lpf, lpf_stride, w, flt +cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \ + dst, stride, left, lpf, w, flt %if STACK_ALIGNMENT < 16 %define lpfm dword [esp+calloff+16*12+ 0] - %define lpf_stridem dword [esp+calloff+16*12+ 4] - %define wm dword [esp+calloff+16*12+ 8] - %define hd dword [esp+calloff+16*12+12] - %define edgeb byte [esp+calloff+16*12+16] - %define edged dword [esp+calloff+16*12+16] + %define wm dword [esp+calloff+16*12+ 4] + %define hd dword [esp+calloff+16*12+ 8] + %define edgeb byte [esp+calloff+16*12+12] + %define edged dword [esp+calloff+16*12+12] %else - %define hd dword r6m - %define edgeb byte r8m + %define hd dword r5m + %define edgeb byte r7m %endif %define PICmem dword [esp+calloff+4*0] %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers @@ -117,47 +116,46 @@ %define m13 [esp+calloff+16*7] %define m14 [esp+calloff+16*8] %define m15 [esp+calloff+16*9] - %define r10 r5 + %define r10 r4 %define base t0-wiener_shifts %assign calloff 0 %if STACK_ALIGNMENT < 16 - mov wd, [rstk+stack_offset+24] - mov lpf_stridem, lpf_strideq + mov wd, [rstk+stack_offset+20] mov wm, wd - mov r4, [rstk+stack_offset+28] - mov hd, r4 - mov r4, [rstk+stack_offset+36] - mov edged, r4 ; edge + mov r5, [rstk+stack_offset+24] + mov hd, r5 + mov r5, [rstk+stack_offset+32] + mov edged, r5 ; edge %endif %else -DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers -cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers +cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt %define base %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 - mov fltq, fltmp - mov edged, r8m - mov hd, r6m - mov t3d, r9m ; pixel_max + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max movq m13, [fltq] movq m15, [fltq+16] %else %if STACK_ALIGNMENT < 16 - mov t0, [rstk+stack_offset+32] - mov t1, [rstk+stack_offset+40] ; pixel_max + mov t0, [rstk+stack_offset+28] + mov t1, [rstk+stack_offset+36] ; pixel_max movq m1, [t0] ; fx movq m3, [t0+16] ; fy LEA t0, wiener_shifts %else - LEA t0, wiener_shifts - mov fltq, r7m + mov fltq, r6m movq m1, [fltq] movq m3, [fltq+16] - mov t1, r9m ; pixel_max + LEA t0, wiener_shifts + mov t1, r8m ; pixel_max %endif mov PICmem, t0 %endif @@ -185,9 +183,8 @@ pshufd m11, m11, q1111 pmullw m12, m0 ; upshift filter coefs to make the pmullw m13, m0 ; horizontal downshift constant - DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w - %define lpfm [rsp+0] - %define lpf_stridem [rsp+8] + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] %define base %define wiener_lshuf7_mem [wiener_lshuf7] %define pd_m262128_mem [pd_m262128] @@ -230,35 +227,29 @@ test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top -%if ARCH_X86_64 - add lpfq, lpf_strideq -%else - add lpfq, lpf_stridem -%endif + add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top - movif32 lpf_strideq, lpf_stridem - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 - movif64 lpf_stridem, lpf_strideq - add r10, lpf_strideq + add r10, strideq mov lpfm, r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -273,17 +264,15 @@ jz .v3 mov lpfq, lpfm call .hv_bottom - add lpfq, lpf_stridem + add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: - movif32 lpf_strideq, lpf_stridem - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - movif64 lpf_stridem, lpf_strideq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov lpfm, r10 call .h mov t6, t1 @@ -293,13 +282,13 @@ mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -344,7 +333,7 @@ %assign stack_offset stack_offset-4 %assign calloff 4 .h: - movif64 wq, r5 + movif64 wq, r4 movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -357,7 +346,7 @@ pshufb m3, wiener_lshuf7_mem ; before the start of the buffer jmp .h_main .h_top: - movif64 wq, r5 + movif64 wq, r4 test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: @@ -367,7 +356,7 @@ movu m5, [lpfq+wq+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right - cmp wd, -18 + cmp wd, -20 jl .h_have_right call .extend_right .h_have_right: @@ -403,8 +392,8 @@ ret ALIGN function_align .hv: - add lpfq, dst_strideq - movif64 wq, r5 + add lpfq, strideq + movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT @@ -418,7 +407,7 @@ pshufb m3, wiener_lshuf7_mem jmp .hv_main .hv_bottom: - movif64 wq, r5 + movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT @@ -430,7 +419,7 @@ movu m5, [lpfq+wq+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right - cmp wd, -18 + cmp wd, -20 jl .hv_have_right call .extend_right .hv_have_right: @@ -516,24 +505,24 @@ mov t1, t0 mov t0, t6 %else - mov r5, t5m + mov r4, t5m mov t1, t4m - mov t6m, r5 + mov t6m, r4 mov t5m, t1 - mov r5, t3m + mov r4, t3m mov t1, t2m - mov t4m, r5 + mov t4m, r4 mov t3m, t1 - mov r5, t1m + mov r4, t1m mov t1, t0 - mov t2m, r5 + mov t2m, r4 mov t0, t6m mov wq, wm %endif - add dstq, dst_strideq + add dstq, strideq ret .v: - movif64 wq, r5 + movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 .v_loop: @@ -588,17 +577,17 @@ %else mov t0, t5m mov t1, t4m - mov r5, t3m + mov r4, t3m mov t6m, t0 mov t5m, t1 - mov t4m, r5 - mov r5, t2m + mov t4m, r4 + mov r4, t2m mov t1, t1m mov t0, t0m - mov t3m, r5 + mov t3m, r4 mov t2m, t1 %endif - add dstq, dst_strideq + add dstq, strideq ret %if ARCH_X86_32 @@ -607,18 +596,17 @@ %else %assign stack_size 11*16+384*8 %endif -cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ - lpf, lpf_stride, w, flt +cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \ + lpf, w, flt %if STACK_ALIGNMENT < 16 %define lpfm dword [esp+calloff+4*6] - %define lpf_stridem dword [esp+calloff+4*7] - %define wm dword [esp+calloff+16*10+0] - %define hd dword [esp+calloff+16*10+4] - %define edgeb byte [esp+calloff+16*10+8] - %define edged dword [esp+calloff+16*10+8] + %define wm dword [esp+calloff+4*7] + %define hd dword [esp+calloff+16*10+0] + %define edgeb byte [esp+calloff+16*10+4] + %define edged dword [esp+calloff+16*10+4] %else - %define hd dword r6m - %define edgeb byte r8m + %define hd dword r5m + %define edgeb byte r7m %endif %define PICmem dword [esp+calloff+4*0] %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers @@ -640,42 +628,41 @@ %define base t0-wiener_shifts %assign calloff 0 %if STACK_ALIGNMENT < 16 - mov wd, [rstk+stack_offset+24] - mov lpf_stridem, lpf_strideq + mov wd, [rstk+stack_offset+20] mov wm, wd - mov r4, [rstk+stack_offset+28] - mov hd, r4 - mov r4, [rstk+stack_offset+36] - mov edged, r4 ; edge + mov r5, [rstk+stack_offset+24] + mov hd, r5 + mov r5, [rstk+stack_offset+32] + mov edged, r5 ; edge %endif %else -cglobal wiener_filter5_16bpc, 5, 14, 16, 384*8+16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h +cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt %define base %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 - mov fltq, fltmp - mov edged, r8m - mov hd, r6m - mov t3d, r9m ; pixel_max + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max movq m12, [fltq] movq m14, [fltq+16] %else %if STACK_ALIGNMENT < 16 - mov t0, [rstk+stack_offset+32] - mov t1, [rstk+stack_offset+40] ; pixel_max + mov t0, [rstk+stack_offset+28] + mov t1, [rstk+stack_offset+36] ; pixel_max movq m1, [t0] ; fx movq m3, [t0+16] ; fy LEA t0, wiener_shifts %else - LEA t0, wiener_shifts - mov fltq, r7m + mov fltq, r6m movq m1, [fltq] movq m3, [fltq+16] - mov t1, r9m ; pixel_max + LEA t0, wiener_shifts + mov t1, r8m ; pixel_max %endif mov PICmem, t0 %endif @@ -706,9 +693,8 @@ mova m15, [wiener_lshuf5] pmullw m11, m0 pmullw m12, m0 - DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w - %define lpfm [rsp+0] - %define lpf_stridem [rsp+8] + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] %define base %else add wd, wd @@ -749,27 +735,21 @@ test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top -%if ARCH_X86_64 - add lpfq, lpf_strideq -%else - add lpfq, lpf_stridem -%endif + add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top - movif32 lpf_strideq, lpf_stridem - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 - movif64 lpf_stridem, lpf_strideq - add r10, lpf_strideq + add r10, strideq mov lpfm, r10 ; below call .h mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -784,16 +764,14 @@ jz .v2 mov lpfq, lpfm call .hv_bottom - add lpfq, lpf_stridem + add lpfq, strideq call .hv_bottom .end: RET .no_top: - movif32 lpf_strideq, lpf_stridem - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - movif64 lpf_stridem, lpf_strideq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov lpfm, r10 call .h mov t4, t1 @@ -801,7 +779,7 @@ mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -822,14 +800,14 @@ mov t2, t1 %else mov t0, t3m - mov r5, t2m + mov r4, t2m mov t1, t1m mov t4m, t0 - mov t3m, r5 + mov t3m, r4 mov t2m, t1 mov wq, wm %endif - add dstq, dst_strideq + add dstq, strideq .v1: call .v jmp .end @@ -853,7 +831,7 @@ %assign stack_offset stack_offset-4 %assign calloff 4 .h: - movif64 wq, r5 + movif64 wq, r4 movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -868,7 +846,7 @@ pshufb m3, m15 ; before the start of the buffer jmp .h_main .h_top: - movif64 wq, r5 + movif64 wq, r4 movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -909,8 +887,8 @@ ret ALIGN function_align .hv: - add lpfq, dst_strideq - movif64 wq, r5 + add lpfq, strideq + movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT @@ -926,7 +904,7 @@ pshufb m3, m15 jmp .hv_main .hv_bottom: - movif64 wq, r5 + movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT @@ -1010,20 +988,20 @@ mov t1, t0 mov t0, t4 %else - mov r5, t3m + mov r4, t3m mov t1, t2m - mov t4m, r5 + mov t4m, r4 mov t3m, t1 - mov r5, t1m + mov r4, t1m mov t1, t0 - mov t2m, r5 + mov t2m, r4 mov t0, t4m mov wq, wm %endif - add dstq, dst_strideq + add dstq, strideq ret .v: - movif64 wq, r5 + movif64 wq, r4 movif32 t1m, t1 .v_loop: %if ARCH_X86_64 @@ -1123,30 +1101,29 @@ %endmacro %if ARCH_X86_32 -DECLARE_REG_TMP 0, 1, 2, 3, 4 +DECLARE_REG_TMP 0, 1, 2, 3, 5 %if STACK_ALIGNMENT < 16 %assign extra_stack 5*16 %else %assign extra_stack 3*16 %endif cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ - dst, dst_stride, left, lpf, lpf_stride, w, params, h + dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*0+4*6] - %define dst_stridemp dword [esp+calloff+16*0+4*7] + %define stridemp dword [esp+calloff+16*0+4*7] %define leftm dword [esp+calloff+16*3+4*0] %define lpfm dword [esp+calloff+16*3+4*1] - %define lpf_stridem dword [esp+calloff+16*3+4*2] - %define w0m dword [esp+calloff+16*3+4*3] - %define hd dword [esp+calloff+16*3+4*4] - %define edgeb byte [esp+calloff+16*3+4*5] - %define edged dword [esp+calloff+16*3+4*5] + %define w0m dword [esp+calloff+16*3+4*2] + %define hd dword [esp+calloff+16*3+4*3] + %define edgeb byte [esp+calloff+16*3+4*4] + %define edged dword [esp+calloff+16*3+4*4] %define leftmp leftm %else %define w0m wm - %define hd dword r6m - %define edgeb byte r8m - %define edged dword r8m + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] @@ -1158,44 +1135,41 @@ %define m9 [base+pd_0xfffffff0] %define m10 [esp+calloff+16*2] %define m11 [base+pd_0xf00800a4] - %define m12 [base+pw_256] + %define m12 [base+sgr_lshuf5] %define m13 [base+pd_34816] %define m14 [base+pw_1023] - %define m15 [base+sgr_lshuf5] - %define r10 r5 + %define r10 r4 %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 - mov dst_strideq, [rstk+stack_offset+ 8] + mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] - mov lpf_strideq, [rstk+stack_offset+20] - mov wd, [rstk+stack_offset+24] + mov wd, [rstk+stack_offset+20] mov dstm, dstq - mov dst_stridemp, dst_strideq + mov stridemp, strideq mov leftm, leftq - mov r1, [rstk+stack_offset+28] - mov r2, [rstk+stack_offset+36] + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] mov lpfm, lpfq - mov lpf_stridem, lpf_strideq mov hd, r1 mov edged, r2 %endif %else -cglobal sgr_filter_5x5_16bpc, 5, 15, 16, -400*24-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \ + w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 - mov paramsq, paramsmp + mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] - mov edged, r8m + movifnidn hd, hm add wd, wd - mov hd, r6m + mov edged, r7m movu m10, [paramsq] - mova m12, [pw_256] + mova m12, [sgr_lshuf5] add lpfq, wq mova m8, [pd_8] lea t1, [rsp+wq+20] @@ -1205,19 +1179,17 @@ mova m11, [pd_0xf00800a4] lea t4, [rsp+wq+400*20+16] pshufhw m7, m10, q0000 - pshufb m10, m12 ; s0 + pshufb m10, [pw_256] ; s0 punpckhqdq m7, m7 ; w0 neg wq mova m13, [pd_34816] ; (1 << 11) + (1 << 15) pxor m6, m6 mova m14, [pw_1023] psllw m7, 4 - mova m15, [sgr_lshuf5] - DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w - %define lpfm [rsp+0] - %define lpf_stridem [rsp+8] + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] %else - mov r1, [rstk+stack_offset+32] ; params + mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ add wd, wd movu m1, [r1] @@ -1230,8 +1202,8 @@ mov t3m, t3 pshufhw m7, m1, q0000 mov t4m, t4 - pshufb m1, m12 ; s0 - punpckhqdq m7, m7 ; w0 + pshufb m1, [base+pw_256] ; s0 + punpckhqdq m7, m7 ; w0 psllw m7, 4 neg wq mova m10, m1 @@ -1239,22 +1211,22 @@ mov w1m, wd sub wd, 4 mov lpfq, lpfm - mov lpf_strideq, lpf_stridem mov w0m, wd + %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, stridemp movif32 t2m, t1 mov t2, t1 call .top_fixup add t1, 400*6 call .h_top - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - movif64 lpf_stridem, lpf_strideq - add r10, lpf_strideq + add r10, strideq mov lpfm, r10 ; below movif32 t0m, t2 mov t0, t2 @@ -1263,7 +1235,7 @@ or edged, 16 call .h .main: - add lpfq, dst_stridemp + add lpfq, stridemp movif32 t4, t4m call .hv call .prep_n @@ -1271,16 +1243,16 @@ jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp %if ARCH_X86_64 test hb, hb %else - mov r5, hd - test r5, r5 + mov r4, hd + test r4, r4 %endif jz .odd_height call .h - add lpfq, dst_stridemp + add lpfq, stridemp call .hv movif32 dstq, dstm call .n0 @@ -1292,7 +1264,7 @@ jz .extend_bottom mov lpfq, lpfm call .h_top - add lpfq, lpf_stridem + add lpfq, stridemp call .hv_bottom .end: movif32 dstq, dstm @@ -1319,10 +1291,10 @@ call .v jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - movif64 lpf_stridem, lpf_strideq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov lpfm, r10 call .h lea t2, [t1+400*6] @@ -1342,18 +1314,17 @@ call .prep_n jmp .odd_height_end .extend_right: -%assign stack_offset stack_offset+8 -%assign calloff 8 - movd m1, wd + movd m0, wd + movd m1, [lpfq-2] + mova m2, [base+pw_256] mova m3, [base+pb_m14_m13] + pshufb m0, m6 + pshufb m1, m2 + psubb m2, m0 + psubb m3, m0 mova m0, [base+pb_0to15] - pshufb m1, m6 - psubb m2, m12, m1 - psubb m3, m1 - movd m1, [lpfq-2] pcmpgtb m2, m0 pcmpgtb m3, m0 - pshufb m1, m12 pand m4, m2 pand m5, m3 pandn m2, m1 @@ -1361,13 +1332,13 @@ por m4, m2 por m5, m3 ret -%assign stack_offset stack_offset-4 +%assign stack_offset stack_offset+4 %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else - %define leftq r5 + %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -1381,11 +1352,11 @@ .h_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] - pshufb m4, m15 + pshufb m4, m12 jmp .h_main .h_top: %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -1438,7 +1409,7 @@ ret .top_fixup: %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov wd, w0m %endif @@ -1458,7 +1429,7 @@ ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -1474,11 +1445,11 @@ .hv_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] - pshufb m4, m15 + pshufb m4, m12 jmp .hv_main .hv_bottom: %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -1579,10 +1550,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 - psubw m2, m12, m3 ; a paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 - mova [t4+wq+4], m2 + mova [t4+wq+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*2+ 8], m0 @@ -1605,7 +1575,7 @@ jmp .hv_main2 .v: ; vertical boxsum + ab %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov wd, w0m %endif @@ -1659,10 +1629,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 - psubw m2, m12, m3 ; a paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 - mova [t4+wq+4], m2 + mova [t4+wq+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*2+ 8], m0 @@ -1671,7 +1640,7 @@ jl .v_loop ret .prep_n: ; initial neighbor setup - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*1+ 2] @@ -1703,7 +1672,7 @@ ret ALIGN function_align .n0: ; neighbor + output (even rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*1+ 2] @@ -1740,25 +1709,23 @@ punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 - paddd m2, m4 ; a * src + b + (1 << 8) - paddd m3, m5 - psrld m2, 9 - psrld m3, 9 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+wq], m0 add wq, 16 jl .n0_loop - add dstq, dst_stridemp + add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n1_loop: mova m0, [dstq+wq] @@ -1771,21 +1738,19 @@ punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 - paddd m2, m4 ; a * src + b + (1 << 7) - paddd m3, m5 - psrld m2, 8 - psrld m3, 8 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m4, m2 ; b - a * src + (1 << 7) + psubd m5, m3 + psrad m4, 8 + psrad m5, 8 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+wq], m0 add wq, 16 jl .n1_loop - add dstq, dst_stridemp + add dstq, stridemp movif32 dstm, dstq ret @@ -1796,23 +1761,22 @@ %assign extra_stack 2*16 %endif cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ - dst, dst_stride, left, lpf, lpf_stride, w, params, h + dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*2+4*0] - %define dst_stridemp dword [esp+calloff+16*2+4*1] + %define stridemp dword [esp+calloff+16*2+4*1] %define leftm dword [esp+calloff+16*2+4*2] %define lpfm dword [esp+calloff+16*2+4*3] - %define lpf_stridem dword [esp+calloff+16*2+4*4] - %define w0m dword [esp+calloff+16*2+4*5] - %define hd dword [esp+calloff+16*2+4*6] - %define edgeb byte [esp+calloff+16*2+4*7] - %define edged dword [esp+calloff+16*2+4*7] + %define w0m dword [esp+calloff+16*2+4*4] + %define hd dword [esp+calloff+16*2+4*5] + %define edgeb byte [esp+calloff+16*2+4*6] + %define edged dword [esp+calloff+16*2+4*6] %define leftmp leftm %else %define w0m wm - %define hd dword r6m - %define edgeb byte r8m - %define edged dword r8m + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] @@ -1822,43 +1786,39 @@ %define m9 [esp+calloff+16*1] %define m10 [base+pd_0xf00801c7] %define m11 [base+pd_34816] - %define m12 [base+pw_256] + %define m12 [base+sgr_lshuf3] %define m13 [base+pw_1023] - %define m14 [base+sgr_lshuf3] - %define m15 m6 + %define m14 m6 %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 - mov dst_strideq, [rstk+stack_offset+ 8] + mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] - mov lpf_strideq, [rstk+stack_offset+20] - mov wd, [rstk+stack_offset+24] + mov wd, [rstk+stack_offset+20] mov dstm, dstq - mov dst_stridemp, dst_strideq + mov stridemp, strideq mov leftm, leftq - mov r1, [rstk+stack_offset+28] - mov r2, [rstk+stack_offset+36] + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] mov lpfm, lpfq - mov lpf_stridem, lpf_strideq mov hd, r1 mov edged, r2 %endif %else -cglobal sgr_filter_3x3_16bpc, 5, 15, 16, -400*42-8, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \ + w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 - mov paramsq, paramsmp + mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] - mov edged, r8m + movifnidn hd, hm add wd, wd - mov hd, r6m + mov edged, r7m movq m9, [paramsq+4] - mova m12, [pw_256] add lpfq, wq lea t1, [rsp+wq+12] mova m8, [pd_8] @@ -1868,17 +1828,17 @@ lea t4, [rsp+wq+400*32+8] mova m11, [pd_34816] pshuflw m7, m9, q3333 - pshufb m9, m12 ; s1 + pshufb m9, [pw_256] ; s1 punpcklqdq m7, m7 ; w1 neg wq pxor m6, m6 mova m13, [pw_1023] psllw m7, 4 - mova m14, [sgr_lshuf3] - DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + mova m12, [sgr_lshuf3] + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else - mov r1, [rstk+stack_offset+32] ; params + mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ add wd, wd movq m1, [r1+4] @@ -1891,8 +1851,8 @@ mov t3m, t3 pshuflw m7, m1, q3333 mov t4m, t4 - pshufb m1, m12 ; s1 - punpcklqdq m7, m7 ; w1 + pshufb m1, [base+pw_256] ; s1 + punpcklqdq m7, m7 ; w1 psllw m7, 4 neg wq mova m9, m1 @@ -1900,19 +1860,20 @@ mov w1m, wd sub wd, 4 mov lpfq, lpfm - mov lpf_strideq, lpf_stridem mov w0m, wd + %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, stridemp mov t2, t1 add t1, 400*6 call .h_top - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - add r10, lpf_strideq + add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 @@ -1920,24 +1881,24 @@ dec hd jz .height1 movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hb, hb %else - mov r5, hd - test r5, r5 + mov r4, hd + test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv1 call .n0 call .n1 @@ -1947,12 +1908,8 @@ jz .extend_bottom mov lpfq, lpfm call .hv0_bottom -%if ARCH_X86_64 - add lpfq, lpf_strideq -%else - mov lpfq, hvsrcm - add lpfq, lpf_stridem -%endif + movif32 lpfq, hvsrcm + add lpfq, stridemp call .hv1_bottom .end: call .n0 @@ -1977,13 +1934,14 @@ call .v1 jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov wq, w0m mov hvsrcm, lpfq @@ -2003,27 +1961,25 @@ call .v0 jmp .main .extend_right: -%assign stack_offset stack_offset+8 -%assign calloff 8 movd m1, wd movd m5, [lpfq-2] + mova m2, [base+pw_256] mova m3, [base+pb_0to15] pshufb m1, m6 - pshufb m5, m12 - mova m2, m12 + pshufb m5, m2 psubb m2, m1 pcmpgtb m2, m3 pand m4, m2 pandn m2, m5 por m4, m2 ret -%assign stack_offset stack_offset-4 +%assign stack_offset stack_offset+4 %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else - %define leftq r5 + %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -2037,11 +1993,11 @@ .h_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] - pshufb m4, m14 + pshufb m4, m12 jmp .h_main .h_top: %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -2079,7 +2035,7 @@ ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -2095,11 +2051,11 @@ .hv0_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] - pshufb m4, m14 + pshufb m4, m12 jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -2163,12 +2119,12 @@ pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 - MAXSD m4, m2, m15 - MAXSD m5, m3, m15 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 - MULLD m4, m9, m15 ; p * s - MULLD m5, m9, m15 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 @@ -2179,15 +2135,14 @@ GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 - MULLD m0, m4, m15 - MULLD m1, m5, m15 - psubw m2, m12, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - mova [t4+wq+4], m2 + mova [t4+wq+4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+ 8], m0 @@ -2198,7 +2153,7 @@ ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -2214,11 +2169,11 @@ .hv1_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] - pshufb m4, m14 + pshufb m4, m12 jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -2276,12 +2231,12 @@ pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 - MAXSD m4, m2, m15 - MAXSD m5, m3, m15 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 - MULLD m4, m9, m15 ; p * s - MULLD m5, m9, m15 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 @@ -2292,15 +2247,14 @@ GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 - MULLD m0, m4, m15 - MULLD m1, m5, m15 - psubw m2, m12, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - mova [t4+wq*1+400*2 +4], m2 + mova [t4+wq*1+400*2 +4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*4+ 8], m0 @@ -2313,7 +2267,7 @@ ret .v0: ; vertical boxsums + ab (even rows) %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov wd, w0m %endif @@ -2346,12 +2300,12 @@ pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 - MAXSD m4, m2, m15 - MAXSD m5, m3, m15 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 - MULLD m4, m9, m15 ; p * s - MULLD m5, m9, m15 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 @@ -2361,15 +2315,14 @@ GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 - MULLD m0, m4, m15 - MULLD m1, m5, m15 - psubw m2, m12, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - mova [t4+wq*1+400*0+ 4], m2 + mova [t4+wq*1+400*0+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*0+ 8], m0 @@ -2379,7 +2332,7 @@ ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov wd, w0m %endif @@ -2409,12 +2362,12 @@ pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 - MAXSD m4, m2, m15 - MAXSD m5, m3, m15 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 - MULLD m4, m9, m15 ; p * s - MULLD m5, m9, m15 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 @@ -2424,15 +2377,14 @@ GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 - MULLD m0, m4, m15 - MULLD m1, m5, m15 - psubw m2, m12, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - mova [t4+wq*1+400*2+ 4], m2 + mova [t4+wq*1+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*4+ 8], m0 @@ -2444,7 +2396,7 @@ mov t1, r10 ret .prep_n: ; initial neighbor setup - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*1+400*0+ 4] @@ -2497,7 +2449,7 @@ ret ALIGN function_align .n0: ; neighbor + output (even rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m3, [t4+wq*1+400*0+4] @@ -2537,25 +2489,23 @@ punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 - paddd m2, m4 ; a * src + b + (1 << 8) - paddd m3, m5 - psrld m2, 9 - psrld m3, 9 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+wq], m0 add wq, 16 jl .n0_loop - add dstq, dst_stridemp + add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*1+400*2+4] @@ -2595,21 +2545,19 @@ punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 - paddd m2, m4 ; a * src + b + (1 << 8) - paddd m3, m5 - psrld m2, 9 - psrld m3, 9 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+wq], m0 add wq, 16 jl .n1_loop - add dstq, dst_stridemp + add dstq, stridemp movif32 dstm, dstq ret @@ -2620,23 +2568,22 @@ %assign extra_stack 8*16 %endif cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ - dst, dst_stride, left, lpf, lpf_stride, w, params, h + dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*8+4*0] - %define dst_stridemp dword [esp+calloff+16*8+4*1] + %define stridemp dword [esp+calloff+16*8+4*1] %define leftm dword [esp+calloff+16*8+4*2] %define lpfm dword [esp+calloff+16*8+4*3] - %define lpf_stridem dword [esp+calloff+16*8+4*4] - %define w0m dword [esp+calloff+16*8+4*5] - %define hd dword [esp+calloff+16*8+4*6] - %define edgeb byte [esp+calloff+16*8+4*7] - %define edged dword [esp+calloff+16*8+4*7] + %define w0m dword [esp+calloff+16*8+4*4] + %define hd dword [esp+calloff+16*8+4*5] + %define edgeb byte [esp+calloff+16*8+4*6] + %define edged dword [esp+calloff+16*8+4*6] %define leftmp leftm %else %define w0m wm - %define hd dword r6m - %define edgeb byte r8m - %define edged dword r8m + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] @@ -2646,7 +2593,7 @@ %define m9 [base+pd_8] %define m10 [base+pd_34816] %define m11 [base+pd_0xf00801c7] - %define m12 [base+pw_256] + %define m12 [base+pd_0xf00800a4] %define m13 [esp+calloff+16*4] %define m14 [esp+calloff+16*5] %define m15 [esp+calloff+16*6] @@ -2654,58 +2601,53 @@ %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 - mov dst_strideq, [rstk+stack_offset+ 8] + mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] - mov lpf_strideq, [rstk+stack_offset+20] - mov wd, [rstk+stack_offset+24] + mov wd, [rstk+stack_offset+20] mov dstm, dstq - mov dst_stridemp, dst_strideq + mov stridemp, strideq mov leftm, leftq - mov r1, [rstk+stack_offset+28] - mov r2, [rstk+stack_offset+36] + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] mov lpfm, lpfq - mov lpf_stridem, lpf_strideq mov hd, r1 mov edged, r2 %endif %else -cglobal sgr_filter_mix_16bpc, 5, 15, 16, -400*66-40, dst, dst_stride, left, \ - lpf, lpf_stride, w, edge, \ - params, h +cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ + w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 - mov paramsq, paramsmp + mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] - mov edged, r8m + movifnidn hd, hm add wd, wd - mov hd, r6m - mova m15, [paramsq] + mov edged, r7m + mova m14, [paramsq] add lpfq, wq mova m9, [pd_8] lea t1, [rsp+wq+44] mova m10, [pd_34816] add dstq, wq - mova m12, [pw_256] - lea t3, [rsp+wq*2+400*24+40] mova m11, [pd_0xf00801c7] + lea t3, [rsp+wq*2+400*24+40] + mova m12, [pd_0xf00800a4] lea t4, [rsp+wq+400*52+40] neg wq - pshuflw m13, m15, q0000 - pshuflw m14, m15, q2222 - pshufhw m15, m15, q1010 - punpcklqdq m13, m13 ; s0 - punpcklqdq m14, m14 ; s1 - punpckhqdq m15, m15 ; w0 w1 + pshufd m15, m14, q2222 ; w0 w1 + punpcklwd m14, m14 + pshufd m13, m14, q0000 ; s0 + pshufd m14, m14, q2222 ; s1 pxor m6, m6 psllw m15, 2 - DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else - mov r1, [rstk+stack_offset+32] ; params + mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ add wd, wd mova m2, [r1] @@ -2733,13 +2675,13 @@ mova m15, m2 mova m6, m3 mov lpfq, lpfm - mov lpf_strideq, lpf_stridem mov w0m, wd + %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, stridemp mov t2, t1 %if ARCH_X86_64 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup @@ -2749,9 +2691,10 @@ %endif add t1, 400*12 call .h_top - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - add r10, lpf_strideq + add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 @@ -2759,24 +2702,24 @@ dec hd jz .height1 movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hd, hd %else - mov r5, hd - test r5, r5 + mov r4, hd + test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv1 call .n0 call .n1 @@ -2786,12 +2729,8 @@ jz .extend_bottom mov lpfq, lpfm call .hv0_bottom -%if ARCH_X86_64 - add lpfq, lpf_strideq -%else - mov lpfq, hvsrcm - add lpfq, lpf_stridem -%endif + movif32 lpfq, hvsrcm + add lpfq, stridemp call .hv1_bottom .end: call .n0 @@ -2816,13 +2755,14 @@ call .v1 jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov wq, w0m mov hvsrcm, lpfq @@ -2854,9 +2794,9 @@ %assign stack_offset stack_offset+4 %assign calloff 4 %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else - %define leftq r5 + %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -2874,7 +2814,7 @@ jmp .h_main .h_top: %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -2928,7 +2868,7 @@ ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -2948,7 +2888,7 @@ jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -3060,10 +3000,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 - psubw m2, m12, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 - mova [t4+wq*1+400*2+ 4], m2 + mova [t4+wq*1+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*4+ 8], m0 @@ -3074,7 +3013,7 @@ ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -3094,7 +3033,7 @@ jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov hvsrcm, lpfq %endif @@ -3189,18 +3128,16 @@ punpckhwd m3, m8, m8 MULLD m0, m2, m7 MULLD m5, m3, m7 - psubw m7, m12, m8 -%if ARCH_X86_32 - mova m8, [esp+20] -%endif paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m5, m10 psrld m0, 12 psrld m5, 12 - mova [t4+wq*1+400*4+4], m7 + mova [t4+wq*1+400*4+4], m8 mova [t3+wq*2+400*8+ 8], m0 mova [t3+wq*2+400*8+24], m5 -%if ARCH_X86_64 +%if ARCH_X86_32 + mova m8, [esp+20] +%else SWAP m6, m8 pxor m6, m6 %endif @@ -3243,15 +3180,14 @@ %endif MAXSD m2, m4, m7 psubd m2, m4 ; p5 - mova m4, [base+pd_0xf00800a4] MAXSD m3, m1, m7 psubd m3, m1 MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 - pmaddwd m0, m4 ; b5 * 164 - pmaddwd m5, m4 - paddusw m2, m4 - paddusw m3, m4 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m5, m12 + paddusw m2, m12 + paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m1, m2, m3, r0, dstm @@ -3259,10 +3195,9 @@ punpckhwd m3, m1, m1 MULLD m0, m2, m7 MULLD m5, m3, m7 - psubw m4, m12, m1 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m5, m10 - mova [t4+wq*1+400*0+ 4], m4 + mova [t4+wq*1+400*0+ 4], m1 psrld m0, 12 psrld m5, 12 mova [t3+wq*2+400*0+ 8], m0 @@ -3275,7 +3210,7 @@ ret .v0: ; vertical boxsums + ab3 (even rows) %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov wd, w0m %endif @@ -3333,10 +3268,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 - psubw m2, m12, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 - mova [t4+wq*1+400*2+4], m2 + mova [t4+wq*1+400*2+4], m3 psrld m0, 12 psrld m1, 12 mova m3, [t1+wq+400*0] @@ -3358,7 +3292,7 @@ ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 - lea wq, [r5-4] + lea wq, [r4-4] %else mov wd, w0m %endif @@ -3413,10 +3347,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 - psubw m2, m12, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 - mova [t4+wq*1+400*4+4], m2 + mova [t4+wq*1+400*4+4], m3 psrld m0, 12 psrld m8, m1, 12 mova m4, [t3+wq*2+400*8+ 8] @@ -3463,15 +3396,14 @@ %endif MAXSD m2, m4, m7 psubd m2, m4 ; p5 - mova m4, [base+pd_0xf00800a4] MAXSD m3, m5, m7 psubd m3, m5 MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 - pmaddwd m0, m4 ; b5 * 164 - pmaddwd m1, m4 - paddusw m2, m4 - paddusw m3, m4 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m1, m12 + paddusw m2, m12 + paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m4, m2, m3, r0, dstm @@ -3479,10 +3411,9 @@ punpckhwd m3, m4, m4 MULLD m0, m2, m7 MULLD m1, m3, m7 - psubw m5, m12, m4 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m10 - mova [t4+wq*1+400*0+ 4], m5 + mova [t4+wq*1+400*0+ 4], m4 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*0+ 8], m0 @@ -3494,7 +3425,7 @@ mov t1, r10 ret .prep_n: ; initial neighbor setup - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*1+400*0+ 2] @@ -3571,7 +3502,7 @@ ret ALIGN function_align .n0: ; neighbor + output (even rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*1+ 4] @@ -3647,66 +3578,64 @@ %endif mova [t3+wq*2+400*20+16], m8 mova [rsp+32+ARCH_X86_32*4], m7 - movu m4, [dstq+wq] + movu m5, [dstq+wq] + punpcklwd m4, m5, m6 punpcklwd m7, m2, m6 - punpckhwd m2, m6 + pmaddwd m7, m4 ; a5 * src punpcklwd m8, m3, m6 + pmaddwd m8, m4 ; a3 * src + punpckhwd m5, m6 + punpckhwd m2, m6 + pmaddwd m2, m5 punpckhwd m3, m6 - punpcklwd m5, m4, m6 - punpckhwd m4, m6 - pmaddwd m7, m5 ; a5 * src - pmaddwd m8, m5 ; a3 * src - pmaddwd m2, m4 - pmaddwd m3, m4 - pslld m5, 13 + pmaddwd m3, m5 pslld m4, 13 - psubd m0, m5 - psubd m1, m5 - paddd m0, m7 ; a5 * src + b5 + (1 << 8) - (src << 13) - paddd m1, m8 ; a3 * src + b3 + (1 << 8) - (src << 13) + pslld m5, 13 + psubd m0, m7 ; b5 - a5 * src + (1 << 8) + psubd m1, m8 ; b3 - a3 * src + (1 << 8) mova m7, [base+pd_0xffff] psrld m0, 9 pslld m1, 7 pand m0, m7 pandn m8, m7, m1 por m0, m8 - psubd m1, m4, [rsp+16+ARCH_X86_32*4] - psubd m8, m4, [rsp+32+ARCH_X86_32*4] - psubd m2, m1 - psubd m3, m8 - mova m1, [base+pd_4096] - psrld m2, 9 - pslld m3, 7 - pand m2, m7 - pandn m7, m3 - por m2, m7 + mova m1, [rsp+16+ARCH_X86_32*4] + mova m8, [rsp+32+ARCH_X86_32*4] + psubd m1, m2 + psubd m8, m3 + mova m2, [base+pd_4096] + psrld m1, 9 + pslld m8, 7 + pand m1, m7 + pandn m7, m8 + por m1, m7 pmaddwd m0, m15 - pmaddwd m2, m15 + pmaddwd m1, m15 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif - paddd m5, m1 - paddd m4, m1 - paddd m0, m5 - paddd m2, m4 + paddd m4, m2 + paddd m5, m2 + paddd m0, m4 + paddd m1, m5 psrad m0, 8 - psrad m2, 8 - packssdw m0, m2 ; clip + psrad m1, 8 + packssdw m0, m1 ; clip pmaxsw m0, m7 psrlw m0, 5 mova [dstq+wq], m0 add wq, 16 jl .n0_loop - add dstq, dst_stridemp + add dstq, stridemp ret %if ARCH_X86_64 SWAP m6, m7 %endif ALIGN function_align .n1: ; neighbor + output (odd rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*1+400*4+4] @@ -3740,43 +3669,39 @@ mova [t3+wq*2+400*24+ 0], m4 mova [t3+wq*2+400*24+16], m0 mova m5, [dstq+wq] - mova m8, [t4+wq*1+400* 6] + mova m2, [t4+wq*1+400* 6] punpcklwd m4, m5, m6 + punpcklwd m8, m2, m6 + pmaddwd m8, m4 ; a5 * src + punpcklwd m0, m3, m6 + pmaddwd m0, m4 ; a3 * src punpckhwd m5, m6 - punpcklwd m0, m8, m6 - punpckhwd m8, m6 - punpcklwd m2, m3, m6 + punpckhwd m2, m6 + pmaddwd m2, m5 punpckhwd m3, m6 - pmaddwd m0, m4 ; a5 * src - pmaddwd m2, m4 ; a3 * src - pmaddwd m8, m5 pmaddwd m3, m5 - paddd m1, m2 ; a3 * src + b3 + (1 << 8) - (src << 13) - pslld m4, 12 - pslld m5, 12 - psubd m2, m4, [t3+wq*2+400*12+ 0] - psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) - psubd m2, m5, [t3+wq*2+400*12+16] + psubd m1, m0 ; b3 - a3 * src + (1 << 8) + pslld m4, 13 + pslld m5, 13 + mova m0, [t3+wq*2+400*12+ 0] + psubd m0, m8 ; b5 - a5 * src + (1 << 8) + mova m8, [t3+wq*2+400*12+16] psubd m8, m2 - paddd m4, m4 - paddd m5, m5 - paddd m7, m3 + psubd m7, m3 mova m2, [base+pd_0xffff] - psubd m1, m4 - psubd m7, m5 + pslld m1, 7 psrld m0, 8 psrld m8, 8 - pslld m1, 7 pslld m7, 7 pand m0, m2 - pand m8, m2 pandn m3, m2, m1 - pandn m2, m7 por m0, m3 - por m8, m2 + pand m8, m2 + pandn m2, m7 + por m2, m8 mova m1, [base+pd_4096] pmaddwd m0, m15 - pmaddwd m8, m15 + pmaddwd m2, m15 %if ARCH_X86_64 SWAP m7, m6 %endif @@ -3784,15 +3709,15 @@ paddd m4, m1 paddd m5, m1 paddd m0, m4 - paddd m8, m5 + paddd m2, m5 psrad m0, 8 - psrad m8, 8 - packssdw m0, m8 ; clip + psrad m2, 8 + packssdw m0, m2 ; clip pmaxsw m0, m7 psrlw m0, 5 mova [dstq+wq], m0 add wq, 16 jl .n1_loop - add dstq, dst_stridemp + add dstq, stridemp movif32 dstm, dstq ret diff -Nru dav1d-0.9.2/src/x86/looprestoration_avx2.asm dav1d-1.0.0/src/x86/looprestoration_avx2.asm --- dav1d-0.9.2/src/x86/looprestoration_avx2.asm 2021-09-03 15:51:24.425037100 +0000 +++ dav1d-1.0.0/src/x86/looprestoration_avx2.asm 2022-03-18 14:31:56.026356000 +0000 @@ -79,23 +79,15 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - -DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers INIT_YMM avx2 -cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h - mov fltq, fltmp - mov edged, r8m +cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m mov wd, wm - mov hd, r6m vbroadcasti128 m6, [wiener_shufA] vpbroadcastb m11, [fltq+ 0] ; x0 x0 vbroadcasti128 m7, [wiener_shufB] @@ -106,38 +98,39 @@ vbroadcasti128 m9, [sgr_shuf+6] add lpfq, wq vpbroadcastd m10, [pw_m16380] - lea t1, [rsp+wq*2+16] vpbroadcastd m14, [fltq+16] ; y0 y1 add dstq, wq vpbroadcastd m15, [fltq+20] ; y2 y3 + lea t1, [rsp+wq*2+16] + psllw m14, 5 neg wq + psllw m15, 5 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top - lea r7, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 - mov [rsp+8*1], lpf_strideq - add r7, lpf_strideq - mov [rsp+8*0], r7 ; below + add r10, strideq + mov [rsp], r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -150,19 +143,18 @@ jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 - mov lpfq, [rsp+8*0] + mov lpfq, [rsp] call .hv_bottom - add lpfq, [rsp+8*1] + add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: - lea r7, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - lea r7, [r7+lpf_strideq*2] - mov [rsp+8*0], r7 + lea r10, [r10+strideq*2] + mov [rsp], r10 call .h mov t6, t1 mov t5, t1 @@ -171,13 +163,13 @@ mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -276,7 +268,7 @@ ret ALIGN function_align .hv: - add lpfq, dst_strideq + add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left @@ -357,9 +349,7 @@ mova m3, [t3+r10*2+32] mova m5, [t5+r10*2+32] paddw m5, [t1+r10*2+32] - psrad m0, 11 - psrad m4, 11 - packssdw m0, m4 + packuswb m0, m4 paddw m4, m1, [t6+r10*2+32] mova [t0+r10*2+32], m1 punpcklwd m1, m2, m3 @@ -372,9 +362,9 @@ pmaddwd m4, m14 paddd m1, m3 paddd m2, m4 - psrad m1, 11 - psrad m2, 11 - packssdw m1, m2 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 @@ -386,7 +376,7 @@ mov t2, t1 mov t1, t0 mov t0, t6 - add dstq, dst_strideq + add dstq, strideq ret .v: mov r10, wq @@ -423,9 +413,10 @@ paddd m2, m6 paddd m1, m5 paddd m3, m7 - REPX {psrad x, 11}, m0, m2, m1, m3 - packssdw m0, m2 - packssdw m1, m3 + packuswb m0, m2 + packuswb m1, m3 + psrlw m0, 8 + psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 @@ -435,15 +426,15 @@ mov t4, t3 mov t3, t2 mov t2, t1 - add dstq, dst_strideq + add dstq, strideq ret -cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h - mov fltq, fltmp - mov edged, r8m +cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m mov wd, wm - mov hd, r6m vbroadcasti128 m6, [wiener_shufB] vpbroadcastd m12, [fltq+ 2] vbroadcasti128 m7, [wiener_shufC] @@ -453,31 +444,32 @@ add lpfq, wq vpbroadcastd m9, [pw_m16380] vpbroadcastd m10, [pw_2056] - lea t1, [rsp+wq*2+16] mova m11, [wiener_l_shuf] vpbroadcastd m14, [fltq+16] ; __ y1 add dstq, wq vpbroadcastd m15, [fltq+20] ; y2 y3 + lea t1, [rsp+wq*2+16] + psllw m14, 5 neg wq + psllw m15, 5 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top - lea r7, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 - mov [rsp+8*1], lpf_strideq - add r7, lpf_strideq - mov [rsp+8*0], r7 ; below + add r10, strideq + mov [rsp], r10 ; below call .h mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -490,25 +482,24 @@ jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 - mov lpfq, [rsp+8*0] + mov lpfq, [rsp] call .hv_bottom - add lpfq, [rsp+8*1] + add lpfq, strideq call .hv_bottom .end: RET .no_top: - lea r7, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - lea r7, [r7+lpf_strideq*2] - mov [rsp+8*0], r7 + lea r10, [r10+strideq*2] + mov [rsp], r10 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -526,7 +517,7 @@ mov t4, t3 mov t3, t2 mov t2, t1 - add dstq, dst_strideq + add dstq, strideq .v1: call .v jmp .end @@ -591,7 +582,7 @@ ret ALIGN function_align .hv: - add lpfq, dst_strideq + add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left @@ -661,9 +652,7 @@ mova m2, [t3+r10*2+32] paddw m2, [t1+r10*2+32] mova m3, [t2+r10*2+32] - psrad m0, 11 - psrad m4, 11 - packssdw m0, m4 + packuswb m0, m4 paddw m4, m1, [t4+r10*2+32] mova [t0+r10*2+32], m1 punpcklwd m1, m2, m3 @@ -676,9 +665,9 @@ pmaddwd m4, m14 paddd m1, m3 paddd m2, m4 - psrad m1, 11 - psrad m2, 11 - packssdw m1, m2 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 @@ -688,7 +677,7 @@ mov t2, t1 mov t1, t0 mov t0, t4 - add dstq, dst_strideq + add dstq, strideq ret .v: mov r10, wq @@ -720,73 +709,69 @@ paddd m2, m6 paddd m1, m5 paddd m3, m7 - REPX {psrad x, 11}, m0, m2, m1, m3 - packssdw m0, m2 - packssdw m1, m3 + packuswb m0, m2 + packuswb m1, m3 + psrlw m0, 8 + psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 jl .v_loop ret -cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \ + w, h, edge, params %define base r12-sgr_x_by_x_avx2-256*4 lea r12, [sgr_x_by_x_avx2+256*4] - mov paramsq, paramsmp + mov paramsq, r6mp mov wd, wm - mov edged, r8m - mov hd, r6m + movifnidn hd, hm + mov edged, r7m vbroadcasti128 m8, [base+sgr_shuf+0] - add lpfq, wq vbroadcasti128 m9, [base+sgr_shuf+8] - lea t1, [rsp+wq*2+20] + add lpfq, wq vbroadcasti128 m10, [base+sgr_shuf+2] add dstq, wq vbroadcasti128 m11, [base+sgr_shuf+6] lea t3, [rsp+wq*4+16+400*12] vpbroadcastd m12, [paramsq+0] ; s0 - neg wq - vpbroadcastd m13, [base+pd_0xf00800a4] pxor m6, m6 vpbroadcastw m7, [paramsq+8] ; w0 + lea t1, [rsp+wq*2+20] + vpbroadcastd m13, [base+pd_0xf00800a4] + neg wq vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) psllw m7, 4 vpbroadcastd m15, [base+pd_m4096] - lea r10, [lpfq+lpf_strideq*4] - mov [rsp+8*1], lpf_strideq - add r10, lpf_strideq - mov [rsp+8*0], r10 ; below test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t2, t1 call .top_fixup add t1, 400*6 call .h_top - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - add r10, lpf_strideq - mov [rsp+8*0], r10 ; below + add r10, strideq + mov [rsp], r10 ; below mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: - add lpfq, dst_strideq + add lpfq, strideq call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: - add lpfq, dst_strideq + add lpfq, strideq test hd, hd jz .odd_height call .h - add lpfq, dst_strideq + add lpfq, strideq call .hv call .n0 call .n1 @@ -794,9 +779,9 @@ jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom - mov lpfq, [rsp+8*0] + mov lpfq, [rsp] call .h_top - add lpfq, [rsp+8*1] + add lpfq, strideq call .hv_bottom .end: call .n0 @@ -819,11 +804,10 @@ call .v jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - lea r10, [r10+lpf_strideq*2] - mov [rsp+8*0], r10 + lea r10, [r10+strideq*2] + mov [rsp], r10 call .h lea t2, [t1+400*6] call .top_fixup @@ -1003,7 +987,7 @@ paddusw m4, m13 paddusw m5, m13 psrad m3, m4, 20 ; min(z, 255) - 256 - vpgatherdd m2, [r12+m3*4], m4 + vpgatherdd m2, [r12+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 pmulld m0, m2 @@ -1063,7 +1047,7 @@ paddusw m4, m13 paddusw m5, m13 psrad m3, m4, 20 ; min(z, 255) - 256 - vpgatherdd m2, [r12+m3*4], m4 + vpgatherdd m2, [r12+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 pmulld m0, m2 @@ -1096,12 +1080,9 @@ pslld m3, 2 paddd m2, m0 ; ab 565 paddd m3, m1 - ; a = 4096 - (ab & 4095) = -(ab | ~4095), so by - ; using OR instead of AND for the masking we get - ; the subtraction for free (with a negated result) - por m0, m15, m2 ; -a - psrld m2, 12 ; b - por m1, m15, m3 + pandn m0, m15, m2 ; a + psrld m2, 12 ; b + pandn m1, m15, m3 psrld m3, 12 mova [t3+r10*4+400*4+ 0], m0 mova [t3+r10*4+400*8+ 0], m2 @@ -1126,11 +1107,11 @@ pslld m3, 2 paddd m2, m0 paddd m3, m1 - por m0, m15, m2 + pandn m0, m15, m2 psrld m2, 12 - por m1, m15, m3 + pandn m1, m15, m3 psrld m3, 12 - paddd m4, m0, [t3+r10*4+400*4+ 0] ; -a + paddd m4, m0, [t3+r10*4+400*4+ 0] ; a paddd m5, m1, [t3+r10*4+400*4+32] mova [t3+r10*4+400*4+ 0], m0 mova [t3+r10*4+400*4+32], m1 @@ -1140,16 +1121,14 @@ mova [t3+r10*4+400*8+32], m3 pmovzxbd m2, [dstq+r10+0] pmovzxbd m3, [dstq+r10+8] - pmaddwd m4, m2 ; -a * src + pmaddwd m4, m2 ; a * src pmaddwd m5, m3 packssdw m2, m3 - psubd m0, m4 ; a * src + b + (1 << 8) + psubd m0, m4 ; b - a * src + (1 << 8) psubd m1, m5 - psrld m0, 9 - psrld m1, 9 + psrad m0, 9 + psrad m1, 9 packssdw m0, m1 - psllw m1, m2, 4 - psubw m0, m1 pmulhrsw m0, m7 paddw m0, m2 vextracti128 xm1, m0, 1 @@ -1158,7 +1137,7 @@ mova [dstq+r10], xm0 add r10, 16 jl .n0_loop - add dstq, dst_strideq + add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) @@ -1166,18 +1145,16 @@ .n1_loop: pmovzxbd m2, [dstq+r10+0] pmovzxbd m3, [dstq+r10+8] - pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; -a * src + pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src pmaddwd m5, m3, [t3+r10*4+400*4+32] mova m0, [t3+r10*4+400*8+ 0] ; b mova m1, [t3+r10*4+400*8+32] packssdw m2, m3 - psubd m0, m4 ; a * src + b + (1 << 7) + psubd m0, m4 ; b - a * src + (1 << 7) psubd m1, m5 - psrld m0, 8 - psrld m1, 8 + psrad m0, 8 + psrad m1, 8 packssdw m0, m1 - psllw m1, m2, 4 - psubw m0, m1 pmulhrsw m0, m7 paddw m0, m2 vextracti128 xm1, m0, 1 @@ -1186,44 +1163,43 @@ mova [dstq+r10], xm0 add r10, 16 jl .n1_loop - add dstq, dst_strideq + add dstq, strideq ret -cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \ + w, h, edge, params %define base r14-sgr_x_by_x_avx2-256*4 - mov paramsq, paramsmp - mov edged, r8m + mov paramsq, r6mp mov wd, wm - mov hd, r6m + movifnidn hd, hm + mov edged, r7m lea r14, [sgr_x_by_x_avx2+256*4] vbroadcasti128 m8, [base+sgr_shuf+2] add lpfq, wq vbroadcasti128 m9, [base+sgr_shuf+4] - lea t1, [rsp+wq*2+20] - vbroadcasti128 m10, [base+sgr_shuf+6] add dstq, wq - vpbroadcastd m11, [paramsq+ 4] ; s1 + vbroadcasti128 m10, [base+sgr_shuf+6] lea t3, [rsp+wq*4+16+400*12] + vpbroadcastd m11, [paramsq+ 4] ; s1 + pxor m6, m6 + vpbroadcastw m7, [paramsq+10] ; w1 + lea t1, [rsp+wq*2+20] vpbroadcastd m12, [base+pd_0xf00801c7] neg wq - vpbroadcastw m7, [paramsq+10] ; w1 - pxor m6, m6 vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) psllw m7, 4 vpbroadcastd m14, [base+pd_m4096] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t2, t1 add t1, 400*6 call .h_top - lea t4, [lpfq+lpf_strideq*4] + lea t4, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - add t4, lpf_strideq - mov [rsp+8*0], t4 ; below + add t4, strideq + mov [rsp], t4 ; below mov t0, t2 call .hv .main: @@ -1231,23 +1207,23 @@ add t3, 400*4 dec hd jz .height1 - add lpfq, dst_strideq + add lpfq, strideq call .hv call .prep_n dec hd jz .extend_bottom .main_loop: - add lpfq, dst_strideq + add lpfq, strideq call .hv call .n dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom - mov lpfq, [rsp+8*0] + mov lpfq, [rsp] call .hv_bottom call .n - add lpfq, [rsp+8*1] + add lpfq, strideq call .hv_bottom .end: call .n @@ -1265,11 +1241,10 @@ call .v jmp .end .no_top: - lea t4, [lpfq+lpf_strideq*4] + lea t4, [lpfq+strideq*4] mov lpfq, dstq - mov [rsp+8*1], lpf_strideq - lea t4, [t4+lpf_strideq*2] - mov [rsp+8*0], t4 + lea t4, [t4+strideq*2] + mov [rsp], t4 call .h lea t0, [t1+400*6] mov t2, t1 @@ -1509,31 +1484,29 @@ paddd m5, m5 psubd m5, m4 mova [t5+r10*4+32], m5 - por m4, m14, m0 + pandn m4, m14, m0 psrld m0, 12 paddd m3, m5 - por m5, m14, m2 + pandn m5, m14, m2 psrld m2, 12 - paddd m4, m5 ; -a - por m5, m14, m1 + paddd m4, m5 ; a + pandn m5, m14, m1 psrld m1, 12 - paddd m0, m2 ; b + (1 << 8) - por m2, m14, m3 + paddd m0, m2 ; b + (1 << 8) + pandn m2, m14, m3 psrld m3, 12 paddd m5, m2 pmovzxbd m2, [dstq+r10+0] paddd m1, m3 pmovzxbd m3, [dstq+r10+8] - pmaddwd m4, m2 ; -a * src + pmaddwd m4, m2 ; a * src pmaddwd m5, m3 packssdw m2, m3 - psubd m0, m4 ; a * src + b + (1 << 8) + psubd m0, m4 ; b - a * src + (1 << 8) psubd m1, m5 - psrld m0, 9 - psrld m1, 9 + psrad m0, 9 + psrad m1, 9 packssdw m0, m1 - psllw m1, m2, 4 - psubw m0, m1 pmulhrsw m0, m7 paddw m0, m2 vextracti128 xm1, m0, 1 @@ -1545,58 +1518,58 @@ mov r10, t5 mov t5, t4 mov t4, r10 - add dstq, dst_strideq + add dstq, strideq ret -cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \ + w, h, edge, params %define base r12-sgr_x_by_x_avx2-256*4 lea r12, [sgr_x_by_x_avx2+256*4] - mov paramsq, paramsmp + mov paramsq, r6mp mov wd, wm - mov edged, r8m - mov hd, r6m + movifnidn hd, hm + mov edged, r7m vbroadcasti128 m9, [base+sgr_shuf+0] - add lpfq, wq vbroadcasti128 m10, [base+sgr_shuf+8] - lea t1, [rsp+wq*2+12] + add lpfq, wq vbroadcasti128 m11, [base+sgr_shuf+2] - add dstq, wq vbroadcasti128 m12, [base+sgr_shuf+6] - lea t3, [rsp+wq*4+400*24+8] + add dstq, wq vpbroadcastd m15, [paramsq+8] ; w0 w1 - neg wq + lea t3, [rsp+wq*4+400*24+8] vpbroadcastd m13, [paramsq+0] ; s0 pxor m7, m7 vpbroadcastd m14, [paramsq+4] ; s1 + lea t1, [rsp+wq*2+12] + neg wq psllw m15, 2 ; to reuse existing pd_m4096 register for rounding test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t2, t1 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup add t1, 400*12 call .h_top - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - add r10, lpf_strideq + add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 - add lpfq, dst_strideq + add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: - add lpfq, dst_strideq + add lpfq, strideq call .hv0 test hd, hd jz .odd_height - add lpfq, dst_strideq + add lpfq, strideq call .hv1 call .n0 call .n1 @@ -1606,7 +1579,7 @@ jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom - add lpfq, lpf_strideq + add lpfq, strideq call .hv1_bottom .end: call .n0 @@ -1631,9 +1604,9 @@ call .v1 jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + lea r10, [lpfq+strideq*4] mov lpfq, dstq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+400*12] @@ -1908,7 +1881,7 @@ vpgatherdd m2, [r12+m3*4], m6 psrad m6, m7, 20 vpgatherdd m3, [r12+m6*4], m7 - vpbroadcastd m6, [base+pd_34816] + vpbroadcastd m6, [base+pd_34816] ; x3 pmulld m0, m2 vpbroadcastd m7, [base+pd_m4096] pmulld m1, m3 @@ -1918,12 +1891,12 @@ pand m7, m1 por m0, m2 ; a3 | (b3 << 12) por m7, m3 - paddw m1, m8, [t2+r10*2+400*0] - paddd m2, m4, [t2+r10*2+400*2] - paddd m3, m5, [t2+r10*2+400*4] - paddw m1, [t1+r10*2+400*0] - paddd m2, [t1+r10*2+400*2] - paddd m3, [t1+r10*2+400*4] + paddw m1, m8, [t2+r10*2+400*0] + paddd m2, m4, [t2+r10*2+400*2] + paddd m3, m5, [t2+r10*2+400*4] + paddw m1, [t1+r10*2+400*0] + paddd m2, [t1+r10*2+400*2] + paddd m3, [t1+r10*2+400*4] mova [t2+r10*2+400*0], m8 mova [t2+r10*2+400*2], m4 mova [t2+r10*2+400*4], m5 @@ -1949,7 +1922,7 @@ paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 - vpgatherdd m4, [r12+m5*4], m2 + vpgatherdd m4, [r12+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r12+m2*4], m3 pmulld m0, m4 @@ -2006,7 +1979,7 @@ paddusw m4, m2 paddusw m5, m2 psrad m3, m4, 20 ; min(z3, 255) - 256 - vpgatherdd m2, [r12+m3*4], m4 + vpgatherdd m2, [r12+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 pmulld m0, m2 @@ -2023,7 +1996,7 @@ mova [t3+r10*4+400*8+ 8], m2 mova [t3+r10*4+400*0+ 8], m3 mova [t3+r10*4+400*0+40], m4 - paddw m2, m2 ; cc5 + paddw m2, m2 ; cc5 paddd m3, m3 paddd m4, m4 mova [t1+r10*2+400*0], m2 @@ -2066,7 +2039,7 @@ paddusw m4, m2 paddusw m5, m2 psrad m3, m4, 20 ; min(z3, 255) - 256 - vpgatherdd m2, [r12+m3*4], m4 + vpgatherdd m2, [r12+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 vpbroadcastd m4, [base+pd_34816] @@ -2112,7 +2085,7 @@ paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 - vpgatherdd m4, [r12+m5*4], m2 + vpgatherdd m4, [r12+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r12+m2*4], m3 pmulld m0, m4 @@ -2154,7 +2127,7 @@ paddd m3, m3 ; ab3[ 0] 222 psubd m2, m4 ; ab3[-1] 343 mova [t3+r10*4+400*20], m3 - por m0, m6, m1 ; a5 565 + pandn m0, m6, m1 ; a5 565 mova [t3+r10*4+400*24], m2 psrld m1, 12 ; b5 565 mova [t3+r10*4+400*12], m0 @@ -2175,11 +2148,11 @@ paddd m0, m4 pslld m4, 2 paddd m4, m0 - por m0, m6, m4 + pandn m0, m6, m4 psrld m4, 12 - paddd m2, m0, [t3+r10*4+400*12] ; -a5 + paddd m2, m0, [t3+r10*4+400*12] ; a5 mova [t3+r10*4+400*12], m0 - paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) + paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) mova [t3+r10*4+400*16], m4 mova m3, [t3+r10*4+400*4+0] paddd m3, [t3+r10*4+400*4+8] @@ -2192,34 +2165,31 @@ psubd m5, m3 ; ab3[ 1] 343 mova [t3+r10*4+400*24], m5 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 - por m3, m6, m1 + pandn m3, m6, m1 psrld m1, 12 - por m5, m6, m4 + pandn m5, m6, m4 psrld m4, 12 - paddd m3, m5 ; -a3 - paddd m1, m4 ; b3 + (1 << 8) + paddd m3, m5 ; a3 + paddd m1, m4 ; b3 + (1 << 8) pmovzxbd m4, [dstq+r10] - pmaddwd m2, m4 ; -a5 * src - pmaddwd m3, m4 ; -a3 * src - pslld m4, 13 - psubd m0, m4 - psubd m1, m4 - psubd m0, m2 ; a5 * src + b5 + (1 << 8) - psubd m1, m3 ; a3 * src + b3 + (1 << 8) + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + psubd m0, m2 ; b5 - a5 * src + (1 << 8) + psubd m1, m3 ; b3 - a3 * src + (1 << 8) psrld m0, 9 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 - psubd m4, m6 - paddd m0, m4 + psubd m0, m6 psrad m0, 13 + paddd m0, m4 vextracti128 xm1, m0, 1 packssdw xm0, xm1 packuswb xm0, xm0 movq [dstq+r10], xm0 add r10, 8 jl .n0_loop - add dstq, dst_strideq + add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) @@ -2236,9 +2206,9 @@ psubd m5, m3 ; ab3[ 1] 343 mova [t3+r10*4+400*28], m5 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 - por m3, m6, m1 + pandn m3, m6, m1 psrld m1, 12 - por m5, m6, m4 + pandn m5, m6, m4 psrld m4, 12 paddd m3, m5 ; -a3 paddd m1, m4 ; b3 + (1 << 8) @@ -2246,25 +2216,22 @@ pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) pmaddwd m3, m4 ; -a3 * src - pslld m4, 12 - psubd m0, m4 - paddd m4, m4 - psubd m1, m4 psubd m0, m2 ; a5 * src + b5 + (1 << 7) psubd m1, m3 ; a3 * src + b3 + (1 << 8) psrld m0, 8 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 - psubd m4, m6 - paddd m0, m4 + psubd m0, m6 psrad m0, 13 + paddd m0, m4 vextracti128 xm1, m0, 1 packssdw xm0, xm1 packuswb xm0, xm0 movq [dstq+r10], xm0 add r10, 8 jl .n1_loop - add dstq, dst_strideq + add dstq, strideq ret + %endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/looprestoration_avx512.asm dav1d-1.0.0/src/x86/looprestoration_avx512.asm --- dav1d-0.9.2/src/x86/looprestoration_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/looprestoration_avx512.asm 2022-03-18 14:31:56.026356000 +0000 @@ -0,0 +1,2122 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12 +wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13 +wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 +wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47 + db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63 +sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8 + db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128 +sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 +r_ext_mask: times 68 db -1 + times 4 db 0 +wiener_x_shuf: db 0, 2, -1, 0 +wiener_x_add: db 0, 1,127, 0 + +pw_61448: times 2 dw 61448 +pw_164_455: dw 164, 455 +pd_m16380: dd -16380 +pd_m4096: dd -4096 +pd_m25 dd -25 +pd_m9: dd -9 +pd_34816: dd 34816 +pd_8421376: dd 8421376 + +cextern sgr_x_by_x + +SECTION .text + +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers + +INIT_ZMM avx512icl +cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt + mov fltq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti32x4 m6, [wiener_shufA] + vbroadcasti32x4 m7, [wiener_shufB] + mov r10d, 0xfffe + vbroadcasti32x4 m8, [wiener_shufC] + vbroadcasti32x4 m9, [wiener_shufD] + kmovw k1, r10d + vpbroadcastd m0, [wiener_x_shuf] + vpbroadcastd m1, [wiener_x_add] + mov r10, 0xaaaaaaaaaaaaaaaa + vpbroadcastd m11, [fltq+ 0] + vpbroadcastd m12, [fltq+ 4] + kmovq k2, r10 + vpbroadcastd m10, [pd_m16380] + packsswb m11, m11 ; x0 x1 x0 x1 + vpbroadcastd m14, [fltq+16] + pshufb m12, m0 + vpbroadcastd m15, [fltq+20] + paddb m12, m1 ; x2 x3+1 x2 127 + vpbroadcastd m13, [pd_8421376] + psllw m14, 5 ; y0 y1 + psllw m15, 5 ; y2 y3 + cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32 + jle .w32 ; pixels, so we need a special case for small widths + lea t1, [rsp+wq*2+16] + add lpfq, wq + add dstq, wq + neg wq + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm16, [leftq] + vmovdqu32 m16{k1}, [lpfq+r10-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception + vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m16, [lpfq+r10-4] +.h_main: + movu m17, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -66 + jl .h_have_right + push r0 + lea r0, [r_ext_mask+65] + vpbroadcastb m0, [lpfq-1] + vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b + vpternlogd m17, m0, [r0+r10+8], 0xe4 + pop r0 +.h_have_right: + pshufb m4, m16, m6 + mova m0, m10 + vpdpbusd m0, m4, m11 + pshufb m4, m16, m7 + mova m2, m10 + vpdpbusd m2, m4, m11 + pshufb m4, m17, m6 + mova m1, m10 + vpdpbusd m1, m4, m11 + pshufb m4, m17, m7 + mova m3, m10 + vpdpbusd m3, m4, m11 + pshufb m4, m16, m8 + vpdpbusd m0, m4, m12 + pshufb m16, m9 + vpdpbusd m2, m16, m12 + pshufb m4, m17, m8 + vpdpbusd m1, m4, m12 + pshufb m17, m9 + vpdpbusd m3, m17, m12 + packssdw m0, m2 + packssdw m1, m3 + psraw m0, 3 + psraw m1, 3 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+64], m1 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm16, [leftq] + vmovdqu32 m16{k1}, [lpfq+r10-4] + add leftq, 4 + jmp .hv_main +.hv_extend_left: + vpbroadcastb xm16, [lpfq+r10] + vmovdqu32 m16{k1}, [lpfq+r10-4] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m16, [lpfq+r10-4] +.hv_main: + movu m17, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -66 + jl .hv_have_right + push r0 + lea r0, [r_ext_mask+65] + vpbroadcastb m0, [lpfq-1] + vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b + vpternlogd m17, m0, [r0+r10+8], 0xe4 + pop r0 +.hv_have_right: + pshufb m4, m16, m6 + mova m0, m10 + vpdpbusd m0, m4, m11 + pshufb m4, m16, m7 + mova m2, m10 + vpdpbusd m2, m4, m11 + pshufb m4, m17, m6 + mova m1, m10 + vpdpbusd m1, m4, m11 + pshufb m4, m17, m7 + mova m3, m10 + vpdpbusd m3, m4, m11 + pshufb m4, m16, m8 + vpdpbusd m0, m4, m12 + pshufb m16, m9 + vpdpbusd m2, m16, m12 + pshufb m4, m17, m8 + vpdpbusd m1, m4, m12 + pshufb m17, m9 + vpdpbusd m3, m17, m12 + packssdw m0, m2 + packssdw m1, m3 + psraw m0, 3 + psraw m1, 3 + mova m16, [t4+r10*2] + paddw m16, [t2+r10*2] + mova m3, [t3+r10*2] + mova m17, [t4+r10*2+64] + paddw m17, [t2+r10*2+64] + mova m5, [t3+r10*2+64] + punpcklwd m4, m16, m3 + mova m2, m13 + vpdpwssd m2, m4, m15 + punpcklwd m18, m17, m5 + mova m4, m13 + vpdpwssd m4, m18, m15 + punpckhwd m16, m3 + mova m3, m13 + vpdpwssd m3, m16, m15 + punpckhwd m17, m5 + mova m5, m13 + vpdpwssd m5, m17, m15 + mova m17, [t5+r10*2] + paddw m17, [t1+r10*2] + paddw m16, m0, [t6+r10*2] + mova m19, [t5+r10*2+64] + paddw m19, [t1+r10*2+64] + paddw m18, m1, [t6+r10*2+64] + mova [t0+r10*2+ 0], m0 + mova [t0+r10*2+64], m1 + punpcklwd m0, m16, m17 + vpdpwssd m2, m0, m14 + punpcklwd m1, m18, m19 + vpdpwssd m4, m1, m14 + punpckhwd m16, m17 + vpdpwssd m3, m16, m14 + punpckhwd m18, m19 + vpdpwssd m5, m18, m14 + packuswb m2, m4 + psrlw m2, 8 + vpackuswb m2{k2}, m3, m5 + mova [dstq+r10], m2 + add r10, 64 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m4, [t4+r10*2+ 0] + paddw m4, [t2+r10*2+ 0] + mova m1, [t3+r10*2+ 0] + mova m5, [t4+r10*2+64] + paddw m5, [t2+r10*2+64] + mova m3, [t3+r10*2+64] + punpcklwd m6, m4, m1 + mova m0, m13 + vpdpwssd m0, m6, m15 + punpcklwd m6, m5, m3 + mova m2, m13 + vpdpwssd m2, m6, m15 + punpckhwd m4, m1 + mova m1, m13 + vpdpwssd m1, m4, m15 + punpckhwd m5, m3 + mova m3, m13 + vpdpwssd m3, m5, m15 + mova m5, [t1+r10*2+ 0] + paddw m4, m5, [t6+r10*2+ 0] + paddw m5, [t5+r10*2+ 0] + mova m7, [t1+r10*2+64] + paddw m6, m7, [t6+r10*2+64] + paddw m7, [t5+r10*2+64] + punpcklwd m8, m4, m5 + vpdpwssd m0, m8, m14 + punpcklwd m8, m6, m7 + vpdpwssd m2, m8, m14 + punpckhwd m4, m5 + vpdpwssd m1, m4, m14 + punpckhwd m6, m7 + vpdpwssd m3, m6, m14 + packuswb m0, m2 + psrlw m0, 8 + vpackuswb m0{k2}, m1, m3 + mova [dstq+r10], m0 + add r10, 64 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret +.w32: + lea r10, [r_ext_mask+73] + mova ym18, [wiener_perm32] + lea t1, [rsp+16] + sub r10, wq + test edgeb, 4 ; LR_HAVE_TOP + jz .w32_no_top + call .w32_h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 32*2 + call .w32_h_top + lea r9, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 32*2 + add r9, strideq + mov [rsp], r9 ; below + call .w32_h + mov t3, t1 + mov t2, t1 + dec hd + jz .w32_v1 + add lpfq, strideq + add t1, 32*2 + call .w32_h + mov t2, t1 + dec hd + jz .w32_v2 + add lpfq, strideq + add t1, 32*2 + call .w32_h + dec hd + jz .w32_v3 +.w32_main: + lea t0, [t1+32*2] +.w32_main_loop: + call .w32_hv + dec hd + jnz .w32_main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .w32_v3 + mov lpfq, [rsp] + call .w32_hv_bottom + add lpfq, strideq + call .w32_hv_bottom +.w32_v1: + call .w32_v + RET +.w32_no_top: + lea r9, [lpfq+strideq*4] + mov lpfq, dstq + lea r9, [r9+strideq*2] + mov [rsp], r9 + call .w32_h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .w32_v1 + add lpfq, strideq + add t1, 32*2 + call .w32_h + mov t2, t1 + dec hd + jz .w32_v2 + add lpfq, strideq + add t1, 32*2 + call .w32_h + dec hd + jz .w32_v3 + lea t0, [t1+32*2] + call .w32_hv + dec hd + jz .w32_v3 + add t0, 32*8 + call .w32_hv + dec hd + jnz .w32_main +.w32_v3: + call .w32_v +.w32_v2: + call .w32_v + jmp .w32_v1 +.w32_h: + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_h_extend_left + movd xm16, [leftq] + vmovdqu32 ym16{k1}, [lpfq-4] + add leftq, 4 + jmp .w32_h_main +.w32_h_extend_left: + vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception + vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory + jmp .w32_h_main +.w32_h_top: + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_h_extend_left + movu ym16, [lpfq-4] +.w32_h_main: + vinserti32x8 m16, [lpfq+4], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .w32_h_have_right + vpbroadcastb m0, [lpfq+wq-1] + movu ym17, [r10-8] + vinserti32x8 m17, [r10+0], 1 + vpternlogd m16, m0, m17, 0xe4 ; c ? a : b +.w32_h_have_right: + pshufb m2, m16, m6 + mova m0, m10 + vpdpbusd m0, m2, m11 + pshufb m2, m16, m7 + mova m1, m10 + vpdpbusd m1, m2, m11 + pshufb m2, m16, m8 + vpdpbusd m0, m2, m12 + pshufb m16, m9 + vpdpbusd m1, m16, m12 + packssdw m0, m1 + psraw m0, 3 + mova [t1], m0 + ret +.w32_hv: + add lpfq, strideq + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_hv_extend_left + movd xm16, [leftq] + vmovdqu32 ym16{k1}, [lpfq-4] + add leftq, 4 + jmp .w32_hv_main +.w32_hv_extend_left: + vpbroadcastb xm16, [lpfq] + vmovdqu32 ym16{k1}, [lpfq-4] + jmp .w32_hv_main +.w32_hv_bottom: + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_hv_extend_left + movu ym16, [lpfq-4] +.w32_hv_main: + vinserti32x8 m16, [lpfq+4], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .w32_hv_have_right + vpbroadcastb m0, [lpfq+wq-1] + movu ym17, [r10-8] + vinserti32x8 m17, [r10+0], 1 + vpternlogd m16, m0, m17, 0xe4 +.w32_hv_have_right: + mova m3, [t4] + paddw m3, [t2] + mova m2, [t3] + pshufb m4, m16, m6 + mova m0, m10 + vpdpbusd m0, m4, m11 + pshufb m4, m16, m7 + mova m5, m10 + vpdpbusd m5, m4, m11 + punpcklwd m4, m3, m2 + mova m1, m13 + vpdpwssd m1, m4, m15 + punpckhwd m3, m2 + mova m2, m13 + vpdpwssd m2, m3, m15 + pshufb m4, m16, m8 + vpdpbusd m0, m4, m12 + pshufb m16, m9 + vpdpbusd m5, m16, m12 + packssdw m0, m5 + psraw m0, 3 + mova m4, [t5] + paddw m4, [t1] + paddw m3, m0, [t6] + mova [t0], m0 + punpcklwd m0, m3, m4 + vpdpwssd m1, m0, m14 + punpckhwd m3, m4 + vpdpwssd m2, m3, m14 + packuswb m1, m2 + vpermb m16, m18, m1 + mova [dstq], ym16 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.w32_v: + mova m2, [t4] + paddw m2, [t2] + mova m1, [t3] + mova m4, [t1] + paddw m3, m4, [t6] + paddw m4, [t5] + punpcklwd m5, m2, m1 + mova m0, m13 + vpdpwssd m0, m5, m15 + punpckhwd m2, m1 + mova m1, m13 + vpdpwssd m1, m2, m15 + punpcklwd m2, m3, m4 + vpdpwssd m0, m2, m14 + punpckhwd m3, m4 + vpdpwssd m1, m3, m14 + packuswb m0, m1 + vpermb m16, m18, m0 + mova [dstq], ym16 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret + +cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \ + w, h, edge, params + mov paramsq, r6mp + mov wd, wm + mov hd, hm + mov edged, r7m + vbroadcasti32x4 m5, [sgr_shuf+1] + add lpfq, wq + vbroadcasti32x4 m6, [sgr_shuf+9] + add dstq, wq + vbroadcasti32x4 m7, [sgr_shuf+3] + lea t3, [rsp+wq*4+16+416*12] + vbroadcasti32x4 m8, [sgr_shuf+7] + pxor m4, m4 + vpbroadcastd m9, [pd_m25] + vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 + vpbroadcastw m15, [paramsq+8] ; w0 + lea t1, [rsp+wq*2+20] + vpbroadcastd m10, [pw_164_455] + neg wq + vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) + mov r10d, 0xfe + vpbroadcastd m13, [pd_m4096] + kmovb k1, r10d + vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) + mov r10, 0x3333333333333333 + mova m18, [sgr_x_by_x+64*0] + kmovq k2, r10 + mova m19, [sgr_x_by_x+64*1] + lea r12, [r_ext_mask+75] + mova m20, [sgr_x_by_x+64*2] + psllw m15, 4 + mova m21, [sgr_x_by_x+64*3] + lea r10, [lpfq+strideq*4] + mova ym22, [sgr_shuf] + add r10, strideq + mov [rsp], r10 ; below + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call .top_fixup + add t1, 416*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + test hd, hd + jz .odd_height + call .h + add lpfq, strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .h_top + add lpfq, strideq + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+416*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu ym17, [lpfq+r10-2] +.h_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.h_have_right: + pshufb m3, m17, m5 + pmullw m2, m3, m3 + pshufb m1, m17, m6 + paddw m0, m3, m1 + shufps m3, m1, q2121 + paddw m0, m3 + punpcklwd m16, m3, m1 + punpckhwd m3, m1 + punpcklwd m1, m2, m4 + vpdpwssd m1, m16, m16 + punpckhwd m2, m4 + vpdpwssd m2, m3, m3 + pshufb m16, m17, m7 + paddw m0, m16 + pshufb m17, m8 + paddw m0, m17 ; sum + punpcklwd m3, m16, m17 + vpdpwssd m1, m3, m3 ; sumsq + punpckhwd m16, m17 + vpdpwssd m2, m16, m16 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10*2+416*0] + paddd m1, [t1+r10*2+416*2] + paddd m2, [t1+r10*2+416*4] +.h_loop_end: + mova [t1+r10*2+416*0], m0 + mova [t1+r10*2+416*2], m1 + mova [t1+r10*2+416*4], m2 + add r10, 32 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-2] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10*2+416*0] + mova m1, [t1+r10*2+416*2] + mova m2, [t1+r10*2+416*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10*2+416*0], m0 + mova [t2+r10*2+416*2], m1 + mova [t2+r10*2+416*4], m2 + add r10, 32 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv_main +.hv_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu ym17, [lpfq+r10-2] +.hv_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv_have_right: + pshufb m1, m17, m5 + pmullw m3, m1, m1 + pshufb m2, m17, m6 + paddw m0, m1, m2 + shufps m1, m2, q2121 + paddw m0, m1 + punpcklwd m16, m1, m2 + punpckhwd m1, m2 + punpcklwd m2, m3, m4 + vpdpwssd m2, m16, m16 + punpckhwd m3, m4 + vpdpwssd m3, m1, m1 + pshufb m16, m17, m7 + paddw m0, m16 + pshufb m17, m8 + paddw m0, m17 ; h sum + punpcklwd m1, m16, m17 + vpdpwssd m2, m1, m1 ; h sumsq + punpckhwd m16, m17 + vpdpwssd m3, m16, m16 + paddw m1, m0, [t1+r10*2+416*0] + paddd m16, m2, [t1+r10*2+416*2] + paddd m17, m3, [t1+r10*2+416*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddd m16, [t2+r10*2+416*2] ; hv sumsq + paddd m17, [t2+r10*2+416*4] + paddw m1, [t2+r10*2+416*0] ; hv sum + mova [t0+r10*2+416*2], m2 + mova [t0+r10*2+416*4], m3 + mova [t0+r10*2+416*0], m0 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + punpcklwd m0, m1, m4 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 164 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires + mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b. + vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but + mova [t3+r10*4+ 72], m17 ; that gets us most of the way. + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10*2+416*0], m1 + paddw m1, m0 + mova [t1+r10*2+416*2], m16 + paddd m16, m2 + mova [t1+r10*2+416*4], m17 + paddd m17, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m2, [t1+r10*2+416*2] + paddd m16, m2, [t2+r10*2+416*2] + mova m3, [t1+r10*2+416*4] + paddd m17, m3, [t2+r10*2+416*4] + paddd m2, m2 + paddd m3, m3 + paddd m16, m2 ; hv sumsq + paddd m17, m3 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + mova m0, [t1+r10*2+416*0] + paddw m1, m0, [t2+r10*2+416*0] + paddw m0, m0 + paddw m1, m0 ; hv sum + punpcklwd m0, m1, m4 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 164 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 + mova [t3+r10*4+ 24], xm17 + vextracti32x4 [t3+r10*4+ 56], m17, 2 + mova [t3+r10*4+ 72], m17 + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+68] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+64] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+72] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 ; ab 565 + paddd m3, m1 + pandn m0, m13, m2 ; a + psrld m2, 12 ; b + pandn m1, m13, m3 + psrld m3, 12 + mova [t3+r10*4+416*4+ 0], m0 + mova [t3+r10*4+416*8+ 0], m2 + mova [t3+r10*4+416*4+64], m1 + mova [t3+r10*4+416*8+64], m3 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m16, [t3+r10*4+ 4] + movu m17, [t3+r10*4+68] + paddd m0, m16, [t3+r10*4+ 0] + paddd m1, m17, [t3+r10*4+64] + paddd m0, [t3+r10*4+ 8] + paddd m1, [t3+r10*4+72] + paddd m16, m0 + pslld m0, 2 + paddd m17, m1 + pslld m1, 2 + paddd m0, m16 + paddd m1, m17 + pandn m16, m13, m0 + psrld m0, 12 + pandn m17, m13, m1 + psrld m1, 12 + paddd m2, m16, [t3+r10*4+416*4+ 0] ; a + paddd m3, m17, [t3+r10*4+416*4+64] + mova [t3+r10*4+416*4+ 0], m16 + mova [t3+r10*4+416*4+64], m17 + paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8) + paddd m17, m1, [t3+r10*4+416*8+64] + mova [t3+r10*4+416*8+ 0], m0 + mova [t3+r10*4+416*8+64], m1 + pmovzxbd m0, [dstq+r10+ 0] + pmovzxbd m1, [dstq+r10+16] + pmaddwd m2, m0 ; a * src + pmaddwd m3, m1 + packssdw m0, m1 + psubd m16, m2 ; b - a * src + (1 << 8) + psubd m17, m3 + psrad m16, 9 + psrad m17, 9 + packssdw m16, m17 + pmulhrsw m16, m15 + paddw m16, m0 + packuswb m16, m16 + vpermd m16, m22, m16 + mova [dstq+r10], ym16 + add r10, 32 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + pmovzxbd m0, [dstq+r10+ 0] + pmovzxbd m1, [dstq+r10+16] + pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src + pmaddwd m3, m1, [t3+r10*4+416*4+64] + mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7) + mova m17, [t3+r10*4+416*8+64] + packssdw m0, m1 + psubd m16, m2 ; b - a * src + (1 << 7) + psubd m17, m3 + psrad m16, 8 + psrad m17, 8 + packssdw m16, m17 + pmulhrsw m16, m15 + paddw m16, m0 + packuswb m16, m16 + vpermd m16, m22, m16 + mova [dstq+r10], ym16 + add r10, 32 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \ + w, h, edge, params + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti32x4 m5, [sgr_shuf+3] + add lpfq, wq + vbroadcasti32x4 m6, [sgr_shuf+5] + add dstq, wq + vbroadcasti32x4 m7, [sgr_shuf+7] + pxor m4, m4 + vpbroadcastd m8, [pd_m9] + vpsubd m11, m4, [paramsq+4] {1to16} ; -s1 + vpbroadcastw m15, [paramsq+10] ; w1 + lea t1, [rsp+wq*2+20] + vpbroadcastd m10, [pw_164_455] + lea t3, [rsp+wq*4+16+416*12] + vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) + neg wq + vpbroadcastd m13, [pd_m4096] + mov r10d, 0xfe + vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) + kmovb k1, r10d + mova m18, [sgr_x_by_x+64*0] + mov r10, 0x3333333333333333 + mova m19, [sgr_x_by_x+64*1] + kmovq k2, r10 + mova m20, [sgr_x_by_x+64*2] + psllw m15, 4 + mova m21, [sgr_x_by_x+64*3] + lea r14, [r_ext_mask+75] + mova ym9, [sgr_shuf] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + add t1, 416*6 + call .h_top + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + add t4, strideq + mov [rsp], t4 ; below + mov t0, t2 + call .hv +.main: + mov t5, t3 + add t3, 416*4 + dec hd + jz .height1 + add lpfq, strideq + call .hv + call .prep_n + dec hd + jz .extend_bottom +.main_loop: + add lpfq, strideq + call .hv + call .n + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv_bottom + call .n + add lpfq, strideq + call .hv_bottom +.end: + call .n + RET +.height1: + call .v + call .prep_n + mov t2, t1 + call .v + jmp .end +.extend_bottom: + call .v + call .n + mov t2, t1 + call .v + jmp .end +.no_top: + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + lea t4, [t4+strideq*2] + mov [rsp], t4 + call .h + lea t0, [t1+416*6] + mov t2, t1 + call .v + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu ym17, [lpfq+r10-2] +.h_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -33 + jl .h_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r14+r10-8] + vinserti32x8 m16, [r14+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.h_have_right: + pshufb m0, m17, m5 + pmullw m2, m0, m0 + pshufb m16, m17, m6 + paddw m0, m16 + pshufb m17, m7 + paddw m0, m17 ; sum + punpcklwd m3, m16, m17 + punpcklwd m1, m2, m4 + vpdpwssd m1, m3, m3 ; sumsq + punpckhwd m16, m17 + punpckhwd m2, m4 + vpdpwssd m2, m16, m16 + mova [t1+r10*2+416*0], m0 + mova [t1+r10*2+416*2], m1 + mova [t1+r10*2+416*4], m2 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv_main +.hv_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu ym17, [lpfq+r10-2] +.hv_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -33 + jl .hv_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r14+r10-8] + vinserti32x8 m16, [r14+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv_have_right: + pshufb m0, m17, m5 + pmullw m3, m0, m0 + pshufb m1, m17, m6 + paddw m0, m1 + pshufb m17, m7 + paddw m0, m17 ; h sum + punpcklwd m16, m17, m1 + punpcklwd m2, m3, m4 + vpdpwssd m2, m16, m16 ; h sumsq + punpckhwd m17, m1 + punpckhwd m3, m4 + vpdpwssd m3, m17, m17 + paddw m1, m0, [t2+r10*2+416*0] + paddw m1, [t1+r10*2+416*0] ; hv sum + paddd m16, m2, [t2+r10*2+416*2] + paddd m17, m3, [t2+r10*2+416*4] + paddd m16, [t1+r10*2+416*2] ; hv sumsq + paddd m17, [t1+r10*2+416*4] + mova [t0+r10*2+416*0], m0 + mova [t0+r10*2+416*2], m2 + mova [t0+r10*2+416*4], m3 + pmulld m16, m8 ; -a * 9 + pmulld m17, m8 + punpcklwd m0, m4, m1 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 + mova [t3+r10*4+ 24], xm17 + vextracti32x4 [t3+r10*4+ 56], m17, 2 + mova [t3+r10*4+ 72], m17 + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m16, [t1+r10*2+416*2] + mova m17, [t1+r10*2+416*4] + paddd m16, m16 + paddd m17, m17 + paddd m16, [t2+r10*2+416*2] ; hv sumsq + paddd m17, [t2+r10*2+416*4] + pmulld m16, m8 ; -a * 9 + pmulld m17, m8 + mova m1, [t1+r10*2+416*0] + paddw m1, m1 + paddw m1, [t2+r10*2+416*0] ; hv sum + punpcklwd m0, m4, m1 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 + mova [t3+r10*4+ 24], xm17 + vextracti32x4 [t3+r10*4+ 56], m17, 2 + mova [t3+r10*4+ 72], m17 + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq + mov t4, t3 + add t3, 416*4 +.prep_n_loop: + mova m2, [t5+r10*4+0] + mova m3, [t4+r10*4+0] + paddd m2, [t5+r10*4+8] + paddd m3, [t4+r10*4+8] + paddd m0, m2, [t5+r10*4+4] + paddd m1, m3, [t4+r10*4+4] + pslld m0, 2 + paddd m1, m1 ; ab[ 0] 222 + psubd m0, m2 ; ab[-1] 343 + mova [t3+r10*4+416*4], m1 + paddd m1, m1 + mova [t5+r10*4], m0 + psubd m1, m3 ; ab[ 0] 343 + mova [t4+r10*4], m1 + add r10, 16 + jl .prep_n_loop + ret +; a+b are packed together in a single dword, but we can't do the +; full neighbor calculations before splitting them since we don't +; have sufficient precision. The solution is to do the calculations +; in two equal halves and split a and b before doing the final sum. +ALIGN function_align +.n: ; neighbor + output + mov r10, wq +.n_loop: + mova m16, [t3+r10*4+ 0] + paddd m16, [t3+r10*4+ 8] + paddd m17, m16, [t3+r10*4+ 4] + paddd m17, m17 ; ab[+1] 222 + mova m2, [t3+r10*4+416*4+ 0] + paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 + mova m3, [t3+r10*4+416*4+64] + paddd m1, m3, [t5+r10*4+64] + mova [t3+r10*4+416*4+ 0], m17 + paddd m17, m17 + psubd m17, m16 ; ab[+1] 343 + mova [t5+r10*4+ 0], m17 + paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343 + mova m16, [t3+r10*4+64] + paddd m16, [t3+r10*4+72] + paddd m17, m16, [t3+r10*4+68] + paddd m17, m17 + mova [t3+r10*4+416*4+64], m17 + paddd m17, m17 + psubd m17, m16 + mova [t5+r10*4+64], m17 + pandn m16, m13, m0 + psrld m0, 12 + paddd m3, m17 + pandn m17, m13, m2 + psrld m2, 12 + paddd m16, m17 ; a + pandn m17, m13, m1 + psrld m1, 12 + paddd m0, m2 ; b + (1 << 8) + pandn m2, m13, m3 + psrld m3, 12 + paddd m17, m2 + pmovzxbd m2, [dstq+r10+ 0] + paddd m1, m3 + pmovzxbd m3, [dstq+r10+16] + pmaddwd m16, m2 ; a * src + pmaddwd m17, m3 + packssdw m2, m3 + psubd m0, m16 ; b - a * src + (1 << 8) + psubd m1, m17 + psrad m0, 9 + psrad m1, 9 + packssdw m0, m1 + pmulhrsw m0, m15 + paddw m0, m2 + packuswb m0, m0 + vpermd m16, m9, m0 + mova [dstq+r10], ym16 + add r10, 32 + jl .n_loop + mov r10, t5 + mov t5, t4 + mov t4, r10 + add dstq, strideq + ret + +cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \ + w, h, edge, params + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti128 m5, [sgr_shuf+1] + add lpfq, wq + vbroadcasti128 m6, [sgr_shuf+9] + add dstq, wq + vbroadcasti128 m7, [sgr_shuf+3] + lea t3, [rsp+wq*4+416*24+8] + vbroadcasti128 m8, [sgr_shuf+7] + pxor m4, m4 + vpbroadcastd m9, [pd_m9] + vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 + vpbroadcastd m14, [pw_61448] + vpsubd m12, m4, [paramsq+4] {1to16} ; -s1 + vpbroadcastd m26, [paramsq+8] ; w0 w1 + lea t1, [rsp+wq*2+12] + vpbroadcastd m10, [pd_m25] + neg wq + vpbroadcastd m13, [pw_164_455] + mov r10d, 0xfe + vpbroadcastd m15, [pd_34816] + kmovb k1, r10d + mova m20, [sgr_x_by_x+64*0] + mov r10, 0x3333333333333333 + mova m21, [sgr_x_by_x+64*1] + kmovq k2, r10 + mova m22, [sgr_x_by_x+64*2] + lea r12, [r_ext_mask+75] + mova m23, [sgr_x_by_x+64*3] + vpbroadcastd m24, [pd_m4096] + vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____ + psllw m26, 5 + mova xm27, [sgr_mix_perm] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup + add t1, 416*12 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+416*12] + lea r10, [wq-2] +.top_fixup_loop: + mova m0, [t1+r10*2+416* 0] + mova m1, [t1+r10*2+416* 2] + mova m2, [t1+r10*2+416* 4] + paddw m0, m0 + mova m3, [t1+r10*2+416* 6] + paddd m1, m1 + mova m16, [t1+r10*2+416* 8] + paddd m2, m2 + mova m17, [t1+r10*2+416*10] + mova [t2+r10*2+416* 0], m0 + mova [t2+r10*2+416* 2], m1 + mova [t2+r10*2+416* 4], m2 + mova [t2+r10*2+416* 6], m3 + mova [t2+r10*2+416* 8], m16 + mova [t2+r10*2+416*10], m17 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsums + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu ym17, [lpfq+r10-2] +.h_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.h_have_right: + pshufb m3, m17, m5 + pshufb m18, m17, m6 + shufps m0, m3, m18, q2121 + pmullw m2, m0, m0 + pshufb m19, m17, m7 + paddw m0, m19 + pshufb m17, m8 + paddw m0, m17 ; sum3 + punpcklwd m16, m19, m17 + punpcklwd m1, m2, m4 + vpdpwssd m1, m16, m16 ; sumsq3 + punpckhwd m19, m17 + punpckhwd m2, m4 + vpdpwssd m2, m19, m19 + mova [t1+r10*2+416* 6], m0 + mova [t1+r10*2+416* 8], m1 + mova [t1+r10*2+416*10], m2 + punpcklwd m19, m3, m18 + paddw m0, m3 + vpdpwssd m1, m19, m19 ; sumsq5 + punpckhwd m3, m18 + paddw m0, m18 ; sum5 + vpdpwssd m2, m3, m3 + mova [t1+r10*2+416* 0], m0 + mova [t1+r10*2+416* 2], m1 + mova [t1+r10*2+416* 4], m2 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv0_main +.hv0_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu ym17, [lpfq+r10-2] +.hv0_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -34 + jl .hv0_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv0_have_right: + pshufb m18, m17, m5 + pshufb m19, m17, m6 + shufps m1, m18, m19, q2121 + pmullw m3, m1, m1 + pshufb m0, m17, m7 + paddw m1, m0 + pshufb m17, m8 + paddw m1, m17 ; sum3 + punpcklwd m16, m0, m17 + punpcklwd m2, m3, m4 + vpdpwssd m2, m16, m16 ; sumsq3 + punpckhwd m0, m17 + punpckhwd m3, m4 + vpdpwssd m3, m0, m0 + paddw m0, m1, [t1+r10*2+416* 6] + paddd m16, m2, [t1+r10*2+416* 8] + paddd m17, m3, [t1+r10*2+416*10] + mova [t1+r10*2+416* 6], m1 + mova [t1+r10*2+416* 8], m2 + mova [t1+r10*2+416*10], m3 + paddw m1, m18 + paddw m1, m19 ; sum5 + mova [t3+r10*4+416*8+ 8], m1 + paddw m1, [t1+r10*2+416* 0] + mova [t1+r10*2+416* 0], m1 + punpcklwd m1, m18, m19 + vpdpwssd m2, m1, m1 ; sumsq5 + punpckhwd m18, m19 + vpdpwssd m3, m18, m18 + mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row + mova [t3+r10*4+416*0+72], m3 ; in case height is odd + paddd m2, [t1+r10*2+416* 2] + paddd m3, [t1+r10*2+416* 4] + mova [t1+r10*2+416* 2], m2 + mova [t1+r10*2+416* 4], m3 + paddw m1, m0, [t2+r10*2+416* 6] + paddd m2, m16, [t2+r10*2+416* 8] + paddd m3, m17, [t2+r10*2+416*10] + mova [t2+r10*2+416* 6], m0 + mova [t2+r10*2+416* 8], m16 + mova [t2+r10*2+416*10], m17 + pmulld m16, m2, m9 ; -a3 * 9 + pmulld m17, m3, m9 + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m22 + paddusw m17, m14 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*4+ 8], m16 + mova [t3+r10*4+416*4+ 24], xm17 + vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 + mova [t3+r10*4+416*4+ 72], m17 + vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*4+104], m16, 3 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv1_main +.hv1_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu ym17, [lpfq+r10-2] +.hv1_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -34 + jl .hv1_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv1_have_right: + pshufb m3, m17, m5 + pshufb m19, m17, m6 + shufps m2, m3, m19, q2121 + pmullw m1, m2, m2 + pshufb m18, m17, m7 + paddw m2, m18 + pshufb m17, m8 + paddw m2, m17 ; sum3 + punpcklwd m16, m17, m18 + punpcklwd m0, m1, m4 + vpdpwssd m0, m16, m16 ; sumsq3 + punpckhwd m17, m18 + punpckhwd m1, m4 + vpdpwssd m1, m17, m17 + paddd m16, m0, [t2+r10*2+416* 8] + paddd m17, m1, [t2+r10*2+416*10] + mova [t2+r10*2+416* 8], m0 + mova [t2+r10*2+416*10], m1 + punpcklwd m18, m3, m19 + vpdpwssd m0, m18, m18 ; sumsq5 + punpckhwd m18, m3, m19 + vpdpwssd m1, m18, m18 + paddw m3, m19 + pmulld m16, m9 ; -a3 * 9 + pmulld m17, m9 + paddd m18, m0, [t2+r10*2+416*2] + paddd m19, m1, [t2+r10*2+416*4] + paddd m18, [t1+r10*2+416*2] + paddd m19, [t1+r10*2+416*4] + mova [t2+r10*2+416*2], m0 + mova [t2+r10*2+416*4], m1 + pmulld m18, m10 ; -a5 * 25 + pmulld m19, m10 + paddw m1, m2, [t2+r10*2+416* 6] + mova [t2+r10*2+416* 6], m2 + paddw m2, m3 ; sum5 + paddw m3, m2, [t2+r10*2+416*0] + paddw m3, [t1+r10*2+416*0] + mova [t2+r10*2+416*0], m2 + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + punpcklwd m2, m3, m4 ; b5 + vpdpwssd m18, m2, m2 ; -p5 + punpckhwd m3, m4 + vpdpwssd m19, m3, m3 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmulld m18, m11 ; p5 * s0 + pmulld m19, m11 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + pmaddwd m2, m13 ; b5 * 164 + pmaddwd m3, m13 + vpalignr m17{k2}, m16, m16, 2 + vpalignr m19{k2}, m18, m18, 2 + paddusw m17, m14 + mova m16, m22 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + paddusw m19, m14 + mova m18, m22 + psraw m19, 4 ; min(z5, 255) - 256 + vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] + vpmovb2m k4, m19 + vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + vmovdqu8 m19{k4}, m18 ; x5 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + pandn m18, m24, m19 + psrld m19, 16 + pmulld m2, m18 + pmulld m3, m19 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*8+ 8], m16 + mova [t3+r10*4+416*8+ 24], xm17 + vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 + paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m3, m15 + mova [t3+r10*4+416*8+ 72], m17 + vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*8+104], m16, 3 + vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) + vpternlogd m19, m3, m24, 0xd8 + mova [t3+r10*4+416*0+ 8], m18 + mova [t3+r10*4+416*0+ 24], xm19 + vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 + mova [t3+r10*4+416*0+ 72], m19 + vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 + vextracti32x4 [t3+r10*4+416*0+104], m18, 3 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-2] +.v0_loop: + mova m2, [t1+r10*2+416* 8] + mova m3, [t1+r10*2+416*10] + paddd m2, m2 + paddd m3, m3 + paddd m16, m2, [t2+r10*2+416* 8] + paddd m17, m3, [t2+r10*2+416*10] + mova m0, [t1+r10*2+416* 6] + paddw m0, m0 + paddw m1, m0, [t2+r10*2+416* 6] + pmulld m16, m9 ; -a3 * 9 + pmulld m17, m9 + mova [t2+r10*2+416* 6], m0 + mova [t2+r10*2+416* 8], m2 + mova [t2+r10*2+416*10], m3 + mova m2, [t1+r10*2+416*0] + mova m3, [t1+r10*2+416*2] + mova m18, [t1+r10*2+416*4] + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + mova [t3+r10*4+416*8+ 8], m2 + mova [t3+r10*4+416*0+ 8], m3 + mova [t3+r10*4+416*0+72], m18 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m22 + paddusw m17, m14 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddw m2, m2 ; cc5 + paddd m3, m3 + paddd m18, m18 + mova [t1+r10*2+416*0], m2 + mova [t1+r10*2+416*2], m3 + mova [t1+r10*2+416*4], m18 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*4+ 8], m16 + mova [t3+r10*4+416*4+ 24], xm17 + vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 + mova [t3+r10*4+416*4+ 72], m17 + vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*4+104], m16, 3 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-2] +.v1_loop: + mova m0, [t1+r10*2+416* 8] + paddd m16, m0, [t2+r10*2+416* 8] + mova m1, [t1+r10*2+416*10] + paddd m17, m1, [t2+r10*2+416*10] + mova m2, [t3+r10*4+416*0+ 8] + paddd m18, m2, [t2+r10*2+416* 2] + mova m3, [t3+r10*4+416*0+72] + paddd m19, m3, [t2+r10*2+416* 4] + paddd m18, [t1+r10*2+416* 2] + paddd m19, [t1+r10*2+416* 4] + mova [t2+r10*2+416* 8], m0 + mova [t2+r10*2+416*10], m1 + mova [t2+r10*2+416* 2], m2 + mova [t2+r10*2+416* 4], m3 + pmulld m16, m9 ; -a3 * 9 + pmulld m17, m9 + pmulld m18, m10 ; -a5 * 25 + pmulld m19, m10 + mova m0, [t1+r10*2+416* 6] + paddw m1, m0, [t2+r10*2+416* 6] + mova m2, [t3+r10*4+416*8+ 8] + paddw m3, m2, [t2+r10*2+416*0] + paddw m3, [t1+r10*2+416*0] + mova [t2+r10*2+416* 6], m0 + mova [t2+r10*2+416*0], m2 + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + punpcklwd m2, m3, m4 ; b5 + vpdpwssd m18, m2, m2 ; -p5 + punpckhwd m3, m4 + vpdpwssd m19, m3, m3 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmulld m18, m11 ; p5 * s0 + pmulld m19, m11 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + pmaddwd m2, m13 ; b5 * 164 + pmaddwd m3, m13 + vpalignr m17{k2}, m16, m16, 2 + vpalignr m19{k2}, m18, m18, 2 + paddusw m17, m14 + mova m16, m22 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + paddusw m19, m14 + mova m18, m22 + psraw m19, 4 ; min(z5, 255) - 256 + vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] + vpmovb2m k4, m19 + vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + vmovdqu8 m19{k4}, m18 ; x5 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + pandn m18, m24, m19 + psrld m19, m19, 16 + pmulld m2, m18 + pmulld m3, m19 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*8+ 8], m16 + mova [t3+r10*4+416*8+ 24], xm17 + vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 + paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m3, m15 + mova [t3+r10*4+416*8+ 72], m17 + vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*8+104], m16, 3 + vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) + vpternlogd m19, m3, m24, 0xd8 + mova [t3+r10*4+416*0+ 8], m18 + mova [t3+r10*4+416*0+ 24], xm19 + vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 + mova [t3+r10*4+416*0+ 72], m19 + vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 + vextracti32x4 [t3+r10*4+416*0+104], m18, 3 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+416*0+4] + paddd m1, m0, [t3+r10*4+416*0+0] + mova m16, [t3+r10*4+416*4+0] + paddd m1, [t3+r10*4+416*0+8] + mova m17, [t3+r10*4+416*8+0] + paddd m16, [t3+r10*4+416*4+8] + paddd m17, [t3+r10*4+416*8+8] + paddd m2, m16, [t3+r10*4+416*4+4] + paddd m3, m17, [t3+r10*4+416*8+4] + paddd m0, m1 + pslld m1, 2 + pslld m2, 2 + paddd m1, m0 ; ab5 565 + paddd m3, m3 ; ab3[ 0] 222 + psubd m2, m16 ; ab3[-1] 343 + mova [t3+r10*4+416*20], m3 + pandn m0, m24, m1 ; a5 565 + mova [t3+r10*4+416*24], m2 + psrld m1, 12 ; b5 565 + mova [t3+r10*4+416*12], m0 + paddd m3, m3 + mova [t3+r10*4+416*16], m1 + psubd m3, m17 ; ab3[ 0] 343 + mova [t3+r10*4+416*28], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m2, [t3+r10*4+4] + paddd m3, m2, [t3+r10*4+0] + paddd m3, [t3+r10*4+8] + mova m1, [t3+r10*4+416*4+0] + paddd m2, m3 + pslld m3, 2 + paddd m1, [t3+r10*4+416*4+8] + paddd m3, m2 + pandn m2, m24, m3 + psrld m3, 12 + paddd m0, m2, [t3+r10*4+416*12] ; a5 + paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8) + mova [t3+r10*4+416*12], m2 + mova [t3+r10*4+416*16], m3 + paddd m2, m1, [t3+r10*4+416*4+4] + paddd m2, m2 ; ab3[ 1] 222 + mova m3, [t3+r10*4+416*20] + paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+416*20], m2 + paddd m2, m2 + psubd m2, m1 ; ab3[ 1] 343 + mova [t3+r10*4+416*24], m2 + paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m1, m24, m17 + psrld m17, 12 + pandn m3, m24, m2 + psrld m2, 12 + paddd m1, m3 ; a3 + pmovzxbd m3, [dstq+r10] + paddd m17, m2 ; b3 + (1 << 8) + pmaddwd m0, m3 ; a5 * src + pmaddwd m1, m3 ; a3 * src + vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) + psubd m16, m0 ; b5 - a5 * src + (1 << 8) + psubd m17, m1 ; b3 - a3 * src + (1 << 8) + psrld m16, 9 + pslld m17, 7 + vmovdqu8 m17{k2}, m16 + vpdpwssd m3, m17, m26 + packuswb m3, m2 + vpermb m16, m27, m3 + mova [dstq+r10], xm16 + add r10, 16 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m1, [t3+r10*4+416*8+0] + paddd m1, [t3+r10*4+416*8+8] + paddd m2, m1, [t3+r10*4+416*8+4] + paddd m2, m2 ; ab3[ 1] 222 + mova m0, [t3+r10*4+416*20] + paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343 + pmovzxbd m3, [dstq+r10] + mova [t3+r10*4+416*20], m2 + paddd m2, m2 + psubd m2, m1 ; ab3[ 1] 343 + mova [t3+r10*4+416*28], m2 + paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m1, m24, m17 + psrld m17, 12 + pandn m2, m24, m0 + psrld m0, 12 + paddd m1, m2 ; a3 + paddd m17, m0 ; b3 + (1 << 8) + mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7) + pmaddwd m1, m3 ; a3 * src + pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src + vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) + psubd m17, m1 ; b3 - a3 * src + (1 << 8) + psubd m16, m0 ; b5 - a5 * src + (1 << 7) + pslld m17, 7 + palignr m17{k2}, m16, m16, 1 + vpdpwssd m3, m17, m26 + packuswb m3, m3 + vpermb m16, m27, m3 + mova [dstq+r10], xm16 + add r10, 16 + jl .n1_loop + add dstq, strideq + ret + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/looprestoration_init_tmpl.c dav1d-1.0.0/src/x86/looprestoration_init_tmpl.c --- dav1d-0.9.2/src/x86/looprestoration_init_tmpl.c 2021-09-03 15:51:24.425037100 +0000 +++ dav1d-1.0.0/src/x86/looprestoration_init_tmpl.c 2022-03-18 14:31:56.026356000 +0000 @@ -42,8 +42,10 @@ decl_wiener_filter_fns(sse2); decl_wiener_filter_fns(ssse3); decl_wiener_filter_fns(avx2); +decl_wiener_filter_fns(avx512icl); decl_sgr_filter_fns(ssse3); decl_sgr_filter_fns(avx2); +decl_sgr_filter_fns(avx512icl); COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c, const int bpc) @@ -75,5 +77,20 @@ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); } + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl); +#if BITDEPTH == 8 + /* With VNNI we don't need a 5-tap version. */ + c->wiener[1] = c->wiener[0]; +#else + c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl); +#endif + if (bpc <= 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl); + } #endif } diff -Nru dav1d-0.9.2/src/x86/looprestoration_sse.asm dav1d-1.0.0/src/x86/looprestoration_sse.asm --- dav1d-0.9.2/src/x86/looprestoration_sse.asm 2021-09-03 15:51:24.425037100 +0000 +++ dav1d-1.0.0/src/x86/looprestoration_sse.asm 2022-03-18 14:31:56.026356000 +0000 @@ -100,20 +100,21 @@ %macro WIENER 0 %if ARCH_X86_64 -DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers -cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h, x +DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers +cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt, x + %define tmpstrideq strideq %define base 0 - mov fltq, fltmp - mov edged, r8m + mov fltq, r6mp mov wd, wm - mov hd, r6m + movifnidn hd, hm + mov edged, r7m movq m14, [fltq] add lpfq, wq + movq m7, [fltq+16] + add dstq, wq lea t1, [rsp+wq*2+16] mova m15, [pw_2056] - add dstq, wq - movq m7, [fltq+16] neg wq %if cpuflag(ssse3) pshufb m14, [wiener_init] @@ -143,7 +144,7 @@ %define m11 [stk+96] %define stk_off 112 %endif -cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride +cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride %define base r6-pb_right_ext_mask-21 %define stk esp %define dstq leftq @@ -152,7 +153,7 @@ %define dstmp [stk+12] %define hd dword [stk+16] %define wq [stk+20] - %define dst_strideq [stk+24] + %define strideq [stk+24] %define leftmp [stk+28] %define t2 [stk+32] %define t4 [stk+36] @@ -164,28 +165,28 @@ %define m13 [stk+64] %define m14 [stk+80] %define m15 [base+pw_2056] - mov r1, r7m ; flt + mov r1, r6m ; flt mov r0, r0m ; dst - mov r5, r5m ; w + mov r4, r4m ; w mov lpfq, lpfm - mov r2, r8m ; edge - mov r4, r6m ; h + mov r2, r7m ; edge + mov r5, r5m ; h movq m3, [r1+ 0] movq m7, [r1+16] - add r0, r5 - mov r1, r1m ; dst_stride - add lpfq, r5 + add r0, r4 + mov r1, r1m ; stride + add lpfq, r4 mov edged, r2 mov r2, r2m ; left mov dstmp, r0 - lea t1, [rsp+r5*2+stk_off] - mov hd, r4 - neg r5 - mov lpf_strideq, lpf_stridem + lea t1, [rsp+r4*2+stk_off] + mov hd, r5 + neg r4 LEA r6, pb_right_ext_mask+21 - mov wq, r5 - mov dst_strideq, r1 + mov wq, r4 + mov strideq, r1 mov leftmp, r2 + mov r4, r1 %if cpuflag(ssse3) pshufb m3, [base+wiener_init] pshufd m1, m3, q2222 @@ -203,21 +204,21 @@ mova m13, m2 mova m14, m3 %endif + psllw m7, 5 pshufd m6, m7, q0000 ; y0 y1 pshufd m7, m7, q1111 ; y2 y3 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top - lea t3, [lpfq+lpf_strideq*4] + lea t3, [lpfq+tmpstrideq*4] mov lpfq, dstmp - mov [rsp+gprsize*1], lpf_strideq - add t3, lpf_strideq - mov [rsp+gprsize*0], t3 ; below + add t3, tmpstrideq + mov [rsp], t3 ; below mov t4, t1 add t1, 384*2 call .h @@ -225,13 +226,13 @@ mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -244,19 +245,18 @@ jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 - mov lpfq, [rsp+gprsize*0] + mov lpfq, [rsp] call .hv_bottom - add lpfq, [rsp+gprsize*1] + add lpfq, strideq call .hv_bottom .v1: call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v RET .no_top: - lea t3, [lpfq+lpf_strideq*4] + lea t3, [lpfq+tmpstrideq*4] mov lpfq, dstmp - mov [rsp+gprsize*1], lpf_strideq - lea t3, [t3+lpf_strideq*2] - mov [rsp+gprsize*0], t3 + lea t3, [t3+tmpstrideq*2] + mov [rsp], t3 call .h mov t6, t1 mov t5, t1 @@ -265,13 +265,13 @@ mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -455,7 +455,7 @@ ret ALIGN function_align .hv: - add lpfq, dst_strideq + add lpfq, strideq mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left @@ -547,9 +547,7 @@ mov r2, t6 %endif paddw m5, [t1+xq*2+16] - psrad m0, 11 - psrad m4, 11 - packssdw m0, m4 + packuswb m0, m4 %if ARCH_X86_64 paddw m4, m1, [t6+xq*2+16] %else @@ -567,14 +565,14 @@ pmaddwd m4, m6 paddd m1, m3 paddd m2, m4 - psrad m1, 11 - psrad m2, 11 - packssdw m1, m2 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 packuswb m0, m1 mova [dstq+xq], m0 add xq, 16 jl .hv_loop - add dstq, dst_strideq + add dstq, strideq %if ARCH_X86_64 mov t6, t5 mov t5, t4 @@ -651,9 +649,7 @@ paddw m5, [r2+xq*2+16] movifnidn dstq, dstmp %endif - psrad m0, 11 - psrad m1, 11 - packssdw m0, m1 + packuswb m0, m1 punpcklwd m1, m2, m3 pmaddwd m1, m7 punpckhwd m2, m3 @@ -664,14 +660,14 @@ pmaddwd m4, m6 paddd m1, m3 paddd m2, m4 - psrad m1, 11 - psrad m2, 11 - packssdw m1, m2 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 packuswb m0, m1 mova [dstq+xq], m0 add xq, 16 jl .v_loop - add dstq, dst_strideq + add dstq, strideq %if ARCH_X86_64 mov t6, t5 mov t5, t4 @@ -689,19 +685,19 @@ %endif %if ARCH_X86_64 -cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h, x - mov fltq, fltmp - mov edged, r8m +cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt, x + mov fltq, r6mp mov wd, wm - mov hd, r6m + movifnidn hd, hm + mov edged, r7m movq m14, [fltq] add lpfq, wq + movq m7, [fltq+16] + add dstq, wq mova m8, [pw_m16380] lea t1, [rsp+wq*2+16] mova m15, [pw_2056] - add dstq, wq - movq m7, [fltq+16] neg wq %if cpuflag(ssse3) pshufb m14, [wiener_init] @@ -724,34 +720,34 @@ %define m11 [stk+80] %define stk_off 96 %endif -cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride +cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride %define stk esp %define leftmp [stk+28] %define m8 [base+pw_m16380] %define m12 [base+wiener_l_shuf] %define m14 [stk+48] - mov r1, r7m ; flt + mov r1, r6m ; flt mov r0, r0m ; dst - mov r5, r5m ; w + mov r4, r4m ; w mov lpfq, lpfm - mov r2, r8m ; edge - mov r4, r6m ; h + mov r2, r7m ; edge + mov r5, r5m ; h movq m2, [r1+ 0] movq m7, [r1+16] - add r0, r5 - mov r1, r1m ; dst_stride - add lpfq, r5 + add r0, r4 + mov r1, r1m ; stride + add lpfq, r4 mov edged, r2 mov r2, r2m ; left mov dstmp, r0 - lea t1, [rsp+r5*2+stk_off] - mov hd, r4 - neg r5 - mov lpf_strideq, lpf_stridem + lea t1, [rsp+r4*2+stk_off] + mov hd, r5 + neg r4 LEA r6, pb_right_ext_mask+21 - mov wq, r5 - mov dst_strideq, r1 + mov wq, r4 + mov strideq, r1 mov leftmp, r2 + mov r4, r1 %if cpuflag(ssse3) pshufb m2, [base+wiener_init] pshufd m1, m2, q3333 @@ -766,27 +762,27 @@ mova m13, m1 mova m14, m2 %endif + psllw m7, 5 pshufd m6, m7, q0000 ; __ y1 pshufd m7, m7, q1111 ; y2 y3 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top - lea xq, [lpfq+lpf_strideq*4] + lea xq, [lpfq+tmpstrideq*4] mov lpfq, dstmp mov t3, t1 add t1, 384*2 - mov [rsp+gprsize*1], lpf_strideq - add xq, lpf_strideq - mov [rsp+gprsize*0], xq ; below + add xq, tmpstrideq + mov [rsp], xq ; below call .h mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -799,25 +795,24 @@ jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 - mov lpfq, [rsp+gprsize*0] + mov lpfq, [rsp] call .hv_bottom - add lpfq, [rsp+gprsize*1] + add lpfq, strideq call .hv_bottom .end: RET .no_top: - lea t3, [lpfq+lpf_strideq*4] + lea t3, [lpfq+tmpstrideq*4] mov lpfq, dstmp - mov [rsp+gprsize*1], lpf_strideq - lea t3, [t3+lpf_strideq*2] - mov [rsp+gprsize*0], t3 + lea t3, [t3+tmpstrideq*2] + mov [rsp], t3 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 - add lpfq, dst_strideq + add lpfq, strideq add t1, 384*2 call .h dec hd @@ -832,7 +827,7 @@ jnz .main .v2: call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v - add dstq, dst_strideq + add dstq, strideq mov t4, t3 mov t3, t2 mov t2, t1 @@ -959,7 +954,7 @@ ret ALIGN function_align .hv: - add lpfq, dst_strideq + add lpfq, strideq mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left @@ -1026,9 +1021,7 @@ paddd m4, m2 mova m2, [t3+xq*2+16] paddw m2, [t1+xq*2+16] - psrad m0, 11 - psrad m4, 11 - packssdw m0, m4 + packuswb m0, m4 %if ARCH_X86_64 mova m3, [t2+xq*2+16] paddw m4, m1, [t4+xq*2+16] @@ -1049,14 +1042,14 @@ pmaddwd m4, m6 paddd m1, m3 paddd m2, m4 - psrad m1, 11 - psrad m2, 11 - packssdw m1, m2 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 packuswb m0, m1 mova [dstq+xq], m0 add xq, 16 jl .hv_loop - add dstq, dst_strideq + add dstq, strideq mov t4, t3 mov t3, t2 mov t2, t1 @@ -1100,9 +1093,7 @@ mova m3, [r2+xq*2+16] mov dstq, dstmp %endif - psrad m0, 11 - psrad m1, 11 - packssdw m0, m1 + packuswb m0, m1 punpcklwd m1, m2, m3 pmaddwd m1, m7 punpckhwd m2, m3 @@ -1113,9 +1104,9 @@ pmaddwd m4, m6 paddd m1, m3 paddd m2, m4 - psrad m1, 11 - psrad m2, 11 - packssdw m1, m2 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 packuswb m0, m1 mova [dstq+xq], m0 add xq, 16 @@ -1177,30 +1168,29 @@ %endmacro %if ARCH_X86_32 -DECLARE_REG_TMP 0, 1, 2, 3, 4 +DECLARE_REG_TMP 0, 1, 2, 3, 5 %if STACK_ALIGNMENT < 16 %assign extra_stack 5*16 %else %assign extra_stack 3*16 %endif cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \ - dst, dst_stride, left, lpf, lpf_stride, w, params, h + dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*0+4*6] - %define dst_stridemp dword [esp+calloff+16*0+4*7] + %define stridemp dword [esp+calloff+16*0+4*7] %define leftm dword [esp+calloff+16*3+4*0] %define lpfm dword [esp+calloff+16*3+4*1] - %define lpf_stridem dword [esp+calloff+16*3+4*2] - %define w0m dword [esp+calloff+16*3+4*3] - %define hd dword [esp+calloff+16*3+4*4] - %define edgeb byte [esp+calloff+16*3+4*5] - %define edged dword [esp+calloff+16*3+4*5] + %define w0m dword [esp+calloff+16*3+4*2] + %define hd dword [esp+calloff+16*3+4*3] + %define edgeb byte [esp+calloff+16*3+4*4] + %define edged dword [esp+calloff+16*3+4*4] %define leftmp leftm %else %define w0m wm - %define hd dword r6m - %define edgeb byte r8m - %define edged dword r8m + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] @@ -1211,44 +1201,40 @@ %define m8 [base+pb_1] %define m9 [esp+calloff+16*2] %define m10 [base+pd_0xf00800a4] - %define m11 [base+pw_256] + %define m11 [base+sgr_lshuf5] %define m12 [base+pd_34816] %define m13 [base+pb_0to15] - %define m14 [base+sgr_lshuf5] - %define r10 r5 + %define r10 r4 %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 - mov dst_strideq, [rstk+stack_offset+ 8] + mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] - mov lpf_strideq, [rstk+stack_offset+20] - mov wd, [rstk+stack_offset+24] + mov wd, [rstk+stack_offset+20] mov dstm, dstq - mov dst_stridemp, dst_strideq + mov stridemp, strideq mov leftm, leftq - mov r1, [rstk+stack_offset+28] - mov r2, [rstk+stack_offset+36] + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] mov lpfm, lpfq - mov lpf_stridem, lpf_strideq mov hd, r1 mov edged, r2 %endif %else -DECLARE_REG_TMP 4, 9, 7, 11, 12 -cglobal sgr_filter_5x5_8bpc, 5, 15, 15, -400*24-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +DECLARE_REG_TMP 8, 7, 9, 11, 12 +cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \ + w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 mov wd, wm %endif %if ARCH_X86_64 - mov paramsq, paramsmp + mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] - mov edged, r8m - mov hd, r6m + movifnidn hd, hm + mov edged, r7m movu m9, [paramsq] - mova m11, [pw_256] add lpfq, wq mova m8, [pb_1] lea t1, [rsp+wq*2+20] @@ -1258,18 +1244,17 @@ mova m12, [pd_34816] ; (1 << 11) + (1 << 15) lea t4, [rsp+wq*2+400*20+16] pshufhw m7, m9, q0000 - pshufb m9, m11 ; s0 + pshufb m9, [pw_256] ; s0 punpckhqdq m7, m7 ; w0 neg wq mova m13, [pb_0to15] pxor m6, m6 - mova m14, [sgr_lshuf5] + mova m11, [sgr_lshuf5] psllw m7, 4 - DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w - %define lpfm [rsp+0] - %define lpf_stridem [rsp+8] + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] %else - mov r1, [rstk+stack_offset+32] ; params + mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ movu m1, [r1] add lpfm, wq @@ -1281,8 +1266,8 @@ mov t3m, t3 pshufhw m7, m1, q0000 mov t4m, t4 - pshufb m1, m11 ; s0 - punpckhqdq m7, m7 ; w0 + pshufb m1, [base+pw_256] ; s0 + punpckhqdq m7, m7 ; w0 psllw m7, 4 neg wq mova m9, m1 @@ -1290,22 +1275,22 @@ mov w1m, wd sub wd, 2 mov lpfq, lpfm - mov lpf_strideq, lpf_stridem mov w0m, wd + %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, stridemp movif32 t2m, t1 mov t2, t1 call .top_fixup add t1, 400*6 call .h_top - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - movif64 lpf_stridem, lpf_strideq - add r10, lpf_strideq + add r10, strideq mov lpfm, r10 ; below movif32 t0m, t2 mov t0, t2 @@ -1314,7 +1299,7 @@ or edged, 16 call .h .main: - add lpfq, dst_stridemp + add lpfq, stridemp movif32 t4, t4m call .hv call .prep_n @@ -1322,16 +1307,16 @@ jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp %if ARCH_X86_64 test hb, hb %else - mov r5, hd - test r5, r5 + mov r4, hd + test r4, r4 %endif jz .odd_height call .h - add lpfq, dst_stridemp + add lpfq, stridemp call .hv movif32 dstq, dstm call .n0 @@ -1343,7 +1328,7 @@ jz .extend_bottom mov lpfq, lpfm call .h_top - add lpfq, lpf_stridem + add lpfq, stridemp call .hv_bottom .end: movif32 dstq, dstm @@ -1370,10 +1355,10 @@ call .v jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - movif64 lpf_stridem, lpf_strideq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov lpfm, r10 call .h lea t2, [t1+400*6] @@ -1409,9 +1394,9 @@ %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else - %define leftq r5 + %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -1425,11 +1410,11 @@ .h_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] - pshufb m5, m14 + pshufb m5, m11 jmp .h_main .h_top: %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -1483,7 +1468,7 @@ ret .top_fixup: %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov wd, w0m %endif @@ -1503,7 +1488,7 @@ ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -1519,11 +1504,11 @@ .hv_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] - pshufb m5, m14 + pshufb m5, m11 jmp .hv_main .hv_bottom: %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -1615,10 +1600,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 - psubw m2, m11, m3 ; a paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m12 - mova [t4+wq*2+4], m2 + mova [t4+wq*2+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*4+ 8], m0 @@ -1641,7 +1625,7 @@ jmp .hv_main2 .v: ; vertical boxsum + ab %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov wd, w0m %endif @@ -1685,10 +1669,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 - psubw m2, m11, m3 ; a paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m12 - mova [t4+wq*2+4], m2 + mova [t4+wq*2+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*4+ 8], m0 @@ -1697,7 +1680,7 @@ jl .v_loop ret .prep_n: ; initial neighbor setup - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*2+ 2] @@ -1729,7 +1712,7 @@ ret ALIGN function_align .n0: ; neighbor + output (even rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*2+ 2] @@ -1767,24 +1750,22 @@ punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 - paddd m2, m4 ; a * src + b + (1 << 8) - paddd m3, m5 - psrld m2, 9 - psrld m3, 9 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n0_loop - add dstq, dst_stridemp + add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n1_loop: movq m0, [dstq+wq] @@ -1794,24 +1775,22 @@ punpcklbw m0, m6 punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a - pmaddwd m2, m1 + pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 - paddd m2, m4 ; a * src + b + (1 << 7) - paddd m3, m5 - psrld m2, 8 - psrld m3, 8 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m4, m2 ; b - a * src + (1 << 7) + psubd m5, m3 + psrad m4, 8 + psrad m5, 8 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n1_loop - add dstq, dst_stridemp + add dstq, stridemp movif32 dstm, dstq ret @@ -1822,23 +1801,22 @@ %assign extra_stack 2*16 %endif cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \ - dst, dst_stride, left, lpf, lpf_stride, w, params, h + dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*2+4*0] - %define dst_stridemp dword [esp+calloff+16*2+4*1] + %define stridemp dword [esp+calloff+16*2+4*1] %define leftm dword [esp+calloff+16*2+4*2] %define lpfm dword [esp+calloff+16*2+4*3] - %define lpf_stridem dword [esp+calloff+16*2+4*4] - %define w0m dword [esp+calloff+16*2+4*5] - %define hd dword [esp+calloff+16*2+4*6] - %define edgeb byte [esp+calloff+16*2+4*7] - %define edged dword [esp+calloff+16*2+4*7] + %define w0m dword [esp+calloff+16*2+4*4] + %define hd dword [esp+calloff+16*2+4*5] + %define edgeb byte [esp+calloff+16*2+4*6] + %define edged dword [esp+calloff+16*2+4*6] %define leftmp leftm %else %define w0m wm - %define hd dword r6m - %define edgeb byte r8m - %define edged dword r8m + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] @@ -1848,41 +1826,37 @@ %define m9 [esp+calloff+16*1] %define m10 [base+pd_0xf00801c7] %define m11 [base+pd_34816] - %define m12 [base+pw_256] + %define m12 m6 %define m13 [base+sgr_lshuf3] - %define m14 m6 %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 - mov dst_strideq, [rstk+stack_offset+ 8] + mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] - mov lpf_strideq, [rstk+stack_offset+20] - mov wd, [rstk+stack_offset+24] + mov wd, [rstk+stack_offset+20] mov dstm, dstq - mov dst_stridemp, dst_strideq + mov stridemp, strideq mov leftm, leftq - mov r1, [rstk+stack_offset+28] - mov r2, [rstk+stack_offset+36] + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] mov lpfm, lpfq - mov lpf_stridem, lpf_strideq mov hd, r1 mov edged, r2 %endif %else -cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*42-8, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \ + w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 mov wd, wm %endif %if ARCH_X86_64 - mov paramsq, paramsmp + mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] - mov edged, r8m - mov hd, r6m + mov hd, hm + mov edged, r7m movq m9, [paramsq+4] - mova m12, [pw_256] add lpfq, wq lea t1, [rsp+wq*2+12] mova m8, [pb_0to15] @@ -1892,16 +1866,16 @@ lea t4, [rsp+wq*2+400*32+8] mova m11, [pd_34816] pshuflw m7, m9, q3333 - pshufb m9, m12 ; s1 + pshufb m9, [pw_256] ; s1 punpcklqdq m7, m7 ; w1 neg wq pxor m6, m6 mova m13, [sgr_lshuf3] psllw m7, 4 - DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else - mov r1, [rstk+stack_offset+32] ; params + mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ movq m1, [r1+4] add lpfm, wq @@ -1913,8 +1887,8 @@ mov t3m, t3 pshuflw m7, m1, q3333 mov t4m, t4 - pshufb m1, m12 ; s1 - punpcklqdq m7, m7 ; w1 + pshufb m1, [base+pw_256] ; s1 + punpcklqdq m7, m7 ; w1 psllw m7, 4 neg wq mova m9, m1 @@ -1922,19 +1896,20 @@ mov w1m, wd sub wd, 2 mov lpfq, lpfm - mov lpf_strideq, lpf_stridem mov w0m, wd + %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, stridemp mov t2, t1 add t1, 400*6 call .h_top - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - add r10, lpf_strideq + add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 @@ -1942,24 +1917,24 @@ dec hd jz .height1 movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hb, hb %else - mov r5, hd - test r5, r5 + mov r4, hd + test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv1 call .n0 call .n1 @@ -1969,12 +1944,8 @@ jz .extend_bottom mov lpfq, lpfm call .hv0_bottom -%if ARCH_X86_64 - add lpfq, lpf_strideq -%else - mov lpfq, hvsrcm - add lpfq, lpf_stridem -%endif + movif32 lpfq, hvsrcm + add lpfq, stridemp call .hv1_bottom .end: call .n0 @@ -1999,13 +1970,14 @@ call .v1 jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov wq, w0m mov hvsrcm, lpfq @@ -2043,9 +2015,9 @@ %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else - %define leftq r5 + %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -2063,7 +2035,7 @@ jmp .h_main .h_top: %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -2102,7 +2074,7 @@ ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -2122,7 +2094,7 @@ jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -2181,8 +2153,8 @@ pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 - MULLD m4, m9, m14 ; p * s - MULLD m5, m9, m14 + MULLD m4, m9, m12 ; p * s + MULLD m5, m9, m12 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 @@ -2193,15 +2165,14 @@ GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 - MULLD m0, m4, m14 - MULLD m1, m5, m14 - psubw m2, m12, m3 + MULLD m0, m4, m12 + MULLD m1, m5, m12 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - mova [t4+wq*2+4], m2 + mova [t4+wq*2+4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+ 8], m0 @@ -2212,7 +2183,7 @@ ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -2232,7 +2203,7 @@ jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -2285,8 +2256,8 @@ pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 - MULLD m4, m9, m14 ; p * s - MULLD m5, m9, m14 + MULLD m4, m9, m12 ; p * s + MULLD m5, m9, m12 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 @@ -2297,15 +2268,14 @@ GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 - MULLD m0, m4, m14 - MULLD m1, m5, m14 - psubw m2, m12, m3 + MULLD m0, m4, m12 + MULLD m1, m5, m12 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - mova [t4+wq*2+400*2 +4], m2 + mova [t4+wq*2+400*2 +4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+400*4+ 8], m0 @@ -2318,7 +2288,7 @@ ret .v0: ; vertical boxsums + ab (even rows) %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov wd, w0m %endif @@ -2345,8 +2315,8 @@ pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 - MULLD m4, m9, m14 ; p * s - MULLD m5, m9, m14 + MULLD m4, m9, m12 ; p * s + MULLD m5, m9, m12 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 @@ -2356,15 +2326,14 @@ GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 - MULLD m0, m4, m14 - MULLD m1, m5, m14 - psubw m2, m12, m3 + MULLD m0, m4, m12 + MULLD m1, m5, m12 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - mova [t4+wq*2+4], m2 + mova [t4+wq*2+4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+ 8], m0 @@ -2374,7 +2343,7 @@ ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov wd, w0m %endif @@ -2398,8 +2367,8 @@ pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 - MULLD m4, m9, m14 ; p * s - MULLD m5, m9, m14 + MULLD m4, m9, m12 ; p * s + MULLD m5, m9, m12 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 @@ -2409,15 +2378,14 @@ GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 - MULLD m0, m4, m14 - MULLD m1, m5, m14 - psubw m2, m12, m3 + MULLD m0, m4, m12 + MULLD m1, m5, m12 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 - mova [t4+wq*2+400*2+ 4], m2 + mova [t4+wq*2+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+400*4+ 8], m0 @@ -2429,7 +2397,7 @@ mov t1, r10 ret .prep_n: ; initial neighbor setup - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*2+400*0+ 4] @@ -2482,7 +2450,7 @@ ret ALIGN function_align .n0: ; neighbor + output (even rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m3, [t4+wq*2+400*0+4] @@ -2523,24 +2491,22 @@ punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 - paddd m2, m4 ; a * src + b + (1 << 8) - paddd m3, m5 - psrld m2, 9 - psrld m3, 9 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n0_loop - add dstq, dst_stridemp + add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*2+400*2+4] @@ -2581,20 +2547,18 @@ punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 - paddd m2, m4 ; a * src + b + (1 << 8) - paddd m3, m5 - psrld m2, 9 - psrld m3, 9 - packssdw m2, m3 - psllw m1, m0, 4 - psubw m2, m1 - pmulhrsw m2, m7 - paddw m0, m2 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n1_loop - add dstq, dst_stridemp + add dstq, stridemp movif32 dstm, dstq ret @@ -2605,23 +2569,22 @@ %assign extra_stack 8*16 %endif cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \ - dst, dst_stride, left, lpf, lpf_stride, w, params, h + dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*8+4*0] - %define dst_stridemp dword [esp+calloff+16*8+4*1] + %define stridemp dword [esp+calloff+16*8+4*1] %define leftm dword [esp+calloff+16*8+4*2] %define lpfm dword [esp+calloff+16*8+4*3] - %define lpf_stridem dword [esp+calloff+16*8+4*4] - %define w0m dword [esp+calloff+16*8+4*5] - %define hd dword [esp+calloff+16*8+4*6] - %define edgeb byte [esp+calloff+16*8+4*7] - %define edged dword [esp+calloff+16*8+4*7] + %define w0m dword [esp+calloff+16*8+4*4] + %define hd dword [esp+calloff+16*8+4*5] + %define edgeb byte [esp+calloff+16*8+4*6] + %define edged dword [esp+calloff+16*8+4*6] %define leftmp leftm %else %define w0m wm - %define hd dword r6m - %define edgeb byte r8m - %define edged dword r8m + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] @@ -2631,7 +2594,7 @@ %define m9 [base+pd_0xffff] %define m10 [base+pd_34816] %define m11 [base+pd_0xf00801c7] - %define m12 [base+pw_256] + %define m12 [base+pd_0xf00800a4] %define m13 [esp+calloff+16*4] %define m14 [esp+calloff+16*5] %define m15 [esp+calloff+16*6] @@ -2639,44 +2602,41 @@ %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 - mov dst_strideq, [rstk+stack_offset+ 8] + mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] - mov lpf_strideq, [rstk+stack_offset+20] - mov wd, [rstk+stack_offset+24] + mov wd, [rstk+stack_offset+20] mov dstm, dstq - mov dst_stridemp, dst_strideq + mov stridemp, strideq mov leftm, leftq - mov r1, [rstk+stack_offset+28] - mov r2, [rstk+stack_offset+36] + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] mov lpfm, lpfq - mov lpf_stridem, lpf_strideq mov hd, r1 mov edged, r2 %endif %else -cglobal sgr_filter_mix_8bpc, 5, 15, 16, -400*66-40, dst, dst_stride, left, \ - lpf, lpf_stride, w, edge, \ - params, h +cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ + w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 mov wd, wm %endif %if ARCH_X86_64 - mov paramsq, paramsmp + mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] - mov edged, r8m - mov hd, r6m + movifnidn hd, hm + mov edged, r7m mova m15, [paramsq] add lpfq, wq mova m9, [pd_0xffff] lea t1, [rsp+wq*2+44] mova m10, [pd_34816] add dstq, wq - mova m12, [pw_256] lea t3, [rsp+wq*4+400*24+40] mova m11, [pd_0xf00801c7] lea t4, [rsp+wq*2+400*52+40] + mova m12, [base+pd_0xf00800a4] neg wq pshuflw m13, m15, q0000 pshuflw m14, m15, q2222 @@ -2686,10 +2646,10 @@ punpckhqdq m15, m15 ; w0 w1 pxor m6, m6 psllw m15, 2 - DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else - mov r1, [rstk+stack_offset+32] ; params + mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ mova m2, [r1] add lpfm, wq @@ -2716,13 +2676,13 @@ mova m15, m2 mova m6, m3 mov lpfq, lpfm - mov lpf_strideq, lpf_stridem mov w0m, wd + %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top - add lpfq, lpf_strideq + add lpfq, stridemp mov t2, t1 %if ARCH_X86_64 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup @@ -2732,9 +2692,10 @@ %endif add t1, 400*12 call .h_top - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - add r10, lpf_strideq + add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 @@ -2742,24 +2703,24 @@ dec hd jz .height1 movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hd, hd %else - mov r5, hd - test r5, r5 + mov r4, hd + test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm - add lpfq, dst_stridemp + add lpfq, stridemp call .hv1 call .n0 call .n1 @@ -2769,12 +2730,8 @@ jz .extend_bottom mov lpfq, lpfm call .hv0_bottom -%if ARCH_X86_64 - add lpfq, lpf_strideq -%else - mov lpfq, hvsrcm - add lpfq, lpf_stridem -%endif + movif32 lpfq, hvsrcm + add lpfq, stridemp call .hv1_bottom .end: call .n0 @@ -2799,13 +2756,14 @@ call .v1 jmp .end .no_top: - lea r10, [lpfq+lpf_strideq*4] + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] mov lpfq, dstq - lea r10, [r10+lpf_strideq*2] + lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov wq, w0m mov hvsrcm, lpfq @@ -2856,9 +2814,9 @@ %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else - %define leftq r5 + %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -2876,7 +2834,7 @@ jmp .h_main .h_top: %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left @@ -2936,7 +2894,7 @@ ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -2956,7 +2914,7 @@ jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -3064,10 +3022,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 - psubw m2, m12, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 - mova [t4+wq*2+400*2+ 4], m2 + mova [t4+wq*2+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+400*4+ 8], m0 @@ -3078,7 +3035,7 @@ ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -3098,7 +3055,7 @@ jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov hvsrcm, lpfq %endif @@ -3189,18 +3146,16 @@ punpckhwd m3, m8, m8 MULLD m0, m2, m7 MULLD m5, m3, m7 - psubw m7, m12, m8 -%if ARCH_X86_32 - mova m8, [esp+20] -%endif paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m5, m10 psrld m0, 12 psrld m5, 12 - mova [t4+wq*2+400*4+ 4], m7 + mova [t4+wq*2+400*4+ 4], m8 mova [t3+wq*4+400*8+ 8], m0 mova [t3+wq*4+400*8+24], m5 -%if ARCH_X86_64 +%if ARCH_X86_32 + mova m8, [esp+20] +%else SWAP m6, m8 pxor m6, m6 %endif @@ -3234,14 +3189,13 @@ SWAP m7, m6 %endif psubd m2, m4 ; p5 - mova m4, [base+pd_0xf00800a4] psubd m3, m1 MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 - pmaddwd m0, m4 ; b5 * 164 - pmaddwd m5, m4 - paddusw m2, m4 - paddusw m3, m4 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m5, m12 + paddusw m2, m12 + paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m1, m2, m3, r0, dstm @@ -3249,10 +3203,9 @@ punpckhwd m3, m1, m1 MULLD m0, m2, m7 MULLD m5, m3, m7 - psubw m4, m12, m1 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m5, m10 - mova [t4+wq*2+4], m4 + mova [t4+wq*2+4], m1 psrld m0, 12 psrld m5, 12 mova [t3+wq*4+ 8], m0 @@ -3265,7 +3218,7 @@ ret .v0: ; vertical boxsums + ab3 (even rows) %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov wd, w0m %endif @@ -3313,10 +3266,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 - psubw m2, m12, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 - mova [t4+wq*2+400*2+4], m2 + mova [t4+wq*2+400*2+4], m3 psrld m0, 12 psrld m1, 12 mova m3, [t1+wq*2+400*0] @@ -3338,7 +3290,7 @@ ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 - lea wq, [r5-2] + lea wq, [r4-2] %else mov wd, w0m %endif @@ -3383,10 +3335,9 @@ punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 - psubw m2, m12, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 - mova [t4+wq*2+400*4+4], m2 + mova [t4+wq*2+400*4+4], m3 psrld m0, 12 psrld m8, m1, 12 mova m4, [t3+wq*4+400*8+ 8] @@ -3421,17 +3372,16 @@ punpckhwd m1, m7 pmaddwd m5, m1, m1 psubd m2, m4 ; p5 - mova m4, [base+pd_0xf00800a4] psubd m3, m5 %if ARCH_X86_64 SWAP m7, m6 %endif MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 - pmaddwd m0, m4 ; b5 * 164 - pmaddwd m1, m4 - paddusw m2, m4 - paddusw m3, m4 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m1, m12 + paddusw m2, m12 + paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m4, m2, m3, r0, dstm @@ -3439,10 +3389,9 @@ punpckhwd m3, m4, m4 MULLD m0, m2, m7 MULLD m1, m3, m7 - psubw m5, m12, m4 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m10 - mova [t4+wq*2+4], m5 + mova [t4+wq*2+4], m4 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+ 8], m0 @@ -3454,7 +3403,7 @@ mov t1, r10 ret .prep_n: ; initial neighbor setup - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*2+400*0+ 2] @@ -3531,7 +3480,7 @@ ret ALIGN function_align .n0: ; neighbor + output (even rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*2+ 4] @@ -3609,55 +3558,50 @@ mova [rsp+32+ARCH_X86_32*4], m7 movq m4, [dstq+wq] punpcklbw m4, m6 - punpcklwd m7, m2, m6 - punpckhwd m2, m6 - punpcklwd m8, m3, m6 - punpckhwd m3, m6 punpcklwd m5, m4, m6 - punpckhwd m4, m6 + punpcklwd m7, m2, m6 pmaddwd m7, m5 ; a5 * src + punpcklwd m8, m3, m6 pmaddwd m8, m5 ; a3 * src - pmaddwd m2, m4 - pmaddwd m3, m4 - pslld m5, 13 - pslld m4, 13 - psubd m0, m5 - psubd m1, m5 - paddd m0, m7 ; a5 * src + b5 + (1 << 8) - (src << 13) - paddd m1, m8 ; a3 * src + b3 + (1 << 8) - (src << 13) + punpckhwd m5, m4, m6 + punpckhwd m2, m6 + pmaddwd m2, m5 + punpckhwd m3, m6 + pmaddwd m3, m5 + psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13) + psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13) psrld m0, 9 pslld m1, 7 pand m0, m9 pandn m8, m9, m1 por m0, m8 - psubd m1, m4, [rsp+16+ARCH_X86_32*4] - psubd m8, m4, [rsp+32+ARCH_X86_32*4] - psubd m2, m1 - psubd m3, m8 - mova m1, [base+pd_4096] - psrld m2, 9 - pslld m3, 7 - pand m2, m9 - pandn m8, m9, m3 - por m2, m8 + mova m1, [rsp+16+ARCH_X86_32*4] + psubd m1, m2 + mova m2, [rsp+32+ARCH_X86_32*4] + psubd m2, m3 + mova m3, [base+pd_4096] + psrld m1, 9 + pslld m2, 7 + pand m1, m9 + pandn m5, m9, m2 + por m1, m5 pmaddwd m0, m15 - pmaddwd m2, m15 - paddd m5, m1 - paddd m4, m1 - paddd m0, m5 - paddd m2, m4 + pmaddwd m1, m15 + paddd m0, m3 + paddd m1, m3 psrad m0, 13 - psrad m2, 13 - packssdw m0, m2 ; clip + psrad m1, 13 + packssdw m0, m1 + paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n0_loop - add dstq, dst_stridemp + add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) - movif64 wq, r5 + movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*2+400*4+4] @@ -3691,54 +3635,47 @@ mova [t3+wq*4+400*24+ 0], m4 mova [t3+wq*4+400*24+16], m0 movq m5, [dstq+wq] - mova m8, [t4+wq*2+400* 6] + mova m2, [t4+wq*2+400* 6] punpcklbw m5, m6 punpcklwd m4, m5, m6 - punpckhwd m5, m6 - punpcklwd m0, m8, m6 - punpckhwd m8, m6 - punpcklwd m2, m3, m6 + punpcklwd m8, m2, m6 + pmaddwd m8, m4 ; a5 * src + punpcklwd m0, m3, m6 + pmaddwd m0, m4 ; a3 * src + punpckhwd m4, m5, m6 + punpckhwd m2, m6 + pmaddwd m2, m4 punpckhwd m3, m6 - pmaddwd m0, m4 ; a5 * src - pmaddwd m2, m4 ; a3 * src - pmaddwd m8, m5 - pmaddwd m3, m5 - paddd m1, m2 ; a3 * src + b3 + (1 << 8) - (src << 13) - pslld m4, 12 - pslld m5, 12 - psubd m2, m4, [t3+wq*4+400*12+ 0] - psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) - psubd m2, m5, [t3+wq*4+400*12+16] - psubd m8, m2 - paddd m4, m4 - paddd m5, m5 - paddd m7, m3 - psubd m1, m4 - psubd m7, m5 - psrld m0, 8 - psrld m8, 8 + pmaddwd m3, m4 + psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13) + mova m0, [t3+wq*4+400*12+ 0] + psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13) + mova m4, [t3+wq*4+400*12+16] + psubd m4, m2 + psubd m7, m3 pslld m1, 7 + psrld m0, 8 + psrld m4, 8 pslld m7, 7 - pand m0, m9 - pand m8, m9 pandn m3, m9, m1 - pandn m2, m9, m7 + pand m0, m9 por m0, m3 - por m8, m2 + pand m4, m9 + pandn m2, m9, m7 + por m2, m4 mova m1, [base+pd_4096] pmaddwd m0, m15 - pmaddwd m8, m15 - paddd m4, m1 - paddd m5, m1 - paddd m0, m4 - paddd m8, m5 + pmaddwd m2, m15 + paddd m0, m1 + paddd m2, m1 psrad m0, 13 - psrad m8, 13 - packssdw m0, m8 ; clip + psrad m2, 13 + packssdw m0, m2 + paddw m0, m5 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n1_loop - add dstq, dst_stridemp + add dstq, stridemp movif32 dstm, dstq ret diff -Nru dav1d-0.9.2/src/x86/mc16_avx2.asm dav1d-1.0.0/src/x86/mc16_avx2.asm --- dav1d-0.9.2/src/x86/mc16_avx2.asm 2021-09-03 15:51:24.425037100 +0000 +++ dav1d-1.0.0/src/x86/mc16_avx2.asm 2022-03-18 14:31:56.026356000 +0000 @@ -28,29 +28,43 @@ %if ARCH_X86_64 -SECTION_RODATA 32 +SECTION_RODATA 64 ; dav1d_obmc_masks[] * -512 -obmc_masks: dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 +const obmc_masks_avx2 + dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0 dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120 dw -4096, -3072, -2048, -1536, 0, 0, 0, 0 dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240 dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608 dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024 + dw 0, 0, 0, 0, 0, 0, 0, 0 -blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 +rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +bdct_lb_q: times 8 db 0 + times 8 db 4 + times 8 db 8 + times 8 db 12 -put_bilin_h_rnd: dw 8, 8, 10, 10 -prep_mul: dw 16, 16, 4, 4 +prep_mul: dw 16, 16, 4, 4 +put_bilin_h_rnd: dw 8, 8, 10, 10 put_8tap_h_rnd: dd 34, 40 +s_8tap_h_rnd: dd 2, 8 +s_8tap_h_sh: dd 2, 4 +put_s_8tap_v_rnd: dd 512, 128 +put_s_8tap_v_sh: dd 10, 8 prep_8tap_1d_rnd: dd 8 - (8192 << 4) prep_8tap_2d_rnd: dd 32 - (8192 << 5) warp8x8t_rnd: dd 16384 - (8192 << 15) @@ -60,20 +74,24 @@ bidir_mul: dw 2048, 2048, 8192, 8192 %define pw_16 prep_mul +%define pd_512 put_s_8tap_v_rnd -pw_2: times 2 dw 2 -pw_64: times 2 dw 64 -pw_2048: times 2 dw 2048 -pw_8192: times 2 dw 8192 -pw_27615: times 2 dw 27615 -pw_32766: times 2 dw 32766 -pw_m512: times 2 dw -512 -pd_32: dd 32 -pd_63: dd 63 -pd_64: dd 64 -pd_512: dd 512 -pd_32768: dd 32768 -pd_65538: dd 65538 +pw_2: times 2 dw 2 +pw_64: times 2 dw 64 +pw_2048: times 2 dw 2048 +pw_8192: times 2 dw 8192 +pw_27615: times 2 dw 27615 +pw_32766: times 2 dw 32766 +pw_m512: times 2 dw -512 +pd_32: dd 32 +pd_63: dd 63 +pd_64: dd 64 +pd_32768: dd 32768 +pd_65538: dd 65538 +pd_m524256: dd -524256 ; -8192 << 6 + 32 +pd_0x3ff: dd 0x3ff +pq_0x40000000: dq 0x40000000 + dd 0 %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) @@ -147,6 +165,33 @@ HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +%macro SCALED_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) +%%table: + %rep %0 - 2 + dw %%base %+ .w%3 - %%base + %rotate 1 + %endrep + %rotate 2 + %%dy_1024: + %xdefine %1_%2_dy1_table (%%dy_1024 - %3) + %rep %0 - 2 + dw %%base %+ .dy1_w%3 - %%base + %rotate 1 + %endrep + %rotate 2 + %%dy_2048: + %xdefine %1_%2_dy2_table (%%dy_2048 - %3) + %rep %0 - 2 + dw %%base %+ .dy2_w%3 - %%base + %rotate 1 + %endrep +%endmacro + +SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 + %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX cextern mc_subpel_filters @@ -1177,8 +1222,8 @@ %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v -cglobal %1_8tap_%2_16bpc +%macro FN 4 ; prefix, type, type_h, type_v +cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d @@ -1186,7 +1231,7 @@ mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) + jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) %endif %endmacro @@ -1196,15 +1241,16 @@ DECLARE_REG_TMP 7, 8 %endif -MC_8TAP_FN put, sharp, SHARP, SHARP -MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH -MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP -MC_8TAP_FN put, smooth, SMOOTH, SMOOTH -MC_8TAP_FN put, sharp_regular, SHARP, REGULAR -MC_8TAP_FN put, regular_sharp, REGULAR, SHARP -MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR -MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH -MC_8TAP_FN put, regular, REGULAR, REGULAR +%define PUT_8TAP_FN FN put_8tap, +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx2 @@ -1915,15 +1961,16 @@ DECLARE_REG_TMP 6, 7 %endif -MC_8TAP_FN prep, sharp, SHARP, SHARP -MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH -MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP -MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH -MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR -MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP -MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR -MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH -MC_8TAP_FN prep, regular, REGULAR, REGULAR +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %define base r7-prep_avx2 @@ -2511,6 +2558,1724 @@ %endif RET +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro REMAP_REG 2 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %xdefine r14_save r14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + %xdefine r14 r14_save + %undef r14_save + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd + movu xm%1, [srcq+ r4*2] + movu xm%2, [srcq+ r6*2] + movu xm%3, [srcq+ r7*2] + movu xm%4, [srcq+ r9*2] + vinserti128 m%1, [srcq+r10*2], 1 + vinserti128 m%2, [srcq+r11*2], 1 + vinserti128 m%3, [srcq+r13*2], 1 + vinserti128 m%4, [srcq+ rX*2], 1 + add srcq, ssq + movu xm%5, [srcq+ r4*2] + movu xm%6, [srcq+ r6*2] + movu xm%7, [srcq+ r7*2] + movu xm%8, [srcq+ r9*2] + vinserti128 m%5, [srcq+r10*2], 1 + vinserti128 m%6, [srcq+r11*2], 1 + vinserti128 m%7, [srcq+r13*2], 1 + vinserti128 m%8, [srcq+ rX*2], 1 + add srcq, ssq + pmaddwd m%1, m12 + pmaddwd m%2, m13 + pmaddwd m%3, m14 + pmaddwd m%4, m15 + pmaddwd m%5, m12 + pmaddwd m%6, m13 + pmaddwd m%7, m14 + pmaddwd m%8, m15 + phaddd m%1, m%2 + %if %9 + mova m10, [rsp+0x00] + %endif + phaddd m%3, m%4 + phaddd m%5, m%6 + phaddd m%7, m%8 + phaddd m%1, m%3 + phaddd m%5, m%7 + paddd m%1, m10 + paddd m%5, m10 + psrad m%1, xm11 + psrad m%5, xm11 + packssdw m%1, m%5 +%endmacro + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isput 1 + %assign isprep 0 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_16bpc, 4, 15, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %xdefine base_reg r12 + mov r7d, pxmaxm +%else + %assign isput 0 + %assign isprep 1 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_16bpc, 4, 15, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %define tmp_stridem qword [rsp+0xd0] + %endif + %xdefine base_reg r11 +%endif + lea base_reg, [%1_8tap_scaled_16bpc_avx2] +%define base base_reg-%1_8tap_scaled_16bpc_avx2 + tzcnt wd, wm + vpbroadcastd m8, dxm +%if isprep && UNIX64 + movd xm10, mxd + vpbroadcastd m10, xm10 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 + mov r6d, pxmaxm +%else + vpbroadcastd m10, mxm + %if isput + vpbroadcastw m11, pxmaxm + %else + mov r6d, pxmaxm + %endif +%endif + mov dyd, dym +%if isput + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+0x98] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %else + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+0x98] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define rX r14 + %define rXd r14d +%endif + shr r7d, 11 + vpbroadcastd m6, [base+pd_0x3ff] + vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4] + movd xm7, [base+s_8tap_h_sh+r7*4] +%if isput + vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4] + pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2 +%else + vpbroadcastd m13, [base+pd_m524256] +%endif + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.w2: + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m10, m8 ; mx+dx*[0,1] + vpbroadcastd xm14, [base+pq_0x40000000+2] + vpbroadcastd xm15, xm15 + pand xm8, xm10, xm6 + psrld xm8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_q] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd xm4, [base+subpel_filters+r6*8+2] + pcmpeqd xm8, xm9 + psrld m10, 10 + paddd m10, m10 + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + movu xm2, [srcq+ssq*2] + movu xm3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m10, m5 + paddb m10, m6 + vpblendd xm15, xm4, 0xa + pblendvb xm15, xm14, xm8 + pmovsxbw m15, xm15 + vinserti128 m0, [srcq+ssq*0], 1 ; 0 4 + vinserti128 m1, [srcq+ssq*1], 1 ; 1 5 + vinserti128 m2, [srcq+ssq*2], 1 ; 2 6 + vinserti128 m3, [srcq+ss3q ], 1 ; 3 7 + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m10}, m0, m1, m2, m3 + REPX {pmaddwd x, m15}, m0, m1, m2, m3 + phaddd m0, m1 + phaddd m2, m3 + paddd m0, m12 + paddd m2, m12 + psrad m0, xm7 + psrad m2, xm7 + packssdw m0, m2 ; 0 1 2 3 4 5 6 7 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 ; 1 2 3 4 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + pshufd xm4, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm4 ; 45 56 + punpckhwd xm4, xm1, xm4 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm14, r6q + pmovsxbw xm14, xm14 + pshufd xm8, xm14, q0000 + pshufd xm9, xm14, q1111 + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pshufd xm8, xm14, q2222 + pshufd xm14, xm14, q3333 + paddd xm5, xm6 + pmaddwd xm6, xm2, xm8 + pmaddwd xm8, xm4, xm14 + psrldq xm9, xm7, 8 + paddd xm5, xm6 + paddd xm5, xm13 + paddd xm5, xm8 + psrad xm5, xm9 + packusdw xm5, xm5 + pminsw xm5, xm11 + movd [dstq], xm5 + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movu xm5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps xm3, xm0, q1032 ; 01 12 + shufps xm0, xm2, q1032 ; 23 34 + shufps xm2, xm4, q1032 ; 45 56 + pshufb xm5, xm10 + pmaddwd xm5, xm15 + phaddd xm5, xm5 + paddd xm5, xm12 + psrad xm5, xm7 + packssdw xm5, xm5 + palignr xm1, xm5, xm1, 12 + punpcklqdq xm1, xm1 ; 6 7 6 7 + punpcklwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movu xm6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xm3, xm0 ; 01 12 + mova xm0, xm2 ; 23 34 + pshufb xm5, xm10 + pshufb xm6, xm10 + pmaddwd xm5, xm15 + pmaddwd xm6, xm15 + phaddd xm5, xm6 + paddd xm5, xm12 + psrad xm5, xm7 + packssdw xm5, xm5 ; 6 7 6 7 + palignr xm1, xm5, xm1, 8 ; 4 5 6 7 + pshufd xm5, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm5 ; 45 56 + punpckhwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +%endif +.w4: + mov myd, mym + mova [rsp+0x00], m12 +%if isput + mova [rsp+0x20], xm13 +%else + SWAP m11, m13 +%endif + mova [rsp+0x30], xm7 + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastq m2, [base+pq_0x40000000+1] + vpbroadcastd xm15, xm15 + SWAP m13, m10 + paddd m13, m8 ; mx+dx*[0-3] + pand m6, m13 + psrld m6, 6 + paddd xm15, xm6 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + vbroadcasti128 m5, [base+bdct_lb_q+ 0] + vbroadcasti128 m1, [base+bdct_lb_q+16] + vbroadcasti128 m0, [base+subpel_s_shuf2] + vpbroadcastd xm14, [base+subpel_filters+r4*8+2] + vpbroadcastd xm7, [base+subpel_filters+r6*8+2] + vpbroadcastd xm15, [base+subpel_filters+r11*8+2] + vpbroadcastd xm8, [base+subpel_filters+r13*8+2] + pcmpeqd m6, m9 + punpckldq m10, m6, m6 + punpckhdq m6, m6 + psrld m13, 10 + paddd m13, m13 + vpblendd xm14, xm7, 0xa + vpblendd xm15, xm8, 0xa + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + pblendvb m14, m2, m10 + pblendvb m15, m2, m6 + pextrd r4, xm13, 2 + pshufb m12, m13, m5 + pshufb m13, m1 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu xm7, [srcq+ssq*0] + movu xm9, [srcq+ssq*1] + movu xm8, [srcq+ssq*2] + movu xm10, [srcq+ss3q ] + movu xm1, [srcq+r4 ] + movu xm3, [srcq+r6 ] + movu xm2, [srcq+r11 ] + movu xm4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + vinserti128 m7, [srcq+ssq*0], 1 + vinserti128 m9, [srcq+ssq*1], 1 + vinserti128 m8, [srcq+ssq*2], 1 + vinserti128 m10, [srcq+ss3q ], 1 + vinserti128 m1, [srcq+r4 ], 1 + vinserti128 m3, [srcq+r6 ], 1 + vinserti128 m2, [srcq+r11 ], 1 + vinserti128 m4, [srcq+r13 ], 1 + lea srcq, [srcq+ssq*4] + vpbroadcastb m5, xm13 + psubb m13, m5 + paddb m12, m0 + paddb m13, m0 + REPX {pshufb x, m12}, m7, m9, m8, m10 + REPX {pmaddwd x, m14}, m7, m9, m8, m10 + REPX {pshufb x, m13}, m1, m2, m3, m4 + REPX {pmaddwd x, m15}, m1, m2, m3, m4 + mova m5, [rsp+0x00] + movd xm6, [rsp+0x30] + phaddd m7, m1 + phaddd m9, m3 + phaddd m8, m2 + phaddd m10, m4 + REPX {paddd x, m5}, m7, m9, m8, m10 + REPX {psrad x, xm6}, m7, m9, m8, m10 + packssdw m7, m9 ; 0 1 4 5 + packssdw m8, m10 ; 2 3 6 7 + vextracti128 xm9, m7, 1 ; 4 5 + vextracti128 xm3, m8, 1 ; 6 7 + shufps xm4, xm7, xm8, q1032 ; 1 2 + shufps xm5, xm8, xm9, q1032 ; 3 4 + shufps xm6, xm9, xm3, q1032 ; 5 6 + psrldq xm10, xm3, 8 ; 7 _ + punpcklwd xm0, xm7, xm4 ; 01 + punpckhwd xm7, xm4 ; 12 + punpcklwd xm1, xm8, xm5 ; 23 + punpckhwd xm8, xm5 ; 34 + punpcklwd xm2, xm9, xm6 ; 45 + punpckhwd xm9, xm6 ; 56 + punpcklwd xm3, xm10 ; 67 + mova [rsp+0x40], xm7 + mova [rsp+0x50], xm8 + mova [rsp+0x60], xm9 +.w4_loop: + and myd, 0x3ff + mov r11d, 64 << 24 + mov r13d, myd + shr r13d, 6 + lea r13d, [t1+r13] + cmovnz r11q, [base+subpel_filters+r13*8] + movq xm9, r11q + pmovsxbw xm9, xm9 + pshufd xm7, xm9, q0000 + pshufd xm8, xm9, q1111 + pmaddwd xm4, xm0, xm7 + pmaddwd xm5, xm1, xm8 + pshufd xm7, xm9, q2222 + pshufd xm9, xm9, q3333 + pmaddwd xm6, xm2, xm7 + pmaddwd xm8, xm3, xm9 +%if isput + mova xm7, [rsp+0x20] + movd xm9, [rsp+0x38] +%else + SWAP m7, m11 +%endif + paddd xm4, xm5 + paddd xm6, xm8 + paddd xm4, xm6 + paddd xm4, xm7 +%if isput + psrad xm4, xm9 + packusdw xm4, xm4 + pminuw xm4, xm11 + movq [dstq], xm4 + add dstq, dsq +%else + SWAP m11, m7 + psrad xm4, 6 + packssdw xm4, xm4 + movq [tmpq], xm4 + add tmpq, 8 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + mova xm8, [rsp+0x00] + movd xm9, [rsp+0x30] + movu xm4, [srcq] + movu xm5, [srcq+r4] + test myd, 0x400 + jz .w4_skip_line + mova xm0, [rsp+0x40] + mova [rsp+0x40], xm1 + mova xm1, [rsp+0x50] + mova [rsp+0x50], xm2 + mova xm2, [rsp+0x60] + mova [rsp+0x60], xm3 + pshufb xm4, xm12 + pshufb xm5, xm13 + pmaddwd xm4, xm14 + pmaddwd xm5, xm15 + phaddd xm4, xm5 + paddd xm4, xm8 + psrad xm4, xm9 + packssdw xm4, xm4 + punpcklwd xm3, xm10, xm4 + mova xm10, xm4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu xm6, [srcq+ssq*1] + movu xm7, [srcq+r6] + movu m0, [rsp+0x50] + pshufb xm4, xm12 + pshufb xm6, xm12 + pshufb xm5, xm13 + pshufb xm7, xm13 + pmaddwd xm4, xm14 + pmaddwd xm6, xm14 + pmaddwd xm5, xm15 + pmaddwd xm7, xm15 + mova [rsp+0x40], m0 + phaddd xm4, xm5 + phaddd xm6, xm7 + paddd xm4, xm8 + paddd xm6, xm8 + psrad xm4, xm9 + psrad xm6, xm9 + packssdw xm4, xm6 + punpcklwd xm9, xm10, xm4 + mova [rsp+0x60], xm9 + psrldq xm10, xm4, 8 + mova xm0, xm1 + mova xm1, xm2 + mova xm2, xm3 + punpcklwd xm3, xm4, xm10 + lea srcq, [srcq+ssq*2] + jmp .w4_loop + SWAP m10, m13 +%if isprep + SWAP m13, m11 +%endif +.w8: + mov dword [rsp+0x80], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [rsp+0x80], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+0x80], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+0x80], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+0x80], 16 + movifprep tmp_stridem, 256 +.w_start: + SWAP m10, m12, m1 + SWAP m11, m7 + ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free +%if isput + movifnidn dsm, dsq + mova [rsp+0xb0], xm7 +%endif + mova [rsp+0x00], m10 + mova [rsp+0x20], m13 + shr t0d, 16 + sub srcq, 6 + pmaddwd m8, [base+rescale_mul2] + movd xm15, t0d + mov [rsp+0x84], t0d + mov [rsp+0x88], srcq + mov [rsp+0x90], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m1, m8 ; mx+dx*[0-7] + jmp .hloop +.hloop_prep: + dec dword [rsp+0x80] + jz .ret + add qword [rsp+0x90], 16 + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m6, [base+pd_0x3ff] + paddd m1, m8, [rsp+0x40] + vpbroadcastd m15, [rsp+0x84] + pxor m9, m9 + mov srcq, [rsp+0x88] + mov r0q, [rsp+0x90] ; dstq / tmpq +.hloop: + vpbroadcastq xm2, [base+pq_0x40000000] + pand m5, m1, m6 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + vextracti128 xm7, m15, 1 + movq r6, xm15 + pextrq r9, xm15, 1 + movq r11, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r7d, r9d + shr r9, 32 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mova [rsp+0x40], m1 + movq xm12, [base+subpel_filters+ r4*8] + movq xm13, [base+subpel_filters+ r6*8] + movhps xm12, [base+subpel_filters+ r7*8] + movhps xm13, [base+subpel_filters+ r9*8] + movq xm14, [base+subpel_filters+r10*8] + movq xm15, [base+subpel_filters+r11*8] + movhps xm14, [base+subpel_filters+r13*8] + movhps xm15, [base+subpel_filters+ rX*8] + psrld m1, 10 + vextracti128 xm7, m1, 1 + vextracti128 xm6, m5, 1 + movq [rsp+0xa0], xm1 + movq [rsp+0xa8], xm7 + movq r6, xm1 + pextrq r11, xm1, 1 + movq r9, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r10d, r11d + shr r11, 32 + mov r7d, r9d + shr r9, 32 + mov r13d, rXd + shr rX, 32 + pshufd xm4, xm5, q2200 + pshufd xm5, xm5, q3311 + pshufd xm7, xm6, q2200 + pshufd xm6, xm6, q3311 + pblendvb xm12, xm2, xm4 + pblendvb xm13, xm2, xm5 + pblendvb xm14, xm2, xm7 + pblendvb xm15, xm2, xm6 + pmovsxbw m12, xm12 + pmovsxbw m13, xm13 + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + mova [rsp+0x60], m0 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b + mova m0, [rsp+0x60] + vbroadcasti128 m9, [base+subpel_s_shuf8] + mov myd, mym + mov dyd, dym + pshufb m0, m9 ; 01a 01b + pshufb m1, m9 ; 23a 23b + pshufb m2, m9 ; 45a 45b + pshufb m3, m9 ; 67a 67b +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm9, r6q + punpcklqdq xm9, xm9 + pmovsxbw m9, xm9 + pshufd m8, m9, q0000 + pshufd m7, m9, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m7 + pshufd m8, m9, q2222 + pshufd m9, m9, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m9 +%if isput + psrldq xm8, xm11, 8 +%endif + paddd m4, [rsp+0x20] + paddd m6, m7 + paddd m4, m5 + paddd m4, m6 +%if isput + psrad m4, xm8 + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0xb0] + mova [dstq], xm4 + add dstq, dsm +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+0x60], myd + mov r4d, [rsp+0xa0] + mov r6d, [rsp+0xa4] + mov r7d, [rsp+0xa8] + mov r9d, [rsp+0xac] + jz .skip_line + vbroadcasti128 m9, [base+wswap] + movu xm4, [srcq+ r4*2] + movu xm5, [srcq+ r6*2] + movu xm6, [srcq+ r7*2] + movu xm7, [srcq+ r9*2] + vinserti128 m4, [srcq+r10*2], 1 + vinserti128 m5, [srcq+r11*2], 1 + vinserti128 m6, [srcq+r13*2], 1 + vinserti128 m7, [srcq+ rX*2], 1 + add srcq, ssq + mov myd, [rsp+0x60] + mov dyd, dym + pshufb m0, m9 + pshufb m1, m9 + pshufb m2, m9 + pshufb m3, m9 + pmaddwd m4, m12 + pmaddwd m5, m13 + pmaddwd m6, m14 + pmaddwd m7, m15 + phaddd m4, m5 + phaddd m6, m7 + phaddd m4, m6 + paddd m4, m10 + psrad m4, xm11 + pslld m4, 16 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .vloop +.skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1 + vbroadcasti128 m9, [base+subpel_s_shuf8] + mov myd, [rsp+0x60] + mov dyd, dym + pshufb m3, m9 + jmp .vloop + SWAP m1, m12, m10 + SWAP m7, m11 +.dy1: + movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy1_w2: + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m10, m8 ; mx+dx*[0-1] + vpbroadcastd xm14, [base+pq_0x40000000+2] + vpbroadcastd xm15, xm15 + pand xm8, xm10, xm6 + psrld xm8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_q] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m4, [base+subpel_filters+r6*8+2] + pcmpeqd xm8, xm9 + psrld m10, 10 + paddd m10, m10 + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + movu xm2, [srcq+ssq*2] + movu xm3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m10, m5 + paddb m10, m6 + vpblendd xm15, xm4, 0xa + pblendvb xm15, xm14, xm8 + pmovsxbw m15, xm15 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*1], 1 + vinserti128 m2, [srcq+ssq*2], 1 + add srcq, ss3q + movq xm6, r4q + pmovsxbw xm6, xm6 + pshufd xm8, xm6, q0000 + pshufd xm9, xm6, q1111 + pshufd xm14, xm6, q2222 + pshufd xm6, xm6, q3333 + REPX {pshufb x, m10}, m0, m1, m2 + pshufb xm3, xm10 + REPX {pmaddwd x, m15}, m0, m1, m2 + pmaddwd xm3, xm15 + phaddd m0, m1 + phaddd m2, m3 + paddd m0, m12 + paddd m2, m12 + psrad m0, xm7 + psrad m2, xm7 + packssdw m0, m2 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 + pshufd xm4, xm1, q2121 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + punpcklwd xm2, xm1, xm4 ; 45 56 +.dy1_w2_loop: + movu xm1, [srcq+ssq*0] + movu xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm1, xm10 + pshufb xm5, xm10 + pmaddwd xm1, xm15 + pmaddwd xm5, xm15 + phaddd xm1, xm5 + pmaddwd xm5, xm3, xm8 + mova xm3, xm0 + pmaddwd xm0, xm9 + paddd xm1, xm12 + psrad xm1, xm7 + packssdw xm1, xm1 + paddd xm5, xm0 + mova xm0, xm2 + pmaddwd xm2, xm14 + paddd xm5, xm2 + palignr xm2, xm1, xm4, 12 + punpcklwd xm2, xm1 ; 67 78 + pmaddwd xm4, xm2, xm6 + paddd xm5, xm13 + paddd xm5, xm4 + mova xm4, xm1 + psrldq xm1, xm7, 8 + psrad xm5, xm1 + packusdw xm5, xm5 + pminsw xm5, xm11 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +.dy1_w4: + mov myd, mym +%if isput + mova [rsp+0x50], xm11 +%endif + mova [rsp+0x00], m12 + mova [rsp+0x20], m13 + mova [rsp+0x40], xm7 + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastq m2, [base+pq_0x40000000+1] + vpbroadcastd xm15, xm15 + SWAP m13, m10 + paddd m13, m8 ; mx+dx*[0-3] + pand m6, m13 + psrld m6, 6 + paddd xm15, xm6 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + vbroadcasti128 m5, [base+bdct_lb_q+ 0] + vbroadcasti128 m1, [base+bdct_lb_q+16] + vbroadcasti128 m4, [base+subpel_s_shuf2] + vpbroadcastd xm14, [base+subpel_filters+r4*8+2] + vpbroadcastd xm7, [base+subpel_filters+r6*8+2] + vpbroadcastd xm15, [base+subpel_filters+r11*8+2] + vpbroadcastd xm8, [base+subpel_filters+r13*8+2] + pcmpeqd m6, m9 + punpckldq m10, m6, m6 + punpckhdq m6, m6 + psrld m13, 10 + paddd m13, m13 + vpblendd xm14, xm7, 0xa + vpblendd xm15, xm8, 0xa + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + pblendvb m14, m2, m10 + pblendvb m15, m2, m6 + pextrd r4, xm13, 2 + pshufb m12, m13, m5 + pshufb m13, m1 + lea r6, [r4+ssq*2] + lea r11, [r4+ssq*1] + lea r13, [r4+ss3q ] + movu xm0, [srcq+ssq*0] + movu xm7, [srcq+r4 ] + movu xm1, [srcq+ssq*2] + movu xm8, [srcq+r6 ] + vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 + vinserti128 m7, [srcq+r11 ], 1 + vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 + vinserti128 m8, [srcq+r13 ], 1 + lea srcq, [srcq+ssq*4] + movu xm2, [srcq+ssq*0] + movu xm9, [srcq+r4 ] + movu xm3, [srcq+ssq*2] ; 6 _ + movu xm10, [srcq+r6 ] + vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 + vinserti128 m9, [srcq+r11 ], 1 + lea srcq, [srcq+ss3q ] + vpbroadcastb m5, xm13 + psubb m13, m5 + paddb m12, m4 + paddb m13, m4 + mova m5, [rsp+0x00] + movd xm6, [rsp+0x40] + pshufb m0, m12 + pshufb m1, m12 + pmaddwd m0, m14 + pmaddwd m1, m14 + pshufb m7, m13 + pshufb m8, m13 + pmaddwd m7, m15 + pmaddwd m8, m15 + pshufb m2, m12 + pshufb xm3, xm12 + pmaddwd m2, m14 + pmaddwd xm3, xm14 + pshufb m9, m13 + pshufb xm10, xm13 + pmaddwd m9, m15 + pmaddwd xm10, xm15 + phaddd m0, m7 + phaddd m1, m8 + phaddd m2, m9 + phaddd xm3, xm10 + paddd m0, m5 + paddd m1, m5 + paddd m2, m5 + paddd xm3, xm5 + psrad m0, xm6 + psrad m1, xm6 + psrad m2, xm6 + psrad xm3, xm6 + vperm2i128 m4, m0, m1, 0x21 ; 1 2 + vperm2i128 m5, m1, m2, 0x21 ; 3 4 + vperm2i128 m6, m2, m3, 0x21 ; 5 6 + shr myd, 6 + mov r13d, 64 << 24 + lea myd, [t1+myq] + cmovnz r13q, [base+subpel_filters+myq*8] + pslld m4, 16 + pslld m5, 16 + pslld m6, 16 + pblendw m0, m4, 0xaa ; 01 12 + pblendw m1, m5, 0xaa ; 23 34 + pblendw m2, m6, 0xaa ; 45 56 + movq xm10, r13q + punpcklqdq xm10, xm10 + pmovsxbw m10, xm10 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 +.dy1_w4_loop: + movu xm11, [srcq+ssq*0] + movu xm6, [srcq+r4 ] + vinserti128 m11, [srcq+ssq*1], 1 + vinserti128 m6, [srcq+r11 ], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pshufb m11, m12 + pshufb m6, m13 + pmaddwd m11, m14 + pmaddwd m6, m15 + paddd m4, [rsp+0x20] + phaddd m11, m6 + pmaddwd m6, m2, m9 + paddd m11, [rsp+0x00] + psrad m11, [rsp+0x40] + mova m0, m1 + mova m1, m2 + paddd m5, m6 + paddd m4, m5 + vinserti128 m2, m3, xm11, 1 + pslld m3, m11, 16 + pblendw m2, m3, 0xaa ; 67 78 + pmaddwd m5, m2, m10 + vextracti128 xm3, m11, 1 + paddd m4, m5 +%if isput + psrad m4, [rsp+0x48] + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0x50] + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET + SWAP m10, m13 +.dy1_w8: + mov dword [rsp+0xa0], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [rsp+0xa0], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+0xa0], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+0xa0], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+0xa0], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + SWAP m10, m12, m1 + SWAP m11, m7 + ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free + mov myd, mym +%if isput + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+0xb8] + %endif + movifnidn dsm, dsq + mova [rsp+0xc0], xm7 +%else + %if UNIX64 + %define hm [rsp+0xb8] + %endif +%endif + mova [rsp+0x00], m10 + mova [rsp+0x20], m13 + mova [rsp+0x40], xm11 + shr t0d, 16 + sub srcq, 6 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul2] + movd xm15, t0d + mov [rsp+0xa4], t0d + mov [rsp+0xa8], srcq + mov [rsp+0xb0], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m1, m8 ; mx+dx*[0-7] + movq xm0, r4q + pmovsxbw xm0, xm0 + mova [rsp+0x50], xm0 + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+0xa0] + jz .ret + add qword [rsp+0xb0], 16 + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m6, [base+pd_0x3ff] + paddd m1, m8, [rsp+0x60] + vpbroadcastd m15, [rsp+0xa4] + pxor m9, m9 + mov srcq, [rsp+0xa8] + mov r0q, [rsp+0xb0] ; dstq / tmpq + mova m10, [rsp+0x00] + mova xm11, [rsp+0x40] +.dy1_hloop: + vpbroadcastq xm2, [base+pq_0x40000000] + pand m5, m1, m6 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + vextracti128 xm7, m15, 1 + movq r6, xm15 + pextrq r9, xm15, 1 + movq r11, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r7d, r9d + shr r9, 32 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mova [rsp+0x60], m1 + movq xm12, [base+subpel_filters+ r4*8] + movq xm13, [base+subpel_filters+ r6*8] + movhps xm12, [base+subpel_filters+ r7*8] + movhps xm13, [base+subpel_filters+ r9*8] + movq xm14, [base+subpel_filters+r10*8] + movq xm15, [base+subpel_filters+r11*8] + movhps xm14, [base+subpel_filters+r13*8] + movhps xm15, [base+subpel_filters+ rX*8] + psrld m1, 10 + vextracti128 xm7, m1, 1 + vextracti128 xm6, m5, 1 + movq r6, xm1 + pextrq r11, xm1, 1 + movq r9, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r10d, r11d + shr r11, 32 + mov r7d, r9d + shr r9, 32 + mov r13d, rXd + shr rX, 32 + pshufd xm4, xm5, q2200 + pshufd xm5, xm5, q3311 + pshufd xm7, xm6, q2200 + pshufd xm6, xm6, q3311 + pblendvb xm12, xm2, xm4 + pblendvb xm13, xm2, xm5 + pblendvb xm14, xm2, xm7 + pblendvb xm15, xm2, xm6 + pmovsxbw m12, xm12 + pmovsxbw m13, xm13 + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + mova [rsp+0x80], m0 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b + mova m0, [rsp+0x80] + vbroadcasti128 m7, [base+subpel_s_shuf8] + vpbroadcastd m8, [rsp+0x50] + vpbroadcastd m9, [rsp+0x54] + vpbroadcastd m10, [rsp+0x58] + vpbroadcastd m11, [rsp+0x5c] + pshufb m0, m7 ; 01a 01b + pshufb m1, m7 ; 23a 23b + pshufb m2, m7 ; 45a 45b + pshufb m3, m7 ; 67a 67b +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, [rsp+0x20] + paddd m6, m7 + paddd m4, m5 + paddd m4, m6 +%if isput + psrad m4, [rsp+0x48] + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0xc0] + mova [dstq], xm4 + add dstq, dsm +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep + vbroadcasti128 m7, [base+wswap] + pshufb m0, m7 + pshufb m1, m7 + pshufb m2, m7 + pshufb m3, m7 + movu xm4, [srcq+ r4*2] + movu xm5, [srcq+ r6*2] + movu xm6, [srcq+ r7*2] + movu xm7, [srcq+ r9*2] + vinserti128 m4, [srcq+r10*2], 1 + vinserti128 m5, [srcq+r11*2], 1 + vinserti128 m6, [srcq+r13*2], 1 + vinserti128 m7, [srcq+ rX*2], 1 + add srcq, ssq + pmaddwd m4, m12 + pmaddwd m5, m13 + pmaddwd m6, m14 + pmaddwd m7, m15 + phaddd m4, m5 + phaddd m6, m7 + phaddd m4, m6 + paddd m4, [rsp+0x00] + psrad m4, [rsp+0x40] + pslld m4, 16 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_vloop + SWAP m1, m12, m10 + SWAP m7, m11 +.dy2: + movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy2_w2: + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m10, m8 ; mx+dx*[0-1] + vpbroadcastd xm14, [base+pq_0x40000000+2] + vpbroadcastd xm15, xm15 + pand xm8, xm10, xm6 + psrld xm8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_q] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd xm4, [base+subpel_filters+r6*8+2] + pcmpeqd xm8, xm9 + psrld m10, 10 + paddd m10, m10 + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*2] + movu xm2, [srcq+ssq*4] + pshufb m10, m5 + paddb m10, m6 + vpblendd xm15, xm4, 0xa + pblendvb xm15, xm14, xm8 + pmovsxbw m15, xm15 + vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 + vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 + lea srcq, [srcq+ssq*4] + vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m0, m10 + pshufb m1, m10 + pshufb m2, m10 + pmaddwd m0, m15 + pmaddwd m1, m15 + pmaddwd m2, m15 + movq xm6, r4q + pmovsxbw xm6, xm6 + phaddd m0, m1 + phaddd m1, m2 + paddd m0, m12 + paddd m1, m12 + psrad m0, xm7 + psrad m1, xm7 + packssdw m0, m1 ; 0 2 2 4 1 3 3 5 + vextracti128 xm1, m0, 1 + pshufd xm8, xm6, q0000 + pshufd xm9, xm6, q1111 + pshufd xm14, xm6, q2222 + pshufd xm6, xm6, q3333 + punpcklwd xm2, xm0, xm1 ; 01 23 + punpckhwd xm1, xm0, xm1 ; 23 45 +.dy2_w2_loop: + movu xm3, [srcq+ssq*0] + movu xm5, [srcq+ssq*2] + vinserti128 m3, [srcq+ssq*1], 1 ; 6 7 + vinserti128 m5, [srcq+ss3q ], 1 ; 8 9 + lea srcq, [srcq+ssq*4] + pmaddwd xm4, xm2, xm8 + pmaddwd xm1, xm9 + pshufb m3, m10 + pshufb m5, m10 + pmaddwd m3, m15 + pmaddwd m5, m15 + phaddd m3, m5 + paddd xm4, xm1 + paddd m3, m12 + psrad m3, xm7 + packssdw m3, m3 + pshufd m3, m3, q2100 + palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9 + vextracti128 xm1, m0, 1 + punpcklwd xm2, xm0, xm1 ; 45 67 + punpckhwd xm1, xm0, xm1 ; 67 89 + pmaddwd xm3, xm2, xm14 + pmaddwd xm5, xm1, xm6 + paddd xm4, xm13 + paddd xm4, xm3 + psrldq xm3, xm7, 8 + paddd xm4, xm5 + psrad xm4, xm3 + packusdw xm4, xm4 + pminsw xm4, xm11 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +.dy2_w4: + mov myd, mym +%if isput + mova [rsp+0x50], xm11 +%endif + mova [rsp+0x00], m12 + mova [rsp+0x20], m13 + mova [rsp+0x40], xm7 + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastq m2, [base+pq_0x40000000+1] + vpbroadcastd xm15, xm15 + SWAP m13, m10 + paddd m13, m8 ; mx+dx*[0-3] + pand m6, m13 + psrld m6, 6 + paddd xm15, xm6 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + vbroadcasti128 m5, [base+bdct_lb_q+ 0] + vbroadcasti128 m1, [base+bdct_lb_q+16] + vbroadcasti128 m4, [base+subpel_s_shuf2] + vpbroadcastd xm14, [base+subpel_filters+r4*8+2] + vpbroadcastd xm7, [base+subpel_filters+r6*8+2] + vpbroadcastd xm15, [base+subpel_filters+r11*8+2] + vpbroadcastd xm8, [base+subpel_filters+r13*8+2] + shr myd, 6 + mov r13d, 64 << 24 + lea myd, [t1+myq] + cmovnz r13q, [base+subpel_filters+myq*8] + pcmpeqd m6, m9 + punpckldq m11, m6, m6 + punpckhdq m6, m6 + psrld m13, 10 + paddd m13, m13 + vpblendd xm14, xm7, 0xa + vpblendd xm15, xm8, 0xa + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + movq xm10, r13q + pblendvb m14, m2, m11 + pblendvb m15, m2, m6 + pextrd r4, xm13, 2 + pshufb m12, m13, m5 + pshufb m13, m1 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu xm0, [srcq+ssq*0] + movu xm7, [srcq+r4 ] + movu xm1, [srcq+ssq*1] + movu xm8, [srcq+r6 ] + vinserti128 m0, [srcq+ssq*2], 1 ; 0 2 + vinserti128 m7, [srcq+r11 ], 1 + vinserti128 m1, [srcq+ss3q ], 1 ; 1 3 + vinserti128 m8, [srcq+r13 ], 1 + lea srcq, [srcq+ssq*4] + movu xm2, [srcq+ssq*0] + movu xm9, [srcq+r4 ] + vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 + vinserti128 m9, [srcq+r6 ], 1 + lea srcq, [srcq+ssq*2] + vpbroadcastb m5, xm13 + psubb m13, m5 + paddb m12, m4 + paddb m13, m4 + mova m5, [rsp+0x00] + movd xm6, [rsp+0x40] + pshufb m0, m12 + pshufb m1, m12 + pshufb m2, m12 + pmaddwd m0, m14 + pmaddwd m1, m14 + pmaddwd m2, m14 + pshufb m7, m13 + pshufb m8, m13 + pshufb m9, m13 + pmaddwd m7, m15 + pmaddwd m8, m15 + pmaddwd m9, m15 + punpcklqdq xm10, xm10 + pmovsxbw m10, xm10 + phaddd m0, m7 + phaddd m1, m8 + phaddd m2, m9 + paddd m0, m5 + paddd m1, m5 + paddd m2, m5 + psrad m0, xm6 + psrad m1, xm6 + psrad m2, xm6 + vperm2i128 m3, m0, m2, 0x21 ; 2 4 + vperm2i128 m2, m1, 0x13 ; 3 5 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + packssdw m0, m3 ; 0 2 2 4 + packssdw m1, m2 ; 1 3 3 5 + punpckhwd m2, m0, m1 ; 23 45 + punpcklwd m0, m1 ; 01 23 +.dy2_w4_loop: + movu xm1, [srcq+ssq*0] + movu xm6, [srcq+r4 ] + movu xm3, [srcq+ssq*1] + movu xm11, [srcq+r6 ] + vinserti128 m1, [srcq+ssq*2], 1 ; 6 8 + vinserti128 m6, [srcq+r11 ], 1 + vinserti128 m3, [srcq+ss3q ], 1 ; 7 9 + vinserti128 m11, [srcq+r13 ], 1 + lea srcq, [srcq+ssq*4] + pmaddwd m4, m0, m7 + pmaddwd m5, m2, m8 + pshufb m1, m12 + pshufb m3, m12 + pmaddwd m1, m14 + pmaddwd m3, m14 + mova m0, [rsp+0x00] + pshufb m6, m13 + pshufb m11, m13 + pmaddwd m6, m15 + pmaddwd m11, m15 + paddd m4, m5 + movd xm5, [rsp+0x40] + phaddd m1, m6 + phaddd m3, m11 + paddd m1, m0 + paddd m3, m0 + psrad m1, xm5 + psrad m3, xm5 + pslld m3, 16 + pblendw m1, m3, 0xaa ; 67 89 + vperm2i128 m0, m2, m1, 0x21 ; 45 67 + paddd m4, [rsp+0x20] + mova m2, m1 + pmaddwd m5, m0, m9 + pmaddwd m6, m2, m10 + paddd m4, m5 + paddd m4, m6 +%if isput + psrad m4, [rsp+0x48] + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0x50] + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET + SWAP m10, m13 +.dy2_w8: + mov dword [rsp+0xa0], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [rsp+0xa0], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+0xa0], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+0xa0], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+0xa0], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + SWAP m10, m12, m1 + SWAP m11, m7 + ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free + mov myd, mym +%if isput + movifnidn dsm, dsq + mova [rsp+0xc0], xm7 +%endif + mova [rsp+0x00], m10 + mova [rsp+0x20], m13 + mova [rsp+0x40], xm11 + shr t0d, 16 + sub srcq, 6 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul2] + movd xm15, t0d + mov [rsp+0xa4], t0d + mov [rsp+0xa8], srcq + mov [rsp+0xb0], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m1, m8 ; mx+dx*[0-7] + movq xm0, r4q + pmovsxbw xm0, xm0 + mova [rsp+0x50], xm0 + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+0xa0] + jz .ret + add qword [rsp+0xb0], 16 + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m6, [base+pd_0x3ff] + paddd m1, m8, [rsp+0x60] + vpbroadcastd m15, [rsp+0xa4] + pxor m9, m9 + mov srcq, [rsp+0xa8] + mov r0q, [rsp+0xb0] ; dstq / tmpq + mova m10, [rsp+0x00] + mova xm11, [rsp+0x40] +.dy2_hloop: + vpbroadcastq xm2, [base+pq_0x40000000] + pand m5, m1, m6 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + vextracti128 xm7, m15, 1 + movq r6, xm15 + pextrq r9, xm15, 1 + movq r11, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r7d, r9d + shr r9, 32 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mova [rsp+0x60], m1 + movq xm12, [base+subpel_filters+ r4*8] + movq xm13, [base+subpel_filters+ r6*8] + movhps xm12, [base+subpel_filters+ r7*8] + movhps xm13, [base+subpel_filters+ r9*8] + movq xm14, [base+subpel_filters+r10*8] + movq xm15, [base+subpel_filters+r11*8] + movhps xm14, [base+subpel_filters+r13*8] + movhps xm15, [base+subpel_filters+ rX*8] + psrld m1, 10 + vextracti128 xm7, m1, 1 + vextracti128 xm6, m5, 1 + movq r6, xm1 + pextrq r11, xm1, 1 + movq r9, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r10d, r11d + shr r11, 32 + mov r7d, r9d + shr r9, 32 + mov r13d, rXd + shr rX, 32 + pshufd xm4, xm5, q2200 + pshufd xm5, xm5, q3311 + pshufd xm7, xm6, q2200 + pshufd xm6, xm6, q3311 + pblendvb xm12, xm2, xm4 + pblendvb xm13, xm2, xm5 + pblendvb xm14, xm2, xm7 + pblendvb xm15, xm2, xm6 + pmovsxbw m12, xm12 + pmovsxbw m13, xm13 + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + mova [rsp+0x80], m0 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b + mova m0, [rsp+0x80] + vbroadcasti128 m7, [base+subpel_s_shuf8] + vpbroadcastd m8, [rsp+0x50] + vpbroadcastd m9, [rsp+0x54] + vpbroadcastd m10, [rsp+0x58] + vpbroadcastd m11, [rsp+0x5c] + pshufb m0, m7 ; 01a 01b + pshufb m1, m7 ; 23a 23b + pshufb m2, m7 ; 45a 45b + pshufb m3, m7 ; 67a 67b +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, [rsp+0x20] + paddd m6, m7 + paddd m4, m5 + paddd m4, m6 +%if isput + psrad m4, [rsp+0x48] + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0xc0] + mova [dstq], xm4 + add dstq, dsm +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep + mova m0, m1 + mova m1, m2 + mova m2, m3 + movu xm3, [srcq+ r4*2] + movu xm4, [srcq+ r6*2] + movu xm5, [srcq+ r7*2] + movu xm6, [srcq+ r9*2] + vinserti128 m3, [srcq+r10*2], 1 + vinserti128 m4, [srcq+r11*2], 1 + vinserti128 m5, [srcq+r13*2], 1 + vinserti128 m6, [srcq+ rX*2], 1 + add srcq, ssq + pmaddwd m3, m12 + pmaddwd m4, m13 + pmaddwd m5, m14 + pmaddwd m6, m15 + phaddd m3, m4 + phaddd m5, m6 + phaddd m3, m5 + movu xm4, [srcq+ r4*2] + movu xm5, [srcq+ r6*2] + movu xm6, [srcq+ r7*2] + movu xm7, [srcq+ r9*2] + vinserti128 m4, [srcq+r10*2], 1 + vinserti128 m5, [srcq+r11*2], 1 + vinserti128 m6, [srcq+r13*2], 1 + vinserti128 m7, [srcq+ rX*2], 1 + add srcq, ssq + pmaddwd m4, m12 + pmaddwd m5, m13 + pmaddwd m6, m14 + pmaddwd m7, m15 + phaddd m4, m5 + phaddd m6, m7 + mova m5, [rsp+0x00] + movd xm7, [rsp+0x40] + phaddd m4, m6 + paddd m3, m5 + paddd m4, m5 + psrad m3, xm7 + psrad m4, xm7 + pslld m4, 16 + pblendw m3, m4, 0xaa + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%undef isput +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled_16bpc + mov t0d, (5*15 << 16) | 5*15 + mov t1d, t0d + jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif + +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, +BILIN_SCALED_FN put +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, +BILIN_SCALED_FN prep +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED prep + %macro WARP_V 5 ; dst, 01, 23, 45, 67 lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] @@ -3595,7 +5360,7 @@ add wq, r5 jmp wq .w2: - vpbroadcastd m2, [base+obmc_masks+2*2] + vpbroadcastd m2, [base+obmc_masks_avx2+2*2] .w2_loop: movd m0, [dstq+dsq*0] pinsrd m0, [dstq+dsq*1], 1 @@ -3611,7 +5376,7 @@ jg .w2_loop RET .w4: - vpbroadcastq m2, [base+obmc_masks+4*2] + vpbroadcastq m2, [base+obmc_masks_avx2+4*2] .w4_loop: movq m0, [dstq+dsq*0] movhps m0, [dstq+dsq*1] @@ -3627,7 +5392,7 @@ RET INIT_YMM avx2 .w8: - vbroadcasti128 m2, [base+obmc_masks+8*2] + vbroadcasti128 m2, [base+obmc_masks_avx2+8*2] .w8_loop: mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 @@ -3642,7 +5407,7 @@ jg .w8_loop RET .w16: - mova m4, [base+obmc_masks+16*2] + mova m4, [base+obmc_masks_avx2+16*2] .w16_loop: mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] @@ -3664,8 +5429,8 @@ movaps [rsp+ 8], xmm6 movaps [rsp+24], xmm7 %endif - mova m6, [base+obmc_masks+32*2] - vbroadcasti128 m7, [base+obmc_masks+32*3] + mova m6, [base+obmc_masks_avx2+32*2] + vbroadcasti128 m7, [base+obmc_masks_avx2+32*3] .w32_loop: mova m0, [dstq+dsq*0+32*0] psubw m3, m0, [tmpq +32*0] @@ -3720,7 +5485,7 @@ mov hd, hm movsxd wq, [r5+wq*4] add wq, r5 - lea maskq, [base+obmc_masks+hq*2] + lea maskq, [base+obmc_masks_avx2+hq*2] lea hd, [hq*3] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] @@ -4016,7 +5781,7 @@ vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm - DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr, _, pxmax + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pd_64] diff -Nru dav1d-0.9.2/src/x86/mc16_avx512.asm dav1d-1.0.0/src/x86/mc16_avx512.asm --- dav1d-0.9.2/src/x86/mc16_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-1.0.0/src/x86/mc16_avx512.asm 2022-03-18 14:31:56.026356000 +0000 @@ -0,0 +1,4858 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 + db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 +spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 + db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 + db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 + db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 +spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 + db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 +spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 + db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 + db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 + db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 +spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 + db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 + db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 + db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 +spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 + db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 + db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 + db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 +prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 + db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 +prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 + db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 + db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 +prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 + db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 + db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 + db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 +spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 + db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 +spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46 + db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 +spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 + db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 + db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 + db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 +spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 + db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 + db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 + db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 +spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 + db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 + db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 + db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 +spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 + db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 + db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 + db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 +spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 + db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 + db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 +spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 +spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 + db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 +w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 +w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 + db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 + db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 +w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 + db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 + db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 + db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 +w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 + db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 + db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 + db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 +w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 + db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 + db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 + db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 +warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 + db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 + db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 + db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 +warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 + db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 + db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 + db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 +warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 + db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 + db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 + db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 +deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 +pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 + dd 1 +pw_2048: times 2 dw 2048 + dd 3 +pw_8192: times 2 dw 8192 +avg_shift: dw 5, 5, 3, 3 +pw_27615: times 2 dw 27615 +pw_32766: times 2 dw 32766 +warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 +warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 +warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 +resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 +resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 +resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 +resize_permE: dq 0, 2, 4, 6 +resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 +resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 + +prep_hv_shift: dq 6, 4 +put_bilin_h_rnd: dw 8, 8, 10, 10 +prep_mul: dw 16, 16, 4, 4 +put_8tap_h_rnd: dd 34, 40 +prep_8tap_rnd: dd 128 - (8192 << 8) +warp_8x8_rnd_h: dd 512, 2048 +warp_8x8_rnd_v: dd 262144, 65536 +warp_8x8t_rnd_v: dd 16384 - (8192 << 15) +avg_round: dw -16400, -16400, -16388, -16388 +w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) +mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) +w_mask_round: dd 128, 64 +bidir_shift: dw 6, 6, 4, 4 + +pb_64: times 4 db 64 +pw_m512: times 2 dw -512 +pw_2: times 2 dw 2 +pw_64: times 2 dw 64 +pd_32: dd 32 +pd_63: dd 63 +pd_128: dd 128 +pd_640: dd 640 +pd_2176: dd 2176 +pd_16384: dd 16384 +pd_0_4: dd 0, 4 + +%define pw_16 prep_mul +%define pd_512 warp_8x8_rnd_h + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) +%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) + +BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +cextern mc_warp_filter +cextern obmc_masks_avx2 +cextern resize_filter + +SECTION .text + +%if WIN64 +DECLARE_REG_TMP 4 +%else +DECLARE_REG_TMP 8 +%endif + +INIT_ZMM avx512icl +cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy + mov mxyd, r6m ; mx + lea r7, [put_avx512icl] + tzcnt t0d, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx t0d, word [r7+t0*2+table_offset(put,)] + add t0, r7 + jmp t0 +.put_w2: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu xmm0, [srcq+ssq*0] + movu xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], xmm0 + mova [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu ym0, [srcq+ssq*0] + movu ym1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], ym0 + mova [dstq+dsq*1], ym1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*1+64*0] + movu m3, [srcq+ssq*1+64*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+64*0], m0 + mova [dstq+dsq*0+64*1], m1 + mova [dstq+dsq*1+64*0], m2 + mova [dstq+dsq*1+64*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] + movu m2, [srcq+64*2] + movu m3, [srcq+64*3] + add srcq, ssq + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + mova [dstq+64*2], m2 + mova [dstq+64*3], m3 + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + vpbroadcastw m5, mxyd + mov mxyd, r7m ; my + vpbroadcastd m4, [pw_16] + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] + mov r6d, r8m ; bitdepth_max + add t0, r7 + shr r6d, 11 + vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] + jmp t0 +.h_w2: + movq xmm1, [srcq+ssq*0] + movhps xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw xmm0, xmm1, xm4 + psrlq xmm1, 16 + pmullw xmm1, xm5 + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 4 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq xmm0, [srcq+ssq*0+0] + movhps xmm0, [srcq+ssq*1+0] + movq xmm1, [srcq+ssq*0+2] + movhps xmm1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw xmm0, xm4 + pmullw xmm1, xm5 + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 4 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+ssq*0+0] + vinserti32x4 ym0, [srcq+ssq*1+0], 1 + movu xm1, [srcq+ssq*0+2] + vinserti32x4 ym1, [srcq+ssq*1+2], 1 + lea srcq, [srcq+ssq*2] + pmullw ym0, ym4 + pmullw ym1, ym5 + paddw ym0, ym6 + paddw ym0, ym1 + psrlw ym0, 4 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu ym0, [srcq+ssq*0+0] + vinserti32x8 m0, [srcq+ssq*1+0], 1 + movu ym1, [srcq+ssq*0+2] + vinserti32x8 m1, [srcq+ssq*1+2], 1 + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m6 + paddw m0, m1 + psrlw m0, 4 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m2, m5, [srcq+ssq*0+2] + pmullw m1, m4, [srcq+ssq*1+0] + pmullw m3, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m0, m6 + paddw m1, m6 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + pmullw m0, m4, [srcq+64*0+0] + pmullw m2, m5, [srcq+64*0+2] + pmullw m1, m4, [srcq+64*1+0] + pmullw m3, m5, [srcq+64*1+2] + add srcq, ssq + paddw m0, m6 + paddw m1, m6 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + pmullw m0, m4, [srcq+64*0+0] + pmullw m7, m5, [srcq+64*0+2] + pmullw m1, m4, [srcq+64*1+0] + pmullw m8, m5, [srcq+64*1+2] + pmullw m2, m4, [srcq+64*2+0] + pmullw m9, m5, [srcq+64*2+2] + pmullw m3, m4, [srcq+64*3+0] + pmullw m10, m5, [srcq+64*3+2] + add srcq, ssq + REPX {paddw x, m6}, m0, m1, m2, m3 + paddw m0, m7 + paddw m1, m8 + paddw m2, m9 + paddw m3, m10 + REPX {psrlw x, 4}, m0, m1, m2, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + mova [dstq+64*2], m2 + mova [dstq+64*3], m3 + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] + shl mxyd, 11 + vpbroadcastw m8, mxyd + add t0, r7 + jmp t0 +.v_w2: + movd xmm0, [srcq+ssq*0] +.v_w2_loop: + movd xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpckldq xmm2, xmm0, xmm1 + movd xmm0, [srcq+ssq*0] + punpckldq xmm1, xmm0 + psubw xmm1, xmm2 + pmulhrsw xmm1, xm8 + paddw xmm1, xmm2 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xmm0, [srcq+ssq*0] +.v_w4_loop: + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq xmm2, xmm0, xmm1 + movq xmm0, [srcq+ssq*0] + punpcklqdq xmm1, xmm0 + psubw xmm1, xmm2 + pmulhrsw xmm1, xm8 + paddw xmm1, xmm2 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu xmm0, [srcq+ssq*0] +.v_w8_loop: + vbroadcasti128 ymm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd ymm2, ymm0, ymm1, 0xf0 + vbroadcasti128 ymm0, [srcq+ssq*0] + vpblendd ymm1, ymm0, 0xf0 + psubw ymm1, ymm2 + pmulhrsw ymm1, ym8 + paddw ymm1, ymm2 + mova [dstq+dsq*0], xmm1 + vextracti128 [dstq+dsq*1], ymm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + vzeroupper + RET +.v_w16: + movu ym0, [srcq+ssq*0] +.v_w16_loop: + movu ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw ym1, ym3, ym0 + pmulhrsw ym1, ym8 + paddw ym1, ym0 + movu ym0, [srcq+ssq*0] + psubw ym2, ym0, ym3 + pmulhrsw ym2, ym8 + paddw ym2, ym3 + mova [dstq+dsq*0], ym1 + mova [dstq+dsq*1], ym2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: + movu m0, [srcq+ssq*0] +.v_w32_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m8 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m8 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + RET +.v_w64: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] +.v_w64_loop: + movu m2, [srcq+ssq*1+64*0] + movu m3, [srcq+ssq*1+64*1] + lea srcq, [srcq+ssq*2] + psubw m4, m2, m0 + pmulhrsw m4, m8 + paddw m4, m0 + movu m0, [srcq+ssq*0+64*0] + psubw m5, m3, m1 + pmulhrsw m5, m8 + paddw m5, m1 + movu m1, [srcq+ssq*0+64*1] + psubw m6, m0, m2 + pmulhrsw m6, m8 + psubw m7, m1, m3 + pmulhrsw m7, m8 + mova [dstq+dsq*0+64*0], m4 + mova [dstq+dsq*0+64*1], m5 + paddw m6, m2 + paddw m7, m3 + mova [dstq+dsq*1+64*0], m6 + mova [dstq+dsq*1+64*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*0+64*2] + movu m3, [srcq+ssq*0+64*3] +.v_w128_loop: + movu m4, [srcq+ssq*1+64*0] + movu m5, [srcq+ssq*1+64*1] + movu m6, [srcq+ssq*1+64*2] + movu m7, [srcq+ssq*1+64*3] + lea srcq, [srcq+ssq*2] + psubw m9, m4, m0 + pmulhrsw m9, m8 + paddw m9, m0 + movu m0, [srcq+ssq*0+64*0] + psubw m10, m5, m1 + pmulhrsw m10, m8 + paddw m10, m1 + movu m1, [srcq+ssq*0+64*1] + psubw m11, m6, m2 + pmulhrsw m11, m8 + paddw m11, m2 + movu m2, [srcq+ssq*0+64*2] + psubw m12, m7, m3 + pmulhrsw m12, m8 + paddw m12, m3 + movu m3, [srcq+ssq*0+64*3] + mova [dstq+dsq*0+64*0], m9 + psubw m9, m0, m4 + pmulhrsw m9, m8 + mova [dstq+dsq*0+64*1], m10 + psubw m10, m1, m5 + pmulhrsw m10, m8 + mova [dstq+dsq*0+64*2], m11 + psubw m11, m2, m6 + pmulhrsw m11, m8 + mova [dstq+dsq*0+64*3], m12 + psubw m12, m3, m7 + pmulhrsw m12, m8 + paddw m9, m4 + paddw m10, m5 + mova [dstq+dsq*1+64*0], m9 + mova [dstq+dsq*1+64*1], m10 + paddw m11, m6 + paddw m12, m7 + mova [dstq+dsq*1+64*2], m11 + mova [dstq+dsq*1+64*3], m12 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w128_loop + RET +.hv: + movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] + shl mxyd, 11 + vpbroadcastd m6, [pw_2] + vpbroadcastw m7, mxyd + vpbroadcastd m8, [pw_8192] + add t0, r7 + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + vpbroadcastd m8, [pw_2048] +.hv_12bpc: + jmp t0 +.hv_w2: + vpbroadcastq xmm1, [srcq+ssq*0] + pmullw xmm0, xmm1, xm4 + psrlq xmm1, 16 + pmullw xmm1, xm5 + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 2 +.hv_w2_loop: + movq xmm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm2, [srcq+ssq*0] + pmullw xmm1, xmm2, xm4 + psrlq xmm2, 16 + pmullw xmm2, xm5 + paddw xmm1, xm6 + paddw xmm1, xmm2 + psrlw xmm1, 2 ; 1 _ 2 _ + shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm7 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm8 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + pmullw xmm0, xm4, [srcq+ssq*0-8] + pmullw xmm1, xm5, [srcq+ssq*0-6] + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 2 +.hv_w4_loop: + movq xmm1, [srcq+ssq*1+0] + movq xmm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps xmm1, [srcq+ssq*0+0] + movhps xmm2, [srcq+ssq*0+2] + pmullw xmm1, xm4 + pmullw xmm2, xm5 + paddw xmm1, xm6 + paddw xmm1, xmm2 + psrlw xmm1, 2 ; 1 2 + shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm7 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm8 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + pmullw xmm0, xm4, [srcq+ssq*0+0] + pmullw xmm1, xm5, [srcq+ssq*0+2] + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 2 + vinserti32x4 ym0, xmm0, 1 +.hv_w8_loop: + movu xm1, [srcq+ssq*1+0] + movu xm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym1, [srcq+ssq*0+0], 1 + vinserti32x4 ym2, [srcq+ssq*0+2], 1 + pmullw ym1, ym4 + pmullw ym2, ym5 + paddw ym1, ym6 + paddw ym1, ym2 + psrlw ym1, 2 ; 1 2 + vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 + mova ym0, ym1 + psubw ym1, ym2 + paddw ym1, ym1 + pmulhw ym1, ym7 + paddw ym1, ym2 + pmulhrsw ym1, ym8 + mova [dstq+dsq*0], xm1 + vextracti32x4 [dstq+dsq*1], ym1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + pmullw ym0, ym4, [srcq+ssq*0+0] + pmullw ym1, ym5, [srcq+ssq*0+2] + paddw ym0, ym6 + paddw ym0, ym1 + psrlw ym0, 2 + vinserti32x8 m0, ym0, 1 +.hv_w16_loop: + movu ym1, [srcq+ssq*1+0] + movu ym2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + vinserti32x8 m1, [srcq+ssq*0+0], 1 + vinserti32x8 m2, [srcq+ssq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m6 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + vshufi32x4 m2, m0, m1, q1032 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m7 + paddw m1, m2 + pmulhrsw m1, m8 + mova [dstq+dsq*0], ym1 + vextracti32x8 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: +.hv_w64: +.hv_w128: + movifnidn wd, wm + lea r6d, [hq+wq*8-256] + mov r4, srcq + mov r7, dstq +.hv_w32_loop0: + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m1, m5, [srcq+ssq*0+2] + paddw m0, m6 + paddw m0, m1 + psrlw m0, 2 +.hv_w32_loop: + pmullw m3, m4, [srcq+ssq*1+0] + pmullw m1, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m3, m6 + paddw m3, m1 + psrlw m3, 2 + psubw m1, m3, m0 + paddw m1, m1 + pmulhw m1, m7 + paddw m1, m0 + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m2, m5, [srcq+ssq*0+2] + paddw m0, m6 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m3 + paddw m2, m2 + pmulhw m2, m7 + paddw m2, m3 + pmulhrsw m1, m8 + pmulhrsw m2, m8 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w32_loop + add r4, 64 + add r7, 64 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w32_loop0 + RET + +cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep_avx512icl] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] + add wq, r6 + shr r5d, 11 + vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq xmm0, [srcq+strideq*0] + movhps xmm0, [srcq+strideq*1] + vpbroadcastq ymm1, [srcq+strideq*2] + vpbroadcastq ymm2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm0, ymm1, 0x30 + vpblendd ymm0, ymm2, 0xc0 + pmullw ymm0, ym4 + psubw ymm0, ym5 + mova [tmpq], ymm0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + vzeroupper + RET +.prep_w8: + movu xm0, [srcq+strideq*0] + vinserti32x4 ym0, [srcq+strideq*1], 1 + vinserti32x4 m0, [srcq+strideq*2], 2 + vinserti32x4 m0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + psubw m0, m5 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + movu ym1, [srcq+strideq*2] + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m4, [srcq+strideq*2] + pmullw m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 4 + jg .prep_w32 + RET +.prep_w64: + pmullw m0, m4, [srcq+strideq*0+64*0] + pmullw m1, m4, [srcq+strideq*0+64*1] + pmullw m2, m4, [srcq+strideq*1+64*0] + pmullw m3, m4, [srcq+strideq*1+64*1] + lea srcq, [srcq+strideq*2] + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 2 + jg .prep_w64 + RET +.prep_w128: + pmullw m0, m4, [srcq+64*0] + pmullw m1, m4, [srcq+64*1] + pmullw m2, m4, [srcq+64*2] + pmullw m3, m4, [srcq+64*3] + add srcq, strideq + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + dec hd + jg .prep_w128 + RET +.h: + vpbroadcastw m5, mxyd + mov mxyd, r6m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastd m6, [pw_32766] + psubw m4, m5 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m4, 2 + psllw m5, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + movu xm1, [srcq+strideq*0] + vinserti32x4 ym1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti32x4 ym2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq ym0, ym1, ym2 + psrldq ym1, 2 + psrldq ym2, 2 + pmullw ym0, ym4 + punpcklqdq ym1, ym2 + pmullw ym1, ym5 + psubw ym0, ym6 + paddw ym0, ym1 + psraw ym0, 2 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+strideq*0+0] + movu xm1, [srcq+strideq*0+2] + vinserti32x4 ym0, [srcq+strideq*1+0], 1 + vinserti32x4 ym1, [srcq+strideq*1+2], 1 + vinserti32x4 m0, [srcq+strideq*2+0], 2 + vinserti32x4 m1, [srcq+strideq*2+2], 2 + vinserti32x4 m0, [srcq+stride3q +0], 3 + vinserti32x4 m1, [srcq+stride3q +2], 3 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m6 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8 + RET +.h_w16: + movu ym0, [srcq+strideq*0+0] + vinserti32x8 m0, [srcq+strideq*1+0], 1 + movu ym1, [srcq+strideq*0+2] + vinserti32x8 m1, [srcq+strideq*1+2], 1 + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m6 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 2 + jg .h_w16 + RET +.h_w32: + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m2, m5, [srcq+strideq*0+2] + pmullw m1, m4, [srcq+strideq*1+0] + pmullw m3, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m0, m6 + psubw m1, m6 + paddw m0, m2 + paddw m1, m3 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + sub hd, 2 + jg .h_w32 + RET +.h_w64: + pmullw m0, m4, [srcq+ 0] + pmullw m2, m5, [srcq+ 2] + pmullw m1, m4, [srcq+64] + pmullw m3, m5, [srcq+66] + add srcq, strideq + psubw m0, m6 + psubw m1, m6 + paddw m0, m2 + paddw m1, m3 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + dec hd + jg .h_w64 + RET +.h_w128: + pmullw m0, m4, [srcq+ 0] + pmullw m7, m5, [srcq+ 2] + pmullw m1, m4, [srcq+ 64] + pmullw m8, m5, [srcq+ 66] + pmullw m2, m4, [srcq+128] + pmullw m9, m5, [srcq+130] + pmullw m3, m4, [srcq+192] + pmullw m10, m5, [srcq+194] + add srcq, strideq + REPX {psubw x, m6}, m0, m1, m2, m3 + paddw m0, m7 + paddw m1, m8 + paddw m2, m9 + paddw m3, m10 + REPX {psraw x, 2}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + vpbroadcastw m9, mxyd + vpbroadcastd m8, [pw_16] + vpbroadcastd m10, [pw_32766] + add wq, r6 + lea stride3q, [strideq*3] + psubw m8, m9 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m8, 2 + psllw m9, 2 +.v_12bpc: + jmp wq +.v_w4: + movq xmm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastq xmm2, [srcq+strideq*1] + vpbroadcastq ymm1, [srcq+strideq*2] + vpbroadcastq ymm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm2, ymm1, 0x30 + vpblendd ymm2, ymm3, 0xc0 + vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 + movq xmm0, [srcq+strideq*0] + valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 + pmullw ymm1, ym8 + pmullw ymm2, ym9 + psubw ymm1, ym10 + paddw ymm1, ymm2 + psraw ymm1, 2 + mova [tmpq], ymm1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + movu xm0, [srcq+strideq*0] +.v_w8_loop: + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 + vinserti32x4 m1, [srcq+strideq*2], 2 + vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 + lea srcq, [srcq+strideq*4] + movu xm0, [srcq+strideq*0] + valignq m2, m0, m1, 2 ; 1 2 3 4 + pmullw m1, m8 + pmullw m2, m9 + psubw m1, m10 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + movu ym0, [srcq+strideq*0] +.v_w16_loop: + vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 + movu ym3, [srcq+strideq*2] + vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 + lea srcq, [srcq+strideq*4] + movu ym0, [srcq+strideq*0] + vshufi32x4 m3, m1, m3, q1032 ; 1 2 + vshufi32x4 m4, m2, m0, q1032 ; 3 4 + pmullw m1, m8 + pmullw m2, m8 + pmullw m3, m9 + pmullw m4, m9 + psubw m1, m10 + psubw m2, m10 + paddw m1, m3 + paddw m2, m4 + psraw m1, 2 + psraw m2, 2 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + movu m0, [srcq+strideq*0] +.v_w32_loop: + movu m3, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m1, m8, m0 + movu m0, [srcq+strideq*0] + pmullw m2, m8, m3 + pmullw m3, m9 + pmullw m4, m9, m0 + psubw m1, m10 + psubw m2, m10 + paddw m1, m3 + paddw m2, m4 + psraw m1, 2 + psraw m2, 2 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + RET +.v_w64: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] +.v_w64_loop: + add srcq, strideq + pmullw m2, m8, m0 + movu m0, [srcq+64*0] + pmullw m3, m8, m1 + movu m1, [srcq+64*1] + pmullw m4, m9, m0 + pmullw m5, m9, m1 + psubw m2, m10 + psubw m3, m10 + paddw m2, m4 + paddw m3, m5 + psraw m2, 2 + psraw m3, 2 + mova [tmpq+64*0], m2 + mova [tmpq+64*1], m3 + add tmpq, 64*2 + dec hd + jg .v_w64_loop + RET +.v_w128: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] + movu m2, [srcq+64*2] + movu m3, [srcq+64*3] +.v_w128_loop: + add srcq, strideq + pmullw m4, m8, m0 + movu m0, [srcq+64*0] + pmullw m5, m8, m1 + movu m1, [srcq+64*1] + pmullw m6, m8, m2 + movu m2, [srcq+64*2] + pmullw m7, m8, m3 + movu m3, [srcq+64*3] + pmullw m11, m9, m0 + pmullw m12, m9, m1 + pmullw m13, m9, m2 + pmullw m14, m9, m3 + REPX {psubw x, m10}, m4, m5, m6, m7 + paddw m4, m11 + paddw m5, m12 + paddw m6, m13 + paddw m7, m14 + REPX {psraw x, 2}, m4, m5, m6, m7 + mova [tmpq+64*0], m4 + mova [tmpq+64*1], m5 + mova [tmpq+64*2], m6 + mova [tmpq+64*3], m7 + add tmpq, 64*4 + dec hd + jg .v_w128_loop + RET +.hv: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + vpbroadcastw m7, mxyd + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.hv_w4: + movq xmm0, [srcq+strideq*0+0] + movq xmm1, [srcq+strideq*0+2] + pmullw xmm0, xm4 + pmullw xmm1, xm5 + psubw xmm0, xm6 + paddw xmm0, xmm1 + psraw xmm0, 2 + vpbroadcastq ym0, xmm0 +.hv_w4_loop: + movu xm1, [srcq+strideq*1] + vinserti128 ym1, [srcq+stride3q ], 1 + movu xm2, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vinserti128 ym2, [srcq+strideq*0], 1 + punpcklqdq ym3, ym1, ym2 + psrldq ym1, 2 + psrldq ym2, 2 + pmullw ym3, ym4 + punpcklqdq ym1, ym2 + pmullw ym1, ym5 + psubw ym3, ym6 + paddw ym1, ym3 + psraw ym1, 2 ; 1 2 3 4 + valignq ym2, ym1, ym0, 3 ; 0 1 2 3 + mova ym0, ym1 + psubw ym1, ym2 + pmulhrsw ym1, ym7 + paddw ym1, ym2 + mova [tmpq], ym1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + pmullw xm0, xm4, [srcq+strideq*0+0] + pmullw xm1, xm5, [srcq+strideq*0+2] + psubw xm0, xm6 + paddw xm0, xm1 + psraw xm0, 2 + vinserti32x4 m0, xm0, 3 +.hv_w8_loop: + movu xm1, [srcq+strideq*1+0] + movu xm2, [srcq+strideq*1+2] + vinserti32x4 ym1, [srcq+strideq*2+0], 1 + vinserti32x4 ym2, [srcq+strideq*2+2], 1 + vinserti32x4 m1, [srcq+stride3q +0], 2 + vinserti32x4 m2, [srcq+stride3q +2], 2 + lea srcq, [srcq+strideq*4] + vinserti32x4 m1, [srcq+strideq*0+0], 3 + vinserti32x4 m2, [srcq+strideq*0+2], 3 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m6 + paddw m1, m2 + psraw m1, 2 ; 1 2 3 4 + valignq m2, m1, m0, 6 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m7 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + pmullw ym0, ym4, [srcq+strideq*0+0] + pmullw ym1, ym5, [srcq+strideq*0+2] + psubw ym0, ym6 + paddw ym0, ym1 + psraw ym0, 2 + vinserti32x8 m0, ym0, 1 +.hv_w16_loop: + movu ym1, [srcq+strideq*1+0] + movu ym2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + vinserti32x8 m1, [srcq+strideq*0+0], 1 + vinserti32x8 m2, [srcq+strideq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m6 + paddw m1, m2 + psraw m1, 2 ; 1 2 + vshufi32x4 m2, m0, m1, q1032 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m7 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m1, m5, [srcq+strideq*0+2] + psubw m0, m6 + paddw m0, m1 + psraw m0, 2 +.hv_w32_loop: + pmullw m3, m4, [srcq+strideq*1+0] + pmullw m1, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m3, m6 + paddw m3, m1 + psraw m3, 2 + psubw m1, m3, m0 + pmulhrsw m1, m7 + paddw m1, m0 + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m2, m5, [srcq+strideq*0+2] + psubw m0, m6 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m3 + pmulhrsw m2, m7 + paddw m2, m3 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 2 + jg .hv_w32_loop + RET +.hv_w64: + pmullw m0, m4, [srcq+ 0] + pmullw m2, m5, [srcq+ 2] + pmullw m1, m4, [srcq+64] + pmullw m3, m5, [srcq+66] + psubw m0, m6 + psubw m1, m6 + paddw m0, m2 + paddw m1, m3 + psraw m0, 2 + psraw m1, 2 +.hv_w64_loop: + add srcq, strideq + pmullw m2, m4, [srcq+ 0] + pmullw m8, m5, [srcq+ 2] + pmullw m3, m4, [srcq+64] + pmullw m9, m5, [srcq+66] + psubw m2, m6 + psubw m3, m6 + paddw m2, m8 + paddw m3, m9 + psraw m2, 2 + psraw m3, 2 + psubw m8, m2, m0 + psubw m9, m3, m1 + pmulhrsw m8, m7 + pmulhrsw m9, m7 + paddw m8, m0 + mova m0, m2 + paddw m9, m1 + mova m1, m3 + mova [tmpq+64*0], m8 + mova [tmpq+64*1], m9 + add tmpq, 64*2 + dec hd + jg .hv_w64_loop + RET +.hv_w128: + pmullw m0, m4, [srcq+ 0] + pmullw m8, m5, [srcq+ 2] + pmullw m1, m4, [srcq+ 64] + pmullw m9, m5, [srcq+ 66] + pmullw m2, m4, [srcq+128] + pmullw m10, m5, [srcq+130] + pmullw m3, m4, [srcq+192] + pmullw m11, m5, [srcq+194] + REPX {psubw x, m6}, m0, m1, m2, m3 + paddw m0, m8 + paddw m1, m9 + paddw m2, m10 + paddw m3, m11 + REPX {psraw x, 2}, m0, m1, m2, m3 +.hv_w128_loop: + add srcq, strideq + pmullw m8, m4, [srcq+ 0] + pmullw m12, m5, [srcq+ 2] + pmullw m9, m4, [srcq+ 64] + pmullw m13, m5, [srcq+ 66] + pmullw m10, m4, [srcq+128] + pmullw m14, m5, [srcq+130] + pmullw m11, m4, [srcq+192] + pmullw m15, m5, [srcq+194] + REPX {psubw x, m6}, m8, m9, m10, m11 + paddw m8, m12 + paddw m9, m13 + paddw m10, m14 + paddw m11, m15 + REPX {psraw x, 2}, m8, m9, m10, m11 + psubw m12, m8, m0 + psubw m13, m9, m1 + psubw m14, m10, m2 + psubw m15, m11, m3 + REPX {pmulhrsw x, m7}, m12, m13, m14, m15 + paddw m12, m0 + mova m0, m8 + paddw m13, m1 + mova m1, m9 + mova [tmpq+64*0], m12 + mova [tmpq+64*1], m13 + paddw m14, m2 + mova m2, m10 + paddw m15, m3 + mova m3, m11 + mova [tmpq+64*2], m14 + mova [tmpq+64*3], m15 + add tmpq, 64*4 + dec hd + jg .hv_w128_loop + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v +cglobal %1_8tap_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) +%endif +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%define buf rsp+stack_offset+8 ; shadow space +%else +DECLARE_REG_TMP 7, 8 +%define buf rsp-40 ; red zone +%endif + +MC_8TAP_FN put, sharp, SHARP, SHARP +MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN put, smooth, SMOOTH, SMOOTH +MC_8TAP_FN put, sharp_regular, SHARP, REGULAR +MC_8TAP_FN put, regular_sharp, REGULAR, SHARP +MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN put, regular, REGULAR, REGULAR + +cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my +%define base r8-put_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx512icl] + movifnidn wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 +%if WIN64 + pop r8 +%endif + jmp wq +.h_w2: + movzx mxd, mxb + sub srcq, 2 + mova ym2, [spel_h_shuf2a] + pmovsxbw xmm4, [base+subpel_filters+mxq*8] + pshufd xmm3, xmm4, q1111 + pshufd xmm4, xmm4, q2222 +.h_w2_loop: + movu xm1, [srcq+ssq*0] + vinserti32x4 ym1, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova xmm0, xm8 + vpermb ym1, ym2, ym1 + vpdpwssd xmm0, xmm3, xm1 + vextracti32x4 xm1, ym1, 1 + vpdpwssd xmm0, xmm4, xm1 + psrad xmm0, 6 + packusdw xmm0, xmm0 + pminsw xmm0, xm9 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + vbroadcasti32x4 ym4, [spel_h_shufA] + vbroadcasti32x4 ym5, [spel_h_shufB] + pshufd xmm0, xmm0, q2211 + vpbroadcastq ym6, xmm0 + vpermq ym7, ymm0, q1111 +.h_w4_loop: + movu xm2, [srcq+ssq*0] + vinserti32x4 ym2, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova ym0, ym8 + pshufb ym1, ym2, ym4 + vpdpwssd ym0, ym6, ym1 + pshufb ym2, ym5 + vpdpwssd ym0, ym7, ym2 + psrad ym0, 6 + vextracti32x4 xm1, ym0, 1 + packusdw xm0, xm1 + pminsw xmm0, xm0, xm9 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + mov r7d, r8m + vpbroadcastw m9, r8m + shr r7d, 11 + vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] + cmp wd, 4 + je .h_w4 + jl .h_w2 + shr mxd, 16 + sub srcq, 6 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mova [buf], xmm0 + vpbroadcastd m10, xmm0 + vpbroadcastd m11, [buf+ 4] + vpbroadcastd m12, [buf+ 8] + vpbroadcastd m13, [buf+12] + cmp wd, 16 + je .h_w16 + jg .h_w32 +.h_w8: + mova m4, [spel_h_shufA] + movu m5, [spel_h_shufB] + movu m6, [spel_h_shufC] + mova m7, [spel_h_shufD] +.h_w8_loop: + movu ym2, [srcq+ssq*0] + vinserti32x8 m2, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova m0, m8 + vpermb m1, m4, m2 + vpdpwssd m0, m10, m1 + vpermb m1, m5, m2 + vpdpwssd m0, m11, m1 + vpermb m1, m6, m2 + vpdpwssd m0, m12, m1 + vpermb m1, m7, m2 + vpdpwssd m0, m13, m1 + psrad m0, 6 + vextracti32x8 ym1, m0, 1 + packusdw ym0, ym1 + pminsw ym0, ym9 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8_loop + RET +.h_w16: + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] +.h_w16_loop: + movu ym2, [srcq+ssq*0+ 0] + vinserti32x8 m2, [srcq+ssq*1+ 0], 1 + movu ym3, [srcq+ssq*0+16] + vinserti32x8 m3, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + mova m0, m8 + mova m1, m8 + pshufb m4, m2, m6 + vpdpwssd m0, m10, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m12, m4 ; b2 + pshufb m4, m2, m7 + vpdpwssd m0, m11, m4 ; a1 + pshufb m4, m3, m7 + vpdpwssd m1, m13, m4 ; b3 + shufpd m2, m3, 0x55 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a2 + vpdpwssd m1, m10, m4 ; b0 + pshufb m2, m7 + vpdpwssd m0, m13, m2 ; a3 + vpdpwssd m1, m11, m2 ; b1 + psrad m0, 6 + psrad m1, 6 + packusdw m0, m1 + pminsw m0, m9 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + lea srcq, [srcq+wq*2] + vbroadcasti32x4 m6, [spel_h_shufA] + lea dstq, [dstq+wq*2] + vbroadcasti32x4 m7, [spel_h_shufB] + neg wq +.h_w32_loop0: + mov r6, wq +.h_w32_loop: + movu m2, [srcq+r6*2+ 0] + movu m3, [srcq+r6*2+ 8] + mova m0, m8 + mova m1, m8 + pshufb m4, m2, m6 + vpdpwssd m0, m10, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m10, m4 ; b0 + vpdpwssd m0, m12, m4 ; a2 + movu m4, [srcq+r6*2+16] + pshufb m3, m7 + vpdpwssd m1, m11, m3 ; b1 + vpdpwssd m0, m13, m3 ; a3 + pshufb m3, m4, m6 + vpdpwssd m1, m12, m3 ; b2 + pshufb m2, m7 + vpdpwssd m0, m11, m2 ; a1 + pshufb m4, m7 + vpdpwssd m1, m13, m4 ; b3 + psrad m0, 6 + psrad m1, 6 + packusdw m0, m1 + pminsw m0, m9 + mova [dstq+r6*2], m0 + add r6, 32 + jl .h_w32_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w32_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastd m10, [pd_32] + pmovsxbw xmm0, [base+subpel_filters+myq*8] + tzcnt r7d, wd + vpbroadcastw m11, r8m + lea r6, [ssq*3] + movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] + sub srcq, r6 + mova [rsp+stack_offset+8], xmm0 + vpbroadcastd m12, xmm0 + add r7, r8 + vpbroadcastd m13, [rsp+stack_offset+12] + vpbroadcastd m14, [rsp+stack_offset+16] + vpbroadcastd m15, [rsp+stack_offset+20] + jmp r7 +.v_w2: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + pinsrd xmm2, [srcq+ssq*2], 2 + add srcq, r6 + pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, r6 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklwd xmm3, xmm1 ; 45 56 + punpcklwd xmm1, xmm2, xmm4 ; 01 12 + punpckhwd xmm2, xmm4 ; 23 34 +.v_w2_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xmm5, xm10 + vpdpwssd xmm5, xm12, xmm1 ; a0 b0 + mova xmm1, xmm2 + vpdpwssd xmm5, xm13, xmm2 ; a1 b1 + mova xmm2, xmm3 + vpdpwssd xmm5, xm14, xmm3 ; a2 b2 + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm0, 0x02 ; 7 8 + punpcklwd xmm3, xmm4 ; 67 78 + vpdpwssd xmm5, xm15, xmm3 ; a3 b3 + psrad xmm5, 6 + packusdw xmm5, xmm5 + pminsw xmm5, xm11 + movd [dstq+dsq*0], xmm5 + pextrd [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xmm1, [srcq+ssq*0] + vpbroadcastq ymm0, [srcq+ssq*1] + vpbroadcastq ymm2, [srcq+ssq*2] + add srcq, r6 + vpbroadcastq ymm4, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm5, [srcq+ssq*2] + add srcq, r6 + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklwd ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm4, 0x30 + vpblendd ymm4, ymm3, 0x30 + punpcklwd ymm2, ymm4 ; 23 34 + vpblendd ymm3, ymm5, 0x30 + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 45 56 +.v_w4_loop: + vpbroadcastq ymm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova ymm4, ym10 + vpdpwssd ymm4, ym12, ymm1 ; a0 b0 + mova ymm1, ymm2 + vpdpwssd ymm4, ym13, ymm2 ; a1 b1 + mova ymm2, ymm3 + vpdpwssd ymm4, ym14, ymm3 ; a2 b2 + vpblendd ymm3, ymm0, ymm5, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 67 78 + vpdpwssd ymm4, ym15, ymm3 ; a3 b3 + psrad ymm4, 6 + vextracti128 xmm5, ymm4, 1 + packusdw xmm4, xmm5 + pminsw xmm4, xm11 + movq [dstq+dsq*0], xmm4 + movhps [dstq+dsq*1], xmm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + vbroadcasti32x4 m2, [srcq+ssq*2] + vinserti32x4 m1, m2, [srcq+ssq*0], 0 + vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 + add srcq, r6 + vinserti32x4 ym2, [srcq+ssq*0], 1 + vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 + mova m6, [spel_v_shuf8] + movu xm0, [srcq+ssq*1] + vinserti32x4 ym0, [srcq+ssq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 + vpermb m1, m6, m1 ; 01 12 + vpermb m2, m6, m2 ; 23 34 + vpermb m3, m6, m0 ; 45 56 +.v_w8_loop: + vinserti32x4 m0, [srcq+ssq*1], 3 + lea srcq, [srcq+ssq*2] + movu xm5, [srcq+ssq*0] + mova m4, m10 + vpdpwssd m4, m12, m1 ; a0 b0 + mova m1, m2 + vshufi32x4 m0, m5, q1032 ; 6 7 8 + vpdpwssd m4, m13, m2 ; a1 b1 + mova m2, m3 + vpdpwssd m4, m14, m3 ; a2 b2 + vpermb m3, m6, m0 ; 67 78 + vpdpwssd m4, m15, m3 ; a3 b3 + psrad m4, 6 + vextracti32x8 ym5, m4, 1 + packusdw ym4, ym5 + pminsw ym4, ym11 + mova [dstq+dsq*0], xm4 + vextracti32x4 [dstq+dsq*1], ym4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti32x8 m1, [srcq+ssq*1] + vinserti32x8 m0, m1, [srcq+ssq*0], 0 + vinserti32x8 m1, [srcq+ssq*2], 1 + mova m8, [spel_v_shuf16] + add srcq, r6 + movu ym3, [srcq+ssq*0] + vinserti32x8 m3, [srcq+ssq*1], 1 + movu ym5, [srcq+ssq*2] + add srcq, r6 + vinserti32x8 m5, [srcq+ssq*0], 1 + vpermb m0, m8, m0 ; 01 + vpermb m1, m8, m1 ; 12 + vpermb m3, m8, m3 ; 34 + vpermb m5, m8, m5 ; 56 + mova m9, [deint_q_shuf] + vpshrdd m2, m1, m3, 16 ; 23 + vpshrdd m4, m3, m5, 16 ; 45 +.v_w16_loop: + mova m6, m10 + mova m7, m10 + vpdpwssd m6, m12, m0 ; a0 + mova m0, m2 + vpdpwssd m7, m12, m1 ; b0 + mova m1, m3 + vpdpwssd m6, m13, m2 ; a1 + mova m2, m4 + vpdpwssd m7, m13, m3 ; b1 + mova m3, m5 + vpdpwssd m6, m14, m4 ; a2 + mova m4, m5 + vpdpwssd m7, m14, m5 ; b2 + movu ym5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m5, [srcq+ssq*0], 1 + vpermb m5, m8, m5 ; 78 + vpshrdd m4, m5, 16 ; 67 + vpdpwssd m6, m15, m4 ; a3 + vpdpwssd m7, m15, m5 ; b3 + psrad m6, 6 + psrad m7, 6 + packusdw m6, m7 + pminsw m6, m11 + vpermq m6, m9, m6 + mova [dstq+dsq*0], ym6 + vextracti32x8 [dstq+dsq*1], m6, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + movaps [rsp+stack_offset+8], xmm6 +%endif + lea wd, [hq+wq*8-256] + mov r7, srcq + mov r8, dstq +.v_w32_loop0: + movu m16, [srcq+ssq*0] + movu m17, [srcq+ssq*1] + movu m18, [srcq+ssq*2] + add srcq, r6 + movu m19, [srcq+ssq*0] + movu m20, [srcq+ssq*1] + movu m21, [srcq+ssq*2] + add srcq, r6 + movu m22, [srcq+ssq*0] + punpcklwd m0, m16, m17 ; 01l + punpckhwd m16, m17 ; 01h + punpcklwd m1, m17, m18 ; 12l + punpckhwd m17, m18 ; 12h + punpcklwd m2, m18, m19 ; 23l + punpckhwd m18, m19 ; 23h + punpcklwd m3, m19, m20 ; 34l + punpckhwd m19, m20 ; 34h + punpcklwd m4, m20, m21 ; 45l + punpckhwd m20, m21 ; 45h + punpcklwd m5, m21, m22 ; 56l + punpckhwd m21, m22 ; 56h +.v_w32_loop: + mova m6, m10 + vpdpwssd m6, m12, m0 ; a0l + mova m8, m10 + vpdpwssd m8, m12, m16 ; a0h + mova m7, m10 + vpdpwssd m7, m12, m1 ; b0l + mova m9, m10 + vpdpwssd m9, m12, m17 ; b0h + mova m0, m2 + vpdpwssd m6, m13, m2 ; a1l + mova m16, m18 + vpdpwssd m8, m13, m18 ; a1h + mova m1, m3 + vpdpwssd m7, m13, m3 ; b1l + mova m17, m19 + vpdpwssd m9, m13, m19 ; b1h + mova m2, m4 + vpdpwssd m6, m14, m4 ; a2l + mova m18, m20 + vpdpwssd m8, m14, m20 ; a2h + mova m3, m5 + vpdpwssd m7, m14, m5 ; b2l + mova m19, m21 + vpdpwssd m9, m14, m21 ; b2h + movu m21, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m4, m22, m21 ; 67l + punpckhwd m20, m22, m21 ; 67h + movu m22, [srcq+ssq*0] + vpdpwssd m6, m15, m4 ; a3l + vpdpwssd m8, m15, m20 ; a3h + punpcklwd m5, m21, m22 ; 78l + punpckhwd m21, m22 ; 78h + vpdpwssd m7, m15, m5 ; b3l + vpdpwssd m9, m15, m21 ; b3h + REPX {psrad x, 6}, m6, m8, m7, m9 + packusdw m6, m8 + packusdw m7, m9 + pminsw m6, m11 + pminsw m7, m11 + mova [dstq+dsq*0], m6 + mova [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + add r7, 64 + add r8, 64 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .v_w32_loop0 +%if WIN64 + movaps xmm6, [rsp+stack_offset+8] +%endif + vzeroupper + RET +.hv: + vpbroadcastw m11, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 + test dword r8m, 0x800 + jnz .hv_12bit + vpbroadcastd m10, [pd_2176] + psllw xmm0, 6 + jmp .hv_main +.hv_12bit: + vpbroadcastd m10, [pd_640] + psllw xmm0, 4 + psllw xmm1, 2 +.hv_main: + mova [buf+ 0], xmm0 + mova [buf+16], xmm1 + vpbroadcastd m8, [buf+ 4] + vpbroadcastd m9, [buf+ 8] + vpbroadcastd ym12, xmm1 + vpbroadcastd ym13, [buf+20] + vpbroadcastd ym14, [buf+24] + vpbroadcastd ym15, [buf+28] + movu xm4, [srcq+ssq*0] + vinserti32x4 ym4, [srcq+ssq*1], 1 + vinserti32x4 m4, [srcq+ssq*2], 2 + add srcq, r6 + vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 + movu xm0, [srcq+ssq*1] + vinserti32x4 ym0, [srcq+ssq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 + cmp wd, 4 + je .hv_w4 + vbroadcasti32x4 m2, [spel_h_shufA] + mova m3, [spel_h_shuf2b] + mova ym6, [spel_h_shuf2a] + mova xm7, [spel_shuf2] + mova m1, m10 + pshufb m4, m2 + pshufb m0, m2 + punpcklqdq m2, m4, m0 + vpdpwssd m1, m8, m2 ; 04 15 26 3_ + punpckhqdq m4, m0 + vpdpwssd m1, m9, m4 + vpermb m1, m3, m1 ; 01 12 + vextracti32x4 xm2, ym1, 1 ; 23 34 + vextracti32x4 xm3, m1, 2 ; 45 56 +.hv_w2_loop: + movu xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym5, [srcq+ssq*0], 1 + mova xm4, xm10 + vpermb ym5, ym6, ym5 + pmaddwd xmm0, xm12, xm1 ; a0 b0 + vpdpwssd xm4, xm8, xm5 + vextracti32x4 xm5, ym5, 1 + mova xm1, xm2 + vpdpwssd xmm0, xm13, xm2 ; a1 b1 + vpdpwssd xm4, xm9, xm5 ; 7 8 + mova xm2, xm3 + vpdpwssd xmm0, xm14, xm3 ; a2 b2 + vpermt2b xm3, xm7, xm4 ; 67 78 + vpdpwssd xmm0, xm15, xm3 ; a3 b3 + psrad xmm0, 10 + packusdw xmm0, xmm0 + pminsw xmm0, xm11 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + vbroadcasti32x4 m19, [spel_h_shufA] + vbroadcasti32x4 m20, [spel_h_shufB] + mova ym6, [spel_shuf4a] + mova ym7, [spel_shuf4b] + mova m2, m10 + mova m3, m10 + pshufb m1, m4, m19 + vpdpwssd m2, m8, m1 + pshufb m1, m0, m19 + vpdpwssd m3, m8, m1 + pshufb m4, m20 + vpdpwssd m2, m9, m4 + pshufb m0, m20 + vpdpwssd m3, m9, m0 + vpermb m1, m6, m2 ; 01 12 + vshufi32x4 m2, m3, q1032 + vpermb m3, m6, m3 ; 45 56 + vpermb m2, m6, m2 ; 23 34 +.hv_w4_loop: + movu xm18, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 ym18, [srcq+ssq*0], 1 + mova ym4, ym10 + pshufb ym17, ym18, ym19 + pmaddwd ym16, ym12, ym1 ; a0 b0 + vpdpwssd ym4, ym8, ym17 + pshufb ym18, ym20 + mova ym1, ym2 + vpdpwssd ym16, ym13, ym2 ; a1 b1 + vpdpwssd ym4, ym9, ym18 ; 7 8 + mova ym2, ym3 + vpdpwssd ym16, ym14, ym3 ; a2 b2 + vpermt2b ym3, ym7, ym4 ; 67 78 + vpdpwssd ym16, ym15, ym3 ; a3 b3 + psrad ym16, 10 + vextracti128 xm17, ym16, 1 + packusdw xm16, xm17 + pminsw xm16, xm11 + movq [dstq+dsq*0], xm16 + movhps [dstq+dsq*1], xm16 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + vzeroupper + RET +.hv_w8: + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + test dword r8m, 0x800 + jnz .hv_w8_12bit + vpbroadcastd m10, [pd_2176] + psllw xmm0, 6 + jmp .hv_w8_main +.hv_w8_12bit: + vpbroadcastd m10, [pd_640] + psllw xmm0, 4 + psllw xmm1, 2 +.hv_w8_main: + mova [buf+ 0], xmm0 + mova [buf+16], xmm1 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [buf+ 4] + vpbroadcastd m14, [buf+ 8] + vpbroadcastd m15, [buf+12] + vpbroadcastd m16, xmm1 + vpbroadcastd m17, [buf+20] + vpbroadcastd m18, [buf+24] + vpbroadcastd m19, [buf+28] + cmp wd, 16 + je .hv_w16 + jg .hv_w32 + mova m5, [spel_h_shufA] + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 + movu ym9, [srcq+ssq*2] + add srcq, r6 + vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 + movu ym20, [srcq+ssq*1] + vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 + add srcq, r6 + movu ym21, [srcq+ssq*0] ; 6 + movu m6, [spel_h_shufB] + movu m7, [spel_h_shufC] + vpermb m8, m5, m0 + mova m1, m10 + vpdpwssd m1, m12, m8 ; a0 b0 + vpermb m8, m5, m9 + mova m2, m10 + vpdpwssd m2, m12, m8 ; c0 d0 + vpermb m8, m5, m20 + mova m3, m10 + vpdpwssd m3, m12, m8 ; e0 f0 + vpermb m8, m5, m21 + mova m4, m10 + vpdpwssd m4, m12, m8 ; g0 + vpermb m8, m6, m0 + vpdpwssd m1, m13, m8 ; a1 b1 + vpermb m8, m6, m9 + vpdpwssd m2, m13, m8 ; c1 d1 + vpermb m8, m6, m20 + vpdpwssd m3, m13, m8 ; e1 f1 + vpermb m8, m6, m21 + vpdpwssd m4, m13, m8 ; g1 + vpermb m8, m7, m0 + vpdpwssd m1, m14, m8 ; a2 b2 + vpermb m8, m7, m9 + vpdpwssd m2, m14, m8 ; c2 d2 + vpermb m8, m7, m20 + vpdpwssd m3, m14, m8 ; e2 f2 + vpermb m8, m7, m21 + vpdpwssd m4, m14, m8 ; g2 + mova m8, [spel_h_shufD] + vpermb m0, m8, m0 + vpdpwssd m1, m15, m0 ; a3 b3 + mova m0, [spel_shuf8a] + vpermb m9, m8, m9 + vpdpwssd m2, m15, m9 ; c3 d3 + mova m9, [spel_shuf8b] + vpermb m20, m8, m20 + vpdpwssd m3, m15, m20 ; e3 f3 + vpermb m21, m8, m21 + vpdpwssd m4, m15, m21 ; g3 + vpermt2b m1, m0, m2 ; 01 12 + vpermt2b m2, m0, m3 ; 23 34 + vpermt2b m3, m0, m4 ; 45 56 +.hv_w8_loop: + movu ym0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m0, [srcq+ssq*0], 1 + mova m4, m10 + vpermb m21, m5, m0 + vpdpwssd m4, m12, m21 ; h0 i0 + vpermb m21, m6, m0 + pmaddwd m20, m16, m1 ; A0 B0 + vpdpwssd m4, m13, m21 ; h1 i1 + vpermb m21, m7, m0 + mova m1, m2 + vpdpwssd m20, m17, m2 ; A1 B1 + vpdpwssd m4, m14, m21 ; h2 i2 + vpermb m21, m8, m0 + mova m2, m3 + vpdpwssd m20, m18, m3 ; A2 B2 + vpdpwssd m4, m15, m21 ; h3 i3 + vpermt2b m3, m9, m4 ; 67 78 + vpdpwssd m20, m19, m3 ; A3 B3 + psrad m20, 10 + vextracti32x8 ym21, m20, 1 + packusdw ym20, ym21 + pminsw ym20, ym11 + mova [dstq+dsq*0], xm20 + vextracti128 [dstq+dsq*1], ym20, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + vzeroupper + RET +.hv_w16: + WIN64_SPILL_XMM 26 + vbroadcasti32x8 m5, [srcq+ssq*0+ 8] + vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 + vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 + movu ym6, [srcq+ssq*1+ 0] + movu ym7, [srcq+ssq*1+16] + vinserti32x8 m6, [srcq+ssq*2+ 0], 1 + vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 + add srcq, r6 + movu ym22, [srcq+ssq*0+ 0] + movu ym23, [srcq+ssq*0+16] + vinserti32x8 m22, [srcq+ssq*1+ 0], 1 + vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4 + movu ym24, [srcq+ssq*2+ 0] + movu ym25, [srcq+ssq*2+16] + add srcq, r6 + vinserti32x8 m24, [srcq+ssq*0+ 0], 1 + vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m9, [spel_shuf16] + pshufb m0, m4, m20 + mova m1, m10 + vpdpwssd m1, m12, m0 ; a0 + pshufb m0, m6, m20 + mova m2, m10 + vpdpwssd m2, m12, m0 ; b0 + pshufb m0, m7, m20 + mova m3, m10 + vpdpwssd m3, m14, m0 ; c2 + pshufb m0, m4, m21 + vpdpwssd m1, m13, m0 ; a1 + pshufb m0, m6, m21 + vpdpwssd m2, m13, m0 ; b1 + pshufb m0, m7, m21 + vpdpwssd m3, m15, m0 ; c3 + pshufb m0, m5, m20 + vpdpwssd m1, m14, m0 ; a2 + shufpd m6, m7, 0x55 + pshufb m7, m6, m20 + vpdpwssd m2, m14, m7 ; b2 + vpdpwssd m3, m12, m7 ; c0 + pshufb m5, m21 + vpdpwssd m1, m15, m5 ; a3 + pshufb m6, m21 + vpdpwssd m2, m15, m6 ; b3 + vpdpwssd m3, m13, m6 ; c1 + pshufb m0, m22, m20 + mova m4, m10 + vpdpwssd m4, m12, m0 ; d0 + pshufb m0, m23, m20 + mova m5, m10 + vpdpwssd m5, m14, m0 ; e2 + pshufb m0, m24, m20 + mova m6, m10 + vpdpwssd m6, m12, m0 ; f0 + pshufb m0, m25, m20 + mova m7, m10 + vpdpwssd m7, m14, m0 ; g2 + pshufb m0, m22, m21 + vpdpwssd m4, m13, m0 ; d1 + pshufb m0, m23, m21 + vpdpwssd m5, m15, m0 ; e3 + pshufb m0, m24, m21 + vpdpwssd m6, m13, m0 ; f1 + pshufb m0, m25, m21 + vpdpwssd m7, m15, m0 ; g3 + shufpd m22, m23, 0x55 + pshufb m23, m22, m20 + vpdpwssd m4, m14, m23 ; d2 + vpdpwssd m5, m12, m23 ; e0 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m6, m14, m25 ; f2 + vpdpwssd m7, m12, m25 ; g0 + pshufb m22, m21 + vpdpwssd m4, m15, m22 ; d3 + vpdpwssd m5, m13, m22 ; e1 + pshufb m24, m21 + vpdpwssd m6, m15, m24 ; f3 + vpdpwssd m7, m13, m24 ; g1 + pslldq m1, 1 + vpermt2b m2, m9, m3 ; 12 + vpermt2b m4, m9, m5 ; 34 + vpermt2b m6, m9, m7 ; 56 + vpshrdd m1, m2, 16 ; 01 + vpshrdd m3, m2, m4, 16 ; 23 + vpshrdd m5, m4, m6, 16 ; 45 +.hv_w16_loop: + movu ym24, [srcq+ssq*1+ 0] + movu ym25, [srcq+ssq*1+16] + lea srcq, [srcq+ssq*2] + vinserti32x8 m24, [srcq+ssq*0+ 0], 1 + vinserti32x8 m25, [srcq+ssq*0+16], 1 + mova m7, m10 + mova m8, m10 + pshufb m0, m24, m20 + vpdpwssd m7, m12, m0 ; h0 + pshufb m0, m25, m20 + vpdpwssd m8, m14, m0 ; i2 + pmaddwd m22, m16, m1 ; A0 + mova m1, m3 + pmaddwd m23, m16, m2 ; B0 + mova m2, m4 + pshufb m0, m24, m21 + vpdpwssd m7, m13, m0 ; h1 + pshufb m0, m25, m21 + vpdpwssd m8, m15, m0 ; i3 + vpdpwssd m22, m17, m3 ; A1 + mova m3, m5 + vpdpwssd m23, m17, m4 ; B1 + mova m4, m6 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m7, m14, m25 ; h2 + vpdpwssd m8, m12, m25 ; i0 + vpdpwssd m22, m18, m5 ; A2 + vpdpwssd m23, m18, m6 ; B2 + pshufb m24, m21 + vpdpwssd m7, m15, m24 ; h3 + vpdpwssd m8, m13, m24 ; i1 + vpermt2b m7, m9, m8 ; 78 + vpshrdd m5, m6, m7, 16 ; 67 + vpdpwssd m22, m19, m5 ; A3 + vpdpwssd m23, m19, m7 ; B3 + mova m6, m7 + psrad m22, 10 + psrad m23, 10 + vshufi32x4 m0, m22, m23, q3232 + vinserti32x8 m22, ym23, 1 + packusdw m22, m0 + pminsw m22, m11 + mova [dstq+dsq*0], ym22 + vextracti32x8 [dstq+dsq*1], m22, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 32 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m22, [spel_shuf32] + lea wd, [hq+wq*8-256] + mov r7, srcq + mov r8, dstq +.hv_w32_loop0: + movu m6, [srcq+ssq*0+ 0] + movu m7, [srcq+ssq*0+ 8] + movu m8, [srcq+ssq*0+16] + mova m0, m10 + mova m23, m10 + pshufb m9, m6, m20 + vpdpwssd m0, m12, m9 ; a0l + pshufb m9, m7, m20 + vpdpwssd m23, m12, m9 ; a0h + vpdpwssd m0, m14, m9 ; a2l + pshufb m7, m21 + vpdpwssd m23, m13, m7 ; a1h + vpdpwssd m0, m15, m7 ; a3l + pshufb m7, m8, m20 + vpdpwssd m23, m14, m7 ; a2h + pshufb m6, m21 + vpdpwssd m0, m13, m6 ; a1l + pshufb m8, m21 + vpdpwssd m23, m15, m8 ; a3h +%macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2] + movu m6, [srcq+%3*%4+ 0] + movu m7, [srcq+%3*%4+ 8] + movu m8, [srcq+%3*%4+16] +%if %4 == 2 + add srcq, r6 +%endif + movu m29, [srcq+%3*%5+ 0] + movu m30, [srcq+%3*%5+ 8] + movu m31, [srcq+%3*%5+16] +%if %5 == 2 + add srcq, r6 +%endif + mova m%1, m10 + mova m9, m10 + pshufb m%2, m6, m20 + vpdpwssd m%1, m12, m%2 ; x0l + pshufb m%2, m29, m20 + vpdpwssd m9, m12, m%2 ; y0l + pshufb m6, m21 + vpdpwssd m%1, m13, m6 ; x1l + pshufb m29, m21 + vpdpwssd m9, m13, m29 ; y1l + pshufb m6, m7, m20 + mova m%2, m10 + vpdpwssd m%2, m12, m6 ; x0h + pshufb m29, m30, m20 + vpdpwssd m%1, m14, m6 ; y2l + mova m6, m10 + vpdpwssd m6, m12, m29 ; x0h + pshufb m7, m21 + vpdpwssd m9, m14, m29 ; y2l + pshufb m30, m21 + vpdpwssd m%2, m13, m7 ; x1h + vpdpwssd m%1, m15, m7 ; x3l + pshufb m7, m8, m20 + vpdpwssd m6, m13, m30 ; y1h + vpdpwssd m9, m15, m30 ; y3l + pshufb m30, m31, m20 + vpdpwssd m%2, m14, m7 ; x2h + pshufb m8, m21 + vpdpwssd m6, m14, m30 ; y2h + pshufb m31, m21 + vpdpwssd m%2, m15, m8 ; x3h + vpdpwssd m6, m15, m31 ; y3h +%if %1 == 1 + vpermt2b m0, m22, m%1 ; 01l + vpermt2b m23, m22, m%2 ; 01h +%endif + vpermt2b m%1, m22, m9 ; xyl + vpermt2b m%2, m22, m6 ; xyh +%endmacro + PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12 + PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34 + PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56 + vpshrdd m2, m1, m3, 16 ; 23l + vpshrdd m25, m24, m26, 16 ; 23h + vpshrdd m4, m3, m5, 16 ; 45l + vpshrdd m27, m26, m28, 16 ; 45h +.hv_w32_loop: + movu m7, [srcq+ssq*1+ 0] + movu m9, [srcq+ssq*2+ 0] + movu m6, [srcq+ssq*1+ 8] + movu m8, [srcq+ssq*2+ 8] + mova m29, m10 + mova m31, m10 + pshufb m30, m7, m20 + vpdpwssd m29, m12, m30 ; h0l + pshufb m30, m9, m20 + vpdpwssd m31, m12, m30 ; i0l + pshufb m7, m21 + vpdpwssd m29, m13, m7 ; h1l + pshufb m9, m21 + vpdpwssd m31, m13, m9 ; i1l + pshufb m7, m6, m20 + vpdpwssd m29, m14, m7 ; h2l + pshufb m9, m8, m20 + vpdpwssd m31, m14, m9 ; i2l + pshufb m6, m21 + vpdpwssd m29, m15, m6 ; h3l + pshufb m8, m21 + vpdpwssd m31, m15, m8 ; i3l + mova m30, m10 + vpdpwssd m30, m12, m7 ; h0h + movu m7, [srcq+ssq*1+16] + lea srcq, [srcq+ssq*2] + vpermt2b m29, m22, m31 ; 78l + mova m31, m10 + vpdpwssd m31, m12, m9 ; i0h + movu m9, [srcq+ssq*0+16] + vpdpwssd m30, m13, m6 ; h1h + pshufb m6, m7, m20 + vpdpwssd m31, m13, m8 ; i1h + pshufb m8, m9, m20 + vpdpwssd m30, m14, m6 ; h2h + pmaddwd m6, m16, m0 ; A0l + pshufb m7, m21 + vpdpwssd m31, m14, m8 ; i2h + pmaddwd m8, m16, m23 ; A0h + pshufb m9, m21 + vpdpwssd m30, m15, m7 ; h3h + pmaddwd m7, m16, m1 ; B0l + vpdpwssd m31, m15, m9 ; i3h + pmaddwd m9, m16, m24 ; B0h + mova m0, m2 + vpdpwssd m6, m17, m2 ; A1l + mova m23, m25 + vpdpwssd m8, m17, m25 ; A1h + mova m1, m3 + vpdpwssd m7, m17, m3 ; B1l + mova m24, m26 + vpdpwssd m9, m17, m26 ; B1h + vpermt2b m30, m22, m31 ; 78h + vpdpwssd m6, m18, m4 ; A2l + mova m2, m4 + vpdpwssd m8, m18, m27 ; A2h + mova m25, m27 + vpdpwssd m7, m18, m5 ; B2l + mova m3, m5 + vpdpwssd m9, m18, m28 ; B2h + mova m26, m28 + vpshrdd m4, m5, m29, 16 ; 67l + vpdpwssd m6, m19, m4 ; A3l + vpshrdd m27, m28, m30, 16 ; 67h + vpdpwssd m8, m19, m27 ; A3h + mova m5, m29 + vpdpwssd m7, m19, m29 ; B3l + mova m28, m30 + vpdpwssd m9, m19, m30 ; B3h + REPX {psrad x, 10}, m6, m8, m7, m9 + packusdw m6, m8 + packusdw m7, m9 + pminsw m6, m11 + pminsw m7, m11 + mova [dstq+dsq*0], m6 + mova [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w32_loop + add r7, 64 + add r8, 64 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .hv_w32_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +MC_8TAP_FN prep, sharp, SHARP, SHARP +MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH +MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR +MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP +MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN prep, regular, REGULAR, REGULAR + +cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 +%define base r7-prep_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + mov wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [pw_8192] + movzx wd, word [r7+wq*2+table_offset(prep,)] + shr r5d, 11 + vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] + add wq, r7 + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mov r5d, r7m + vbroadcasti32x4 m4, [spel_h_shufA] + vbroadcasti32x4 m5, [spel_h_shufB] + shr r5d, 11 + mova ym9, [prep_endA] + psllw xmm0, [base+prep_hv_shift+r5*8] + mova [tmpq], xmm0 + vpbroadcastd m6, [tmpq+4] + vpbroadcastd m7, [tmpq+8] +.h_w4_loop: + movu xm2, [srcq+strideq*0] + vinserti32x4 ym2, [srcq+strideq*1], 1 + vinserti32x4 m2, [srcq+strideq*2], 2 + vinserti32x4 m2, [srcq+r6 ], 3 + lea srcq, [srcq+strideq*4] + mova m0, m10 + pshufb m1, m2, m4 + vpdpwssd m0, m6, m1 + pshufb m2, m5 + vpdpwssd m0, m7, m2 + vpermb m0, m9, m0 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m10, [prep_8tap_rnd] + lea r6, [strideq*3] + cmp wd, 4 + je .h_w4 + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mov r5d, r7m + sub srcq, 6 + shr r5d, 11 + psllw xmm0, [base+prep_hv_shift+r5*8] + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + cmp wd, 16 + je .h_w16 + jg .h_w32 +.h_w8: + mova m6, [spel_h_shufA] + movu m7, [spel_h_shufB] + movu m8, [spel_h_shufC] + mova m9, [spel_h_shufD] + mova m11, [prep_endB] +.h_w8_loop: + movu ym4, [srcq+strideq*0] + vinserti32x8 m4, [srcq+strideq*1], 1 + movu ym5, [srcq+strideq*2] + vinserti32x8 m5, [srcq+r6 ], 1 + lea srcq, [srcq+strideq*4] + mova m0, m10 + mova m1, m10 + vpermb m2, m6, m4 + vpermb m3, m6, m5 + vpdpwssd m0, m12, m2 + vpdpwssd m1, m12, m3 + vpermb m2, m7, m4 + vpermb m3, m7, m5 + vpdpwssd m0, m13, m2 + vpdpwssd m1, m13, m3 + vpermb m2, m8, m4 + vpermb m3, m8, m5 + vpdpwssd m0, m14, m2 + vpdpwssd m1, m14, m3 + vpermb m2, m9, m4 + vpermb m3, m9, m5 + vpdpwssd m0, m15, m2 + vpdpwssd m1, m15, m3 + vpermt2b m0, m11, m1 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] + mova m11, [prep_endC] +.h_w16_loop: + movu ym2, [srcq+strideq*0+ 0] + vinserti32x8 m2, [srcq+strideq*1+ 0], 1 + movu ym3, [srcq+strideq*0+16] + vinserti32x8 m3, [srcq+strideq*1+16], 1 + lea srcq, [srcq+strideq*2] + mova m0, m10 + mova m1, m10 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m14, m4 ; b2 + pshufb m4, m2, m7 + vpdpwssd m0, m13, m4 ; a1 + pshufb m4, m3, m7 + vpdpwssd m1, m15, m4 ; b3 + shufpd m2, m3, 0x55 + pshufb m4, m2, m6 + vpdpwssd m0, m14, m4 ; a2 + vpdpwssd m1, m12, m4 ; b0 + pshufb m2, m7 + vpdpwssd m0, m15, m2 ; a3 + vpdpwssd m1, m13, m2 ; b1 + vpermt2b m0, m11, m1 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + vbroadcasti32x4 m6, [spel_h_shufA] + lea srcq, [srcq+wq*2] + vbroadcasti32x4 m7, [spel_h_shufB] + neg wq + mova m11, [prep_endC] +.h_w32_loop0: + mov r6, wq +.h_w32_loop: + movu m2, [srcq+r6*2+ 0] + movu m3, [srcq+r6*2+ 8] + mova m0, m10 + mova m1, m10 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m12, m4 ; b0 + vpdpwssd m0, m14, m4 ; a2 + movu m4, [srcq+r6*2+16] + pshufb m3, m7 + vpdpwssd m1, m13, m3 ; b1 + vpdpwssd m0, m15, m3 ; a3 + pshufb m3, m4, m6 + vpdpwssd m1, m14, m3 ; b2 + pshufb m2, m7 + vpdpwssd m0, m13, m2 ; a1 + pshufb m4, m7 + vpdpwssd m1, m15, m4 ; b3 + vpermt2b m0, m11, m1 + mova [tmpq], m0 + add tmpq, 64 + add r6, 32 + jl .h_w32_loop + add srcq, strideq + dec hd + jg .h_w32_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mov r5d, r7m + vpbroadcastd m10, [prep_8tap_rnd] + pmovsxbw xmm0, [base+subpel_filters+myq*8] + tzcnt r6d, wd + shr r5d, 11 + movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] + psllw xmm0, [base+prep_hv_shift+r5*8] + add r7, r6 + lea r6, [strideq*3] + sub srcq, r6 + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + jmp r7 +.v_w4: + movq xmm1, [srcq+strideq*0] + vpbroadcastq ymm0, [srcq+strideq*1] + vpbroadcastq ymm2, [srcq+strideq*2] + add srcq, r6 + vpbroadcastq ymm4, [srcq+strideq*0] + vpbroadcastq ymm3, [srcq+strideq*1] + vpbroadcastq ymm5, [srcq+strideq*2] + mova xm11, [prep_endA] + add srcq, r6 + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklwd ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+strideq*0] + vpblendd ymm2, ymm4, 0x30 + vpblendd ymm4, ymm3, 0x30 + punpcklwd ymm2, ymm4 ; 23 34 + vpblendd ymm3, ymm5, 0x30 + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 45 56 +.v_w4_loop: + vpbroadcastq ymm5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + mova ymm4, ym10 + vpdpwssd ymm4, ym12, ymm1 ; a0 b0 + mova ymm1, ymm2 + vpdpwssd ymm4, ym13, ymm2 ; a1 b1 + mova ymm2, ymm3 + vpdpwssd ymm4, ym14, ymm3 ; a2 b2 + vpblendd ymm3, ymm0, ymm5, 0x30 + vpbroadcastq ymm0, [srcq+strideq*0] + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 67 78 + vpdpwssd ymm4, ym15, ymm3 ; a3 b3 + vpermb ymm4, ym11, ymm4 + mova [tmpq], xmm4 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + vbroadcasti32x4 m2, [srcq+strideq*2] + vinserti32x4 m1, m2, [srcq+strideq*0], 0 + vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2 + add srcq, r6 + vinserti32x4 ym2, [srcq+strideq*0], 1 + vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4 + mova m6, [spel_v_shuf8] + movu xm0, [srcq+strideq*1] + vinserti32x4 ym0, [srcq+strideq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 + mova ym11, [prep_endB] + vpermb m1, m6, m1 ; 01 12 + vpermb m2, m6, m2 ; 23 34 + vpermb m3, m6, m0 ; 45 56 +.v_w8_loop: + vinserti32x4 m0, [srcq+strideq*1], 3 + lea srcq, [srcq+strideq*2] + movu xm5, [srcq+strideq*0] + mova m4, m10 + vpdpwssd m4, m12, m1 ; a0 b0 + mova m1, m2 + vshufi32x4 m0, m5, q1032 ; 6 7 8 + vpdpwssd m4, m13, m2 ; a1 b1 + mova m2, m3 + vpdpwssd m4, m14, m3 ; a2 b2 + vpermb m3, m6, m0 ; 67 78 + vpdpwssd m4, m15, m3 ; a3 b3 + vpermb m4, m11, m4 + mova [tmpq], ym4 + add tmpq, 32 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti32x8 m1, [srcq+strideq*1] + vinserti32x8 m0, m1, [srcq+strideq*0], 0 + vinserti32x8 m1, [srcq+strideq*2], 1 + mova m8, [spel_v_shuf16] + add srcq, r6 + movu ym3, [srcq+strideq*0] + vinserti32x8 m3, [srcq+strideq*1], 1 + movu ym5, [srcq+strideq*2] + add srcq, r6 + vinserti32x8 m5, [srcq+strideq*0], 1 + mova m11, [prep_endA] + vpermb m0, m8, m0 ; 01 + vpermb m1, m8, m1 ; 12 + vpermb m3, m8, m3 ; 34 + vpermb m5, m8, m5 ; 56 + vpshrdd m2, m1, m3, 16 ; 23 + vpshrdd m4, m3, m5, 16 ; 45 +.v_w16_loop: + mova m6, m10 + mova m7, m10 + vpdpwssd m6, m12, m0 ; a0 + mova m0, m2 + vpdpwssd m7, m12, m1 ; b0 + mova m1, m3 + vpdpwssd m6, m13, m2 ; a1 + mova m2, m4 + vpdpwssd m7, m13, m3 ; b1 + mova m3, m5 + vpdpwssd m6, m14, m4 ; a2 + mova m4, m5 + vpdpwssd m7, m14, m5 ; b2 + movu ym5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m5, [srcq+strideq*0], 1 + vpermb m5, m8, m5 ; 78 + vpshrdd m4, m5, 16 ; 67 + vpdpwssd m6, m15, m4 ; a3 + vpdpwssd m7, m15, m5 ; b3 + vpermt2b m6, m11, m7 + mova [tmpq], m6 + add tmpq, 64 + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + PUSH r8 + movaps [rsp+stack_offset+8], xmm6 +%endif + lea r5, [hq+wq*8-256] + mov r7, srcq + mov r8, tmpq +.v_w32_loop0: + movu m16, [srcq+strideq*0] + movu m17, [srcq+strideq*1] + movu m18, [srcq+strideq*2] + add srcq, r6 + movu m19, [srcq+strideq*0] + movu m20, [srcq+strideq*1] + movu m21, [srcq+strideq*2] + add srcq, r6 + movu m22, [srcq+strideq*0] + mova m11, [prep_endC] + punpcklwd m0, m16, m17 ; 01l + punpckhwd m16, m17 ; 01h + punpcklwd m1, m17, m18 ; 12l + punpckhwd m17, m18 ; 12h + punpcklwd m2, m18, m19 ; 23l + punpckhwd m18, m19 ; 23h + punpcklwd m3, m19, m20 ; 34l + punpckhwd m19, m20 ; 34h + punpcklwd m4, m20, m21 ; 45l + punpckhwd m20, m21 ; 45h + punpcklwd m5, m21, m22 ; 56l + punpckhwd m21, m22 ; 56h +.v_w32_loop: + mova m6, m10 + vpdpwssd m6, m12, m0 ; a0l + mova m8, m10 + vpdpwssd m8, m12, m16 ; a0h + mova m7, m10 + vpdpwssd m7, m12, m1 ; b0l + mova m9, m10 + vpdpwssd m9, m12, m17 ; b0h + mova m0, m2 + vpdpwssd m6, m13, m2 ; a1l + mova m16, m18 + vpdpwssd m8, m13, m18 ; a1h + mova m1, m3 + vpdpwssd m7, m13, m3 ; b1l + mova m17, m19 + vpdpwssd m9, m13, m19 ; b1h + mova m2, m4 + vpdpwssd m6, m14, m4 ; a2l + mova m18, m20 + vpdpwssd m8, m14, m20 ; a2h + mova m3, m5 + vpdpwssd m7, m14, m5 ; b2l + mova m19, m21 + vpdpwssd m9, m14, m21 ; b2h + movu m21, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklwd m4, m22, m21 ; 67l + punpckhwd m20, m22, m21 ; 67h + movu m22, [srcq+strideq*0] + vpdpwssd m6, m15, m4 ; a3l + vpdpwssd m8, m15, m20 ; a3h + punpcklwd m5, m21, m22 ; 78l + punpckhwd m21, m22 ; 78h + vpdpwssd m7, m15, m5 ; b3l + vpdpwssd m9, m15, m21 ; b3h + vpermt2b m6, m11, m8 + vpermt2b m7, m11, m9 + mova [tmpq+wq*0], m6 + mova [tmpq+wq*2], m7 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w32_loop + add r7, 64 + add r8, 64 + movzx hd, r5b + mov srcq, r7 + mov tmpq, r8 + sub r5d, 1<<8 + jg .v_w32_loop0 +%if WIN64 + movaps xmm6, [rsp+stack_offset+8] + POP r8 +%endif + vzeroupper + RET +.hv: + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mov r5d, r7m + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [strideq*3] + sub srcq, 2 + shr r5d, 11 + sub srcq, r6 + psllw xmm0, [base+prep_hv_shift+r5*8] + psllw xmm1, 2 + vpbroadcastd m10, [prep_8tap_rnd] + vpbroadcastd ym11, [pd_128] + mova xm21, [prep_endA] + mova [tmpq+ 0], xmm0 + mova [tmpq+16], xmm1 + vpbroadcastd m8, [tmpq+ 4] + vpbroadcastd m9, [tmpq+ 8] + vpbroadcastd ym12, xmm1 + vpbroadcastd ym13, [tmpq+20] + vpbroadcastd ym14, [tmpq+24] + vpbroadcastd ym15, [tmpq+28] + movu xm4, [srcq+strideq*0] + vinserti32x4 ym4, [srcq+strideq*1], 1 + vinserti32x4 m4, [srcq+strideq*2], 2 + add srcq, r6 + vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3 + movu xm0, [srcq+strideq*1] + vinserti32x4 ym0, [srcq+strideq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 + vbroadcasti32x4 m19, [spel_h_shufA] + vbroadcasti32x4 m20, [spel_h_shufB] + mova ym6, [spel_shuf4a] + mova ym7, [spel_shuf4b] + mova m2, m10 + mova m3, m10 + pshufb m1, m4, m19 + vpdpwssd m2, m8, m1 + pshufb m1, m0, m19 + vpdpwssd m3, m8, m1 + pshufb m4, m20 + vpdpwssd m2, m9, m4 + pshufb m0, m20 + vpdpwssd m3, m9, m0 + vpermb m1, m6, m2 ; 01 12 + vshufi32x4 m2, m3, q1032 + vpermb m3, m6, m3 ; 45 56 + vpermb m2, m6, m2 ; 23 34 +.hv_w4_loop: + movu xm18, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti128 ym18, [srcq+strideq*0], 1 + mova ym16, ym11 + mova ym4, ym10 + pshufb ym17, ym18, ym19 + vpdpwssd ym16, ym12, ym1 ; a0 b0 + vpdpwssd ym4, ym8, ym17 + pshufb ym18, ym20 + mova ym1, ym2 + vpdpwssd ym16, ym13, ym2 ; a1 b1 + vpdpwssd ym4, ym9, ym18 ; 7 8 + mova ym2, ym3 + vpdpwssd ym16, ym14, ym3 ; a2 b2 + vpermt2b ym3, ym7, ym4 ; 67 78 + vpdpwssd ym16, ym15, ym3 ; a3 b3 + vpermb ym16, ym21, ym16 + mova [tmpq], xm16 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + vzeroupper + RET +.hv_w8: + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + mov r5d, r7m + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [strideq*3] + sub srcq, 6 + shr r5d, 11 + sub srcq, r6 + vpbroadcastd m10, [prep_8tap_rnd] + vpbroadcastd m11, [pd_128] + psllw xmm0, [base+prep_hv_shift+r5*8] + psllw xmm1, 2 + mova [tmpq+ 0], xmm0 + mova [tmpq+16], xmm1 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + vpbroadcastd m16, xmm1 + vpbroadcastd m17, [tmpq+20] + vpbroadcastd m18, [tmpq+24] + vpbroadcastd m19, [tmpq+28] + cmp wd, 16 + je .hv_w16 + jg .hv_w32 + WIN64_SPILL_XMM 23 + mova m5, [spel_h_shufA] + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 + movu ym9, [srcq+strideq*2] + add srcq, r6 + vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 + movu ym20, [srcq+strideq*1] + vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 + add srcq, r6 + movu ym21, [srcq+strideq*0] ; 6 + movu m6, [spel_h_shufB] + movu m7, [spel_h_shufC] + mova ym22, [prep_endB] + vpermb m8, m5, m0 + mova m1, m10 + vpdpwssd m1, m12, m8 ; a0 b0 + vpermb m8, m5, m9 + mova m2, m10 + vpdpwssd m2, m12, m8 ; c0 d0 + vpermb m8, m5, m20 + mova m3, m10 + vpdpwssd m3, m12, m8 ; e0 f0 + vpermb m8, m5, m21 + mova m4, m10 + vpdpwssd m4, m12, m8 ; g0 + vpermb m8, m6, m0 + vpdpwssd m1, m13, m8 ; a1 b1 + vpermb m8, m6, m9 + vpdpwssd m2, m13, m8 ; c1 d1 + vpermb m8, m6, m20 + vpdpwssd m3, m13, m8 ; e1 f1 + vpermb m8, m6, m21 + vpdpwssd m4, m13, m8 ; g1 + vpermb m8, m7, m0 + vpdpwssd m1, m14, m8 ; a2 b2 + vpermb m8, m7, m9 + vpdpwssd m2, m14, m8 ; c2 d2 + vpermb m8, m7, m20 + vpdpwssd m3, m14, m8 ; e2 f2 + vpermb m8, m7, m21 + vpdpwssd m4, m14, m8 ; g2 + mova m8, [spel_h_shufD] + vpermb m0, m8, m0 + vpdpwssd m1, m15, m0 ; a3 b3 + mova m0, [spel_shuf8a] + vpermb m9, m8, m9 + vpdpwssd m2, m15, m9 ; c3 d3 + mova m9, [spel_shuf8b] + vpermb m20, m8, m20 + vpdpwssd m3, m15, m20 ; e3 f3 + vpermb m21, m8, m21 + vpdpwssd m4, m15, m21 ; g3 + vpermt2b m1, m0, m2 ; 01 12 + vpermt2b m2, m0, m3 ; 23 34 + vpermt2b m3, m0, m4 ; 45 56 +.hv_w8_loop: + movu ym0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m0, [srcq+strideq*0], 1 + mova m4, m10 + mova m20, m11 + vpermb m21, m5, m0 + vpdpwssd m4, m12, m21 ; h0 i0 + vpermb m21, m6, m0 + vpdpwssd m20, m16, m1 ; A0 B0 + vpdpwssd m4, m13, m21 ; h1 i1 + vpermb m21, m7, m0 + mova m1, m2 + vpdpwssd m20, m17, m2 ; A1 B1 + vpdpwssd m4, m14, m21 ; h2 i2 + vpermb m21, m8, m0 + mova m2, m3 + vpdpwssd m20, m18, m3 ; A2 B2 + vpdpwssd m4, m15, m21 ; h3 i3 + vpermt2b m3, m9, m4 ; 67 78 + vpdpwssd m20, m19, m3 ; A3 B3 + vpermb m20, m22, m20 + mova [tmpq], ym20 + add tmpq, 32 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 27 + vbroadcasti32x8 m5, [srcq+strideq*0+ 8] + vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 + vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 + movu ym6, [srcq+strideq*1+ 0] + movu ym7, [srcq+strideq*1+16] + vinserti32x8 m6, [srcq+strideq*2+ 0], 1 + vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 + add srcq, r6 + movu ym22, [srcq+strideq*0+ 0] + movu ym23, [srcq+strideq*0+16] + vinserti32x8 m22, [srcq+strideq*1+ 0], 1 + vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4 + movu ym24, [srcq+strideq*2+ 0] + movu ym25, [srcq+strideq*2+16] + add srcq, r6 + vinserti32x8 m24, [srcq+strideq*0+ 0], 1 + vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m9, [spel_shuf16] + mova m26, [prep_endB] + pshufb m0, m4, m20 + mova m1, m10 + vpdpwssd m1, m12, m0 ; a0 + pshufb m0, m6, m20 + mova m2, m10 + vpdpwssd m2, m12, m0 ; b0 + pshufb m0, m7, m20 + mova m3, m10 + vpdpwssd m3, m14, m0 ; c2 + pshufb m0, m4, m21 + vpdpwssd m1, m13, m0 ; a1 + pshufb m0, m6, m21 + vpdpwssd m2, m13, m0 ; b1 + pshufb m0, m7, m21 + vpdpwssd m3, m15, m0 ; c3 + pshufb m0, m5, m20 + vpdpwssd m1, m14, m0 ; a2 + shufpd m6, m7, 0x55 + pshufb m7, m6, m20 + vpdpwssd m2, m14, m7 ; b2 + vpdpwssd m3, m12, m7 ; c0 + pshufb m5, m21 + vpdpwssd m1, m15, m5 ; a3 + pshufb m6, m21 + vpdpwssd m2, m15, m6 ; b3 + vpdpwssd m3, m13, m6 ; c1 + pshufb m0, m22, m20 + mova m4, m10 + vpdpwssd m4, m12, m0 ; d0 + pshufb m0, m23, m20 + mova m5, m10 + vpdpwssd m5, m14, m0 ; e2 + pshufb m0, m24, m20 + mova m6, m10 + vpdpwssd m6, m12, m0 ; f0 + pshufb m0, m25, m20 + mova m7, m10 + vpdpwssd m7, m14, m0 ; g2 + pshufb m0, m22, m21 + vpdpwssd m4, m13, m0 ; d1 + pshufb m0, m23, m21 + vpdpwssd m5, m15, m0 ; e3 + pshufb m0, m24, m21 + vpdpwssd m6, m13, m0 ; f1 + pshufb m0, m25, m21 + vpdpwssd m7, m15, m0 ; g3 + shufpd m22, m23, 0x55 + pshufb m23, m22, m20 + vpdpwssd m4, m14, m23 ; d2 + vpdpwssd m5, m12, m23 ; e0 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m6, m14, m25 ; f2 + vpdpwssd m7, m12, m25 ; g0 + pshufb m22, m21 + vpdpwssd m4, m15, m22 ; d3 + vpdpwssd m5, m13, m22 ; e1 + pshufb m24, m21 + vpdpwssd m6, m15, m24 ; f3 + vpdpwssd m7, m13, m24 ; g1 + pslldq m1, 1 + vpermt2b m2, m9, m3 ; 12 + vpermt2b m4, m9, m5 ; 34 + vpermt2b m6, m9, m7 ; 56 + vpshrdd m1, m2, 16 ; 01 + vpshrdd m3, m2, m4, 16 ; 23 + vpshrdd m5, m4, m6, 16 ; 45 +.hv_w16_loop: + movu ym24, [srcq+strideq*1+ 0] + movu ym25, [srcq+strideq*1+16] + lea srcq, [srcq+strideq*2] + vinserti32x8 m24, [srcq+strideq*0+ 0], 1 + vinserti32x8 m25, [srcq+strideq*0+16], 1 + mova m7, m10 + mova m8, m10 + pshufb m0, m24, m20 + vpdpwssd m7, m12, m0 ; h0 + mova m22, m11 + pshufb m0, m25, m20 + vpdpwssd m8, m14, m0 ; i2 + mova m23, m11 + vpdpwssd m22, m16, m1 ; A0 + mova m1, m3 + vpdpwssd m23, m16, m2 ; B0 + mova m2, m4 + pshufb m0, m24, m21 + vpdpwssd m7, m13, m0 ; h1 + pshufb m0, m25, m21 + vpdpwssd m8, m15, m0 ; i3 + vpdpwssd m22, m17, m3 ; A1 + mova m3, m5 + vpdpwssd m23, m17, m4 ; B1 + mova m4, m6 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m7, m14, m25 ; h2 + vpdpwssd m8, m12, m25 ; i0 + vpdpwssd m22, m18, m5 ; A2 + vpdpwssd m23, m18, m6 ; B2 + pshufb m24, m21 + vpdpwssd m7, m15, m24 ; h3 + vpdpwssd m8, m13, m24 ; i1 + vpermt2b m7, m9, m8 ; 78 + vpshrdd m5, m6, m7, 16 ; 67 + vpdpwssd m22, m19, m5 ; A3 + vpdpwssd m23, m19, m7 ; B3 + mova m6, m7 + vpermt2b m22, m26, m23 + mova [tmpq], m22 + add tmpq, 64 + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + PUSH r8 + %assign regs_used regs_used + 1 + WIN64_SPILL_XMM 32 +%endif + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m22, [spel_shuf32] + lea r5d, [hq+wq*8-256] + mov r7, srcq + mov r8, tmpq +.hv_w32_loop0: + movu m6, [srcq+strideq*0+ 0] + movu m7, [srcq+strideq*0+ 8] + movu m8, [srcq+strideq*0+16] + mova m0, m10 + mova m23, m10 + pshufb m9, m6, m20 + vpdpwssd m0, m12, m9 ; a0l + pshufb m9, m7, m20 + vpdpwssd m23, m12, m9 ; a0h + vpdpwssd m0, m14, m9 ; a2l + pshufb m7, m21 + vpdpwssd m23, m13, m7 ; a1h + vpdpwssd m0, m15, m7 ; a3l + pshufb m7, m8, m20 + vpdpwssd m23, m14, m7 ; a2h + pshufb m6, m21 + vpdpwssd m0, m13, m6 ; a1l + pshufb m8, m21 + vpdpwssd m23, m15, m8 ; a3h + PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12 + PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34 + PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56 + vpshrdd m2, m1, m3, 16 ; 23l + vpshrdd m25, m24, m26, 16 ; 23h + vpshrdd m4, m3, m5, 16 ; 45l + vpshrdd m27, m26, m28, 16 ; 45h +.hv_w32_loop: + movu m7, [srcq+strideq*1+ 0] + movu m9, [srcq+strideq*2+ 0] + movu m6, [srcq+strideq*1+ 8] + movu m8, [srcq+strideq*2+ 8] + mova m29, m10 + mova m31, m10 + pshufb m30, m7, m20 + vpdpwssd m29, m12, m30 ; h0l + pshufb m30, m9, m20 + vpdpwssd m31, m12, m30 ; i0l + pshufb m7, m21 + vpdpwssd m29, m13, m7 ; h1l + pshufb m9, m21 + vpdpwssd m31, m13, m9 ; i1l + pshufb m7, m6, m20 + vpdpwssd m29, m14, m7 ; h2l + pshufb m9, m8, m20 + vpdpwssd m31, m14, m9 ; i2l + pshufb m6, m21 + vpdpwssd m29, m15, m6 ; h3l + pshufb m8, m21 + vpdpwssd m31, m15, m8 ; i3l + mova m30, m10 + vpdpwssd m30, m12, m7 ; h0h + movu m7, [srcq+strideq*1+16] + lea srcq, [srcq+strideq*2] + vpermt2b m29, m22, m31 ; 78l + mova m31, m10 + vpdpwssd m31, m12, m9 ; i0h + movu m9, [srcq+strideq*0+16] + vpdpwssd m30, m13, m6 ; h1h + pshufb m6, m7, m20 + vpdpwssd m31, m13, m8 ; i1h + pshufb m8, m9, m20 + vpdpwssd m30, m14, m6 ; h2h + mova m6, m11 + vpdpwssd m6, m16, m0 ; A0l + pshufb m7, m21 + vpdpwssd m31, m14, m8 ; i2h + mova m8, m11 + vpdpwssd m8, m16, m23 ; A0h + pshufb m9, m21 + vpdpwssd m30, m15, m7 ; h3h + mova m7, m11 + vpdpwssd m7, m16, m1 ; B0l + vpdpwssd m31, m15, m9 ; i3h + mova m9, m11 + vpdpwssd m9, m16, m24 ; B0h + mova m0, m2 + vpdpwssd m6, m17, m2 ; A1l + mova m23, m25 + vpdpwssd m8, m17, m25 ; A1h + mova m1, m3 + vpdpwssd m7, m17, m3 ; B1l + mova m24, m26 + vpdpwssd m9, m17, m26 ; B1h + vpermt2b m30, m22, m31 ; 78h + mova m31, [prep_endC] + vpdpwssd m6, m18, m4 ; A2l + mova m2, m4 + vpdpwssd m8, m18, m27 ; A2h + mova m25, m27 + vpdpwssd m7, m18, m5 ; B2l + mova m3, m5 + vpdpwssd m9, m18, m28 ; B2h + mova m26, m28 + vpshrdd m4, m5, m29, 16 ; 67l + vpdpwssd m6, m19, m4 ; A3l + vpshrdd m27, m28, m30, 16 ; 67h + vpdpwssd m8, m19, m27 ; A3h + mova m5, m29 + vpdpwssd m7, m19, m29 ; B3l + mova m28, m30 + vpdpwssd m9, m19, m30 ; B3h + vpermt2b m6, m31, m8 + vpermt2b m7, m31, m9 + mova [tmpq+wq*0], m6 + mova [tmpq+wq*2], m7 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w32_loop + add r7, 64 + add r8, 64 + movzx hd, r5b + mov srcq, r7 + mov tmpq, r8 + sub r5d, 1<<8 + jg .hv_w32_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts +%define base r6-pd_0to7 + mov t0d, r7m + lea r6, [pd_0to7] + shr t0d, 11 + vpbroadcastd m8, [base+warp_8x8t_rnd_v] + vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main + psrad m14, m16, 15 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 + psrad m16, 15 + packssdw m14, m16 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 + psrad m15, m16, 15 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 + add tsq, tsq + psrad m16, 15 + packssdw m15, m16 + jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end + +cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd + mov t0d, r7m ; pixel_max + lea r6, [pd_0to7] + shr t0d, 11 + vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] + vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] + call .main + psrad m14, m16, 13 + call .main2 + psrad m16, 13 + packusdw m14, m16 + call .main2 + psrad m15, m16, 13 + call .main2 + vpbroadcastd m0, [base+bidir_shift+t0*4] + vpsrlvw m14, m0 + psrad m16, 13 + packusdw m15, m16 + vpsrlvw m15, m0 +.end: + mova m0, [base+warp8x8_end] + vpermb m16, m0, m14 + lea r2, [dsq*3] + mova [dstq+dsq*0], xm16 + vextracti128 [dstq+dsq*1], ym16, 1 + vextracti32x4 [dstq+dsq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + vpermb m16, m0, m15 + lea dstq, [dstq+dsq*4] + mova [dstq+dsq*0], xm16 + vextracti128 [dstq+dsq*1], ym16, 1 + vextracti32x4 [dstq+dsq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + RET +.main: + vpbroadcastd ym3, [base+pd_512] +%if WIN64 + mov abcdq, r5mp + vpaddd ym18, ym3, r6m {1to8} ; mx +%else + add r5d, 512 + vpbroadcastd ym18, r5d +%endif + vpaddd ym20, ym3, r7m {1to8} ; my + mova ym16, [base+pd_0to7] + vpbroadcastd ym19, [abcdq+4*0] ; alpha + vpbroadcastd ym21, [abcdq+4*1] ; gamma + lea r4, [ssq*3+6] + vpdpwssd ym18, ym19, ym16 ; tmx + vpdpwssd ym20, ym21, ym16 ; tmy + sub srcq, r4 + mova m10, [base+warp8x8_permA] + lea r4, [mc_warp_filter+64*8] + vbroadcasti32x4 m12, [base+warp8x8_permC] + kxnorb k1, k1, k1 + vbroadcasti32x4 m13, [base+warp8x8_permD] + movu ym5, [srcq+0] + vinserti32x8 m5, [srcq+8], 1 + psrad ym17, ym18, 10 + mova m11, [base+warp8x8_permB] + kmovb k2, k1 + vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 + psrad ym19, 16 ; beta + psrad ym21, 16 ; delta + paddd ym18, ym19 + vpermb m4, m10, m5 + vpbroadcastq m9, [base+warp_shift_h+t0*8] + pshufd m3, m3, q3120 + paddd m7, m1, m1 + pshufb m2, m3, m12 + vpdpwssd m1, m4, m2 + vpermb m5, m11, m5 + vshufi32x4 m4, m5, q1021 + pshufb m3, m13 + vpdpwssd m1, m4, m3 + call .h + psllq m2, m1, 32 + paddd m1, m2 + vpmultishiftqb m1, m9, m1 + vpshrdq m1, m0, 48 ; 01 12 + call .h + vpshrdq m2, m1, m0, 48 ; 23 34 + call .h + vpshrdq m3, m2, m0, 48 ; 45 56 +.main2: + call .h + psrad ym6, ym20, 10 + kmovb k1, k2 + paddd ym17, ym20, ym21 ; my += delta + vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 + psrad ym16, ym17, 10 + kmovb k2, k1 + vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 + shufps m5, m20, m6, q2020 + mova m16, m8 + pshufb m4, m5, m12 + vpdpwssd m16, m1, m4 ; a0 b0 + pshufb m5, m13 + mova m1, m2 + vpdpwssd m16, m2, m5 ; a1 b1 + shufps m6, m20, m6, q3131 + paddd ym20, ym17, ym21 + pshufb m4, m6, m12 + mova m2, m3 + vpdpwssd m16, m3, m4 ; a2 b2 + vpshrdq m3, m0, 48 ; 67 78 + pshufb m6, m13 + vpdpwssd m16, m3, m6 ; a3 b3 + ret +ALIGN function_align +.h: + movu ym16, [srcq+ssq*1] + psrad ym6, ym18, 10 + lea srcq, [srcq+ssq*2] + vinserti32x8 m5, m16, [srcq+ssq*0], 1 + kmovb k1, k2 + paddd ym17, ym18, ym19 ; mx += beta + vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 + psrad ym16, ym17, 10 + kmovb k2, k1 + vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 + vpermb m4, m10, m5 + shufps m16, m18, m6, q2020 + shufps m6, m18, m6, q3131 + mova m0, m7 + pshufb m18, m16, m12 + vpdpwssd m0, m4, m18 ; a0 b0 + vpermb m5, m11, m5 + pshufb m18, m6, m13 + vpdpwssd m0, m5, m18 ; a3 b3 + paddd ym18, ym17, ym19 + vshufi32x4 m17, m4, m5, q1021 + pshufb m16, m13 + vpdpwssd m0, m17, m16 ; a1 b1 + vshufi32x4 m4, m5, q2132 + pshufb m6, m12 + vpdpwssd m0, m4, m6 ; a2 b2 + vpmultishiftqb m0, m9, m0 ; a a b b + ret + +%macro BIDIR_FN 0 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xmm0, ym0, 1 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + cmp hd, 8 + jl .w4_end + vextracti32x4 xmm0, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq ], xmm0 + movhps [dstq+strideq*1], xmm0 + vextracti32x4 xmm0, m0, 3 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq ], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xmm0, ym1, 1 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + vextracti32x4 xmm0, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq ], xmm0 + movhps [dstq+strideq*1], xmm0 + vextracti32x4 xmm0, m1, 3 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main + mova [dstq+64*2], m0 + mova [dstq+64*3], m1 + dec hd + jg .w128_loop + RET +%endmacro + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg_avx512icl_table + lea r6, [avg_avx512icl_table] + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + vpbroadcastd m2, [base+avg_round+t0*4] + vpbroadcastd m3, [base+avg_shift+t0*4] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+64*0] + paddsw m0, [tmp2q+64*0] + mova m1, [tmp1q+64*1] + paddsw m1, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + pmaxsw m0, m2 + pmaxsw m1, m2 + psubsw m0, m2 + psubsw m1, m2 + vpsrlvw m0, m3 + vpsrlvw m1, m3 + ret + +cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-w_avg_avx512icl_table + lea r6, [w_avg_avx512icl_table] + tzcnt wd, wm + mov t0d, r7m ; pixel_max + shr t0d, 11 + movsxd wq, [r6+wq*4] + vpbroadcastd m5, [base+w_avg_round+t0*4] + vpbroadcastd m7, [base+bidir_shift+t0*4] + add wq, r6 + mov r6d, r6m ; weight + lea t0d, [r6-16] + shl r6d, 16 + sub r6d, t0d ; 16-weight, weight + movifnidn hd, hm + vpbroadcastd m6, r6d + BIDIR_FN +ALIGN function_align +.main: + mova m3, [tmp1q+64*0] + mova m1, [tmp2q+64*0] + mova m0, [tmp1q+64*1] + mova m4, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + punpcklwd m2, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + mova m0, m5 + vpdpwssd m0, m6, m2 + mova m2, m5 + vpdpwssd m2, m6, m1 + mova m1, m5 + vpdpwssd m1, m6, m3 + mova m3, m5 + vpdpwssd m3, m6, m4 + REPX {psrad x, 2}, m0, m2, m1, m3 + packusdw m0, m2 + packusdw m1, m3 + vpsrlvw m0, m7 + vpsrlvw m1, m7 + ret + +cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask_avx512icl_table + lea r7, [mask_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_64] + vpbroadcastd m9, [base+mask_round+r6*4] + vpbroadcastd m10, [base+bidir_shift+r6*4] + mov maskq, maskmp + add wq, r7 + BIDIR_FN +ALIGN function_align +.main: + pmovzxbw m1, [maskq+32*0] + mova m4, [tmp1q+64*0] + mova m2, [tmp2q+64*0] + pmovzxbw m6, [maskq+32*1] + mova m5, [tmp1q+64*1] + mova m3, [tmp2q+64*1] + add maskq, 32*2 + add tmp1q, 64*2 + add tmp2q, 64*2 + punpcklwd m7, m4, m2 + punpckhwd m4, m2 + psubw m0, m8, m1 + punpcklwd m2, m1, m0 ; m, 64-m + punpckhwd m1, m0 + mova m0, m9 + vpdpwssd m0, m7, m2 + mova m2, m9 + vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) + punpcklwd m7, m5, m3 + punpckhwd m5, m3 + psubw m1, m8, m6 + punpcklwd m3, m6, m1 + punpckhwd m6, m1 + mova m1, m9 + vpdpwssd m1, m7, m3 + mova m3, m9 + vpdpwssd m3, m5, m6 + REPX {psrad x, 4}, m0, m2, m1, m3 + packusdw m0, m2 + packusdw m1, m3 + vpsrlvw m0, m10 + vpsrlvw m1, m10 + ret + +cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx512icl_table + lea r7, [w_mask_420_avx512icl_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+mask_round+r6*4] + vpbroadcastd m13, [base+bidir_shift+r6*4] + mov r6d, r7m ; sign + vpbroadcastd m14, [base+w_mask_round+r6*4] + mova ym15, [w_mask_end42x] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + mova m4, [w_mask_shuf4] + vpermt2b m2, m4, m3 + mova m3, m14 + vpdpbusd m3, m2, [pb_64] {1to16} + vpermb m3, m15, m3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xmm0, ym0, 1 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + mova [maskq], xm3 + cmp hd, 8 + jl .w4_end + vextracti32x4 xmm0, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm0 + vextracti32x4 xmm0, m0, 3 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xmm0, ym1, 1 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + vextracti32x4 xmm0, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm0 + vextracti32x4 xmm0, m1, 3 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 +.w4_end: + RET +.w8: + mova m8, [w_mask_shuf8] + vpbroadcastd m9, [pb_64] + jmp .w8_start +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w8_start: + vpermt2b m2, m8, m3 + mova m3, m14 + vpdpbusd m3, m2, m9 + vpermb m3, m15, m3 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + mova [maskq], xm3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16: + mova m8, [w_mask_shuf16] + vpbroadcastd m9, [pb_64] + jmp .w16_start +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w16_start: + vpermt2b m2, m8, m3 + mova m3, m14 + vpdpbusd m3, m2, m9 + vpermb m3, m15, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + mova [maskq], xm3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w32: + paddw m2, m3 + mova m8, m14 + vpdpwssd m8, m11, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + call .main + paddw m2, m3 + mova m3, m14 + vpdpwssd m3, m11, m2 + vpermt2b m8, m15, m3 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + mova [maskq], ym8 + sub hd, 4 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w64: + mova m8, m2 + mova m9, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + call .main + paddw m8, m2 + paddw m9, m3 + mova m2, m14 + vpdpwssd m2, m11, m8 + mova m3, m14 + vpdpwssd m3, m11, m9 + vpermt2b m2, m15, m3 + mova [dstq+strideq*1+64*0], m0 + mova [dstq+strideq*1+64*1], m1 + mova [maskq], ym2 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 64 +.w128: + mova m16, m2 + mova m8, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + call .main + mova m17, m2 + mova m9, m3 + mova [dstq+strideq*0+64*2], m0 + mova [dstq+strideq*0+64*3], m1 + call .main + paddw m2, m16 + paddw m3, m8 + mova m16, m14 + vpdpwssd m16, m11, m2 + mova m8, m14 + vpdpwssd m8, m11, m3 + mova [dstq+strideq*1+64*0], m0 + mova [dstq+strideq*1+64*1], m1 + call .main + paddw m2, m17 + paddw m3, m9 + mova m17, m14 + vpdpwssd m17, m11, m2 + mova m9, m14 + vpdpwssd m9, m11, m3 + vpermt2b m16, m15, m8 + vpermt2b m17, m15, m9 + mova [dstq+strideq*1+64*2], m0 + mova [dstq+strideq*1+64*3], m1 + mova [maskq+32*0], ym16 + mova [maskq+32*1], ym17 + sub hd, 2 + jg .w128_loop + vzeroupper + RET +ALIGN function_align +.main: + mova m1, [tmp1q+64*0] + mova m3, [tmp2q+64*0] + mova m4, [tmp1q+64*1] + mova m7, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + psubsw m6, m1, m3 + punpcklwd m5, m3, m1 + pabsw m6, m6 + punpckhwd m3, m1 + psubusw m6, m10, m6 + psrlw m6, 10 ; 64-m + psubw m2, m11, m6 ; m + punpcklwd m1, m6, m2 + punpckhwd m6, m2 + mova m0, m12 + vpdpwssd m0, m5, m1 + mova m1, m12 + vpdpwssd m1, m3, m6 + psubsw m5, m4, m7 + punpcklwd m6, m7, m4 + pabsw m5, m5 + punpckhwd m7, m4 + psubusw m5, m10, m5 + psrlw m5, 10 + psubw m3, m11, m5 + punpcklwd m4, m5, m3 + psrad m0, 4 + punpckhwd m5, m3 + psrad m1, 4 + packusdw m0, m1 + mova m1, m12 + vpdpwssd m1, m6, m4 + mova m4, m12 + vpdpwssd m4, m7, m5 + psrad m1, 4 + psrad m4, 4 + packusdw m1, m4 + vpsrlvw m0, m13 + vpsrlvw m1, m13 + ret + +cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx512icl_table + lea r7, [w_mask_422_avx512icl_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m9, [base+pw_64] + vpbroadcastd m10, [base+mask_round+r6*4] + vpbroadcastd m11, [base+bidir_shift+r6*4] + mov r6d, r7m ; sign + vpbroadcastd m12, [base+w_mask_round+r6*4] + mova ym13, [w_mask_end42x] + mov maskq, maskmp + add wq, r7 + paddw m14, m9, m9 ; pw_128 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xmm0, ym0, 1 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + cmp hd, 8 + jl .w4_end + vextracti32x4 xmm0, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm0 + vextracti32x4 xmm0, m0, 3 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xmm0, ym1, 1 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + vextracti32x4 xmm0, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm0 + vextracti32x4 xmm0, m1, 3 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main + mova [dstq+64*2], m0 + mova [dstq+64*3], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + mova m1, [tmp1q+64*0] + mova m3, [tmp2q+64*0] + mova m4, [tmp1q+64*1] + mova m7, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + psubsw m6, m1, m3 + punpcklwd m5, m3, m1 + pabsw m6, m6 + punpckhwd m3, m1 + psubusw m6, m8, m6 + psrlw m6, 10 + psubw m2, m9, m6 + punpcklwd m1, m6, m2 + punpckhwd m6, m2 + mova m0, m10 + vpdpwssd m0, m5, m1 + mova m1, m10 + vpdpwssd m1, m3, m6 + psubsw m5, m4, m7 + punpcklwd m6, m7, m4 + pabsw m5, m5 + punpckhwd m7, m4 + psubusw m5, m8, m5 + psrlw m5, 10 + psubw m3, m9, m5 + punpcklwd m4, m5, m3 + psrad m0, 4 + punpckhwd m5, m3 + psrad m1, 4 + packusdw m0, m1 + mova m1, m10 + vpdpwssd m1, m6, m4 + mova m4, m10 + vpdpwssd m4, m7, m5 + mova m5, m12 + vpdpwssd m5, m14, m2 + mova m2, m12 + vpdpwssd m2, m14, m3 + psrad m1, 4 + psrad m4, 4 + packusdw m1, m4 + vpermt2b m5, m13, m2 + vpsrlvw m0, m11 + vpsrlvw m1, m11 + mova [maskq], ym5 + add maskq, 32 + ret + +cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx512icl_table + lea r7, [w_mask_444_avx512icl_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m9, [base+pw_64] + vpbroadcastd m10, [base+mask_round+r6*4] + mova m11, [w_mask_end444] + vpbroadcastd m12, [base+bidir_shift+r6*4] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xmm0, ym0, 1 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + cmp hd, 8 + jl .w4_end + vextracti32x4 xmm0, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm0 + vextracti32x4 xmm0, m0, 3 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xmm0, ym1, 1 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 + vextracti32x4 xmm0, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm0 + vextracti32x4 xmm0, m1, 3 + movq [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm0 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main + mova [dstq+64*2], m0 + mova [dstq+64*3], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + mova m1, [tmp1q+64*0] + mova m3, [tmp2q+64*0] + mova m4, [tmp1q+64*1] + mova m7, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + psubsw m6, m1, m3 + punpcklwd m5, m3, m1 + pabsw m6, m6 + punpckhwd m3, m1 + psubusw m6, m8, m6 + psrlw m6, 10 + psubw m2, m9, m6 + punpcklwd m1, m6, m2 + punpckhwd m6, m2 + mova m0, m10 + vpdpwssd m0, m5, m1 + mova m1, m10 + vpdpwssd m1, m3, m6 + psubsw m5, m4, m7 + punpcklwd m6, m7, m4 + pabsw m5, m5 + punpckhwd m7, m4 + psubusw m5, m8, m5 + psrlw m5, 10 + psubw m3, m9, m5 + punpcklwd m4, m5, m3 + psrad m0, 4 + punpckhwd m5, m3 + psrad m1, 4 + packusdw m0, m1 + mova m1, m10 + vpdpwssd m1, m6, m4 + mova m4, m10 + vpdpwssd m4, m7, m5 + vpermt2b m2, m11, m3 + psrad m1, 4 + psrad m4, 4 + packusdw m1, m4 + vpsrlvw m0, m12 + vpsrlvw m1, m12 + mova [maskq], m2 + add maskq, 64 + ret + +cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_avx512icl_table + lea r6, [blend_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + vpbroadcastd m6, [base+pw_m512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + pmovzxbw ym19, [maskq] + movq xm16, [dstq+dsq*0] + movhps xm16, [dstq+dsq*1] + vpbroadcastq ym17, [dstq+dsq*2] + vpbroadcastq ym18, [dstq+r6 ] + pmullw ym19, ym6 + vpblendd ym16, ym17, 0x30 + vpblendd ym16, ym18, 0xc0 + psubw ym17, ym16, [tmpq] + add maskq, 16 + add tmpq, 32 + pmulhrsw ym17, ym19 + paddw ym16, ym17 + vextracti128 xm17, ym16, 1 + movq [dstq+dsq*0], xm16 + movhps [dstq+dsq*1], xm16 + movq [dstq+dsq*2], xm17 + movhps [dstq+r6 ], xm17 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + vzeroupper + RET +.w8: + pmovzxbw m2, [maskq] + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 + vinserti32x4 m0, [dstq+dsq*2], 2 + vinserti32x4 m0, [dstq+r6 ], 3 + pmullw m2, m6 + psubw m1, m0, [tmpq] + add maskq, 32 + add tmpq, 64 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + vextracti32x4 [dstq+dsq*2], m0, 2 + vextracti32x4 [dstq+r6 ], m0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + RET +.w16: + pmovzxbw m4, [maskq+32*0] + pmovzxbw m5, [maskq+32*1] + mova ym0, [dstq+dsq*0] + vinserti32x8 m0, [dstq+dsq*1], 1 + mova ym1, [dstq+dsq*2] + vinserti32x8 m1, [dstq+r6 ], 1 + pmullw m4, m6 + pmullw m5, m6 + psubw m2, m0, [tmpq+64*0] + psubw m3, m1, [tmpq+64*1] + add maskq, 32*2 + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + mova [dstq+dsq*2], ym1 + vextracti32x8 [dstq+r6 ], m1, 1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w16 + RET +.w32: + pmovzxbw m4, [maskq+32*0] + pmovzxbw m5, [maskq+32*1] + mova m0, [dstq+dsq*0] + mova m1, [dstq+dsq*1] + pmullw m4, m6 + pmullw m5, m6 + psubw m2, m0, [tmpq+ 64*0] + psubw m3, m1, [tmpq+ 64*1] + add maskq, 32*2 + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32 + RET + +cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h + lea r5, [blend_v_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + vpbroadcastd xmm2, [obmc_masks_avx2+2*2] +.w2_loop: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movq xmm1, [tmpq] + add tmpq, 4*2 + psubw xmm1, xmm0, xmm1 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + vpbroadcastq xmm2, [obmc_masks_avx2+4*2] +.w4_loop: + movq xmm0, [dstq+dsq*0] + movhps xmm0, [dstq+dsq*1] + psubw xmm1, xmm0, [tmpq] + add tmpq, 8*2 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 + psubw ym1, ym0, [tmpq] + add tmpq, 16*2 + pmulhrsw ym1, ym2 + paddw ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] +.w16_loop: + mova ym0, [dstq+dsq*0] + vinserti32x8 m0, [dstq+dsq*1], 1 + psubw m1, m0, [tmpq] + add tmpq, 32*2 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: + mova m4, [obmc_masks_avx2+32*2] +.w32_loop: + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 64*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 64*1] + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32_loop + RET + +cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask +%define base r6-$$ + lea r6, [$$] + tzcnt wd, wm + mov hd, hm + movsxd wq, [base+blend_h_avx512icl_table+wq*4] + lea maskq, [base+obmc_masks_avx2+hq*2] + lea hd, [hq*3] + lea wq, [base+blend_h_avx512icl_table+wq] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movd xmm2, [maskq+hq*2] + movq xmm1, [tmpq] + add tmpq, 4*2 + punpcklwd xmm2, xmm2 + psubw xmm1, xmm0, xmm1 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova xmm3, [blend_shuf] +.w4_loop: + movq xmm0, [dstq+dsq*0] + movhps xmm0, [dstq+dsq*1] + movd xmm2, [maskq+hq*2] + psubw xmm1, xmm0, [tmpq] + add tmpq, 8*2 + pshufb xmm2, xmm3 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + vbroadcasti32x4 ym3, [blend_shuf] + shufpd ym3, ym3, 0x0c +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 + vpbroadcastd ym2, [maskq+hq*2] + psubw ym1, ym0, [tmpq] + add tmpq, 16*2 + pshufb ym2, ym3 + pmulhrsw ym1, ym2 + paddw ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + RET +.w16: + vbroadcasti32x4 m3, [blend_shuf] + shufpd m3, m3, 0xf0 +.w16_loop: + mova ym0, [dstq+dsq*0] + vinserti32x8 m0, [dstq+dsq*1], 1 + vpbroadcastd m2, [maskq+hq*2] + psubw m1, m0, [tmpq] + add tmpq, 32*2 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16_loop + RET +.w32: + vpbroadcastw m4, [maskq+hq*2] + vpbroadcastw m5, [maskq+hq*2+2] + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 64*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 64*1] + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w32 + RET +.w64: + vpbroadcastw m4, [maskq+hq*2] + mova m0, [dstq+64*0] + psubw m2, m0, [tmpq+64*0] + mova m1, [dstq+64*1] + psubw m3, m1, [tmpq+64*1] + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + vpbroadcastw m8, [maskq+hq*2] + mova m0, [dstq+64*0] + psubw m4, m0, [tmpq+64*0] + mova m1, [dstq+64*1] + psubw m5, m1, [tmpq+64*1] + mova m2, [dstq+64*2] + psubw m6, m2, [tmpq+64*2] + mova m3, [dstq+64*3] + psubw m7, m3, [tmpq+64*3] + add tmpq, 64*4 + REPX {pmulhrsw x, m8}, m4, m5, m6, m7 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + mova [dstq+64*2], m2 + mova [dstq+64*3], m3 + add dstq, dsq + inc hq + jl .w128 + RET + +cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + mov r6, ~0 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + kmovq k6, r6 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax + LEA r7, $$ +%define base r7-$$ + vpbroadcastd m3, [base+pd_16384] + vpbroadcastd m7, [base+pd_63] + mova m24, [base+resize_permA] + mova m25, [base+resize_permB] + mova m26, [base+resize_permC] + mova m27, [base+resize_permD] + vbroadcasti32x4 m28, [base+resize_shufA] + vbroadcasti32x4 m29, [base+resize_shufB] + mova m30, [base+resize_permE] + vpbroadcastw ym31, pxmaxm + vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] + pslld m5, 4 ; dx*16 + pslld m6, 14 + pxor m2, m2 +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx +.loop_x: + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + vptestmd k5, m1, m1 + pand m9, m7 ; filter offset (masked) + ktestw k5, k5 + jz .load + vpbroadcastq m14, [base+pd_0_4] + vpermq m10, m0, q1100 + vpermq m11, m0, q3322 + vpermq m20, m1, q1100 + vpermq m21, m1, q3322 + punpckldq m10, m10 + punpckldq m11, m11 + punpckldq m20, m20 + punpckldq m21, m21 + paddd m10, m14 + paddd m11, m14 + paddd m20, m14 + paddd m21, m14 + vextracti32x8 ym12, m10, 1 + vextracti32x8 ym13, m11, 1 + vextracti32x8 ym22, m20, 1 + vextracti32x8 ym23, m21, 1 + kmovq k1, k6 + kmovq k2, k6 + kmovq k3, k6 + kmovq k4, k6 + vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 + vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 + vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B + vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F + kmovq k1, k6 + kmovq k2, k6 + kmovq k3, k6 + kmovq k4, k6 + vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] + vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] + vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] + vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] + pshufb m16, m0 + pshufb m17, m1 + pshufb m18, m14 + pshufb m19, m15 + mova m20, m24 + mova m22, m24 + mova m21, m25 + mova m23, m25 + vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b + vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d + vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb + vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd + mova m15, m26 + mova m17, m26 + mova m16, m27 + mova m18, m27 + vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa + vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb + vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc + vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd + kmovq k1, k6 + kmovq k2, k6 + vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] + vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] + pshufb m10, m11, m28 + pshufb m11, m11, m29 + pshufb m12, m13, m28 + pshufb m13, m13, m29 + jmp .filter +.load: + kmovq k1, k6 + kmovq k2, k6 + kmovq k3, k6 + kmovq k4, k6 + vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] + vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] + pshufb m10, m11, m28 + pshufb m11, m11, m29 + pshufb m12, m13, m28 + pshufb m13, m13, m29 + vpgatherdd m15{k3}, [srcq+m0*2+ 0] + vpgatherdd m16{k4}, [srcq+m0*2+ 4] + kmovq k1, k6 + kmovq k2, k6 + vpgatherdd m17{k1}, [srcq+m0*2+ 8] + vpgatherdd m18{k2}, [srcq+m0*2+12] +.filter: + mova m14, m2 + vpdpwssd m14, m15, m10 + vpdpwssd m14, m16, m11 + vpdpwssd m14, m17, m12 + vpdpwssd m14, m18, m13 + psubd m14, m3, m14 + psrad m14, 15 + packusdw m14, m14 + vpermq m14, m30, m14 + pminsw ym14, ym31 + mova [dstq+xq*2], ym14 + paddd m4, m5 + add xd, 16 + cmp xd, dst_wd + jl .loop_x + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/mc16_sse.asm dav1d-1.0.0/src/x86/mc16_sse.asm --- dav1d-0.9.2/src/x86/mc16_sse.asm 2021-09-03 15:51:24.425037100 +0000 +++ dav1d-1.0.0/src/x86/mc16_sse.asm 2022-03-18 14:31:56.030356000 +0000 @@ -41,9 +41,16 @@ spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 rescale_mul: dd 0, 1, 2, 3 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 +bdct_lb_q: times 8 db 0 + times 8 db 4 + times 8 db 8 + times 8 db 12 pw_2: times 8 dw 2 pw_16: times 4 dw 16 @@ -60,10 +67,21 @@ pd_63: times 4 dd 63 pd_64: times 4 dd 64 pd_512: times 4 dd 512 +pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 +pd_0x3ff: times 4 dd 0x3ff +pd_0x4000: times 4 dd 0x4000 +pq_0x400000: times 2 dq 0x400000 +pq_0x40000000: times 2 dq 0x40000000 pd_65538: times 2 dd 65538 put_bilin_h_rnd: times 4 dw 8 times 4 dw 10 +s_8tap_h_rnd: times 2 dd 2 + times 2 dd 8 +put_s_8tap_v_rnd: times 2 dd 512 + times 2 dd 128 +s_8tap_h_sh: dd 2, 4 +put_s_8tap_v_sh: dd 10, 8 bidir_rnd: times 4 dw -16400 times 4 dw -16388 put_8tap_h_rnd: dd 34, 34, 40, 40 @@ -113,6 +131,33 @@ BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 +%macro SCALED_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) +%%table: + %rep %0 - 2 + dw %%base %+ .w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_1024: + %xdefine %1_%2_dy1_table (%%dy_1024 - %3) + %rep %0 - 2 + dw %%base %+ .dy1_w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_2048: + %xdefine %1_%2_dy2_table (%%dy_2048 - %3) + %rep %0 - 2 + dw %%base %+ .dy2_w%3 - %%base + %rotate 1 + %endrep +%endmacro + +SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 + cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) @@ -121,14 +166,6 @@ SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %if UNIX64 DECLARE_REG_TMP 7 %else @@ -289,7 +326,8 @@ sub wd, 8 jg .h_w16 je .h_w8 - jp .h_w4 + cmp wd, -4 + je .h_w4 .h_w2: movq m1, [srcq+ssq*0] movhps m1, [srcq+ssq*1] @@ -1120,8 +1158,8 @@ %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 -%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v -cglobal %1_8tap_%2_16bpc +%macro FN 4 ; prefix, type, type_h, type_v +cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d @@ -1129,7 +1167,7 @@ mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) + jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) %endif %endmacro @@ -1141,15 +1179,16 @@ DECLARE_REG_TMP 7, 8, 8 %endif -MC_8TAP_FN put, sharp, SHARP, SHARP -MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH -MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP -MC_8TAP_FN put, smooth, SMOOTH, SMOOTH -MC_8TAP_FN put, sharp_regular, SHARP, REGULAR -MC_8TAP_FN put, regular_sharp, REGULAR, SHARP -MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR -MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH -MC_8TAP_FN put, regular, REGULAR, REGULAR +%define PUT_8TAP_FN FN put_8tap, +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR %if ARCH_X86_32 cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my @@ -1971,15 +2010,16 @@ DECLARE_REG_TMP 6, 7, 7, 8 %endif -MC_8TAP_FN prep, sharp, SHARP, SHARP -MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH -MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP -MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH -MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR -MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP -MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR -MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH -MC_8TAP_FN prep, regular, REGULAR, REGULAR +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR %if ARCH_X86_32 cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my @@ -2540,6 +2580,3917 @@ RET %undef tmp +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro SAVE_REG 1 + %xdefine r%1_save r%1 + %xdefine r%1q_save r%1q + %xdefine r%1d_save r%1d + %if ARCH_X86_32 + %define r%1m_save [rstk+stack_offset+(%1+1)*4] + %endif +%endmacro + +%macro LOAD_REG 1 + %xdefine r%1 r%1_save + %xdefine r%1q r%1q_save + %xdefine r%1d r%1d_save + %if ARCH_X86_32 + %define r%1m r%1m_save + %endif + %undef r%1d_save + %undef r%1q_save + %undef r%1_save +%endmacro + +%macro REMAP_REG 2-3 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d + %if ARCH_X86_32 + %if %3 == 0 + %xdefine r%1m r%2m + %else + %define r%1m [rstk+stack_offset+(%1+1)*4] + %endif + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %if ARCH_X86_64 + SAVE_REG 14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %else + SAVE_REG 5 + %assign %%i 5 + %rep 5 + %assign %%j %%i-1 + REMAP_REG %%i, %%j, 0 + %assign %%i %%i-1 + %endrep + %endif + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %if ARCH_X86_64 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + LOAD_REG 14 + %else + %rep 4 + %assign %%j %%i+1 + REMAP_REG %%i, %%j, 1 + %assign %%i %%i+1 + %endrep + LOAD_REG 5 + %endif + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%if ARCH_X86_32 + %macro MC_4TAP_SCALED_H 1 ; dst_mem + movu m7, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m5, [r4 +ssq*0] + movu m6, [r4 +ssq*1] + lea srcq, [srcq+ssq*2] + lea r4, [r4 +ssq*2] + REPX {pshufb x, m12}, m7, m2 + REPX {pmaddwd x, m13}, m7, m2 + REPX {pshufb x, m14}, m5, m6 + REPX {pmaddwd x, m15}, m5, m6 + phaddd m7, m5 + phaddd m2, m6 + mova m5, [esp+0x00] + movd m6, [esp+0x10] + paddd m7, m5 + paddd m2, m5 + psrad m7, m6 + psrad m2, m6 + packssdw m7, m2 + mova [stk+%1], m7 + %endmacro +%endif + +%if ARCH_X86_64 + %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] + movu m%1, [srcq+ r4*2] + movu m%2, [srcq+ r6*2] + movu m%3, [srcq+ r7*2] + movu m%4, [srcq+ r9*2] + movu m%5, [srcq+r10*2] + movu m%6, [srcq+r11*2] + movu m%7, [srcq+r13*2] + movu m%8, [srcq+ rX*2] + add srcq, ssq + pmaddwd m%1, [stk+0x10] + pmaddwd m%2, [stk+0x20] + pmaddwd m%3, [stk+0x30] + pmaddwd m%4, [stk+0x40] + pmaddwd m%5, [stk+0x50] + pmaddwd m%6, [stk+0x60] + pmaddwd m%7, [stk+0x70] + pmaddwd m%8, [stk+0x80] + phaddd m%1, m%2 + phaddd m%3, m%4 + phaddd m%5, m%6 + phaddd m%7, m%8 + phaddd m%1, m%3 + phaddd m%5, m%7 + paddd m%1, hround + paddd m%5, hround + psrad m%1, m12 + psrad m%5, m12 + packssdw m%1, m%5 + %endmacro +%else + %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets + %if %3 == 1 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + %endif + movu m0, [srcq+r0*2] + movu m1, [srcq+rX*2] + movu m2, [srcq+r4*2] + movu m3, [srcq+r5*2] + mov r0, [stk+16] + mov rX, [stk+20] + mov r4, [stk+24] + mov r5, [stk+28] + pmaddwd m0, [stk+%1+0x00] + pmaddwd m1, [stk+%1+0x10] + pmaddwd m2, [stk+%1+0x20] + pmaddwd m3, [stk+%1+0x30] + phaddd m0, m1 + phaddd m2, m3 + movu m4, [srcq+r0*2] + movu m5, [srcq+rX*2] + movu m6, [srcq+r4*2] + movu m7, [srcq+r5*2] + add srcq, ssq + pmaddwd m4, [stk+%1+0xa0] + pmaddwd m5, [stk+%1+0xb0] + pmaddwd m6, [stk+%1+0xc0] + pmaddwd m7, [stk+%1+0xd0] + phaddd m4, m5 + phaddd m6, m7 + phaddd m0, m2 + phaddd m4, m6 + paddd m0, hround + paddd m4, hround + psrad m0, m12 + psrad m4, m12 + packssdw m0, m4 + %if %2 != 0 + mova [stk+%2], m0 + %endif + %endmacro +%endif + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isput 1 + %assign isprep 0 + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %endif + %xdefine base_reg r12 +%else ; prep + %assign isput 0 + %assign isprep 1 + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %define tmp_stridem qword [stk+0x138] + %endif + %xdefine base_reg r11 + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %define tmp_stridem dword [stk+0x138] + %endif +%endif +%if ARCH_X86_32 + mov [esp+0x1f0], t0d + mov [esp+0x1f4], t1d + %if isput && required_stack_alignment > STACK_ALIGNMENT + mov dstd, dstm + mov dsd, dsm + mov srcd, srcm + mov ssd, ssm + mov hd, hm + mov r4, mxm + %define r0m [esp+0x200] + %define dsm [esp+0x204] + %define dsmp dsm + %define r1m dsm + %define r2m [esp+0x208] + %define ssm [esp+0x20c] + %define r3m ssm + %define hm [esp+0x210] + %define mxm [esp+0x214] + mov r0m, dstd + mov dsm, dsd + mov r2m, srcd + mov ssm, ssd + mov hm, hd + mov r0, mym + mov r1, dxm + mov r2, dym + %define mym [esp+0x218] + %define dxm [esp+0x21c] + %define dym [esp+0x220] + mov mxm, r4 + mov mym, r0 + mov dxm, r1 + mov dym, r2 + tzcnt wd, wm + %endif + %if isput + mov r3, pxmaxm + %define pxmaxm r3 + %else + mov r2, pxmaxm + %endif + %if isprep && required_stack_alignment > STACK_ALIGNMENT + %xdefine base_reg r5 + %else + %xdefine base_reg r6 + %endif +%endif + LEA base_reg, %1_8tap_scaled_16bpc_ssse3 +%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3 +%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT + tzcnt wd, wm +%endif +%if ARCH_X86_64 + %if isput + mov r7d, pxmaxm + %endif +%else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 +%endif + movd m8, dxm + movd m14, mxm +%if isput + movd m15, pxmaxm +%endif + pshufd m8, m8, q0000 + pshufd m14, m14, q0000 +%if isput + pshuflw m15, m15, q0000 + punpcklqdq m15, m15 +%endif +%if isprep + %if UNIX64 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 + %endif + %if ARCH_X86_64 + mov r6d, pxmaxm + %endif +%endif +%if ARCH_X86_64 + mov dyd, dym +%endif +%if isput + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %elif ARCH_X86_64 + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %else + %endif + %if ARCH_X86_64 + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+0x138] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif + %else + %define rX r1 + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %elif ARCH_X86_64 + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %xdefine hm r7m + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %if ARCH_X86_64 + %define rX r14 + %define rXd r14d + %else + %define rX r3 + %endif +%endif +%if ARCH_X86_64 + shr r7d, 11 + mova m10, [base+pd_0x3ff] + movddup m11, [base+s_8tap_h_rnd+r7*8] + movd m12, [base+s_8tap_h_sh+r7*4] + %if isput + movddup m13, [base+put_s_8tap_v_rnd+r7*8] + movd m7, [base+put_s_8tap_v_sh+r7*4] + %define pxmaxm [rsp] + mova pxmaxm, m15 + punpcklqdq m12, m7 + %endif + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q +%else + %define m10 [base+pd_0x3ff] + %define m11 [esp+0x00] + %define m12 [esp+0x10] + shr r3, 11 + movddup m1, [base+s_8tap_h_rnd+r3*8] + movd m2, [base+s_8tap_h_sh+r3*4] + %if isput + %define m13 [esp+0x20] + %define pxmaxm [esp+0x30] + %define stk esp+0x40 + movddup m5, [base+put_s_8tap_v_rnd+r3*8] + movd m6, [base+put_s_8tap_v_sh+r3*4] + mova pxmaxm, m15 + punpcklqdq m2, m6 + mova m13, m5 + %else + %define m13 [base+pd_m524256] + %endif + mov ssd, ssm + mova m11, m1 + mova m12, m2 + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + mov r1, [esp+0x1f4] + lea r0, [ssd*3] + movzx r2, r1b + shr r1, 16 + cmp dword hm, 6 + cmovs r1, r2 + mov [esp+0x1f4], r1 + %if isprep + mov r1, r1m + %endif + mov r2, r2m + sub srcq, r0 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define ss3q r0 + %define myd r4 + %define dyd dword dym + %define hd dword hm +%endif + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.w2: + %if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d + %else + movzx r4, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r4 + %endif + pxor m9, m9 + punpckldq m9, m8 + paddd m14, m9 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m9, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + pshufd m15, m15, q0321 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_q] + mova m6, [base+spel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m2, m2 + pcmpeqd m8, m2 + psrld m14, 10 + paddd m14, m14 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [stk], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m15 m6 + %endif + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m7 + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + pand m9, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m9 + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + movu m7, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + %else + pand m7, m5, [base+pd_0x4000] + pandn m5, m15 + por m5, m7 + %define m15 m5 + %endif + punpcklbw m15, m15 + psraw m15, 8 + REPX {pshufb x, m14}, m0, m1, m2, m3 + REPX {pmaddwd x, m15}, m0, m1, m2, m3 + %if ARCH_X86_64 + REPX {pshufb x, m14}, m4, m5, m6, m7 + REPX {pmaddwd x, m15}, m4, m5, m6, m7 + phaddd m0, m1 + phaddd m2, m3 + phaddd m4, m5 + phaddd m6, m7 + REPX {paddd x, m11}, m0, m2, m4, m6 + REPX {psrad x, m12}, m0, m2, m4, m6 + packssdw m0, m2 ; 0 1 2 3 + packssdw m4, m6 ; 4 5 6 7 + SWAP m1, m4 + %else + mova [stk+0x10], m15 + phaddd m0, m1 + phaddd m2, m3 + movu m1, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m14}, m1, m7, m6, m3 + REPX {pmaddwd x, m15}, m1, m7, m6, m3 + phaddd m1, m7 + phaddd m6, m3 + REPX {paddd x, m11}, m0, m2, m1, m6 + REPX {psrad x, m12}, m0, m2, m1, m6 + packssdw m0, m2 + packssdw m1, m6 + %define m14 [stk+0x00] + %define m15 [stk+0x10] + %endif + palignr m2, m1, m0, 4 ; 1 2 3 4 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + pshufd m5, m1, q0321 ; 5 6 7 _ + punpcklwd m2, m1, m5 ; 45 56 + punpckhwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mov myd, mym + mov r0, r0m + mova [stk+0x20], m3 + mova [stk+0x30], m0 + mova [stk+0x40], m2 + mova [stk+0x50], m4 + %endif +.w2_loop: + and myd, 0x3ff + %if ARCH_X86_64 + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m10, r6q + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pmaddwd m5, m3, m7 + pmaddwd m6, m0, m8 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddwd m7, m2, m9 + pmaddwd m8, m4, m10 + paddd m5, m6 + paddd m7, m8 + %else + mov r1, [esp+0x1f4] + xor r3, r3 + mov r5, myd + shr r5, 6 + lea r1, [r1+r5] + mov r5, 64 << 24 + cmovnz r3, [base+subpel_filters+r1*8+4] + cmovnz r5, [base+subpel_filters+r1*8+0] + movd m6, r3 + movd m7, r5 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m5, m7, q0000 + pshufd m6, m7, q1111 + pmaddwd m3, m5 + pmaddwd m0, m6 + pshufd m5, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m2, m5 + pmaddwd m4, m7 + paddd m3, m0 + paddd m2, m4 + SWAP m5, m3 + SWAP m7, m2 + %define m8 m3 + %endif + paddd m5, m13 + pshufd m6, m12, q1032 + pxor m8, m8 + paddd m5, m7 + psrad m5, m6 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, pxmaxm + movd [dstq], m5 + add dstq, dsmp + dec hd + jz .ret + %if ARCH_X86_64 + add myd, dyd + %else + add myd, dym + %endif + test myd, ~0x3ff + %if ARCH_X86_32 + SWAP m3, m5 + SWAP m2, m7 + mova m3, [stk+0x20] + mova m0, [stk+0x30] + mova m2, [stk+0x40] + mova m4, [stk+0x50] + %endif + jz .w2_loop + %if ARCH_X86_32 + mov r3, r3m + %endif + movu m5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps m3, m0, q1032 ; 01 12 + shufps m0, m2, q1032 ; 23 34 + shufps m2, m4, q1032 ; 45 56 + pshufb m5, m14 + pmaddwd m5, m15 + phaddd m5, m5 + paddd m5, m11 + psrad m5, m12 + packssdw m5, m5 + palignr m4, m5, m1, 12 + punpcklqdq m1, m4, m4 ; 6 7 6 7 + punpcklwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mova [stk+0x20], m3 + mova [stk+0x30], m0 + mova [stk+0x40], m2 + mova [stk+0x50], m4 + %endif + jmp .w2_loop +.w2_skip_line: + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m3, m0 ; 01 12 + mova m0, m2 ; 23 34 + pshufb m5, m14 + pshufb m6, m14 + pmaddwd m5, m15 + pmaddwd m6, m15 + phaddd m5, m6 + paddd m5, m11 + psrad m5, m12 + packssdw m5, m5 ; 6 7 6 7 + punpckhqdq m1, m5 ; 4 5 6 7 + pshufd m5, m1, q0321 ; 5 6 7 _ + punpcklwd m2, m1, m5 ; 45 56 + punpckhwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mova [stk+0x20], m3 + mova [stk+0x30], m0 + mova [stk+0x40], m2 + mova [stk+0x50], m4 + %endif + jmp .w2_loop +%endif +INIT_XMM ssse3 +.w4: +%if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %if isput + mova [rsp+0x30], m13 + %endif + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d +%else + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + movzx r4, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r4 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m9, [base+pd_0x4000] +%else + %define m9 [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + pshufd m7, m15, q1032 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r6d, m15 + movd r13d, m7 + mova m10, [base+bdct_lb_q+ 0] + mova m11, [base+bdct_lb_q+16] + movd m13, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+ r6*8+2] + movd m15, [base+subpel_filters+r11*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r0, m15 + movd r4, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd rX, m15 + movd r5, m7 + mova m5, [base+bdct_lb_q+ 0] + mova m6, [base+bdct_lb_q+16] + movd m1, [base+subpel_filters+r0*8+2] + movd m2, [base+subpel_filters+rX*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + movifprep r3, r3m + SWAP m4, m7 + %define m10 m5 + %define m11 m6 + %define m12 m1 + %define m13 m1 +%endif + psrld m14, 10 + paddd m14, m14 + punpckldq m13, m2 + punpckldq m15, m4 + punpcklqdq m13, m15 + pxor m2, m2 + pcmpeqd m0, m2 +%if ARCH_X86_64 + pand m9, m0 +%else + pand m2, m9, m0 + %define m9 m2 + SWAP m7, m4 +%endif + pandn m0, m13 +%if ARCH_X86_64 + SWAP m13, m0 +%else + %define m13 m0 +%endif + por m13, m9 + punpckhbw m15, m13, m13 + punpcklbw m13, m13 + psraw m15, 8 + psraw m13, 8 + pshufb m12, m14, m10 + pshufb m14, m11 + mova m10, [base+spel_s_shuf2] + movd r4d, m14 + shr r4d, 24 +%if ARCH_X86_32 + mova [stk+0x20], m13 + mova [stk+0x30], m15 + pxor m2, m2 +%endif + pshufb m7, m14, m2 + psubb m14, m7 + paddb m12, m10 + paddb m14, m10 +%if ARCH_X86_64 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + movu m1, [srcq+r4 ] + movu m3, [srcq+r6 ] + movu m2, [srcq+r11 ] + movu m4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m7, m9, m8, m10 + REPX {pmaddwd x, m13}, m7, m9, m8, m10 + REPX {pshufb x, m14}, m1, m2, m3, m4 + REPX {pmaddwd x, m15}, m1, m2, m3, m4 + mova m5, [rsp+0x10] + movd xm6, [rsp+0x20] + phaddd m7, m1 + phaddd m9, m3 + phaddd m8, m2 + phaddd m10, m4 + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m4, [srcq+ss3q ] + REPX {paddd x, m5}, m7, m9, m8, m10 + REPX {psrad x, xm6}, m7, m9, m8, m10 + packssdw m7, m9 ; 0 1 + packssdw m8, m10 ; 2 3 + movu m0, [srcq+r4 ] + movu m9, [srcq+r6 ] + movu m10, [srcq+r11 ] + movu m11, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m1, m2, m3, m4 + REPX {pmaddwd x, m13}, m1, m2, m3, m4 + REPX {pshufb x, m14}, m0, m9, m10, m11 + REPX {pmaddwd x, m15}, m0, m9, m10, m11 + phaddd m1, m0 + phaddd m2, m9 + phaddd m3, m10 + phaddd m4, m11 + REPX {paddd x, m5}, m1, m2, m3, m4 + REPX {psrad x, xm6}, m1, m2, m3, m4 + packssdw m1, m2 ; 4 5 + packssdw m3, m4 ; 6 7 + SWAP m9, m1 + shufps m4, m7, m8, q1032 ; 1 2 + shufps m5, m8, m9, q1032 ; 3 4 + shufps m6, m9, m3, q1032 ; 5 6 + pshufd m10, m3, q1032 ; 7 _ + punpcklwd m0, m7, m4 ; 01 + punpckhwd m7, m4 ; 12 + punpcklwd m1, m8, m5 ; 23 + punpckhwd m8, m5 ; 34 + punpcklwd m2, m9, m6 ; 45 + punpckhwd m9, m6 ; 56 + punpcklwd m3, m10 ; 67 + mova [rsp+0x40], m7 + mova [rsp+0x50], m8 + mova [rsp+0x60], m9 +%else + mova [stk+0x00], m12 + mova [stk+0x10], m14 + add r4, srcq + MC_4TAP_SCALED_H 0x40 ; 0 1 + MC_4TAP_SCALED_H 0x50 ; 2 3 + MC_4TAP_SCALED_H 0x60 ; 4 5 + MC_4TAP_SCALED_H 0x70 ; 6 7 + mova m4, [stk+0x40] + mova m5, [stk+0x50] + mova m6, [stk+0x60] + mova m7, [stk+0x70] + mov [stk+0xc0], r4 + shufps m1, m4, m5, q1032 ; 1 2 + shufps m2, m5, m6, q1032 ; 3 4 + shufps m3, m6, m7, q1032 ; 5 6 + pshufd m0, m7, q1032 ; 7 _ + mova [stk+0xb0], m0 + punpcklwd m0, m4, m1 ; 01 + punpckhwd m4, m1 ; 12 + punpcklwd m1, m5, m2 ; 23 + punpckhwd m5, m2 ; 34 + punpcklwd m2, m6, m3 ; 45 + punpckhwd m6, m3 ; 56 + punpcklwd m3, m7, [stk+0xb0] ; 67 + mov myd, mym + mov r0, r0m + mova [stk+0x40], m0 ; 01 + mova [stk+0x50], m1 ; 23 + mova [stk+0x60], m2 ; 45 + mova [stk+0x70], m3 ; 67 + mova [stk+0x80], m4 ; 12 + mova [stk+0x90], m5 ; 34 + mova [stk+0xa0], m6 ; 56 + %define m12 [stk+0x00] + %define m14 [stk+0x10] + %define m13 [stk+0x20] + %define m15 [stk+0x30] + %define hrnd_mem [esp+0x00] + %define hsh_mem [esp+0x10] + %if isput + %define vrnd_mem [esp+0x20] + %else + %define vrnd_mem [base+pd_m524256] + %endif +%endif +.w4_loop: + and myd, 0x3ff +%if ARCH_X86_64 + mov r11d, 64 << 24 + mov r13d, myd + shr r13d, 6 + lea r13d, [t1+r13] + cmovnz r11q, [base+subpel_filters+r13*8] + movq m9, r11q + punpcklbw m9, m9 + psraw m9, 8 + pshufd m7, m9, q0000 + pshufd m8, m9, q1111 + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pshufd m7, m9, q2222 + pshufd m9, m9, q3333 + pmaddwd m6, m2, m7 + pmaddwd m8, m3, m9 + %if isput + movd m9, [rsp+0x28] + %define vrnd_mem [rsp+0x30] + %else + %define vrnd_mem [base+pd_m524256] + %endif + paddd m4, m5 + paddd m6, m8 + paddd m4, m6 + paddd m4, vrnd_mem +%else + mov mym, myd + mov r5, [esp+0x1f4] + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m0, m4 + pmaddwd m1, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + %if isput + movd m4, [esp+0x18] + %endif + paddd m0, m1 + paddd m2, m3 + paddd m0, vrnd_mem + paddd m0, m2 + SWAP m4, m0 + %define m9 m0 +%endif +%if isput + pxor m5, m5 + psrad m4, m9 + packssdw m4, m4 + pmaxsw m4, m5 + pminsw m4, pxmaxm + movq [dstq], m4 + add dstq, dsmp +%else + psrad m4, 6 + packssdw m4, m4 + movq [tmpq], m4 + add tmpq, 8 +%endif + dec hd + jz .ret +%if ARCH_X86_64 + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + mova m8, [rsp+0x10] + movd m9, [rsp+0x20] + movu m4, [srcq] + movu m5, [srcq+r4] + test myd, 0x400 + jz .w4_skip_line + mova m0, [rsp+0x40] + mova [rsp+0x40], m1 + mova m1, [rsp+0x50] + mova [rsp+0x50], m2 + mova m2, [rsp+0x60] + mova [rsp+0x60], m3 + pshufb m4, m12 + pshufb m5, m14 + pmaddwd m4, m13 + pmaddwd m5, m15 + phaddd m4, m5 + paddd m4, m8 + psrad m4, m9 + packssdw m4, m4 + punpcklwd m3, m10, m4 + mova m10, m4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu m6, [srcq+ssq*1] + movu m7, [srcq+r6] + mova m0, [rsp+0x50] + mova m11, [rsp+0x60] + pshufb m4, m12 + pshufb m6, m12 + pshufb m5, m14 + pshufb m7, m14 + pmaddwd m4, m13 + pmaddwd m6, m13 + pmaddwd m5, m15 + pmaddwd m7, m15 + mova [rsp+0x40], m0 + mova [rsp+0x50], m11 + phaddd m4, m5 + phaddd m6, m7 + paddd m4, m8 + paddd m6, m8 + psrad m4, m9 + psrad m6, m9 + packssdw m4, m6 + punpcklwd m9, m10, m4 + mova [rsp+0x60], m9 + pshufd m10, m4, q1032 + mova m0, m1 + mova m1, m2 + mova m2, m3 + punpcklwd m3, m4, m10 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +%else + SWAP m0, m4 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + jnz .w4_next_line + mova m0, [stk+0x40] + mova m1, [stk+0x50] + mova m2, [stk+0x60] + mova m3, [stk+0x70] + jmp .w4_loop +.w4_next_line: + mov r5, [stk+0xc0] + movu m4, [srcq] + movu m5, [r5] + test myd, 0x400 + jz .w4_skip_line + add [stk+0xc0], ssq + mova m0, [stk+0x80] + mova m3, [stk+0x50] + mova [stk+0x40], m0 + mova [stk+0x80], m3 + mova m1, [stk+0x90] + mova m6, [stk+0x60] + mova [stk+0x50], m1 + mova [stk+0x90], m6 + mova m2, [stk+0xa0] + mova m7, [stk+0x70] + mova [stk+0x60], m2 + mova [stk+0xa0], m7 + pshufb m4, m12 + pshufb m5, m14 + pmaddwd m4, m13 + pmaddwd m5, m15 + phaddd m4, m5 + paddd m4, hrnd_mem + psrad m4, hsh_mem + packssdw m4, m4 + punpcklwd m3, [stk+0xb0], m4 + mova [stk+0xb0], m4 + mova [stk+0x70], m3 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu m6, [srcq+ssq*1] + movu m7, [r5 +ssq*1] + lea r5, [r5 +ssq*2] + mov [stk+0xc0], r5 + mova m0, [stk+0x50] + mova m1, [stk+0x60] + mova m2, [stk+0x70] + mova m3, [stk+0x90] + pshufb m4, m12 + pshufb m6, m12 + pshufb m5, m14 + pshufb m7, m14 + pmaddwd m4, m13 + pmaddwd m6, m13 + pmaddwd m5, m15 + pmaddwd m7, m15 + mova [stk+0x40], m0 + mova [stk+0x50], m1 + mova [stk+0x60], m2 + mova [stk+0x80], m3 + phaddd m4, m5 + phaddd m6, m7 + mova m5, [stk+0xa0] + mova m7, [stk+0xb0] + paddd m4, hrnd_mem + paddd m6, hrnd_mem + psrad m4, hsh_mem + psrad m6, hsh_mem + packssdw m4, m6 + punpcklwd m7, m4 + pshufd m6, m4, q1032 + mova [stk+0x90], m5 + mova [stk+0xa0], m7 + mova [stk+0xb0], m6 + punpcklwd m3, m4, m6 + mova [stk+0x70], m3 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +%endif +INIT_XMM ssse3 +%if ARCH_X86_64 + %define stk rsp+0x20 +%endif +.w8: + mov dword [stk+0xf0], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [stk+0xf0], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [stk+0xf0], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [stk+0xf0], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [stk+0xf0], 16 + movifprep tmp_stridem, 256 +.w_start: +%if ARCH_X86_64 + %ifidn %1, put + movifnidn dsm, dsq + %endif + mova [rsp+0x10], m11 + %define hround m11 + shr t0d, 16 + movd m15, t0d + %if isprep + mova m13, [base+pd_m524256] + %endif +%else + %define hround [esp+0x00] + %define m12 [esp+0x10] + %define m10 [base+pd_0x3ff] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq ssm + %endif + mov r4, [esp+0x1f0] + shr r4, 16 + movd m15, r4 + mov r0, r0m + mov myd, mym +%endif + sub srcq, 6 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + mova [stk+0x100], m7 + mova [stk+0x120], m15 + mov [stk+0x0f8], srcq + mov [stk+0x130], r0q ; dstq / tmpq +%if ARCH_X86_64 && UNIX64 + mov hm, hd +%elif ARCH_X86_32 + mov r5, hm + mov [stk+0x0f4], myd + mov [stk+0x134], r5 +%endif + jmp .hloop +.hloop_prep: + dec dword [stk+0x0f0] + jz .ret +%if ARCH_X86_64 + add qword [stk+0x130], 16 + mov hd, hm +%else + add dword [stk+0x130], 16 + mov myd, [stk+0x0f4] + mov r5, [stk+0x134] + mov r0, [stk+0x130] +%endif + mova m7, [stk+0x100] + mova m14, [stk+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m11, [rsp+0x10] +%endif + mova m15, [stk+0x120] + mov srcq, [stk+0x0f8] +%if ARCH_X86_64 + mov r0q, [stk+0x130] ; dstq / tmpq +%else + mov mym, myd + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.hloop: +%if ARCH_X86_64 + mova m9, [base+pq_0x40000000] +%else + %define m9 [base+pq_0x40000000] +%endif + pxor m1, m1 + psrld m2, m14, 10 + mova [stk], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m1 + pshufd m2, m5, q1032 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pxor m2, m2 + pcmpeqd m5, m2 + mova [stk+0x110], m14 + pshufd m4, m15, q1032 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + movq r11, m14 + punpckhqdq m14, m14 + movq rX, m14 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m9, m4 + pand m8, m9, m6 + pand m15, m9, m14 + pand m9, m9, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m9, m5 + punpcklbw m0, m7, m7 + punpckhbw m7, m7 + punpcklbw m1, m8, m8 + punpckhbw m8, m8 + psraw m0, 8 + psraw m7, 8 + psraw m1, 8 + psraw m8, 8 + punpcklbw m2, m15, m15 + punpckhbw m15, m15 + punpcklbw m3, m9, m9 + punpckhbw m9, m9 + psraw m2, 8 + psraw m15, 8 + psraw m3, 8 + psraw m9, 8 + mova [stk+0x10], m0 + mova [stk+0x20], m7 + mova [stk+0x30], m1 + mova [stk+0x40], m8 + mova [stk+0x50], m2 + mova [stk+0x60], m15 + mova [stk+0x70], m3 + mova [stk+0x80], m9 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 + mova [stk+0x90], m1 + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 + mova [stk+0xa0], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 + mova [stk+0xb0], m3 + MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 + mova [stk+0xc0], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 + mova [stk+0xd0], m5 + MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 + MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 + MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 + mova m5, [stk+0xd0] + mova m1, [stk+0x90] + mova m2, [stk+0xa0] + mova m3, [stk+0xb0] + mova m9, [stk+0xc0] + mov myd, mym + mov dyd, dym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova [stk+0x90], m4 + mova [stk+0xa0], m5 + mova [stk+0xb0], m6 + mova [stk+0xc0], m7 + %define hround [rsp+0x10] +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m11, r6q + punpcklbw m11, m11 + psraw m11, 8 + pshufd m5, m11, q0000 + pshufd m7, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m4, m5, m0 + pmaddwd m5, m5, m1 + pmaddwd m6, m7, m2 + pmaddwd m7, m7, m3 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [stk+0x90], m10 + pmaddwd m7, [stk+0xa0], m10 + pmaddwd m8, [stk+0xb0], m11 + pmaddwd m9, [stk+0xc0], m11 + paddd m4, m6 + paddd m5, m7 + %if isput + pshufd m6, m12, q1032 + %endif + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r4, m15 + movd r5, m4 + mova m14, [stk+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [stk+16], m14 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m9, m4 + pand m1, m9, m6 + pand m2, m9, m7 + pand m3, m9, m5 + pandn m4, [stk+0x20] + pandn m6, [stk+0x30] + pandn m7, [stk+0x40] + pandn m5, [stk+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + punpcklbw m4, m0, m0 + punpckhbw m0, m0 + punpcklbw m5, m1, m1 + punpckhbw m1, m1 + psraw m4, 8 + psraw m0, 8 + psraw m5, 8 + psraw m1, 8 + punpcklbw m6, m2, m2 + punpckhbw m2, m2 + punpcklbw m7, m3, m3 + punpckhbw m3, m3 + psraw m6, 8 + psraw m2, 8 + psraw m7, 8 + psraw m3, 8 + mova [stk+0x0a0], m4 + mova [stk+0x0b0], m0 + mova [stk+0x0c0], m5 + mova [stk+0x0d0], m1 + mova [stk+0x140], m6 + mova [stk+0x150], m2 + mova [stk+0x160], m7 + mova [stk+0x170], m3 + MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 + MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 + MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 + MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 + MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 + MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 + MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 + MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 + mova m5, [stk+0x60] + mova m6, [stk+0x70] + mova m7, [stk+0x80] + mova m0, [stk+0x90] + mov myd, mym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m1, [stk+0x20] + mova m2, [stk+0x30] + mova m3, [stk+0x40] + mova m4, [stk+0x50] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 +.vloop: + mov r0, r0m + mov r5, [esp+0x1f4] + and myd, 0x3ff + mov mym, myd + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [stk+0x60], m6 + pmaddwd m3, [stk+0x70], m6 + pmaddwd m4, [stk+0x80], m7 + pmaddwd m5, [stk+0x90], m7 + %if isput + movd m6, [esp+0x18] + %endif + paddd m0, m2 + paddd m1, m3 + paddd m0, vrnd_mem + paddd m1, vrnd_mem + paddd m4, m0 + paddd m5, m1 +%endif +%ifidn %1, put + psrad m4, m6 + psrad m5, m6 + packssdw m4, m5 + pxor m7, m7 + pmaxsw m4, m7 + pminsw m4, pxmaxm + mova [dstq], m4 + add dstq, dsm +%else + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep +%if ARCH_X86_64 + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [stk+0x140], myd + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + jz .skip_line + mova m14, [base+unpckw] + movu m8, [srcq+r10*2] + movu m9, [srcq+r11*2] + movu m10, [srcq+r13*2] + movu m11, [srcq+ rX*2] + movu m4, [srcq+ r4*2] + movu m5, [srcq+ r6*2] + movu m6, [srcq+ r7*2] + movu m7, [srcq+ r9*2] + add srcq, ssq + mov myd, [stk+0x140] + mov dyd, dym + pshufd m15, m14, q1032 + pshufb m0, m14 ; 0a 1a + pshufb m1, m14 ; 0b 1b + pshufb m2, m15 ; 3a 2a + pshufb m3, m15 ; 3b 2b + pmaddwd m8, [stk+0x50] + pmaddwd m9, [stk+0x60] + pmaddwd m10, [stk+0x70] + pmaddwd m11, [stk+0x80] + pmaddwd m4, [stk+0x10] + pmaddwd m5, [stk+0x20] + pmaddwd m6, [stk+0x30] + pmaddwd m7, [stk+0x40] + phaddd m8, m9 + phaddd m10, m11 + mova m11, hround + phaddd m4, m5 + phaddd m6, m7 + phaddd m8, m10 + phaddd m4, m6 + paddd m4, m11 + paddd m8, m11 + psrad m4, m12 + psrad m8, m12 + packssdw m4, m8 + pshufb m5, [stk+0x90], m14 ; 4a 5a + pshufb m6, [stk+0xa0], m14 ; 4b 5b + pshufb m7, [stk+0xb0], m15 ; 7a 6a + pshufb m8, [stk+0xc0], m15 ; 7b 6b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + punpckhwd m5, m7 ; 56a + punpckhwd m6, m8 ; 56b + punpcklwd m7, m4 ; 78a + punpckhqdq m4, m4 + punpcklwd m8, m4 ; 78b + mova [stk+0x90], m5 + mova [stk+0xa0], m6 + mova [stk+0xb0], m7 + mova [stk+0xc0], m8 + jmp .vloop +.skip_line: + MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11 + MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11 + mov myd, [stk+0x140] + mov dyd, dym + mova m0, m2 ; 01a + mova m1, m3 ; 01b + mova m2, [stk+0x90] ; 23a + mova m3, [stk+0xa0] ; 23b + mova m5, [stk+0xb0] ; 45a + mova m6, [stk+0xc0] ; 45b + punpcklwd m7, m4, m8 ; 67a + punpckhwd m4, m8 ; 67b + mova [stk+0x90], m5 + mova [stk+0xa0], m6 + mova [stk+0xb0], m7 + mova [stk+0xc0], m4 +%else + mov r0m, r0 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + mov mym, myd + jnz .next_line + mova m0, [stk+0x20] + mova m1, [stk+0x30] + mova m2, [stk+0x40] + mova m3, [stk+0x50] + jmp .vloop +.next_line: + test myd, 0x400 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + jz .skip_line + MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 + mova m7, [base+unpckw] + pshufd m4, m7, q1032 + pshufb m0, [stk+0x20], m7 ; 0a 1a + pshufb m1, [stk+0x30], m7 ; 0b 1b + pshufb m2, [stk+0x40], m4 ; 3a 2a + pshufb m3, [stk+0x50], m4 ; 3b 2b + pshufb m5, [stk+0x60], m7 ; 4a 5a + pshufb m6, [stk+0x70], m7 ; 4b 5b + pshufb m7, [stk+0x80], m4 ; 7a 6a + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + punpckhwd m5, m7 ; 56a + mova [stk+0x60], m5 + pshufb m5, [stk+0x90], m4 ; 7b 6b + punpcklwd m7, [stk+0xe0] ; 78a + punpckhwd m6, m5 ; 56b + mova [stk+0x70], m6 + movq m6, [stk+0xe8] + mova [stk+0x80], m7 + punpcklwd m5, m6 + mov myd, mym + mova [stk+0x90], m5 + jmp .vloop +.skip_line: + MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 + MC_8TAP_SCALED_H 0xa0, 0 ; 9 + mova m7, [stk+0xe0] + mova m2, [stk+0x60] ; 23a + mova m3, [stk+0x70] ; 23b + mova m4, [stk+0x80] ; 45a + mova m5, [stk+0x90] ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova m0, [stk+0x40] ; 01a + mova m1, [stk+0x50] ; 01b + mov myd, mym + mova [stk+0x40], m2 + mova [stk+0x50], m3 + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova [stk+0x20], m0 + mova [stk+0x30], m1 +%endif + jmp .vloop +INIT_XMM ssse3 +.dy1: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy1_w2: + %if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d + %else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + %define m11 [esp+0x00] + %define m12 [esp+0x10] + %define m13 [esp+0x20] + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 + mov r1, r1m + %endif + pxor m9, m9 + punpckldq m9, m8 + paddd m14, m9 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m9, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + pshufd m15, m15, q0321 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_q] + mova m6, [base+spel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m2, m2 + pcmpeqd m8, m2 + psrld m14, 10 + paddd m14, m14 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [stk], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m15 m6 + %endif + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m7 + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + pand m9, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m9 + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + %else + pand m7, m5, [base+pd_0x4000] + pandn m5, m15 + por m5, m7 + %define m15 m5 + mov myd, mym + mov r5, [esp+0x1f4] + xor r3, r3 + shr myd, 6 + lea r5, [r5+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + mov [stk+0x20], r3 + mov r3, r3m + %endif + punpcklbw m15, m15 + psraw m15, 8 + REPX {pshufb x, m14}, m0, m1, m2, m3 + REPX {pmaddwd x, m15}, m0, m1, m2, m3 + %if ARCH_X86_64 + REPX {pshufb x, m14}, m4, m5, m6 + REPX {pmaddwd x, m15}, m4, m5, m6 + phaddd m0, m1 + phaddd m2, m3 + phaddd m4, m5 + phaddd m6, m6 + REPX {paddd x, m11}, m0, m2, m4, m6 + REPX {psrad x, m12}, m0, m2, m4, m6 + packssdw m0, m2 ; 0 1 2 3 + packssdw m4, m6 ; 4 5 6 + SWAP m1, m4 + movq m10, r4 + %else + mova [stk+0x10], m15 + phaddd m0, m1 + phaddd m2, m3 + movu m1, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + REPX {pshufb x, m14}, m1, m7, m6 + REPX {pmaddwd x, m15}, m1, m7, m6 + %define m14 [stk+0x00] + %define m15 [stk+0x10] + phaddd m1, m7 + phaddd m6, m6 + REPX {paddd x, m11}, m0, m2, m1, m6 + REPX {psrad x, m12}, m0, m2, m1, m6 + packssdw m0, m2 + packssdw m1, m6 + %define m8 m6 + %define m9 m4 + %define m10 m5 + movd m10, r4 + movd m9, [stk+0x20] + punpckldq m10, m9 + %endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + %if ARCH_X86_32 + mova [stk+0x50], m7 + mova [stk+0x60], m8 + mova [stk+0x70], m9 + mova [stk+0x80], m10 + %define m7 [stk+0x50] + %define m8 [stk+0x60] + %define m9 [stk+0x70] + %define m10 [stk+0x80] + %endif + palignr m2, m1, m0, 4 ; 1 2 3 4 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + pshufd m4, m1, q2121 ; 5 6 5 6 + punpcklwd m2, m1, m4 ; 45 56 + %if ARCH_X86_32 + mov r0, r0m + %endif +.dy1_w2_loop: + movu m1, [srcq+ssq*0] + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m3, m7 + mova m3, m0 + pmaddwd m0, m8 + pshufb m1, m14 + pshufb m6, m14 + pmaddwd m1, m15 + pmaddwd m6, m15 + phaddd m1, m6 + paddd m1, m11 + psrad m1, m12 + packssdw m1, m1 + paddd m5, m0 + mova m0, m2 + pmaddwd m2, m9 + paddd m5, m2 + palignr m2, m1, m4, 12 + punpcklwd m2, m1 ; 67 78 + pmaddwd m4, m2, m10 + paddd m5, m13 + paddd m5, m4 + pxor m6, m6 + mova m4, m1 + pshufd m1, m12, q1032 + psrad m5, m1 + packssdw m5, m5 + pmaxsw m5, m6 + pminsw m5, pxmaxm + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q1032 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +INIT_XMM ssse3 +.dy1_w4: +%if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %if isput + mova [rsp+0x30], m13 + %define vrnd_mem [rsp+0x30] + %define stk rsp+0x40 + %else + %define vrnd_mem [base+pd_m524256] + %define stk rsp+0x30 + %endif + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m9 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq r3 + %endif + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m9, [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + pshufd m7, m15, q1032 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r6d, m15 + movd r13d, m7 + mova m10, [base+bdct_lb_q+ 0] + mova m11, [base+bdct_lb_q+16] + movd m13, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+ r6*8+2] + movd m15, [base+subpel_filters+r11*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r0, m15 + movd r4, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd rX, m15 + movd r5, m7 + mova m5, [base+bdct_lb_q+ 0] + mova m6, [base+bdct_lb_q+16] + movd m1, [base+subpel_filters+r0*8+2] + movd m2, [base+subpel_filters+rX*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + SWAP m4, m7 + %if isprep + mov r3, r3m + %endif + %define m10 m5 + %define m11 m6 + %define m12 m1 + %define m13 m1 +%endif + psrld m14, 10 + paddd m14, m14 + punpckldq m13, m2 + punpckldq m15, m4 + punpcklqdq m13, m15 + pxor m2, m2 + pcmpeqd m0, m2 +%if ARCH_X86_64 + pand m9, m0 +%else + pand m2, m9, m0 + %define m9 m2 + SWAP m7, m4 +%endif + pandn m0, m13 +%if ARCH_X86_64 + SWAP m13, m0 +%else + %define m13 m0 +%endif + por m13, m9 + punpckhbw m15, m13, m13 + punpcklbw m13, m13 + psraw m15, 8 + psraw m13, 8 + pshufb m12, m14, m10 + pshufb m14, m11 + mova m10, [base+spel_s_shuf2] + movd r4d, m14 + shr r4d, 24 +%if ARCH_X86_32 + mova [stk+0x40], m13 + mova [stk+0x50], m15 + pxor m2, m2 +%endif + pshufb m7, m14, m2 + psubb m14, m7 + paddb m12, m10 + paddb m14, m10 +%if ARCH_X86_64 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + movu m1, [srcq+r4 ] + movu m3, [srcq+r6 ] + movu m2, [srcq+r11 ] + movu m4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m7, m9, m8, m10 + REPX {pmaddwd x, m13}, m7, m9, m8, m10 + REPX {pshufb x, m14}, m1, m3, m2, m4 + REPX {pmaddwd x, m15}, m1, m3, m2, m4 + mova m5, [rsp+0x10] + movd xm6, [rsp+0x20] + phaddd m7, m1 + phaddd m9, m3 + phaddd m8, m2 + phaddd m10, m4 + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + REPX {paddd x, m5}, m7, m9, m8, m10 + REPX {psrad x, xm6}, m7, m9, m8, m10 + packssdw m7, m9 ; 0 1 + packssdw m8, m10 ; 2 3 + movu m0, [srcq+r4 ] + movu m9, [srcq+r6 ] + movu m10, [srcq+r11 ] + add srcq, ss3q + REPX {pshufb x, m12}, m1, m2, m3 + REPX {pmaddwd x, m13}, m1, m2, m3 + REPX {pshufb x, m14}, m0, m9, m10 + REPX {pmaddwd x, m15}, m0, m9, m10 + phaddd m1, m0 + phaddd m2, m9 + phaddd m3, m10 + shr myd, 6 + mov r13d, 64 << 24 + lea myd, [t1+myq] + cmovnz r13q, [base+subpel_filters+myq*8] + REPX {paddd x, m5}, m1, m2, m3 + REPX {psrad x, xm6}, m1, m2, m3 + packssdw m1, m2 ; 4 5 + packssdw m3, m3 ; 6 6 + SWAP m9, m1 + shufps m4, m7, m8, q1032 ; 1 2 + shufps m5, m8, m9, q1032 ; 3 4 + shufps m6, m9, m3, q1032 ; 5 6 + punpcklwd m0, m7, m4 ; 01 + punpckhwd m7, m4 ; 12 + punpcklwd m1, m8, m5 ; 23 + punpckhwd m8, m5 ; 34 + punpcklwd m2, m9, m6 ; 45 + punpckhwd m9, m6 ; 56 + movq m10, r13 + mova [stk+0x00], m1 + mova [stk+0x10], m8 + mova [stk+0x20], m2 + mova [stk+0x30], m9 + mova [stk+0x40], m3 + %define hrnd_mem [rsp+0x10] + %define hsh_mem [rsp+0x20] + %define vsh_mem [rsp+0x28] + %if isput + %define vrnd_mem [rsp+0x30] + %else + %define vrnd_mem [base+pd_m524256] + %endif +%else + mova [stk+0x20], m12 + mova [stk+0x30], m14 + add r4, srcq + MC_4TAP_SCALED_H 0x60 ; 0 1 + MC_4TAP_SCALED_H 0x70 ; 2 3 + MC_4TAP_SCALED_H 0x80 ; 4 5 + movu m7, [srcq] + movu m2, [r4] + add srcq, ssq + add r4, ssq + mov [stk+0xb0], r4 + pshufb m7, m12 + pshufb m2, m14 + pmaddwd m7, m13 + pmaddwd m2, m15 + phaddd m7, m2 + paddd m7, [esp+0x00] + psrad m7, [esp+0x10] + packssdw m7, m7 ; 6 6 + mova m4, [stk+0x60] + mova m5, [stk+0x70] + mova m6, [stk+0x80] + mov myd, mym + mov rX, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea rX, [rX+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+rX*8+0] + cmovnz r5, [base+subpel_filters+rX*8+4] + mov r3, r3m + shufps m1, m4, m5, q1032 ; 1 2 + shufps m2, m5, m6, q1032 ; 3 4 + shufps m3, m6, m7, q1032 ; 5 6 + mova [stk+0xa0], m7 + punpcklwd m0, m4, m1 ; 01 + punpckhwd m4, m1 ; 12 + punpcklwd m1, m5, m2 ; 23 + punpckhwd m5, m2 ; 34 + punpcklwd m2, m6, m3 ; 45 + punpckhwd m6, m3 ; 56 + movd m7, r4 + movd m3, r5 + mov r0, r0m + %if isput + mov r1, r1m + %endif + mov r4, [stk+0xb0] + mova [stk+0xc0], m4 ; 12 + mova [stk+0x60], m1 ; 23 + mova [stk+0x70], m2 ; 45 + mova [stk+0x80], m5 ; 34 + mova [stk+0x90], m6 ; 56 + %define m12 [stk+0x20] + %define m14 [stk+0x30] + %define m13 [stk+0x40] + %define m15 [stk+0x50] + %define hrnd_mem [esp+0x00] + %define hsh_mem [esp+0x10] + %define vsh_mem [esp+0x18] + %if isput + %define vrnd_mem [esp+0x20] + %else + %define vrnd_mem [base+pd_m524256] + %endif + %define m10 m7 + punpckldq m10, m3 +%endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m3, m10, q0000 + pshufd m4, m10, q1111 + pshufd m5, m10, q2222 + pshufd m10, m10, q3333 +%if ARCH_X86_32 + %xdefine m8 m3 + %xdefine m9 m6 + %xdefine m11 m5 + %xdefine m6 m4 + mova [stk+0x100], m3 + mova [stk+0x110], m4 + mova [stk+0x120], m5 + mova [stk+0x130], m10 + %define m3 [stk+0x100] + %define m4 [stk+0x110] + %define m5 [stk+0x120] + %define m10 [stk+0x130] + mova m7, [stk+0xc0] + mova m8, [stk+0x80] +%endif +.dy1_w4_loop: + movu m11, [srcq+ssq*0] + movu m6, [srcq+ssq*1] + pmaddwd m0, m3 + pmaddwd m7, m3 + pmaddwd m1, m4 + pmaddwd m8, m4 + pmaddwd m2, m5 + pmaddwd m9, m5 + paddd m1, m0 + paddd m8, m7 +%if ARCH_X86_64 + movu m0, [srcq+r4] + movu m7, [srcq+r6] +%else + movu m0, [r4+ssq*0] + movu m7, [r4+ssq*1] + lea r4, [r4+ssq*2] +%endif + lea srcq, [srcq+ssq*2] + paddd m1, m2 + paddd m8, m9 + pshufb m11, m12 + pshufb m6, m12 + pmaddwd m11, m13 + pmaddwd m6, m13 + pshufb m0, m14 + pshufb m7, m14 + pmaddwd m0, m15 + pmaddwd m7, m15 + phaddd m11, m0 + phaddd m6, m7 + paddd m11, hrnd_mem + paddd m6, hrnd_mem + psrad m11, hsh_mem + psrad m6, hsh_mem + packssdw m11, m6 ; 7 8 +%if ARCH_X86_64 + shufps m9, [stk+0x40], m11, q1032 ; 6 7 + mova m0, [stk+0x00] + mova [stk+0x40], m11 +%else + shufps m9, [stk+0xa0], m11, q1032 ; 6 7 + mova m0, [stk+0x60] + mova [stk+0xa0], m11 +%endif + punpcklwd m2, m9, m11 ; 67 + punpckhwd m9, m11 ; 78 + pmaddwd m6, m2, m10 + pmaddwd m7, m9, m10 +%if isput + movd m11, vsh_mem +%endif + paddd m1, vrnd_mem + paddd m8, vrnd_mem + paddd m1, m6 + paddd m8, m7 +%if ARCH_X86_64 + mova m7, [stk+0x10] +%else + mova m7, [stk+0x80] +%endif +%if isput + psrad m1, m11 + psrad m8, m11 +%else + psrad m1, 6 + psrad m8, 6 +%endif + packssdw m1, m8 +%if ARCH_X86_64 + mova m8, [stk+0x30] +%else + mova m8, [stk+0x90] +%endif +%if isput + pxor m6, m6 + pmaxsw m1, m6 + pminsw m1, pxmaxm + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], m1 + add tmpq, 16 +%endif +%if ARCH_X86_64 + mova m1, [stk+0x20] + mova [stk+0x10], m8 + mova [stk+0x00], m1 + mova [stk+0x20], m2 + mova [stk+0x30], m9 +%else + mova m1, [stk+0x70] + mova [stk+0x80], m8 + mova [stk+0x60], m1 + mova [stk+0x70], m2 + mova [stk+0x90], m9 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET ; why not jz .ret? +INIT_XMM ssse3 +.dy1_w8: + mov dword [stk+0xf0], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [stk+0xf0], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [stk+0xf0], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [stk+0xf0], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [stk+0xf0], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + mov myd, mym +%if ARCH_X86_64 + %ifidn %1, put + movifnidn dsm, dsq + %endif + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %define hround m11 + %if isput + mova [rsp+0x30], m13 + %else + mova m13, [base+pd_m524256] + %endif + shr t0d, 16 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d +%else + %define hround [esp+0x00] + %define m12 [esp+0x10] + %define m10 [base+pd_0x3ff] + %define m8 m0 + %xdefine m14 m4 + %xdefine m15 m3 + %if isprep + %define ssq ssm + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif + sub srcq, 6 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 + movq m3, r4q +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + SWAP m3, m5 +%endif + punpcklbw m3, m3 + psraw m3, 8 + mova [stk+0x100], m7 + mova [stk+0x120], m15 + mov [stk+0x0f8], srcq + mov [stk+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 +%if ARCH_X86_64 + mova [stk+0x140], m0 + mova [stk+0x150], m1 + mova [stk+0x160], m2 + mova [stk+0x170], m3 + %if UNIX64 + mov hm, hd + %endif +%else + mova [stk+0x180], m0 + mova [stk+0x190], m1 + mova [stk+0x1a0], m2 + mova [stk+0x1b0], m3 + SWAP m5, m3 + mov r5, hm + mov [stk+0x134], r5 +%endif + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [stk+0x0f0] + jz .ret +%if ARCH_X86_64 + add qword [stk+0x130], 16 + mov hd, hm +%else + add dword [stk+0x130], 16 + mov r5, [stk+0x134] + mov r0, [stk+0x130] +%endif + mova m7, [stk+0x100] + mova m14, [stk+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m11, [rsp+0x10] +%endif + mova m15, [stk+0x120] + mov srcq, [stk+0x0f8] +%if ARCH_X86_64 + mov r0q, [stk+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.dy1_hloop: +%if ARCH_X86_64 + mova m9, [base+pq_0x40000000] +%else + %define m9 [base+pq_0x40000000] +%endif + pxor m1, m1 + psrld m2, m14, 10 + mova [stk], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m1 + pshufd m2, m5, q1032 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pxor m2, m2 + pcmpeqd m5, m2 + mova [stk+0x110], m14 + pshufd m4, m15, q1032 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + movq r11, m14 + punpckhqdq m14, m14 + movq rX, m14 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m9, m4 + pand m8, m9, m6 + pand m15, m9, m14 + pand m9, m9, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m9, m5 + punpcklbw m0, m7, m7 + punpckhbw m7, m7 + punpcklbw m1, m8, m8 + punpckhbw m8, m8 + psraw m0, 8 + psraw m7, 8 + psraw m1, 8 + psraw m8, 8 + punpcklbw m2, m15, m15 + punpckhbw m15, m15 + punpcklbw m3, m9, m9 + punpckhbw m9, m9 + psraw m2, 8 + psraw m15, 8 + psraw m3, 8 + psraw m9, 8 + mova [stk+0x10], m0 + mova [stk+0x20], m7 + mova [stk+0x30], m1 + mova [stk+0x40], m8 + mova [stk+0x50], m2 + mova [stk+0x60], m15 + mova [stk+0x70], m3 + mova [stk+0x80], m9 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 + mova [stk+0x90], m1 + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 + mova [stk+0xa0], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 + mova [stk+0xb0], m3 + MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 + mova [stk+0xc0], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 + mova [stk+0xd0], m5 + MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 + MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 + MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 + mova m5, [stk+0xd0] + mova m1, [stk+0x90] + mova m2, [stk+0xa0] + mova m3, [stk+0xb0] + mova m9, [stk+0xc0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova m10, [stk+0x140] + mova m11, [stk+0x150] + mova m14, [stk+0x160] + mova m15, [stk+0x170] + mova [stk+0x90], m4 + mova [stk+0xa0], m5 + mova [stk+0xb0], m6 + mova [stk+0xc0], m7 + %define hround [rsp+0x10] + %define shift [rsp+0x20] + %if isput + %define vround [rsp+0x30] + %else + %define vround [base+pd_m524256] + %endif +.dy1_vloop: + pmaddwd m4, m0, m10 + pmaddwd m5, m1, m10 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m11 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [stk+0x90], m14 + pmaddwd m7, [stk+0xa0], m14 + pmaddwd m8, [stk+0xb0], m15 + pmaddwd m9, [stk+0xc0], m15 + paddd m4, m6 + paddd m5, m7 + %if isput + pshufd m6, m12, q1032 + %endif + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r4, m15 + movd r5, m4 + mova m14, [stk+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [stk+16], m14 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m9, m4 + pand m1, m9, m6 + pand m2, m9, m7 + pand m3, m9, m5 + pandn m4, [stk+0x20] + pandn m6, [stk+0x30] + pandn m7, [stk+0x40] + pandn m5, [stk+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + punpcklbw m4, m0, m0 + punpckhbw m0, m0 + punpcklbw m5, m1, m1 + punpckhbw m1, m1 + psraw m4, 8 + psraw m0, 8 + psraw m5, 8 + psraw m1, 8 + punpcklbw m6, m2, m2 + punpckhbw m2, m2 + punpcklbw m7, m3, m3 + punpckhbw m3, m3 + psraw m6, 8 + psraw m2, 8 + psraw m7, 8 + psraw m3, 8 + mova [stk+0x0a0], m4 + mova [stk+0x0b0], m0 + mova [stk+0x0c0], m5 + mova [stk+0x0d0], m1 + mova [stk+0x140], m6 + mova [stk+0x150], m2 + mova [stk+0x160], m7 + mova [stk+0x170], m3 + MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 + MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 + MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 + MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 + MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 + MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 + MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 + MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 + mova m5, [stk+0x60] + mova m6, [stk+0x70] + mova m7, [stk+0x80] + mova m0, [stk+0x90] + mov r0, r0m + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m1, [stk+0x20] + mova m2, [stk+0x30] + mova m3, [stk+0x40] + mova m4, [stk+0x50] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova m4, [stk+0x180] + mova m5, [stk+0x190] + mova m6, [stk+0x1a0] + mova m7, [stk+0x1b0] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 +.dy1_vloop: + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [stk+0x60], m6 + pmaddwd m3, [stk+0x70], m6 + pmaddwd m4, [stk+0x80], m7 + pmaddwd m5, [stk+0x90], m7 + %if isput + movd m6, [esp+0x18] + %endif + paddd m0, m2 + paddd m1, m3 + paddd m0, vrnd_mem + paddd m1, vrnd_mem + paddd m4, m0 + paddd m5, m1 +%endif +%ifidn %1, put + psrad m4, m6 + psrad m5, m6 + packssdw m4, m5 + pxor m7, m7 + pmaxsw m4, m7 + pminsw m4, pxmaxm + mova [dstq], m4 + add dstq, dsm +%else + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep +%if ARCH_X86_64 + movu m8, [srcq+r10*2] + movu m9, [srcq+r11*2] + movu m12, [srcq+r13*2] + movu m13, [srcq+ rX*2] + movu m4, [srcq+ r4*2] + movu m5, [srcq+ r6*2] + movu m6, [srcq+ r7*2] + movu m7, [srcq+ r9*2] + add srcq, ssq + pmaddwd m8, [stk+0x50] + pmaddwd m9, [stk+0x60] + pmaddwd m12, [stk+0x70] + pmaddwd m13, [stk+0x80] + pmaddwd m4, [stk+0x10] + pmaddwd m5, [stk+0x20] + pmaddwd m6, [stk+0x30] + pmaddwd m7, [stk+0x40] + phaddd m8, m9 + phaddd m12, m13 + mova m9, [base+unpckw] + mova m13, hround + phaddd m4, m5 + phaddd m6, m7 + phaddd m8, m12 + phaddd m4, m6 + pshufd m5, m9, q1032 + pshufb m0, m9 ; 0a 1a + pshufb m1, m9 ; 0b 1b + pshufb m2, m5 ; 3a 2a + pshufb m3, m5 ; 3b 2b + mova m12, shift + paddd m4, m13 + paddd m8, m13 + psrad m4, m12 + psrad m8, m12 + packssdw m4, m8 + pshufb m6, [stk+0x90], m9 ; 4a 5a + pshufb m7, [stk+0xa0], m9 ; 4b 5b + pshufb m8, [stk+0xb0], m5 ; 7a 6a + pshufb m13, [stk+0xc0], m5 ; 7b 6b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m6 ; 34a + punpcklwd m3, m7 ; 34b + punpckhwd m6, m8 ; 56a + punpckhwd m7, m13 ; 56b + punpcklwd m8, m4 ; 78a + punpckhqdq m4, m4 + punpcklwd m13, m4 ; 78b + mova [stk+0x90], m6 + mova [stk+0xa0], m7 + mova [stk+0xb0], m8 + mova [stk+0xc0], m13 + mova m13, vround +%else + mov r0m, r0 + mov r3, r3m + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 + mova m7, [base+unpckw] + pshufd m4, m7, q1032 + pshufb m0, [stk+0x20], m7 ; 0a 1a + pshufb m1, [stk+0x30], m7 ; 0b 1b + pshufb m2, [stk+0x40], m4 ; 3a 2a + pshufb m3, [stk+0x50], m4 ; 3b 2b + pshufb m5, [stk+0x60], m7 ; 4a 5a + pshufb m6, [stk+0x70], m7 ; 4b 5b + pshufb m7, [stk+0x80], m4 ; 7a 6a + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + punpckhwd m5, m7 ; 56a + mova [stk+0x60], m5 + pshufb m5, [stk+0x90], m4 ; 7b 6b + punpcklwd m7, [stk+0xe0] ; 78a + mova m4, [stk+0x180] + punpckhwd m6, m5 ; 56b + mova [stk+0x70], m6 + movq m6, [stk+0xe8] + mova [stk+0x80], m7 + mova m7, [stk+0x1b0] + punpcklwd m5, m6 + mova m6, [stk+0x1a0] + mova [stk+0x90], m5 + mova m5, [stk+0x190] + mov r0, r0m +%endif + jmp .dy1_vloop +INIT_XMM ssse3 +%if ARCH_X86_64 + %define stk rsp+0x20 +%endif +.dy2: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy2_w2: + %if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m13 + %define vrnd_mem [rsp+0x10] + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d + %else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + %define m11 [esp+0x00] + %define m12 [esp+0x10] + %define vrnd_mem [esp+0x20] + mov r1, r1m + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 + %endif + pxor m9, m9 + punpckldq m9, m8 + paddd m14, m9 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m9, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + pshufd m15, m15, q0321 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_q] + mova m6, [base+spel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m2, m2 + pcmpeqd m8, m2 + psrld m14, 10 + paddd m14, m14 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [stk], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m15 m6 + %endif + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*2] + movu m2, [srcq+ssq*4] + punpckldq m15, m7 + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + pand m9, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m9 + movu m4, [srcq+ssq*1] + movu m5, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + %else + pand m7, m5, [base+pd_0x4000] + pandn m5, m15 + por m5, m7 + %define m15 m5 + mov myd, mym + mov r5, [esp+0x1f4] + xor r3, r3 + shr myd, 6 + lea r5, [r5+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + mov [stk+0x20], r3 + mov r3, r3m + %endif + punpcklbw m15, m15 + psraw m15, 8 + REPX {pshufb x, m14}, m0, m1, m2 + REPX {pmaddwd x, m15}, m0, m1, m2 + %if ARCH_X86_64 + REPX {pshufb x, m14}, m4, m5, m6 + REPX {pmaddwd x, m15}, m4, m5, m6 + phaddd m0, m1 + phaddd m1, m2 + phaddd m4, m5 + phaddd m5, m6 + REPX {paddd x, m11}, m0, m1, m4, m5 + REPX {psrad x, m12}, m0, m1, m4, m5 + packssdw m0, m1 ; 0 2 2 4 + packssdw m4, m5 ; 1 3 3 5 + SWAP m2, m4 + movq m10, r4 + %else + mova [stk+0x10], m15 + phaddd m0, m1 + phaddd m1, m2 + movu m2, [srcq+ssq*1] + movu m7, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + REPX {pshufb x, m14}, m2, m7, m6 + REPX {pmaddwd x, m15}, m2, m7, m6 + %define m14 [stk+0x00] + %define m15 [stk+0x10] + phaddd m2, m7 + phaddd m7, m6 + REPX {paddd x, m11}, m0, m1, m2, m7 + REPX {psrad x, m12}, m0, m1, m2, m7 + packssdw m0, m1 + packssdw m2, m7 + %define m8 m6 + %define m9 m4 + %define m10 m5 + movd m10, r4 + movd m9, [stk+0x20] + punpckldq m10, m9 + %endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + %if ARCH_X86_32 + mova [stk+0x50], m7 + mova [stk+0x60], m8 + mova [stk+0x70], m9 + mova [stk+0x80], m10 + %xdefine m13 m7 + %define m7 [stk+0x50] + %define m8 [stk+0x60] + %define m9 [stk+0x70] + %define m10 [stk+0x80] + %endif + punpcklwd m1, m0, m2 ; 01 23 + punpckhwd m3, m0, m2 ; 23 45 + %if ARCH_X86_32 + mov r4, r0m + %define dstq r4 + mova [stk+0x20], m3 + mova [stk+0x30], m0 + %endif +.dy2_w2_loop: + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + movu m13, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd m3, m8 + REPX {pshufb x, m14}, m4, m5, m6, m13 + REPX {pmaddwd x, m15}, m4, m5, m6, m13 + phaddd m4, m5 + phaddd m6, m13 + pmaddwd m5, m1, m7 + paddd m4, m11 + paddd m6, m11 + psrad m4, m12 + psrad m6, m12 + packssdw m4, m6 ; 6 7 8 9 + paddd m5, m3 + pshufd m3, m4, q2200 + pshufd m4, m4, q3311 + palignr m3, m0, 12 ; 4 6 6 8 + palignr m4, m2, 12 ; 5 7 7 9 + mova m0, m3 + mova m2, m4 + punpcklwd m1, m3, m4 + punpckhwd m3, m4 + pmaddwd m6, m1, m9 + pmaddwd m4, m3, m10 + paddd m5, vrnd_mem + paddd m6, m4 + paddd m5, m6 + pshufd m4, m12, q1032 + pxor m6, m6 + psrad m5, m4 + packssdw m5, m5 + pmaxsw m5, m6 + pminsw m5, pxmaxm + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q1032 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +INIT_XMM ssse3 +.dy2_w4: +%if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %if isput + mova [rsp+0x30], m13 + %define vrnd_mem [rsp+0x30] + %define stk rsp+0x40 + %else + %define vrnd_mem [base+pd_m524256] + %define stk rsp+0x30 + %endif + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m9 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq r3 + %endif + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m9, [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + pshufd m7, m15, q1032 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r6d, m15 + movd r13d, m7 + mova m10, [base+bdct_lb_q+ 0] + mova m11, [base+bdct_lb_q+16] + movd m13, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+ r6*8+2] + movd m15, [base+subpel_filters+r11*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r1, m15 + movd r4, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r3, m15 + movd r5, m7 + mova m5, [base+bdct_lb_q+ 0] + mova m6, [base+bdct_lb_q+16] + movd m1, [base+subpel_filters+r1*8+2] + movd m2, [base+subpel_filters+r3*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + SWAP m4, m7 + mov r3, r3m + %if isprep + lea ss3q, [ssq*3] + %endif + %define m10 m5 + %define m11 m6 + %define m12 m1 + %define m13 m1 +%endif + psrld m14, 10 + paddd m14, m14 + punpckldq m13, m2 + punpckldq m15, m4 + punpcklqdq m13, m15 + pxor m2, m2 + pcmpeqd m0, m2 +%if ARCH_X86_64 + pand m9, m0 +%else + pand m2, m9, m0 + %define m9 m2 + SWAP m7, m4 +%endif + pandn m0, m13 +%if ARCH_X86_64 + SWAP m13, m0 +%else + %define m13 m0 +%endif + por m13, m9 + punpckhbw m15, m13, m13 + punpcklbw m13, m13 + psraw m15, 8 + psraw m13, 8 + pshufb m12, m14, m10 + pshufb m14, m11 + mova m10, [base+spel_s_shuf2] + movd r4d, m14 + shr r4d, 24 +%if ARCH_X86_32 + mova [stk+0x40], m13 + mova [stk+0x50], m15 + pxor m2, m2 +%endif + pshufb m7, m14, m2 + psubb m14, m7 + paddb m12, m10 + paddb m14, m10 +%if ARCH_X86_64 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu m1, [srcq+ssq*0] + movu m8, [srcq+ssq*2] + movu m9, [srcq+ssq*1] + movu m10, [srcq+ss3q ] + movu m7, [srcq+r4 ] + movu m2, [srcq+r11 ] + movu m3, [srcq+r6 ] + movu m4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m1, m9, m8, m10 + REPX {pmaddwd x, m13}, m1, m9, m8, m10 + REPX {pshufb x, m14}, m7, m3, m2, m4 + REPX {pmaddwd x, m15}, m7, m3, m2, m4 + mova m5, [rsp+0x10] + movd xm6, [rsp+0x20] + phaddd m1, m7 + phaddd m8, m2 + phaddd m9, m3 + phaddd m10, m4 + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + REPX {paddd x, m5}, m1, m9, m8, m10 + REPX {psrad x, xm6}, m1, m9, m8, m10 + packssdw m1, m8 ; 0 2 + packssdw m9, m10 ; 1 3 + movu m0, [srcq+r4 ] + movu m8, [srcq+r6 ] + lea srcq, [srcq+ssq*2] + REPX {pshufb x, m12}, m2, m3 + REPX {pmaddwd x, m13}, m2, m3 + REPX {pshufb x, m14}, m0, m8 + REPX {pmaddwd x, m15}, m0, m8 + phaddd m2, m0 + phaddd m3, m8 + shr myd, 6 + mov r9d, 64 << 24 + lea myd, [t1+myq] + cmovnz r9q, [base+subpel_filters+myq*8] + REPX {paddd x, m5}, m2, m3 + REPX {psrad x, xm6}, m2, m3 + packssdw m2, m3 ; 4 5 + pshufd m3, m2, q1032 ; 5 _ + punpcklwd m0, m1, m9 ; 01 + punpckhwd m1, m9 ; 23 + punpcklwd m2, m3 ; 45 + movq m10, r9 + %define hrnd_mem [rsp+0x10] + %define hsh_mem [rsp+0x20] + %define vsh_mem [rsp+0x28] + %if isput + %define vrnd_mem [rsp+0x30] + %else + %define vrnd_mem [base+pd_m524256] + %endif +%else + mova [stk+0x20], m12 + mova [stk+0x30], m14 + add r4, srcq + MC_4TAP_SCALED_H 0x60 ; 0 1 + MC_4TAP_SCALED_H 0x70 ; 2 3 + MC_4TAP_SCALED_H 0x80 ; 4 5 + mov [stk+0xe0], r4 + mova m3, [base+spel_s_shuf8] + mova m0, [stk+0x60] + mova m1, [stk+0x70] + mova m2, [stk+0x80] + mov myd, mym + mov rX, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea rX, [rX+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+rX*8+0] + cmovnz r5, [base+subpel_filters+rX*8+4] + mov r3, r3m + pshufb m0, m3 ; 01 + pshufb m1, m3 ; 23 + pshufb m2, m3 ; 45 + movd m7, r4 + movd m4, r5 + mov r5, r0m + %if isput + mov r1, r1m + %endif + mov r4, [stk+0xe0] + %define dstq r5 + %define tmpq r5 + %define m12 [stk+0x20] + %define m14 [stk+0x30] + %define m13 [stk+0x40] + %define m15 [stk+0x50] + %define hrnd_mem [esp+0x00] + %define hsh_mem [esp+0x10] + %define vsh_mem [esp+0x18] + %if isput + %define vrnd_mem [esp+0x20] + %else + %define vrnd_mem [base+pd_m524256] + %endif + %define m10 m7 + punpckldq m10, m4 +%endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m3, m10, q0000 + pshufd m4, m10, q1111 + pshufd m5, m10, q2222 + pshufd m10, m10, q3333 +%if ARCH_X86_32 + %xdefine m8 m3 + %xdefine m9 m6 + %xdefine m11 m5 + %xdefine m6 m4 + mova [stk+0x100], m3 + mova [stk+0x110], m4 + mova [stk+0x120], m5 + mova [stk+0x130], m10 + %define m3 [stk+0x100] + %define m4 [stk+0x110] + %define m5 [stk+0x120] + %define m10 [stk+0x130] +%endif +.dy2_w4_loop: + pmaddwd m8, m0, m3 + pmaddwd m9, m1, m3 + mova m0, m2 + pmaddwd m1, m4 + pmaddwd m11, m2, m4 + paddd m8, vrnd_mem + paddd m9, vrnd_mem + pmaddwd m2, m5 + paddd m8, m1 + paddd m9, m11 + paddd m8, m2 + movu m6, [srcq+ssq*0] + movu m1, [srcq+ssq*2] +%if ARCH_X86_64 + movu m11, [srcq+r4 ] + movu m2, [srcq+r11] +%else + movu m11, [r4+ssq*0] + movu m2, [r4+ssq*2] +%endif + pshufb m6, m12 + pshufb m1, m12 + pmaddwd m6, m13 + pmaddwd m1, m13 + pshufb m11, m14 + pshufb m2, m14 + pmaddwd m11, m15 + pmaddwd m2, m15 + phaddd m6, m11 + phaddd m1, m2 + paddd m6, hrnd_mem + paddd m1, hrnd_mem + psrad m6, hsh_mem + psrad m1, hsh_mem + movu m7, [srcq+ssq*1] + movu m11, [srcq+ss3q ] + packssdw m6, m1 ; 6 8 +%if ARCH_X86_64 + movu m2, [srcq+r6 ] + movu m1, [srcq+r13] +%else + movu m2, [r4+ssq*1] + movu m1, [r4+ss3q ] +%endif + pshufb m7, m12 + pshufb m11, m12 + pmaddwd m7, m13 + pmaddwd m11, m13 + pshufb m2, m14 + pshufb m1, m14 + pmaddwd m2, m15 + pmaddwd m1, m15 + phaddd m7, m2 + phaddd m11, m1 + paddd m7, hrnd_mem + paddd m11, hrnd_mem + psrad m7, hsh_mem + psrad m11, hsh_mem + packssdw m7, m11 ; 7 9 +%if ARCH_X86_32 + lea r4, [r4+ssq*4] +%endif + lea srcq, [srcq+ssq*4] + punpcklwd m1, m6, m7 ; 67 + punpckhwd m6, m7 ; 89 + mova m2, m6 + pmaddwd m11, m1, m5 + pmaddwd m7, m1, m10 + pmaddwd m6, m10 + paddd m9, m11 +%if isput + movd m11, vsh_mem +%endif + paddd m8, m7 + paddd m9, m6 +%if isput + psrad m8, m11 + psrad m9, m11 + packssdw m8, m9 + pxor m7, m7 + pmaxsw m8, m7 + pminsw m8, pxmaxm + movq [dstq+dsq*0], m8 + movhps [dstq+dsq*1], m8 + lea dstq, [dstq+dsq*2] +%else + psrad m8, 6 + psrad m9, 6 + packssdw m8, m9 + mova [tmpq], m8 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET ; why not jz .ret? +INIT_XMM ssse3 +.dy2_w8: + mov dword [stk+0xf0], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [stk+0xf0], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [stk+0xf0], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [stk+0xf0], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [stk+0xf0], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + mov myd, mym +%if ARCH_X86_64 + %ifidn %1, put + movifnidn dsm, dsq + %endif + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %define hround m11 + %if isput + mova [rsp+0x30], m13 + %else + mova m13, [base+pd_m524256] + %endif + shr t0d, 16 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d +%else + %define hround [esp+0x00] + %define m12 [esp+0x10] + %define m10 [base+pd_0x3ff] + %define m8 m0 + %xdefine m14 m4 + %xdefine m15 m3 + %if isput + %define dstq r0 + %else + %define tmpq r0 + %define ssq ssm + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif + sub srcq, 6 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 + movq m3, r4q +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + SWAP m3, m5 +%endif + punpcklbw m3, m3 + psraw m3, 8 + mova [stk+0x100], m7 + mova [stk+0x120], m15 + mov [stk+0x0f8], srcq + mov [stk+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 +%if ARCH_X86_64 + mova [stk+0x140], m0 + mova [stk+0x150], m1 + mova [stk+0x160], m2 + mova [stk+0x170], m3 + %if UNIX64 + mov hm, hd + %endif +%else + mova [stk+0x180], m0 + mova [stk+0x190], m1 + mova [stk+0x1a0], m2 + mova [stk+0x1b0], m3 + SWAP m5, m3 + mov r5, hm + mov [stk+0x134], r5 +%endif + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [stk+0x0f0] + jz .ret +%if ARCH_X86_64 + add qword [stk+0x130], 16 + mov hd, hm +%else + add dword [stk+0x130], 16 + mov r5, [stk+0x134] + mov r0, [stk+0x130] +%endif + mova m7, [stk+0x100] + mova m14, [stk+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m11, [rsp+0x10] +%endif + mova m15, [stk+0x120] + mov srcq, [stk+0x0f8] +%if ARCH_X86_64 + mov r0q, [stk+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.dy2_hloop: +%if ARCH_X86_64 + mova m9, [base+pq_0x40000000] +%else + %define m9 [base+pq_0x40000000] +%endif + pxor m1, m1 + psrld m2, m14, 10 + mova [stk], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m1 + pshufd m2, m5, q1032 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pxor m2, m2 + pcmpeqd m5, m2 + mova [stk+0x110], m14 + pshufd m4, m15, q1032 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + movq r11, m14 + punpckhqdq m14, m14 + movq rX, m14 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m9, m4 + pand m8, m9, m6 + pand m15, m9, m14 + pand m9, m9, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m9, m5 + punpcklbw m0, m7, m7 + punpckhbw m7, m7 + punpcklbw m1, m8, m8 + punpckhbw m8, m8 + psraw m0, 8 + psraw m7, 8 + psraw m1, 8 + psraw m8, 8 + punpcklbw m2, m15, m15 + punpckhbw m15, m15 + punpcklbw m3, m9, m9 + punpckhbw m9, m9 + psraw m2, 8 + psraw m15, 8 + psraw m3, 8 + psraw m9, 8 + mova [stk+0x10], m0 + mova [stk+0x20], m7 + mova [stk+0x30], m1 + mova [stk+0x40], m8 + mova [stk+0x50], m2 + mova [stk+0x60], m15 + mova [stk+0x70], m3 + mova [stk+0x80], m9 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 + mova [stk+0x90], m1 + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 + mova [stk+0xa0], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 + mova [stk+0xb0], m3 + MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 + mova [stk+0xc0], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 + mova [stk+0xd0], m5 + MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 + MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 + MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 + mova m5, [stk+0xd0] + mova m1, [stk+0x90] + mova m2, [stk+0xa0] + mova m3, [stk+0xb0] + mova m9, [stk+0xc0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova m10, [stk+0x140] + mova m11, [stk+0x150] + mova m14, [stk+0x160] + mova m15, [stk+0x170] + mova [stk+0x90], m4 + mova [stk+0xa0], m5 + mova [stk+0xb0], m6 + mova [stk+0xc0], m7 + %define hround [rsp+0x10] + %define shift [rsp+0x20] + %if isput + %define vround [rsp+0x30] + %else + %define vround [base+pd_m524256] + %endif +.dy2_vloop: + pmaddwd m4, m0, m10 + pmaddwd m5, m1, m10 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m11 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [stk+0x90], m14 + pmaddwd m7, [stk+0xa0], m14 + pmaddwd m8, [stk+0xb0], m15 + pmaddwd m9, [stk+0xc0], m15 + paddd m4, m6 + paddd m5, m7 + %if isput + pshufd m6, m12, q1032 + %endif + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r4, m15 + movd r5, m4 + mova m14, [stk+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [stk+16], m14 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m9, m4 + pand m1, m9, m6 + pand m2, m9, m7 + pand m3, m9, m5 + pandn m4, [stk+0x20] + pandn m6, [stk+0x30] + pandn m7, [stk+0x40] + pandn m5, [stk+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + punpcklbw m4, m0, m0 + punpckhbw m0, m0 + punpcklbw m5, m1, m1 + punpckhbw m1, m1 + psraw m4, 8 + psraw m0, 8 + psraw m5, 8 + psraw m1, 8 + punpcklbw m6, m2, m2 + punpckhbw m2, m2 + punpcklbw m7, m3, m3 + punpckhbw m3, m3 + psraw m6, 8 + psraw m2, 8 + psraw m7, 8 + psraw m3, 8 + mova [stk+0x0a0], m4 + mova [stk+0x0b0], m0 + mova [stk+0x0c0], m5 + mova [stk+0x0d0], m1 + mova [stk+0x140], m6 + mova [stk+0x150], m2 + mova [stk+0x160], m7 + mova [stk+0x170], m3 + MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 + MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 + MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 + MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 + MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 + MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 + MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 + MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 + mova m5, [stk+0x60] + mova m6, [stk+0x70] + mova m7, [stk+0x80] + mova m0, [stk+0x90] + mov r0, r0m + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m1, [stk+0x20] + mova m2, [stk+0x30] + mova m3, [stk+0x40] + mova m4, [stk+0x50] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova m4, [stk+0x180] + mova m5, [stk+0x190] + mova m6, [stk+0x1a0] + mova m7, [stk+0x1b0] + mova [stk+0x40], m2 + mova [stk+0x50], m3 +.dy2_vloop: + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [stk+0x60], m6 + pmaddwd m3, [stk+0x70], m6 + pmaddwd m4, [stk+0x80], m7 + pmaddwd m5, [stk+0x90], m7 + %if isput + movd m6, [esp+0x18] + %endif + paddd m0, m2 + paddd m1, m3 + paddd m0, vrnd_mem + paddd m1, vrnd_mem + paddd m4, m0 + paddd m5, m1 +%endif +%ifidn %1, put + psrad m4, m6 + psrad m5, m6 + packssdw m4, m5 + pxor m7, m7 + pmaxsw m4, m7 + pminsw m4, pxmaxm + mova [dstq], m4 + add dstq, dsm +%else + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep +%if ARCH_X86_64 + MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1 + mova [stk+0xd0], m4 + MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1 + mova m4, [stk+0xd0] + mova m0, m2 ; 01a + mova m1, m3 ; 01b + mova m2, [stk+0x90] ; 23a + mova m3, [stk+0xa0] ; 23b + mova m5, [stk+0xb0] ; 45a + mova m6, [stk+0xc0] ; 45b + punpcklwd m7, m4, m8 ; 67a + punpckhwd m4, m8 ; 67b + mova [stk+0x90], m5 + mova [stk+0xa0], m6 + mova [stk+0xb0], m7 + mova [stk+0xc0], m4 +%else + mov r0m, r0 + mov r3, r3m + MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8 + MC_8TAP_SCALED_H 0xa0, 0 ; 9 + mova m7, [stk+0xe0] + mova m2, [stk+0x60] ; 23a + mova m3, [stk+0x70] ; 23b + mova m4, [stk+0x80] ; 45a + mova m5, [stk+0x90] ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova m0, [stk+0x40] ; 01a + mova m1, [stk+0x50] ; 01b + mova [stk+0x40], m2 + mova [stk+0x50], m3 + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova m4, [stk+0x180] + mova m5, [stk+0x190] + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m6, [stk+0x1a0] + mova m7, [stk+0x1b0] + mov r0, r0m +%endif + jmp .dy2_vloop +INIT_XMM ssse3 +.ret: + MC_8TAP_SCALED_RET 0 +%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT + %define r0m [rstk+stack_offset+ 4] + %define r1m [rstk+stack_offset+ 8] + %define r2m [rstk+stack_offset+12] + %define r3m [rstk+stack_offset+16] +%endif +%undef isput +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled_16bpc + mov t0d, (5*15 << 16) | 5*15 + mov t1d, (5*15 << 16) | 5*15 + jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%elif ARCH_X86_64 +DECLARE_REG_TMP 6, 8 +%else +DECLARE_REG_TMP 1, 2 +%endif +BILIN_SCALED_FN put +FN put_8tap_scaled, sharp, SHARP, SHARP +FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN put_8tap_scaled, smooth, SMOOTH, SMOOTH +FN put_8tap_scaled, sharp_regular, SHARP, REGULAR +FN put_8tap_scaled, regular_sharp, REGULAR, SHARP +FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN put_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%elif ARCH_X86_64 +DECLARE_REG_TMP 6, 7 +%else +DECLARE_REG_TMP 1, 2 +%endif +BILIN_SCALED_FN prep +FN prep_8tap_scaled, sharp, SHARP, SHARP +FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH +FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR +FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP +FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN prep_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED prep + +%if ARCH_X86_64 +DECLARE_REG_TMP 6 +%else +DECLARE_REG_TMP 2 +%endif + %if ARCH_X86_64 ; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that ; by allocating 16 bytes more stack space so that stack offsets match up. @@ -4589,7 +8540,7 @@ pshufd m5, m5, q0000 mova [rsp+16*3*ARCH_X86_32], m4 %if ARCH_X86_64 - DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ %else diff -Nru dav1d-0.9.2/src/x86/mc_avx2.asm dav1d-1.0.0/src/x86/mc_avx2.asm --- dav1d-0.9.2/src/x86/mc_avx2.asm 2021-09-03 15:51:24.425037100 +0000 +++ dav1d-1.0.0/src/x86/mc_avx2.asm 2022-03-18 14:31:56.030356000 +0000 @@ -1457,7 +1457,6 @@ %endif %define PUT_8TAP_FN FN put_8tap, - PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP @@ -2123,7 +2122,6 @@ %endif %define PREP_8TAP_FN FN prep_8tap, - PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP @@ -2816,7 +2814,7 @@ dec srcq movd xm15, t0d punpckldq m8, m9, m8 - paddd m14, m8 ; mx+dx*[0-1] + paddd m14, m8 ; mx+dx*[0,1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 @@ -2867,8 +2865,7 @@ lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm11, r6q - punpcklbw xm11, xm11 - psraw xm11, 8 + pmovsxbw xm11, xm11 pshufd xm8, xm11, q0000 pshufd xm9, xm11, q1111 pshufd xm10, xm11, q2222 @@ -2996,8 +2993,7 @@ lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm10, r6q - punpcklbw xm10, xm10 - psraw xm10, 8 + pmovsxbw xm10, xm10 pshufd xm7, xm10, q0000 pshufd xm8, xm10, q1111 pshufd xm9, xm10, q2222 @@ -3171,9 +3167,8 @@ lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm11, r6q - punpcklbw xm11, xm11 - psraw xm11, 8 - vinserti128 m11, xm11, 1 + punpcklqdq xm11, xm11 + pmovsxbw m11, xm11 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pmaddwd m4, m0, m8 @@ -3319,8 +3314,7 @@ vpbroadcastq m2, [srcq+ssq*1] add srcq, ss3q movq xm10, r4q - punpcklbw xm10, xm10 - psraw xm10, 8 + pmovsxbw xm10, xm10 vpblendd m15, m7, 0xaa pblendvb m15, m11, m8 pshufd xm8, xm10, q0000 @@ -3416,9 +3410,8 @@ punpcklqdq m15, m15 pblendvb m15, m11, m8 movq xm10, r4q - punpcklbw xm10, xm10 - psraw xm10, 8 - vinserti128 m10, xm10, 1 + punpcklqdq xm10, xm10 + pmovsxbw m10, xm10 pshufb m2, m14 pshufb m3, m14 pshufb m4, m14 @@ -3525,8 +3518,7 @@ vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] movq xm0, r4q - punpcklbw xm0, xm0 - psraw xm0, 8 + pmovsxbw xm0, xm0 mova [rsp+96], xm0 jmp .dy1_hloop .dy1_hloop_prep: @@ -3694,8 +3686,7 @@ pmaddubsw m0, m15 pmaddubsw m1, m15 movq xm11, r4q - punpcklbw xm11, xm11 - psraw xm11, 8 + pmovsxbw xm11, xm11 phaddw m0, m1 pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 pshufd xm8, xm11, q0000 @@ -3791,9 +3782,8 @@ pmaddubsw xm1, xm15 pmaddubsw m3, m15 movq xm11, r4q - punpcklbw xm11, xm11 - psraw xm11, 8 - vinserti128 m11, xm11, 1 + punpcklqdq xm11, xm11 + pmovsxbw m11, xm11 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 2 _ 4 @@ -3888,8 +3878,7 @@ vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] movq xm0, r4q - punpcklbw xm0, xm0 - psraw xm0, 8 + pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy2_hloop .dy2_hloop_prep: @@ -4515,11 +4504,12 @@ %define base r6-blend_avx2_table lea r6, [blend_avx2_table] tzcnt wd, wm - movifnidn hd, hm movifnidn maskq, maskmp + movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m4, [base+pb_64] vpbroadcastd m5, [base+pw_512] + sub tmpq, maskq add wq, r6 lea r6, [dsq*3] jmp wq @@ -4532,9 +4522,8 @@ psubb xm3, xm4, xm6 punpcklbw xm2, xm3, xm6 punpckhbw xm3, xm6 - mova xm6, [tmpq] + mova xm6, [maskq+tmpq] add maskq, 4*4 - add tmpq, 4*4 punpcklbw xm0, xm6 punpckhbw xm1, xm6 pmaddubsw xm0, xm2 @@ -4557,9 +4546,8 @@ vpbroadcastq m2, [dstq+dsq*2] vpbroadcastq m3, [dstq+r6 ] mova m0, [maskq] - mova m6, [tmpq] + mova m6, [maskq+tmpq] add maskq, 8*4 - add tmpq, 8*4 vpblendd m1, m2, 0x30 vpblendd m1, m3, 0xc0 psubb m3, m4, m0 @@ -4589,9 +4577,8 @@ psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 - mova m6, [tmpq] + mova m6, [maskq+tmpq] add maskq, 16*2 - add tmpq, 16*2 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 @@ -4609,9 +4596,8 @@ .w32: mova m0, [maskq] mova m1, [dstq] - mova m6, [tmpq] + mova m6, [maskq+tmpq] add maskq, 32 - add tmpq, 32 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 @@ -4675,21 +4661,21 @@ RET ALIGN function_align .w8: - vbroadcasti128 m4, [maskq+8*2] + mova xm3, [maskq+8*2] .w8_loop: - vpbroadcastq m2, [dstq+dsq*0] - movq xm0, [dstq+dsq*1] - vpblendd m0, m2, 0x30 - movq xm1, [tmpq+8*1] - vinserti128 m1, [tmpq+8*0], 1 + movq xm0, [dstq+dsq*0] + vpbroadcastq xm1, [dstq+dsq*1] + mova xm2, [tmpq] add tmpq, 8*2 - punpcklbw m0, m1 - pmaddubsw m0, m4 - pmulhrsw m0, m5 - vextracti128 xm1, m0, 1 + punpcklbw xm0, xm2 + punpckhbw xm1, xm2 + pmaddubsw xm0, xm3 + pmaddubsw xm1, xm3 + pmulhrsw xm0, xm5 + pmulhrsw xm1, xm5 packuswb xm0, xm1 - movhps [dstq+dsq*0], xm0 - movq [dstq+dsq*1], xm0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop @@ -5060,11 +5046,11 @@ vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm - DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ - vpbroadcastd m3, [base+pw_m256] + vpbroadcastd xm3, [base+pw_m256] vpbroadcastd m7, [base+pd_63] vbroadcasti128 m15, [base+pb_8x0_8x8] pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] diff -Nru dav1d-0.9.2/src/x86/mc_avx512.asm dav1d-1.0.0/src/x86/mc_avx512.asm --- dav1d-0.9.2/src/x86/mc_avx512.asm 2021-09-03 15:51:24.425037100 +0000 +++ dav1d-1.0.0/src/x86/mc_avx512.asm 2022-03-18 14:31:56.034356000 +0000 @@ -26,10 +26,44 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -%if HAVE_AVX512ICL && ARCH_X86_64 +%if ARCH_X86_64 SECTION_RODATA 64 +obmc_masks: +pw_512: times 2 dw 512 + ; 2 + db 45, 19, 64, 0 + ; 4 + db 39, 25, 50, 14, 59, 5, 64, 0 + ; 8 + db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 + ; 16 + db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 + db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 + ; 32 + db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 + db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 + db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 + db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 + +warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20 + db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22 + db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24 + db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26 +warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24 + db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26 + db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28 + db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30 +warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 +warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 +pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 +warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43 +pd_16384: dd 16384 +pd_262144: dd 262144 +warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54 +warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59 + db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63 bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 @@ -100,40 +134,98 @@ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 +spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 + db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 + db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 +spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 + db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 + db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 + db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 +spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13 + db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 +spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 + db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 + db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 + db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 +spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55 + db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63 + db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71 + db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79 +spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 + db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 + db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 + db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 +spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 + db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 +spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 + db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 +spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 + db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 + db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52 + db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54 +spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40 + db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42 + db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48 + db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50 +spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 + db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 + db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 + db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 +spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12 + db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14 + db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20 + db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22 +spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 +subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 +resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +resize_permC: dd 0, 4, 8, 12 pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 wm_420_perm64: dq 0xfedcba9876543210 wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 -pb_127: times 4 db 127 -pw_m128 times 2 dw -128 -pw_512: times 2 dw 512 -pw_1024: times 2 dw 1024 -pw_2048: times 2 dw 2048 -pw_6903: times 2 dw 6903 -pw_8192: times 2 dw 8192 -pd_2: dd 2 -pd_32: dd 32 -pd_32768: dd 32768 +pb_8x0_8x8: times 8 db 0 + times 8 db 8 +pb_127: times 4 db 127 +pw_m128 times 2 dw -128 +pw_m256: times 2 dw -256 +pw_1024: times 2 dw 1024 +pw_2048: times 2 dw 2048 +pw_6903: times 2 dw 6903 +pw_8192: times 2 dw 8192 +pd_32: dd 32 +pd_34: dd 34 +pd_63: dd 63 +pd_512: dd 512 +pd_32768: dd 32768 %define pb_m64 (wm_sign+4) %define pb_64 (wm_sign+8) +%define pd_2 (pd_0to7+8) cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) +cextern mc_warp_filter +cextern resize_filter %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) @@ -188,12 +280,16 @@ %endrep %endmacro +%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep) %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX +BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 @@ -201,26 +297,608 @@ BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 SECTION .text -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - %macro WRAP_YMM 1+ INIT_YMM cpuname %1 INIT_ZMM cpuname %endmacro +INIT_ZMM avx512icl +cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy + movifnidn mxyd, r6m ; mx + lea r7, [put_avx512icl] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [r7+wq*2+table_offset(put,)] + add wq, r7 + jmp wq +.put_w2: + movzx r6d, word [srcq+ssq*0] + movzx r7d, word [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6w + mov [dstq+dsq*1], r7w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu xmm0, [srcq+ssq*0] + movu xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], xmm0 + mova [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu ym0, [srcq+ssq*0] + movu ym1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], ym0 + mova [dstq+dsq*1], ym1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*1+64*0] + movu m3, [srcq+ssq*1+64*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+64*0], m0 + mova [dstq+dsq*0+64*1], m1 + mova [dstq+dsq*1+64*0], m2 + mova [dstq+dsq*1+64*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w128 + RET +.h: + ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 + ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 + imul mxyd, 0xff01 + vbroadcasti128 m4, [bilin_h_shuf8] + add mxyd, 16 << 8 + vpbroadcastw m5, mxyd + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .hv + movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] + vpbroadcastd m3, [pw_2048] + add wq, r7 + jmp wq +.h_w2: + movd xmm0, [srcq+ssq*0] + pinsrd xmm0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb xmm0, xm4 + pmaddubsw xmm0, xm5 + pmulhrsw xmm0, xm3 + packuswb xmm0, xmm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + mova xmm4, [bilin_h_shuf4] +.h_w4_loop: + movq xmm0, [srcq+ssq*0] + movhps xmm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xmm4 + pmaddubsw xmm0, xm5 + pmulhrsw xmm0, xm3 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb ym0, ym4 + pmaddubsw ym0, ym5 + pmulhrsw ym0, ym3 + vpmovuswb xmm0, ym0 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mova m4, [bilin_h_perm16] +.h_w16_loop: + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + vpermb m0, m4, m0 + pmaddubsw m0, m5 + pmulhrsw m0, m3 + vpmovuswb ym0, m0 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + movu ym0, [srcq+ssq*0+8*0] + vinserti32x8 m0, [srcq+ssq*1+8*0], 1 + movu ym1, [srcq+ssq*0+8*1] + vinserti32x8 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + add srcq, ssq + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + movu m0, [srcq+8*0] + movu m2, [srcq+8*1] + movu m1, [srcq+8*8] + movu m6, [srcq+8*9] + add srcq, ssq + REPX {pshufb x, m4}, m0, m2, m1, m6 + REPX {pmaddubsw x, m5}, m0, m2, m1, m6 + REPX {pmulhrsw x, m3}, m0, m2, m1, m6 + packuswb m0, m2 + packuswb m1, m6 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] + imul mxyd, 0xff01 + vpbroadcastd m5, [pw_2048] + add mxyd, 16 << 8 + add wq, r7 + vpbroadcastw m4, mxyd + jmp wq +.v_w2: + movd xmm0, [srcq+ssq*0] +.v_w2_loop: + pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1 + lea srcq, [srcq+ssq*2] + pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 + pshuflw xmm1, xmm1, q2301 ; 1 0 + punpcklbw xmm1, xmm0, xmm1 + pmaddubsw xmm1, xm4 + pmulhrsw xmm1, xm5 + packuswb xmm1, xmm1 + pextrw [dstq+dsq*0], xmm1, 1 + pextrw [dstq+dsq*1], xmm1, 0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xmm0, [srcq+ssq*0] +.v_w4_loop: + vpbroadcastd xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm1, xmm0, 0x02 ; 1 2 + punpcklbw xmm1, xmm2 + pmaddubsw xmm1, xm4 + pmulhrsw xmm1, xm5 + packuswb xmm1, xmm1 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xmm0, [srcq+ssq*0] +.v_w8_loop: + movq xmm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw xmm1, xmm3, xmm0 + movq xmm0, [srcq+ssq*0] + punpcklbw xmm2, xmm0, xmm3 + pmaddubsw xmm1, xm4 + pmaddubsw xmm2, xm4 + pmulhrsw xmm1, xm5 + pmulhrsw xmm2, xm5 + packuswb xmm1, xmm2 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + movu xmm0, [srcq+ssq*0] +.v_w16_loop: + vbroadcasti128 ymm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1 + vbroadcasti128 ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2 + punpcklbw ymm1, ymm2, ymm3 + punpckhbw ymm2, ymm3 + pmaddubsw ymm1, ym4 + pmaddubsw ymm2, ym4 + pmulhrsw ymm1, ym5 + pmulhrsw ymm2, ym5 + packuswb ymm1, ymm2 + mova [dstq+dsq*0], xmm1 + vextracti128 [dstq+dsq*1], ymm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + vzeroupper + RET +.v_w32: + movu ym0, [srcq+ssq*0] + kxnorb k1, k1, k1 +.v_w32_loop: + vbroadcasti32x8 m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendmd m3{k1}, m2, m0 ; 0 1 + vbroadcasti32x8 m0, [srcq+ssq*0] + vpblendmd m2{k1}, m0, m2 ; 1 2 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + mova [dstq+dsq*0], ym1 + vextracti32x8 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + RET +.v_w64: + movu m0, [srcq+ssq*0] +.v_w64_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m1, m3, m0 + punpckhbw m6, m3, m0 + movu m0, [srcq+ssq*0] + pmaddubsw m1, m4 + pmaddubsw m6, m4 + punpcklbw m2, m0, m3 + punpckhbw m7, m0, m3 + pmaddubsw m2, m4 + pmaddubsw m7, m4 + REPX {pmulhrsw x, m5}, m1, m6, m2, m7 + packuswb m1, m6 + packuswb m2, m7 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] +.v_w128_loop: + add srcq, ssq + movu m2, [srcq+64*0] + movu m3, [srcq+64*1] + punpcklbw m6, m2, m0 + pmaddubsw m6, m4 + punpckhbw m0, m2, m0 + pmaddubsw m0, m4 + punpcklbw m7, m3, m1 + pmaddubsw m7, m4 + punpckhbw m1, m3, m1 + pmaddubsw m1, m4 + REPX {pmulhrsw x, m5}, m6, m0, m7, m1 + packuswb m6, m0 + mova m0, m2 + packuswb m7, m1 + mova m1, m3 + mova [dstq+64*0], m6 + mova [dstq+64*1], m7 + add dstq, dsq + dec hd + jg .v_w128_loop + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 + ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 + movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 ; can't shift by 12 due to signed overflow + vpbroadcastd m7, [pw_2048] + add wq, r7 + vpbroadcastw m6, mxyd + jmp wq +.hv_w2: + vpbroadcastd xmm0, [srcq+ssq*0] + pshufb xmm0, xm4 + pmaddubsw xmm0, xm5 +.hv_w2_loop: + movd xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pinsrd xmm1, [srcq+ssq*0], 1 + pshufb xmm1, xm4 + pmaddubsw xmm1, xm5 ; 1 _ 2 _ + shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _ + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm6 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm7 + packuswb xmm1, xmm1 + pextrw [dstq+dsq*0], xmm1, 0 + pextrw [dstq+dsq*1], xmm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova xmm4, [bilin_h_shuf4] + movddup xmm0, [srcq+ssq*0] + pshufb xmm0, xmm4 + pmaddubsw xmm0, xm5 +.hv_w4_loop: + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm1, [srcq+ssq*0] + pshufb xmm1, xmm4 + pmaddubsw xmm1, xm5 ; 1 2 + shufps xmm2, xmm0, xmm1, q1032 ; 0 1 + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm6 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm7 + packuswb xmm1, xmm1 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti128 ym0, [srcq+ssq*0] + pshufb ym0, ym4 + pmaddubsw ym0, ym5 +.hv_w8_loop: + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 ym1, [srcq+ssq*0], 1 + pshufb ym1, ym4 + pmaddubsw ym1, ym5 ; 1 2 + valignq ym2, ym1, ym0, 2 + mova ym0, ym1 + psubw ym1, ym2 + paddw ym1, ym1 + pmulhw ym1, ym6 + paddw ym1, ym2 + pmulhrsw ym1, ym7 + vpmovuswb xmm1, ym1 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + vbroadcasti32x8 m0, [srcq+ssq*0] + mova m4, [bilin_h_perm16] + vpermb m0, m4, m0 + pmaddubsw m0, m5 +.hv_w16_loop: + movu ym1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m1, [srcq+ssq*0], 1 + vpermb m1, m4, m1 + pmaddubsw m1, m5 ; 1 2 + valignq m2, m1, m0, 4 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + vpmovuswb ym1, m1 + mova [dstq+dsq*0], xm1 + vextracti32x4 [dstq+dsq*1], ym1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+ssq*0] + pmovzxbq m8, [pb_02461357] + pmaddubsw m0, m5 +.hv_w32_loop: + vpermb m2, m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpermb m3, m4, [srcq+ssq*0] + pmaddubsw m2, m5 + psubw m1, m2, m0 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m0 + pmaddubsw m0, m3, m5 + psubw m3, m0, m2 + paddw m3, m3 + pmulhw m3, m6 + paddw m3, m2 + pmulhrsw m1, m7 + pmulhrsw m3, m7 + packuswb m1, m3 + vpermq m1, m8, m1 + mova [dstq+dsq*0], ym1 + vextracti32x8 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w32_loop + RET +.hv_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w64_loop: + add srcq, ssq + movu m2, [srcq+8*0] + movu m3, [srcq+8*1] + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + psubw m8, m2, m0 + psubw m9, m3, m1 + paddw m8, m8 + pmulhw m8, m6 + paddw m9, m9 + pmulhw m9, m6 + paddw m8, m0 + pmulhrsw m8, m7 + paddw m9, m1 + pmulhrsw m9, m7 + mova m0, m2 + mova m1, m3 + packuswb m8, m9 + mova [dstq], m8 + add dstq, dsq + dec hd + jg .hv_w64_loop + RET +.hv_w128: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + movu m2, [srcq+8*8] + movu m3, [srcq+8*9] + REPX {pshufb x, m4}, m0, m1, m2, m3 + REPX {pmaddubsw x, m5}, m0, m1, m2, m3 +.hv_w128_loop: + add srcq, ssq + movu m8, [srcq+8*0] + movu m9, [srcq+8*1] + movu m10, [srcq+8*8] + movu m11, [srcq+8*9] + REPX {pshufb x, m4}, m8, m9, m10, m11 + REPX {pmaddubsw x, m5}, m8, m9, m10, m11 + psubw m12, m8, m0 + psubw m13, m9, m1 + psubw m14, m10, m2 + psubw m15, m11, m3 + paddw m12, m12 + pmulhw m12, m6 + paddw m13, m13 + pmulhw m13, m6 + paddw m14, m14 + pmulhw m14, m6 + paddw m15, m15 + pmulhw m15, m6 + paddw m12, m0 + pmulhrsw m12, m7 + paddw m13, m1 + pmulhrsw m13, m7 + paddw m14, m2 + pmulhrsw m14, m7 + paddw m15, m3 + pmulhrsw m15, m7 + mova m0, m8 + mova m1, m9 + mova m2, m10 + mova m3, m11 + packuswb m12, m13 + packuswb m14, m15 + mova [dstq+64*0], m12 + mova [dstq+64*1], m14 + add dstq, dsq + dec hd + jg .hv_w128_loop + RET + DECLARE_REG_TMP 3, 5, 6 -INIT_ZMM avx512icl cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea t2, [prep_avx512icl] @@ -784,6 +1462,871 @@ %endif %endmacro +%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb +%if %5 + vpermb m%2, m6, m%1 + vpermb m%3, m7, m%1 + vpermb m%4, m8, m%1 +%else +%if %2 < %4 ; reuse a previous value if possible + pshufb m%2, m%1, m6 +%endif + pshufb m%3, m%1, m7 + pshufb m%4, m%1, m8 +%endif + mova m%1, m5 + vpdpbusd m%1, m%2, m9 + mova m%2, m5 + vpdpbusd m%2, m%3, m9 + vpdpbusd m%1, m%3, m10 + vpdpbusd m%2, m%4, m10 + packusdw m%1, m%2 + psrlw m%1, 6 +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +%define PUT_8TAP_FN FN put_8tap, + +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR + +cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +%define base r8-put_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx512icl] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 + lea r6, [ssq*3] + lea r7, [dsq*3] +%if WIN64 + pop r8 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) + WIN64_SPILL_XMM 11 + cmp wd, 4 + jl .h_w2 + vbroadcasti128 m6, [subpel_h_shufA] + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m7, [subpel_h_shufB] + vbroadcasti128 m8, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [base+mxq*8+subpel_filters+0] + vpbroadcastd m10, [base+mxq*8+subpel_filters+4] + add wq, r8 + jmp wq +.h_w2: + movzx mxd, mxb + dec srcq + mova xmm4, [subpel_h_shuf4] + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w2_loop: + movq xmm0, [srcq+ssq*0] + movhps xmm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xmm4 + mova xmm1, xm5 + vpdpbusd xmm1, xmm0, xmm3 + packssdw xmm0, xmm1, xmm1 + psraw xmm0, 6 + packuswb xmm0, xm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w4_loop: + movq xmm0, [srcq+ssq*0] + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xm6 + pshufb xmm1, xm6 + mova xmm2, xm5 + vpdpbusd xmm2, xmm0, xmm3 + mova xmm0, xm5 + vpdpbusd xmm0, xmm1, xmm3 + packssdw xmm0, xmm2, xmm0 + psraw xmm0, 6 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xmm0, [srcq+ssq*0] + vinserti32x4 ym0, ymm0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 + vpmovuswb xmm0, ym0 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mova m6, [spel_h_perm16a] + mova m7, [spel_h_perm16b] + mova m8, [spel_h_perm16c] +.h_w16_loop: + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3, 1 + vpmovuswb ym0, m0 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + movu ym0, [srcq+ssq*0+8*0] + vinserti32x8 m0, [srcq+ssq*1+8*0], 1 + movu ym1, [srcq+ssq*0+8*1] + vinserti32x8 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + movu m0, [srcq+8*0] + movu m2, [srcq+8*1] + movu m1, [srcq+8*8] + movu m3, [srcq+8*9] + add srcq, ssq + PUT_8TAP_H 0, 4, 11, 12 + PUT_8TAP_H 2, 12, 11, 4 + PUT_8TAP_H 1, 4, 11, 12 + PUT_8TAP_H 3, 12, 11, 4 + packuswb m0, m2 + packuswb m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] + vpbroadcastd m7, [pw_512] + lea myq, [base+subpel_filters+myq*8] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + add r6, r8 + lea ss3q, [ssq*3] + sub srcq, ss3q + jmp r6 +.v_w2: + movd xmm2, [srcq+ssq*0] + pinsrw xmm2, [srcq+ssq*1], 2 + pinsrw xmm2, [srcq+ssq*2], 4 + add srcq, ss3q + pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm3, xmm1 ; 45 56 + punpcklbw xmm1, xmm2, xmm4 ; 01 12 + punpckhbw xmm2, xmm4 ; 23 34 +.v_w2_loop: + pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm9 ; a1 b1 + paddw xmm5, xmm2 + mova xmm2, xmm3 + pmaddubsw xmm3, xm10 ; a2 b2 + paddw xmm5, xmm3 + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 + punpcklbw xmm3, xmm4 ; 67 78 + pmaddubsw xmm4, xmm3, xm11 ; a3 b3 + paddw xmm5, xmm4 + pmulhrsw xmm5, xm7 + packuswb xmm5, xmm5 + pextrw [dstq+dsq*0], xmm5, 0 + pextrw [dstq+dsq*1], xmm5, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + pinsrd xmm2, [srcq+ssq*2], 2 + add srcq, ss3q + pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm3, xmm1 ; 45 56 + punpcklbw xmm1, xmm2, xmm4 ; 01 12 + punpckhbw xmm2, xmm4 ; 23 34 +.v_w4_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm9 ; a1 b1 + paddw xmm5, xmm2 + mova xmm2, xmm3 + pmaddubsw xmm3, xm10 ; a2 b2 + paddw xmm5, xmm3 + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 + punpcklbw xmm3, xmm4 ; 67 78 + pmaddubsw xmm4, xmm3, xm11 ; a3 b3 + paddw xmm5, xmm4 + pmulhrsw xmm5, xm7 + packuswb xmm5, xmm5 + movd [dstq+dsq*0], xmm5 + pextrd [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xmm1, [srcq+ssq*0] + vpbroadcastq ymm0, [srcq+ssq*1] + vpbroadcastq ymm2, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq ymm5, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm4, [srcq+ssq*2] + add srcq, ss3q + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklbw ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm5, 0x30 + vpblendd ymm5, ymm3, 0x30 + punpcklbw ymm2, ymm5 ; 23 34 + vpblendd ymm3, ymm4, 0x30 + vpblendd ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 45 56 +.v_w8_loop: + vpbroadcastq ymm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw ymm5, ymm1, ym8 ; a0 b0 + mova ymm1, ymm2 + pmaddubsw ymm2, ym9 ; a1 b1 + paddw ymm5, ymm2 + mova ymm2, ymm3 + pmaddubsw ymm3, ym10 ; a2 b2 + paddw ymm5, ymm3 + vpblendd ymm3, ymm0, ymm4, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm4, ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 67 78 + pmaddubsw ymm4, ymm3, ym11 ; a3 b3 + paddw ymm5, ymm4 + pmulhrsw ymm5, ym7 + vextracti128 xmm4, ymm5, 1 + packuswb xmm5, xmm4 + movq [dstq+dsq*0], xmm5 + movhps [dstq+dsq*1], xmm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + vzeroupper + RET +.v_w16: + mova m12, [spel_v_perm16] + vbroadcasti32x4 m1, [srcq+ssq*0] + vbroadcasti32x4 ym4, [srcq+ssq*1] + mov r6d, 0x0f + vbroadcasti32x4 m2, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti32x4 ym5, [srcq+ssq*0] + kmovb k1, r6d + vbroadcasti32x4 m3, [srcq+ssq*1] + vbroadcasti32x4 ym6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m1{k1}, m4, m2, 0xcc + vshufpd m2{k1}, m5, m3, 0xcc + vshufpd m3{k1}, m6, m0, 0xcc + vpermb m1, m12, m1 ; 01 12 + vpermb m2, m12, m2 ; 23 34 + vpermb m3, m12, m3 ; 45 56 +.v_w16_loop: + pmaddubsw m4, m1, m8 ; a0 b0 + mova m1, m2 + pmaddubsw m5, m2, m9 ; a1 b1 + mova m2, m3 + pmaddubsw m6, m3, m10 ; a2 b2 + mova m3, m0 + paddw m4, m5 + vbroadcasti32x4 ym5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m3{k1}, m5, m0, 0xcc + vpermb m3, m12, m3 ; 67 78 + pmaddubsw m5, m3, m11 ; a3 b3 + paddw m4, m6 + paddw m4, m5 + pmulhrsw m4, m7 + vextracti32x8 ym5, m4, 1 + packuswb ym4, ym5 + mova [dstq+dsq*0], xm4 + vextracti32x4 [dstq+dsq*1], ym4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: + mova m12, [spel_v_perm32] + pmovzxbq m14, [pb_02461357] + vpshrdw m13, m12, m12, 8 + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + vpermb m1, m12, m0 ; 01 + vinserti32x8 m0, [srcq+ssq*2], 0 + add srcq, ss3q + vpermb m2, m13, m0 ; 12 + vinserti32x8 m0, [srcq+ssq*0], 1 + vpermb m3, m12, m0 ; 23 + vinserti32x8 m0, [srcq+ssq*1], 0 + vpermb m4, m13, m0 ; 34 + vinserti32x8 m0, [srcq+ssq*2], 1 + add srcq, ss3q + vpermb m5, m12, m0 ; 45 + vinserti32x8 m0, [srcq+ssq*0], 0 + vpermb m6, m13, m0 ; 56 +.v_w32_loop: + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddubsw m15, m1, m8 + mova m1, m3 + pmaddubsw m16, m2, m8 + mova m2, m4 + pmaddubsw m17, m3, m9 + mova m3, m5 + pmaddubsw m18, m4, m9 + mova m4, m6 + pmaddubsw m19, m5, m10 + vpermb m5, m12, m0 ; 67 + vinserti32x8 m0, [srcq+ssq*0], 0 + pmaddubsw m20, m6, m10 + vpermb m6, m13, m0 ; 78 + paddw m15, m17 + pmaddubsw m17, m5, m11 + paddw m16, m18 + pmaddubsw m18, m6, m11 + paddw m15, m19 + paddw m16, m20 + paddw m15, m17 + paddw m16, m18 + pmulhrsw m15, m7 + pmulhrsw m16, m7 + packuswb m15, m16 + vpermq m15, m14, m15 + mova [dstq+dsq*0], ym15 + vextracti32x8 [dstq+dsq*1], m15, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: +.v_w128: + lea r6d, [hq+wq*4-256] + mov r4, srcq + mov r7, dstq +.v_loop0: + movu m2, [srcq+ssq*0] + movu m4, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + movu m13, [srcq+ssq*0] + movu m15, [srcq+ssq*1] + movu m17, [srcq+ssq*2] + add srcq, ss3q + movu m0, [srcq+ssq*0] + punpcklbw m1, m2, m4 ; 01l + punpckhbw m2, m4 ; 01h + punpcklbw m3, m4, m6 ; 12l + punpckhbw m4, m6 ; 12h + punpcklbw m5, m6, m13 ; 23l + punpckhbw m6, m13 ; 23h + punpcklbw m12, m13, m15 ; 34l + punpckhbw m13, m15 ; 34h + punpcklbw m14, m15, m17 ; 45l + punpckhbw m15, m17 ; 45h + punpcklbw m16, m17, m0 ; 56l + punpckhbw m17, m0 ; 56h +.v_loop: + pmaddubsw m18, m1, m8 ; a0l + mova m1, m5 + pmaddubsw m19, m2, m8 ; a0h + mova m2, m6 + pmaddubsw m20, m3, m8 ; b0l + mova m3, m12 + pmaddubsw m21, m4, m8 ; b0h + mova m4, m13 + pmaddubsw m5, m9 ; a1l + pmaddubsw m6, m9 ; a1h + pmaddubsw m12, m9 ; b1l + pmaddubsw m13, m9 ; b1h + paddw m18, m5 + mova m5, m14 + pmaddubsw m14, m10 ; a2l + paddw m19, m6 + mova m6, m15 + pmaddubsw m15, m10 ; a2h + paddw m20, m12 + mova m12, m16 + pmaddubsw m16, m10 ; b2l + paddw m21, m13 + mova m13, m17 + pmaddubsw m17, m10 ; b2h + paddw m18, m14 + paddw m19, m15 + paddw m20, m16 + paddw m21, m17 + movu m17, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m14, m0, m17 ; 67l + punpckhbw m15, m0, m17 ; 67h + pmaddubsw m16, m14, m11 ; a3l + pmaddubsw m0, m15, m11 ; a3h + paddw m18, m16 + paddw m19, m0 + movu m0, [srcq+ssq*0] + punpcklbw m16, m17, m0 ; 78l + punpckhbw m17, m0 ; 78h + pmulhrsw m18, m7 + pmulhrsw m19, m7 + packuswb m18, m19 + mova [dstq+dsq*0], m18 + pmaddubsw m18, m16, m11 ; b3l + pmaddubsw m19, m17, m11 ; b3h + paddw m18, m20 + paddw m19, m21 + pmulhrsw m18, m7 + pmulhrsw m19, m7 + packuswb m18, m19 + mova [dstq+dsq*1], m18 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_loop + add r4, 64 + add r7, 64 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 256 + jg .v_loop0 + vzeroupper + RET +.hv: + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastd m8, [pd_2] + vpbroadcastq ym0, [base+subpel_filters+myq*8] + lea ss3q, [ssq*3] + vpbroadcastd ym9, [pd_32768] + mov r6, srcq + punpcklbw ym0, ym8, ym0 + sub r6, ss3q + psraw ym0, 2 ; << 6 + mova xm14, [spel_hv_end] + pshufd ym10, ym0, q0000 + pshufd ym11, ym0, q1111 + pshufd ym12, ym0, q2222 + pshufd ym13, ym0, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 ym6, [subpel_h_shuf4] + movq xmm2, [r6+ssq*0] + movhps xmm2, [r6+ssq*1] + movq xmm0, [r6+ssq*2] + movhps xmm0, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq ymm1, [srcq+ssq*0] + vpblendd ymm2, ymm3, 0x30 + vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ + vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 + pshufb ymm2, ym6 + pshufb ymm0, ym6 + mova ymm1, ym8 + vpdpbusd ymm1, ymm2, ym7 + mova ymm2, ym8 + vpdpbusd ymm2, ymm0, ym7 + packssdw ymm2, ymm1, ymm2 + psraw ymm2, 2 + vextracti128 xmm3, ymm2, 1 + palignr xmm4, xmm3, xmm2, 4 + punpcklwd xmm1, xmm2, xmm4 ; 01 12 + punpckhwd xmm2, xmm4 ; 23 34 + pshufd xmm0, xmm3, q2121 + punpcklwd xmm3, xmm0 ; 45 56 +.hv_w2_loop: + movq xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm4, [srcq+ssq*0] + mova xmm5, xm9 + vpdpwssd xmm5, xmm1, xm10 ; a0 b0 + mova xmm1, xmm2 + vpdpwssd xmm5, xmm2, xm11 ; a1 b1 + pshufb xmm4, xm6 + mova xmm2, xmm3 + vpdpwssd xmm5, xmm3, xm12 ; a2 b2 + mova xmm3, xm8 + vpdpbusd xmm3, xmm4, xm7 + packssdw xmm4, xmm3, xmm3 + psraw xmm4, 2 + palignr xmm3, xmm4, xmm0, 12 + mova xmm0, xmm4 + punpcklwd xmm3, xmm4 ; 67 78 + vpdpwssd xmm5, xmm3, xm13 ; a3 b3 + packuswb xmm5, xmm5 + pshufb xmm5, xm14 + pextrw [dstq+dsq*0], xmm5, 0 + pextrw [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + vzeroupper + RET +.hv_w4: + movq xmm1, [r6+ssq*0] + vpbroadcastq ym2, [r6+ssq*1] + vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 + vinserti32x4 m2, [srcq+ssq*0], 2 + vinserti32x4 m1, [srcq+ssq*1], 2 + vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 + vbroadcasti32x4 m6, [subpel_h_shufA] + add srcq, ss3q + vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 + pshufb m2, m6 + pshufb m1, m6 + mova m0, m8 + vpdpbusd m0, m2, m7 + mova m4, m8 + vpdpbusd m4, m1, m7 + mova ym1, [spel_hv_perm4a] + mova ym2, [spel_hv_perm4b] + mova ym3, [spel_hv_perm4c] + packssdw m0, m4 + psraw m0, 2 ; _ 0 1 2 3 4 5 6 + mov r6d, 0x5555 + vpermb ym1, ym1, ym0 ; 01 12 + vpermb m2, m2, m0 ; 23 34 + vpermb m3, m3, m0 ; 45 56 + kmovw k1, r6d + mova ym15, [spel_hv_perm4d] +.hv_w4_loop: + movq xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 + mova ym5, ym9 + vpdpwssd ym5, ym1, ym10 ; a0 b0 + mova ym1, ym2 + pshufb ym4, ym6 + mova ym0, ym8 + vpdpbusd ym0, ym4, ym7 + vpdpwssd ym5, ym2, ym11 ; a1 b1 + mova ym2, ym3 + vpdpwssd ym5, ym3, ym12 ; a2 b2 + vpsraw ym3{k1}, ym0, 2 ; 7 8 + vpermb ym3, ym15, ym3 ; 67 78 + vpdpwssd ym5, ym3, ym13 ; a3 b3 + packuswb ym5, ym5 + vpermb ym5, ym14, ym5 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [base+subpel_filters+mxq*8+0] + vpbroadcastd m11, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastd m8, [pd_2] + vpbroadcastq m0, [base+subpel_filters+myq*8] + vpbroadcastd m9, [pd_32768] + punpcklbw m0, m8, m0 + lea ss3q, [ssq*3] + psraw m0, 2 ; << 6 + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + cmp wd, 8 + jne .hv_w16 + mov r6, srcq + sub r6, ss3q + movu xmm1, [r6+ssq*0] + vinserti128 ymm1, [r6+ssq*1], 1 + movu xmm2, [srcq+ssq*1] + vinserti32x4 m6, zmm1, [r6+ssq*2], 2 + vinserti128 ymm2, [srcq+ssq*2], 1 + vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3 + add srcq, ss3q + vbroadcasti32x4 m4, [subpel_h_shufA] + vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ + vbroadcasti32x4 m7, [subpel_h_shufB] + vbroadcasti32x4 m17, [subpel_h_shufC] + pshufb m1, m6, m4 ; 0 1 2 3 0123 + mova m2, m8 + vpdpbusd m2, m1, m10 + pshufb m5, m6, m7 ; 0 1 2 3 4567 + mova m1, m8 + vpdpbusd m1, m5, m10 + pshufb m4, m0, m4 ; 4 5 6 _ 0123 + mova m3, m8 + vpdpbusd m3, m4, m10 + pshufb m7, m0, m7 ; 4 5 6 _ 4567 + mova m4, m8 + vpdpbusd m4, m7, m10 + pshufb m6, m17 + vpdpbusd m2, m5, m11 + vpdpbusd m1, m6, m11 + pshufb m6, m0, m17 + vpdpbusd m3, m7, m11 + vpdpbusd m4, m6, m11 + mova m5, [spel_hv_perm8a] + mova m0, [spel_hv_perm8b] + mov r6, 0x55555555ff00 + packssdw m2, m1 + packssdw m3, m4 + mova m18, [spel_hv_perm8c] + psraw m2, 2 ; 0 1 2 3 + psraw m3, 2 ; 4 5 6 _ + vpermb m1, m5, m2 ; 01 12 + vbroadcasti32x8 m6, [subpel_h_shufA] + kmovq k1, r6 + vpermt2b m2, m0, m3 ; 23 34 + vbroadcasti32x8 m7, [subpel_h_shufB] + kshiftrq k2, k1, 16 + mova xm16, [spel_hv_end] + vpermb m3, m5, m3 ; 45 56 +.hv_w8_loop: + vbroadcasti32x4 ym4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m4{k1}, [srcq+ssq*0] + mova m0, m9 + vpdpwssd m0, m1, m12 ; a0 b0 + pshufb m1, m4, m6 ; 7 8 0123 4567 + mova m5, m8 + vpdpbusd m5, m1, m10 + pshufb m4, m7 ; 7 8 4567 89ab + vpdpwssd m0, m2, m13 ; a1 b1 + mova m1, m2 + vpdpbusd m5, m4, m11 + mova m2, m3 + vpdpwssd m0, m3, m14 ; a2 b2 + psraw m3{k2}, m5, 2 ; 75 86 + vpermb m3, m18, m3 ; 67 78 + vpdpwssd m0, m3, m15 ; a3 b3 + packuswb m0, m0 + vpermb zmm1, m16, m0 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + vzeroupper + RET +.hv_w16: + movu m7, [spel_hv_perm16a] + sub srcq, ss3q + mova m20, [spel_hv_perm16b] + lea r6d, [wq*2-32] + mova m21, [spel_hv_perm16c] + mov r4, srcq + mov r7, dstq + mova ym16, [spel_hv_end16] + lea r6d, [hq+r6*8] +.hv_w16_loop0: + movu ym17, [srcq+ssq*0] + vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 + movu ym18, [srcq+ssq*2] + add srcq, ss3q + vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3 + movu ym19, [srcq+ssq*1] + vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5 + add srcq, ss3q + vpermb m2, m7, m17 ; 0 1 0123 89ab + vpermb m0, m20, m17 ; 0 1 4567 cdef + vpermb m4, m7, m18 ; 2 3 0123 89ab + mova m1, m8 + vpdpbusd m1, m2, m10 + vpermb m5, m20, m18 ; 2 3 4567 cdef + mova m2, m8 + vpdpbusd m2, m0, m10 + vpermb m17, m21, m17 ; 0 1 89ab ghij + mova m3, m8 + vpdpbusd m3, m4, m10 + vpermb m6, m7, m19 ; 4 5 0123 89ab + mova m4, m8 + vpdpbusd m4, m5, m10 + vpermb m18, m21, m18 ; 2 3 89ab ghij + vpdpbusd m1, m0, m11 + movu ym0, [srcq+ssq*0] ; 6 + vpdpbusd m2, m17, m11 + vpermb m17, m20, m19 ; 4 5 4567 cdef + vpdpbusd m3, m5, m11 + mova m5, m8 + vpdpbusd m5, m6, m10 + mova m6, m8 + vpdpbusd m6, m17, m10 + vpdpbusd m4, m18, m11 + mova m18, [spel_hv_perm16d] + vpermb m18, m18, m0 ; 6 0145 2367 89cd abef + vpdpbusd m5, m17, m11 + vpermb m19, m21, m19 ; 4 5 89ab ghij + mova m17, m8 + vpdpbusd m17, m18, m10 + mova m18, [spel_hv_perm16e] + vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij + packssdw m1, m2 ; 01 + vpdpbusd m6, m19, m11 + packssdw m3, m4 ; 23 + vpdpbusd m17, m0, m11 + psraw m1, 2 + packssdw m5, m6 ; 45 + psraw m3, 2 + vpshrdd m2, m1, m3, 16 ; 12 + psraw m5, 2 + vpshrdd m4, m3, m5, 16 ; 34 + psraw m17, 2 + vpshrdd m6, m5, m17, 16 ; 56 +.hv_w16_loop: + movu ym18, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m18, [srcq+ssq*0], 1 + mova m0, m9 + vpdpwssd m0, m1, m12 ; a0 + vpermb m1, m7, m18 ; 7 8 0123 89ab + mova m17, m9 + vpdpwssd m17, m2, m12 ; b0 + vpermb m2, m20, m18 ; 7 8 4567 cdef + mova m19, m8 + vpdpbusd m19, m1, m10 + vpermb m18, m21, m18 + mova m1, m8 + vpdpbusd m1, m2, m10 + vpdpwssd m0, m3, m13 ; a1 + vpdpwssd m17, m4, m13 ; b1 + vpdpbusd m19, m2, m11 + mova m2, m4 + vpdpbusd m1, m18, m11 + mova m4, m6 + vpdpwssd m0, m5, m14 ; a2 + vpdpwssd m17, m6, m14 ; b2 + packssdw m19, m1 + mova m1, m3 + mova m3, m5 + psraw m6, m19, 2 ; 7 8 + vpshrdd m5, m4, m6, 16 ; 6 7 + vpdpwssd m17, m6, m15 ; b3 + vpdpwssd m0, m5, m15 ; a3 + packuswb m0, m17 + vpermb zmm1, m16, m0 + mova [dstq+dsq*0], xmm1 + vextracti128 [dstq+dsq*1], ymm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w16_loop0 + vzeroupper + RET + %macro PREP_8TAP_H 0 vpermb m10, m5, m0 vpermb m11, m5, m1 @@ -1629,6 +3172,135 @@ jg .hv_loop0 RET +cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts + vpbroadcastd m9, [pd_16384] + mova ym15, [warp_8x8t_end] + call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main + jmp .start +.loop: + call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2 + lea tmpq, [tmpq+tsq*4] +.start: + paddd m16, m16 + vpermb m16, m15, m16 + mova [tmpq+tsq*0], xm16 + vextracti128 [tmpq+tsq*2], ym16, 1 + sub r6d, 0x1800 + jg .loop + RET + +cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter + vpbroadcastd m9, [pd_262144] + mova xm15, [warp_8x8_end] + call .main + jmp .start +.loop: + call .main2 + lea dstq, [dstq+dsq*2] +.start: + psrad m16, 19 + packuswb m16, m16 + vpermb m16, m15, m16 + movq [dstq+dsq*0], xm16 + movhps [dstq+dsq*1], xm16 + sub r6d, 0x1800 + jg .loop + RET +ALIGN function_align +.main: + vpbroadcastd m1, [pd_512] +%if WIN64 + mov abcdq, r5mp + vpaddd ym18, ym1, r6m {1to8} ; mx +%else + add r5d, 512 + vpbroadcastd ym18, r5d +%endif + vpaddd ym20, ym1, r7m {1to8} ; my + mova ym16, [pd_0to7] + vpbroadcastd ym19, [abcdq+4*0] + vpbroadcastd ym21, [abcdq+4*1] + lea r4, [ssq*3+3] + mova m10, [warp_8x8_permA] + mov r6d, 0x5555 + mova m11, [warp_8x8_permB] + lea filterq, [mc_warp_filter+64*8] + vpbroadcastq m12, [warp_8x8_hpack] + sub srcq, r4 ; src -= src_stride*3 + 3 + vbroadcasti32x4 m13, [warp_8x8_permC] + kxnorb k2, k2, k2 + vbroadcasti32x4 m14, [warp_8x8_permD] + vpdpwssd ym18, ym19, ym16 ; alpha + vpdpwssd ym20, ym21, ym16 ; gamma + vbroadcasti32x4 m0, [srcq] + psrad ym19, 16 ; beta + psrad ym21, 16 ; delta + kmovw k1, r6d + psrad ym16, ym18, 10 + kmovb k3, k2 + paddd ym18, ym19 + vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0 + psrld m1, 8 ; pd_2 + pshufb m0, m11 + paddd m8, m1, m1 ; pd_4 + vpdpbusd m1, m0, m2 + call .h + psllq m2, m1, 45 + pslld m1, 13 + paddd m1, m2 + vpshrdq m1, m0, 48 ; 01 12 + call .h + vpshrdq m2, m1, m0, 48 ; 23 34 + call .h + vpshrdq m3, m2, m0, 48 ; 45 56 +.main2: + call .h + psrad ym17, ym20, 10 + kmovb k2, k3 + paddd ym20, ym21 + vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0 + psrad ym16, ym20, 10 + kmovb k3, k2 + paddd ym20, ym21 + vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1 + shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3 + mova m16, m9 + pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1 + vpdpwssd m16, m1, m4 + pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3 + mova m1, m2 + vpdpwssd m16, m2, m5 + shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7 + mova m2, m3 + pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5 + vpdpwssd m16, m3, m4 + vpshrdq m3, m0, 48 ; 67 78 + pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7 + vpdpwssd m16, m3, m5 + ret +ALIGN function_align +.h: + movu xm5, [srcq+ssq*1] + psrad ym16, ym18, 10 + lea srcq, [srcq+ssq*2] + vinserti32x4 ym5, [srcq+ssq*0], 1 + kmovb k2, k3 + paddd ym18, ym19 + vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1 + psrad ym17, ym18, 10 + kmovb k3, k2 + paddd ym18, ym19 + vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2 + mova m0, m8 + vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7 + vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3 + vpdpbusd m0, m4, m17 + vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb + vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7 + vpdpbusd m0, m5, m16 + vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3) + ret + %macro BIDIR_FN 1 ; op lea stride3q, [strideq*3] jmp wq @@ -2392,4 +4064,475 @@ jg .w128_loop RET -%endif ; HAVE_AVX512ICL && ARCH_X86_64 +cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask +%define base r6-blend_avx512icl_table + lea r6, [blend_avx512icl_table] + tzcnt wd, wm + movifnidn maskq, maskmp + movifnidn hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m6, [base+pb_64] + vpbroadcastd m7, [base+pw_512] + sub tmpq, maskq + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + vpbroadcastd xmm1, [dstq+dsq*2] + pinsrd xmm1, [dstq+r6 ], 3 + mova xmm4, [maskq] + mova xmm5, [maskq+tmpq] + add maskq, 4*4 + psubb xmm3, xm6, xmm4 + punpcklbw xmm0, xmm5 + punpcklbw xmm2, xmm3, xmm4 + punpckhbw xmm1, xmm5 + punpckhbw xmm3, xmm4 + pmaddubsw xmm0, xmm2 + pmaddubsw xmm1, xmm3 + pmulhrsw xmm0, xm7 + pmulhrsw xmm1, xm7 + packuswb xmm0, xmm1 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + pextrd [dstq+dsq*2], xmm0, 2 + pextrd [dstq+r6 ], xmm0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + RET +.w8: + movq xmm0, [dstq+dsq*0] + vpbroadcastq xmm1, [dstq+dsq*1] + vpbroadcastq ymm2, [dstq+dsq*2] + vpbroadcastq ymm3, [dstq+r6 ] + mova ymm4, [maskq] + mova ymm5, [maskq+tmpq] + add maskq, 8*4 + vpblendd ymm0, ymm2, 0x30 + vpblendd ymm1, ymm3, 0xc0 + psubb ymm3, ym6, ymm4 + punpcklbw ymm0, ymm5 + punpcklbw ymm2, ymm3, ymm4 + punpckhbw ymm1, ymm5 + punpckhbw ymm3, ymm4 + pmaddubsw ymm0, ymm2 + pmaddubsw ymm1, ymm3 + pmulhrsw ymm0, ym7 + pmulhrsw ymm1, ym7 + packuswb ymm0, ymm1 + vextracti128 xmm1, ymm0, 1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + movq [dstq+dsq*2], xmm1 + movhps [dstq+r6 ], xmm1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + vzeroupper + RET +.w16: + mova xm1, [dstq+dsq*0] + vinserti32x4 ym1, [dstq+dsq*1], 1 + vinserti32x4 m1, [dstq+dsq*2], 2 + mova m4, [maskq] + vinserti32x4 m1, [dstq+r6 ], 3 + mova m5, [maskq+tmpq] + add maskq, 16*4 + psubb m3, m6, m4 + punpcklbw m0, m1, m5 + punpcklbw m2, m3, m4 + punpckhbw m1, m5 + punpckhbw m3, m4 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + vextracti32x4 [dstq+dsq*2], m0, 2 + vextracti32x4 [dstq+r6 ], m0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w16 + RET +.w32: + mova ym1, [dstq+dsq*0] + vinserti32x8 m1, [dstq+dsq*1], 1 + mova m4, [maskq] + mova m5, [maskq+tmpq] + add maskq, 32*2 + psubb m3, m6, m4 + punpcklbw m0, m1, m5 + punpcklbw m2, m3, m4 + punpckhbw m1, m5 + punpckhbw m3, m4 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32 + RET + +cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_v_avx512icl_table + lea r5, [blend_v_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m5, [base+pw_512] + add wq, r5 + add maskq, obmc_masks-blend_v_avx512icl_table + jmp wq +.w2: + vpbroadcastd xmm2, [maskq+2*2] +.w2_s0_loop: + movd xmm0, [dstq+dsq*0] + pinsrw xmm0, [dstq+dsq*1], 1 + movd xmm1, [tmpq] + add tmpq, 2*2 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm2 + pmulhrsw xmm0, xm5 + packuswb xmm0, xmm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_s0_loop + RET +.w4: + vpbroadcastq xmm2, [maskq+4*2] +.w4_loop: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movq xmm1, [tmpq] + add tmpq, 4*2 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm2 + pmulhrsw xmm0, xm5 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + mova xmm3, [maskq+8*2] +.w8_loop: + movq xmm0, [dstq+dsq*0] + vpbroadcastq xmm1, [dstq+dsq*1] + mova xmm2, [tmpq] + add tmpq, 8*2 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm3 + pmaddubsw xmm1, xmm3 + pmulhrsw xmm0, xm5 + pmulhrsw xmm1, xm5 + packuswb xmm0, xmm1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + vbroadcasti32x4 ym3, [maskq+16*2] + vbroadcasti32x4 ym4, [maskq+16*3] +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti32x4 ym1, [dstq+dsq*1], 1 + mova ym2, [tmpq] + add tmpq, 16*2 + punpcklbw ym0, ym1, ym2 + punpckhbw ym1, ym2 + pmaddubsw ym0, ym3 + pmaddubsw ym1, ym4 + pmulhrsw ym0, ym5 + pmulhrsw ym1, ym5 + packuswb ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: + mova m4, [maskq+32*2] + vshufi32x4 m3, m4, m4, q2020 + vshufi32x4 m4, m4, q3131 +.w32_loop: + mova ym1, [dstq+dsq*0] + vinserti32x8 m1, [dstq+dsq*1], 1 + mova m2, [tmpq] + add tmpq, 32*2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32_loop + RET + +cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask +%define base r6-blend_h_avx512icl_table + lea r6, [blend_h_avx512icl_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + lea maskq, [base+obmc_masks+hq*2] + vpbroadcastd m5, [base+pw_512] + lea hd, [hq*3] + add wq, r6 + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd xmm0, [dstq+dsq*0] + pinsrw xmm0, [dstq+dsq*1], 1 + movd xmm2, [maskq+hq*2] + movd xmm1, [tmpq] + add tmpq, 2*2 + punpcklwd xmm2, xmm2 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm2 + pmulhrsw xmm0, xm5 + packuswb xmm0, xmm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova xmm3, [blend_shuf] +.w4_loop: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movd xmm2, [maskq+hq*2] + movq xmm1, [tmpq] + add tmpq, 4*2 + pshufb xmm2, xmm3 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm2 + pmulhrsw xmm0, xm5 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + vbroadcasti128 ymm4, [blend_shuf] + shufpd ymm4, ymm4, 0x03 +.w8_loop: + vpbroadcastq ymm1, [dstq+dsq*0] + movq xmm0, [dstq+dsq*1] + vpblendd ymm0, ymm1, 0x30 + vpbroadcastd ymm3, [maskq+hq*2] + movq xmm1, [tmpq+8*1] + vinserti128 ymm1, [tmpq+8*0], 1 + add tmpq, 8*2 + pshufb ymm3, ymm4 + punpcklbw ymm0, ymm1 + pmaddubsw ymm0, ymm3 + pmulhrsw ymm0, ym5 + vextracti128 xmm1, ymm0, 1 + packuswb xmm0, xmm1 + movhps [dstq+dsq*0], xmm0 + movq [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + vzeroupper + RET +.w16: + vbroadcasti32x4 ym4, [blend_shuf] + shufpd ym4, ym4, 0x0c +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti32x4 ym1, [dstq+dsq*1], 1 + vpbroadcastd ym3, [maskq+hq*2] + mova ym2, [tmpq] + add tmpq, 16*2 + pshufb ym3, ym4 + punpcklbw ym0, ym1, ym2 + punpckhbw ym1, ym2 + pmaddubsw ym0, ym3 + pmaddubsw ym1, ym3 + pmulhrsw ym0, ym5 + pmulhrsw ym1, ym5 + packuswb ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16_loop + RET +.w32: + vbroadcasti32x4 m4, [blend_shuf] + shufpd m4, m4, 0xf0 +.w32_loop: + mova ym1, [dstq+dsq*0] + vinserti32x8 m1, [dstq+dsq*1], 1 + vpbroadcastd m3, [maskq+hq*2] + mova m2, [tmpq] + add tmpq, 32*2 + pshufb m3, m4 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w32_loop + RET +.w64: + vpbroadcastw m3, [maskq+hq*2] + mova m1, [dstq] + mova m2, [tmpq] + add tmpq, 32*2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + vpbroadcastw m6, [maskq+hq*2] + mova m2, [dstq+64*0] + mova m1, [tmpq+64*0] + mova m3, [dstq+64*1] + mova m4, [tmpq+64*1] + add tmpq, 64*2 + punpcklbw m0, m2, m1 + punpckhbw m2, m1 + pmaddubsw m0, m6 + pmaddubsw m2, m6 + punpcklbw m1, m3, m4 + punpckhbw m3, m4 + pmaddubsw m1, m6 + pmaddubsw m3, m6 + REPX {pmulhrsw x, m5}, m0, m2, m1, m3 + packuswb m0, m2 + packuswb m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + inc hq + jl .w128 + RET + +cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + mov r6, ~0 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + kmovq k3, r6 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x + LEA r7, $$ +%define base r7-$$ + vpbroadcastd m3, [base+pw_m256] + vpbroadcastd m7, [base+pd_63] + vbroadcasti32x4 m15, [base+pb_8x0_8x8] + vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] + pslld m5, 4 ; dx*16 + pslld m6, 14 + pxor m2, m2 + mova m16, [base+resize_permA] + mova m17, [base+resize_permB] + mova xm18, [base+resize_permC] +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx +.loop_x: + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + vptestmd k4, m1, m1 + pand m9, m7 ; filter offset (masked) + ktestw k4, k4 + jz .load + vextracti32x8 ym12, m0, 1 + vextracti32x8 ym13, m1, 1 + kmovq k1, k3 + kmovq k2, k3 + vpgatherdq m10{k1}, [srcq+ym0] + vpgatherdq m11{k2}, [srcq+ym12] + kmovq k1, k3 + kmovq k2, k3 + vpgatherdq m14{k1}, [base+resize_shuf+4+ym1] + vpgatherdq m0{k2}, [base+resize_shuf+4+ym13] + mova m12, m16 + mova m13, m17 + paddb m14, m15 + paddb m0, m15 + pshufb m10, m14 + pshufb m11, m0 + vpermi2d m12, m10, m11 + vpermi2d m13, m10, m11 + jmp .filter +.load: + kmovq k1, k3 + kmovq k2, k3 + vpgatherdd m12{k1}, [srcq+m0+0] + vpgatherdd m13{k2}, [srcq+m0+4] +.filter: + kmovq k1, k3 + kmovq k2, k3 + vpgatherdd m10{k1}, [base+resize_filter+m9*8+0] + vpgatherdd m11{k2}, [base+resize_filter+m9*8+4] + mova m14, m2 + vpdpbusd m14, m12, m10 + vpdpbusd m14, m13, m11 + packssdw m14, m14 + pmulhrsw m14, m3 + packuswb m14, m14 + vpermd m14, m18, m14 + mova [dstq+xq], xm14 + paddd m4, m5 + add xd, 16 + cmp xd, dst_wd + jl .loop_x + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/mc_init_tmpl.c dav1d-1.0.0/src/x86/mc_init_tmpl.c --- dav1d-0.9.2/src/x86/mc_init_tmpl.c 2021-09-03 15:51:24.425037100 +0000 +++ dav1d-1.0.0/src/x86/mc_init_tmpl.c 2022-03-18 14:31:56.034356000 +0000 @@ -152,7 +152,6 @@ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); -#if BITDEPTH == 8 && ARCH_X86_64 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); @@ -174,7 +173,6 @@ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); -#endif c->avg = BF(dav1d_avg, ssse3); c->w_avg = BF(dav1d_w_avg, ssse3); @@ -224,7 +222,6 @@ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); -#if BITDEPTH == 8 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); @@ -246,7 +243,6 @@ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); -#endif c->avg = BF(dav1d_avg, avx2); c->w_avg = BF(dav1d_w_avg, avx2); @@ -265,7 +261,17 @@ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; -#if HAVE_AVX512ICL && BITDEPTH == 8 + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl); + init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); @@ -283,6 +289,11 @@ c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl); c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl); c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl); -#endif + c->blend = BF(dav1d_blend, avx512icl); + c->blend_v = BF(dav1d_blend_v, avx512icl); + c->blend_h = BF(dav1d_blend_h, avx512icl); + c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl); + c->resize = BF(dav1d_resize, avx512icl); #endif } diff -Nru dav1d-0.9.2/src/x86/mc_sse.asm dav1d-1.0.0/src/x86/mc_sse.asm --- dav1d-0.9.2/src/x86/mc_sse.asm 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/src/x86/mc_sse.asm 2022-03-18 14:31:56.034356000 +0000 @@ -308,10 +308,8 @@ %endrep %endmacro -%if ARCH_X86_64 SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 -%endif %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX @@ -3943,34 +3941,80 @@ %endif %endmacro -%macro REMAP_REG 2 +%macro SAVE_REG 1 + %xdefine r%1_save r%1 + %xdefine r%1q_save r%1q + %xdefine r%1d_save r%1d + %if ARCH_X86_32 + %define r%1m_save [rstk+stack_offset+(%1+1)*4] + %endif +%endmacro + +%macro LOAD_REG 1 + %xdefine r%1 r%1_save + %xdefine r%1q r%1q_save + %xdefine r%1d r%1d_save + %if ARCH_X86_32 + %define r%1m r%1m_save + %endif + %undef r%1d_save + %undef r%1q_save + %undef r%1_save +%endmacro + +%macro REMAP_REG 2-3 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d + %if ARCH_X86_32 + %if %3 == 0 + %xdefine r%1m r%2m + %else + %define r%1m [rstk+stack_offset+(%1+1)*4] + %endif + %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep - %xdefine r14_save r14 - %assign %%i 14 - %rep 14 - %assign %%j %%i-1 - REMAP_REG %%i, %%j - %assign %%i %%i-1 - %endrep + %if ARCH_X86_64 + SAVE_REG 14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %else + SAVE_REG 5 + %assign %%i 5 + %rep 5 + %assign %%j %%i-1 + REMAP_REG %%i, %%j, 0 + %assign %%i %%i-1 + %endrep + %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 - %rep 13 - %assign %%j %%i+1 - REMAP_REG %%i, %%j - %assign %%i %%i+1 - %endrep - %xdefine r14 r14_save - %undef r14_save + %if ARCH_X86_64 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + LOAD_REG 14 + %else + %rep 4 + %assign %%j %%i+1 + REMAP_REG %%i, %%j, 1 + %assign %%i %%i+1 + %endrep + LOAD_REG 5 + %endif %endif %endmacro @@ -3982,7 +4026,8 @@ %endif %endmacro -%macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] +%if ARCH_X86_64 + %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] SWAP m%2, m%5 movq m%1, [srcq+ r4] movq m%2, [srcq+ r6] @@ -4019,33 +4064,155 @@ pmulhrsw m%1, m12 pmulhrsw m%5, m12 SWAP m%2, m%5 -%endmacro + %endmacro +%else + %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets + %if %3 == 1 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + %endif + movq m0, [srcq+r0] + movq m1, [srcq+rX] + movhps m0, [srcq+r4] + movhps m1, [srcq+r5] + add srcq, ssq + movq m4, [srcq+r0] + movq m5, [srcq+rX] + movhps m4, [srcq+r4] + movhps m5, [srcq+r5] + mov r0, [esp+16] + mov rX, [esp+24] + mov r4, [esp+20] + mov r5, [esp+28] + sub srcq, ssq + movq m2, [srcq+r0] + movq m3, [srcq+rX] + movhps m2, [srcq+r4] + movhps m3, [srcq+r5] + add srcq, ssq + movq m6, [srcq+r0] + movq m7, [srcq+rX] + movhps m6, [srcq+r4] + movhps m7, [srcq+r5] + add srcq, ssq + pmaddubsw m0, [esp+%1+ 0] + pmaddubsw m4, [esp+%1+ 0] + pmaddubsw m1, [esp+%1+16] + pmaddubsw m5, [esp+%1+16] + pmaddubsw m2, [esp+%1+32] + pmaddubsw m6, [esp+%1+32] + pmaddubsw m3, [esp+%1+48] + pmaddubsw m7, [esp+%1+48] + phaddw m0, m1 + phaddw m4, m5 + phaddw m2, m3 + phaddw m6, m7 + phaddw m0, m2 + phaddw m4, m6 + pmulhrsw m0, m12 + pmulhrsw m4, m12 + %if %2 != 0 + mova [esp+%2+ 0], m0 + mova [esp+%2+16], m4 + %endif + %endmacro +%endif %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isprep 0 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal put_8tap_scaled_8bpc, 4, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy - %else -cglobal put_8tap_scaled_8bpc, 4, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif %endif %xdefine base_reg r12 %define rndshift 10 -%else +%else ; prep %assign isprep 1 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy - %xdefine tmp_stridem r14q - %else -cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy - %define tmp_stridem qword [rsp+0x138] + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+0x138] + %endif + %xdefine base_reg r11 + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy + %else +cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy + %endif + %define tmp_stridem dword [esp+0x138] %endif - %xdefine base_reg r11 %define rndshift 6 %endif +%if ARCH_X86_32 + mov [esp+0x1f0], t0d + mov [esp+0x1f4], t1d + %if !isprep && required_stack_alignment > STACK_ALIGNMENT + mov dstd, dstm + mov dsd, dsm + mov srcd, srcm + mov ssd, ssm + mov hd, hm + mov r4, mxm + %define r0m [esp+0x200] + %define dsm [esp+0x204] + %define dsmp dsm + %define r1m dsm + %define r2m [esp+0x208] + %define ssm [esp+0x20c] + %define r3m ssm + %define hm [esp+0x210] + %define mxm [esp+0x214] + mov r0m, dstd + mov dsm, dsd + mov r2m, srcd + mov ssm, ssd + mov hm, hd + mov r0, mym + mov r1, dxm + mov r2, dym + %define mym [esp+0x218] + %define dxm [esp+0x09c] + %define dym [esp+0x21c] + mov mxm, r4 + mov mym, r0 + mov dxm, r1 + mov dym, r2 + tzcnt wd, wm + %endif + %if isprep && required_stack_alignment > STACK_ALIGNMENT + %xdefine base_reg r5 + %else + %xdefine base_reg r6 + %endif + mov ssd, ssm +%endif LEA base_reg, %1_8tap_scaled_8bpc_ssse3 -%define base base_reg-%1_8tap_scaled_8bpc_ssse3 +%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3 +%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT tzcnt wd, wm +%endif +%if ARCH_X86_32 + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 +%endif movd m8, dxm movd m14, mxm pshufd m8, m8, q0000 @@ -4054,25 +4221,31 @@ mov r5d, t0d DECLARE_REG_TMP 5, 7 %endif +%if ARCH_X86_64 mov dyd, dym +%endif %ifidn %1, put %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m - %else + %elif ARCH_X86_64 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif - %if required_stack_alignment > STACK_ALIGNMENT - %define dsm [rsp+0x138] - %define rX r1 - %define rXd r1d + %if ARCH_X86_64 + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+0x138] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif %else - %define dsm dsq - %define rX r14 - %define rXd r14d + %define rX r1 %endif %else ; prep %if WIN64 @@ -4080,28 +4253,61 @@ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m - %else + %elif ARCH_X86_64 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+0x94] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV - %define rX r14 - %define rXd r14d + %if ARCH_X86_64 + %define rX r14 + %define rXd r14d + %else + %define rX r3 + %endif %endif +%if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m12, [base+pw_8192] -%ifidn %1, put + %ifidn %1, put mova m13, [base+pd_512] -%else + %else mova m13, [base+pd_32] + %endif +%else + %define m10 [base+pd_0x3ff] + %define m12 [base+pw_8192] + %ifidn %1, put + %define m13 [base+pd_512] + %else + %define m13 [base+pd_32] + %endif %endif pxor m9, m9 +%if ARCH_X86_64 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q +%else + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + mov r1, [esp+0x1f4] + lea r0, [ssq*3] + movzx r2, r1b + shr r1, 16 + cmp dword hm, 6 + cmovs r1, r2 + mov [esp+0x1f4], r1 + mov r1, r1m + mov r2, r2m + sub srcq, r0 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define ss3q r0 + %define myd r4 + %define dyd dword dym + %define hd dword hm +%endif cmp dyd, 1024 je .dy1 cmp dyd, 2048 @@ -4111,35 +4317,65 @@ jmp wq %ifidn %1, put .w2: + %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d + %else + movzx r4, byte [esp+0x1f0] + dec srcq + movd m15, r4 + %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] + %if ARCH_X86_64 mova m11, [base+pd_0x4000] + %else + %define m11 [base+pd_0x4000] + %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 + %if ARCH_X86_64 movd r6d, m15 + %else + movd r3d, m15 + %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [rsp+0x180], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m8 m5 + %define m15 m6 + %endif movq m0, [srcq+ssq*0] movq m2, [srcq+ssq*2] movhps m0, [srcq+ssq*1] movhps m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] + %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 + %endif movq m1, [srcq+ssq*0] movq m3, [srcq+ssq*2] movhps m1, [srcq+ssq*1] @@ -4147,10 +4383,19 @@ lea srcq, [srcq+ssq*4] punpckldq m15, m7 punpcklqdq m15, m15 + %if ARCH_X86_64 pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 + %else + pand m7, m8, m11 + pandn m8, m15 + %define m8 m6 + %define m15 m5 + por m15, m7 + mova [rsp+0x190], m15 + %endif pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 @@ -4169,8 +4414,17 @@ pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mov myd, mym + mov r0, r0m + mova [rsp+0x1a0], m3 + mova [rsp+0x1b0], m0 + mova [rsp+0x1c0], m2 + mova [rsp+0x1d0], m4 + %endif .w2_loop: and myd, 0x3ff + %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 @@ -4189,19 +4443,69 @@ pmaddwd m8, m4, m11 paddd m5, m6 paddd m7, m8 + %else + mov mym, myd + mov r1, [esp+0x1f4] + xor r3, r3 + shr r4, 6 + lea r1, [r1+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r1*8+0] + cmovnz r3, [base+subpel_filters+r1*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m5, m7, q0000 + pshufd m6, m7, q1111 + pmaddwd m3, m5 + pmaddwd m0, m6 + pshufd m5, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m2, m5 + pmaddwd m4, m7 + paddd m3, m0 + paddd m2, m4 + SWAP m5, m3 + SWAP m7, m2 + %endif paddd m5, m13 paddd m5, m7 psrad m5, 10 packssdw m5, m5 packuswb m5, m5 + %if ARCH_X86_64 pextrw r6d, m5, 0 mov [dstq], r6w add dstq, dsq dec hd jz .ret add myd, dyd + %else + pextrw r3d, m5, 0 + mov [dstq], r3w + add dstq, dsm + dec hd + jz .ret + mov myd, mym + add myd, dym + %endif test myd, ~0x3ff + %if ARCH_X86_32 + SWAP m3, m5 + SWAP m2, m7 + mova m3, [rsp+0x1a0] + mova m0, [rsp+0x1b0] + mova m2, [rsp+0x1c0] + mova m4, [rsp+0x1d0] + %define m14 [esp+0x180] + %define m15 [esp+0x190] + %endif jz .w2_loop + %if ARCH_X86_32 + mov r3, r3m + %endif movq m5, [srcq] test myd, 0x400 jz .w2_skip_line @@ -4216,6 +4520,12 @@ palignr m4, m5, m1, 12 punpcklqdq m1, m4, m4 ; 6 7 6 7 punpcklwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mova [rsp+0x1a0], m3 + mova [rsp+0x1b0], m0 + mova [rsp+0x1c0], m2 + mova [rsp+0x1d0], m4 + %endif jmp .w2_loop .w2_skip_line: movhps m5, [srcq+ssq*1] @@ -4231,23 +4541,42 @@ mova m1, m4 punpcklwd m2, m4, m5 ; 45 56 punpckhwd m4, m5 ; 67 __ + %if ARCH_X86_32 + mova [rsp+0x1a0], m3 + mova [rsp+0x1b0], m0 + mova [rsp+0x1c0], m2 + mova [rsp+0x1d0], m4 + %endif jmp .w2_loop - SWAP m15, m8, m9 %endif +INIT_XMM ssse3 .w4: +%if ARCH_X86_64 mov myd, mym - mova m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd m15, t0d - pmaddwd m8, m7 +%else + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + movzx r4, byte [esp+0x1f0] + dec srcq + movd m15, r4 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 mova m11, [base+pd_0x4000] +%else + %define m11 [base+pd_0x4000] +%endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 psrldq m7, m15, 8 +%if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 @@ -4258,30 +4587,59 @@ movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] +%else + movd r0, m15 + movd rX, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r4, m15 + movd r5, m7 + movd m1, [base+subpel_filters+r0*8+2] + movd m2, [base+subpel_filters+rX*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + movifprep r3, r3m + SWAP m4, m7 + %define m15 m1 +%endif mova m5, [base+bdct_lb_dw] movq m6, [base+subpel_s_shuf2] - pcmpeqd m0, m9 psrld m14, 10 - movu m7, [srcq+ssq*0] - movu m9, [srcq+ssq*1] - movu m8, [srcq+ssq*2] - movu m10, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] punpckldq m15, m3 punpckldq m2, m4 - punpcklqdq m6, m6 punpcklqdq m15, m2 + punpcklqdq m6, m6 pshufb m14, m5 paddb m14, m6 +%if ARCH_X86_64 + pcmpeqd m0, m9 + pand m11, m0 +%else + mova [esp+0x180], m14 + SWAP m7, m4 + pxor m3, m3 + pcmpeqd m0, m3 + pand m2, m11, m0 + %define m11 m2 +%endif + pandn m0, m15 +%if ARCH_X86_64 + SWAP m15, m0 +%else + %define m15 m0 +%endif + por m15, m11 +%if ARCH_X86_64 + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] movu m2, [srcq+ssq*0] movu m4, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m5, [srcq+ss3q ] lea srcq, [srcq+ssq*4] - pand m11, m0 - pandn m0, m15 - SWAP m15, m0 - por m15, m11 pshufb m7, m14 pshufb m9, m14 pshufb m8, m14 @@ -4320,8 +4678,71 @@ mova [rsp+0x00], m7 mova [rsp+0x10], m8 mova [rsp+0x20], m9 +%else + mova [esp+0x190], m15 + lea ss3q, [ssq*3] + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + movu m7, [srcq+ssq*2] + movu m6, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m2, m14 + pshufb m3, m14 + pshufb m7, m14 + pshufb m6, m14 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m7, m15 + pmaddubsw m6, m15 + phaddw m2, m3 + phaddw m7, m6 + movu m1, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m6, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m1, m14 + pshufb m5, m14 + pshufb m3, m14 + pshufb m6, m14 + pmaddubsw m1, m15 + pmaddubsw m5, m15 + pmaddubsw m3, m15 + pmaddubsw m6, m15 + phaddw m1, m5 + phaddw m3, m6 + pmulhrsw m2, m12 + pmulhrsw m7, m12 + pmulhrsw m1, m12 + pmulhrsw m3, m12 + shufps m4, m2, m7, q1032 ; 1 2 + shufps m5, m7, m1, q1032 ; 3 4 + shufps m6, m1, m3, q1032 ; 5 6 + psrldq m0, m3, 8 ; 7 _ + mova [esp+0x1a0], m0 + %define m11 [esp+0x1a0] + punpcklwd m0, m2, m4 ; 01 + punpckhwd m2, m4 ; 12 + punpcklwd m4, m7, m5 ; 23 + punpckhwd m7, m5 ; 34 + punpcklwd m5, m1, m6 ; 45 + punpckhwd m1, m6 ; 56 + punpcklwd m3, [esp+0x1a0] ; 67 + mov myd, mym + mov r0, r0m + mova [esp+0x1b0], m0 ; 01 + mova [esp+0x1c0], m4 ; 23 + mova [esp+0x1d0], m5 ; 45 + mova [esp+0x1e0], m3 ; 67 + mova [rsp+0x00], m2 ; 12 + mova [rsp+0x10], m7 ; 34 + mova [rsp+0x20], m1 ; 56 + SWAP m1, m4 + SWAP m2, m5 +%endif .w4_loop: and myd, 0x3ff +%if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 @@ -4342,39 +4763,107 @@ paddd m6, m7 paddd m4, m13 paddd m4, m6 +%else + mov mym, myd + mov r5, [esp+0x1f4] + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m0, m4 + pmaddwd m1, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + paddd m0, m1 + paddd m2, m3 + paddd m0, m13 + paddd m0, m2 + SWAP m4, m0 +%endif psrad m4, rndshift packssdw m4, m4 %ifidn %1, put packuswb m4, m4 movd [dstq], m4 - add dstq, dsq + add dstq, dsmp %else movq [tmpq], m4 add tmpq, 8 %endif dec hd jz .ret +%if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .w4_loop +%else + SWAP m0, m4 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + jnz .w4_next_line + mova m0, [esp+0x1b0] + mova m1, [esp+0x1c0] + mova m2, [esp+0x1d0] + mova m3, [esp+0x1e0] + jmp .w4_loop +.w4_next_line: + %define m14 [esp+0x180] + %define m15 [esp+0x190] +%endif movu m4, [srcq] test myd, 0x400 jz .w4_skip_line +%if ARCH_X86_64 mova m0, [rsp+0x00] mova [rsp+0x00], m1 mova m1, [rsp+0x10] mova [rsp+0x10], m2 mova m2, [rsp+0x20] mova [rsp+0x20], m3 +%else + mova m5, [esp+0x1c0] + mova m0, [rsp+0x000] + mova [rsp+0x00], m5 + mova [esp+0x1b0], m0 + mova m6, [esp+0x1d0] + mova m1, [rsp+0x010] + mova [rsp+0x10], m6 + mova [esp+0x1c0], m1 + mova m7, [esp+0x1e0] + mova m2, [rsp+0x020] + mova [rsp+0x20], m7 + mova [esp+0x1d0], m2 +%endif pshufb m4, m14 pmaddubsw m4, m15 phaddw m4, m4 pmulhrsw m4, m12 punpcklwd m3, m11, m4 - mova m11, m4 +%if ARCH_X86_32 + mova [esp+0x1e0], m3 +%endif + mova m11, m4 add srcq, ssq jmp .w4_loop .w4_skip_line: +%if ARCH_X86_32 + mova m0, [esp+0x1c0] + mova m1, [esp+0x1d0] + mova m2, [esp+0x1e0] +%endif movu m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m6, [rsp+0x10] @@ -4385,17 +4874,27 @@ pmaddubsw m5, m15 phaddw m4, m5 pmulhrsw m4, m12 - punpcklwd m9, m11, m4 + punpcklwd m5, m11, m4 mova [rsp+0x00], m6 mova [rsp+0x10], m7 - mova [rsp+0x20], m9 + mova [rsp+0x20], m5 +%if ARCH_X86_64 psrldq m11, m4, 8 mova m0, m1 mova m1, m2 mova m2, m3 punpcklwd m3, m4, m11 +%else + psrldq m6, m4, 8 + punpcklwd m3, m4, m6 + mova [esp+0x1a0], m6 + mova [esp+0x1b0], m0 + mova [esp+0x1c0], m1 + mova [esp+0x1d0], m2 + mova [esp+0x1e0], m3 +%endif jmp .w4_loop - SWAP m0, m15 +INIT_XMM ssse3 .w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 @@ -4419,9 +4918,23 @@ %ifidn %1, put movifnidn dsm, dsq %endif +%if ARCH_X86_64 shr t0d, 16 - sub srcq, 3 movd m15, t0d +%else + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq ssm + %endif + mov r4, [esp+0x1f0] + shr r4, 16 + movd m15, r4 + mov r0, r0m + mov myd, mym +%endif + sub srcq, 3 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 @@ -4430,42 +4943,81 @@ mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq -%if UNIX64 +%if ARCH_X86_64 && UNIX64 mov hm, hd +%elif ARCH_X86_32 + mov r5, hm + mov [esp+0x094], myd + mov [esp+0x134], r5 %endif jmp .hloop .hloop_prep: dec dword [rsp+0x090] jz .ret +%if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm +%else + add dword [esp+0x130], 8*(isprep+1) + mov myd, [esp+0x094] + mov r5, [esp+0x134] + mov r0, [esp+0x130] +%endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] +%if ARCH_X86_64 mova m10, [base+pd_0x3ff] +%endif mova m15, [rsp+0x120] pxor m9, m9 mov srcq, [rsp+0x098] +%if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq +%else + mov mym, myd + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif paddd m14, m7 .hloop: +%if ARCH_X86_64 mova m11, [base+pq_0x40000000] - psrld m4, m14, 10 - mova [rsp], m4 +%else + %define m11 [base+pq_0x40000000] +%endif + psrld m2, m14, 10 + mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 - psrldq m4, m5, 8 + psrldq m2, m5, 8 +%if ARCH_X86_64 movd r4d, m5 - movd r6d, m4 + movd r6d, m2 psrldq m5, 4 - psrldq m4, 4 + psrldq m2, 4 movd r7d, m5 - movd r9d, m4 + movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + psrldq m5, 4 + psrldq m2, 4 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] + pxor m2, m2 + %define m9 m2 +%endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 @@ -4473,6 +5025,7 @@ pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 +%if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 @@ -4574,6 +5127,114 @@ paddd m5, m7 paddd m4, m8 paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r4, m15 + movd r5, m4 + mova m14, [esp+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [esp+16], m14 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m11, m4 + pand m1, m11, m6 + pand m2, m11, m7 + pand m3, m11, m5 + pandn m4, [esp+0x20] + pandn m6, [esp+0x30] + pandn m7, [esp+0x40] + pandn m5, [esp+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1 + MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3 + MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5 + MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7 + mova m5, [esp+0x180] + mova m6, [esp+0x190] + mova m7, [esp+0x1a0] + mova m0, [esp+0x1b0] + mov myd, mym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [esp+0x180], m4 + mova [esp+0x190], m5 + mova [esp+0x1a0], m6 + mova [esp+0x1b0], m7 + mova m1, [esp+0x140] + mova m2, [esp+0x150] + mova m3, [esp+0x160] + mova m4, [esp+0x170] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova [esp+0x140], m0 + mova [esp+0x150], m1 + mova [esp+0x160], m2 + mova [esp+0x170], m3 +.vloop: + mov r0, r0m + mov r5, [esp+0x1f4] + and myd, 0x3ff + mov mym, myd + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [esp+0x180], m6 + pmaddwd m3, [esp+0x190], m6 + pmaddwd m4, [esp+0x1a0], m7 + pmaddwd m5, [esp+0x1b0], m7 + paddd m0, m2 + paddd m1, m3 + paddd m0, m13 + paddd m1, m13 + paddd m4, m0 + paddd m5, m1 +%endif psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 @@ -4587,6 +5248,7 @@ %endif dec hd jz .hloop_prep +%if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .vloop @@ -4660,63 +5322,238 @@ mova [rsp+0x60], m6 mova [rsp+0x70], m7 mova [rsp+0x80], m4 +%else + mov r0m, r0 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + mov mym, myd + jnz .next_line + mova m0, [esp+0x140] + mova m1, [esp+0x150] + mova m2, [esp+0x160] + mova m3, [esp+0x170] + jmp .vloop +.next_line: + test myd, 0x400 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + jz .skip_line + mova m6, [base+unpckw] + mova m0, [esp+0x140] + mova m1, [esp+0x150] + mova m7, [esp+0x180] + movq m4, [srcq+r0] + movq m5, [srcq+rX] + movhps m4, [srcq+r4] + movhps m5, [srcq+r5] + pshufb m0, m6 ; 0a 1a + pshufb m1, m6 ; 0b 1b + pshufb m7, m6 ; 4a 5a + mov r0, [esp+16] + mov rX, [esp+24] + mov r4, [esp+20] + mov r5, [esp+28] + movq m3, [srcq+r0] + movq m2, [srcq+rX] + movhps m3, [srcq+r4] + movhps m2, [srcq+r5] + add srcq, ssq + pmaddubsw m4, [esp+0x20] + pmaddubsw m5, [esp+0x30] + pmaddubsw m3, [esp+0x40] + pmaddubsw m2, [esp+0x50] + phaddw m4, m5 + phaddw m3, m2 + mova m5, [esp+0x190] + mova m2, [esp+0x160] + phaddw m4, m3 + mova m3, [esp+0x170] + pmulhrsw m4, m12 ; 8a 8b + mov myd, mym + pshufb m5, m6 ; 4b 5b + pshufd m6, m6, q1032 + pshufb m2, m6 ; 3a 2a + pshufb m3, m6 ; 3b 2b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + mova [esp+0x140], m0 + mova [esp+0x150], m1 + mova m0, [esp+0x1a0] + mova m1, [esp+0x1b0] + punpcklwd m2, m7 ; 34a + punpcklwd m3, m5 ; 34b + mova [esp+0x160], m2 + mova [esp+0x170], m3 + pshufb m0, m6 ; 7a 6a + pshufb m1, m6 ; 7b 6b + punpckhwd m7, m0 ; 56a + punpckhwd m5, m1 ; 56b + punpcklwd m0, m4 + punpckhqdq m4, m4 + punpcklwd m1, m4 + mova [esp+0x180], m7 + mova [esp+0x190], m5 + mova [esp+0x1a0], m0 + mova [esp+0x1b0], m1 + mova m0, [esp+0x140] + mova m1, [esp+0x150] jmp .vloop +.skip_line: + MC_8TAP_SCALED_H 0x20, 0x1c0, 0 + mov myd, mym + mova m0, [esp+0x160] + mova m1, [esp+0x170] + mova m2, [esp+0x180] + mova m3, [esp+0x190] + mova [esp+0x140], m0 + mova [esp+0x150], m1 + mova m4, [esp+0x1a0] + mova m5, [esp+0x1b0] + mova [esp+0x160], m2 + mova [esp+0x170], m3 + mova m6, [esp+0x1c0] + mova m7, [esp+0x1d0] + mova [esp+0x180], m4 + mova [esp+0x190], m5 + punpcklwd m4, m6, m7 + punpckhwd m6, m7 + mova [esp+0x1a0], m4 + mova [esp+0x1b0], m6 +%endif + jmp .vloop +INIT_XMM ssse3 .dy1: movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy1_w2: + %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d + %else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + movzx r5, byte [esp+0x1f0] + dec srcd + movd m15, r5 + %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] + %if ARCH_X86_64 mova m11, [base+pd_0x4000] + %else + %define m11 [base+pd_0x4000] + %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 + %if ARCH_X86_64 movd r6d, m15 + %else + movd r3d, m15 + %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [esp+0x00], m14 + %define m14 [esp+0x00] + SWAP m5, m0 + SWAP m6, m3 + %define m8 m5 + %define m15 m6 + %endif movq m0, [srcq+ssq*0] movq m2, [srcq+ssq*2] movhps m0, [srcq+ssq*1] movhps m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] + %if ARCH_X86_64 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 + movq m10, r4 + %else + mov myd, mym + mov r5, [esp+0x1f4] + xor r3, r3 + shr myd, 6 + lea r5, [r5+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + %define m10 m4 + movd m10, r4 + movd m3, r3 + mov r3, r3m + punpckldq m10, m3 + %endif movq m1, [srcq+ssq*0] movq m3, [srcq+ssq*2] movhps m1, [srcq+ssq*1] add srcq, ss3q - movq m10, r4q punpcklbw m10, m10 psraw m10, 8 punpckldq m15, m7 punpcklqdq m15, m15 + %if ARCH_X86_64 pand m11, m8 + %else + pand m7, m11, m8 + %define m11 m7 + %endif pandn m8, m15 SWAP m15, m8 por m15, m11 + %if ARCH_X86_64 pshufd m8, m10, q0000 pshufd m9, m10, q1111 pshufd m11, m10, q3333 pshufd m10, m10, q2222 + %else + mova [esp+0x10], m15 + %define m15 [esp+0x10] + mov r0, r0m + pshufd m5, m4, q0000 + pshufd m6, m4, q1111 + pshufd m7, m4, q2222 + pshufd m4, m4, q3333 + %define m8 [esp+0x20] + %define m9 [esp+0x30] + %define m10 [esp+0x40] + %define m11 [esp+0x50] + mova m8, m5 + mova m9, m6 + mova m10, m7 + mova m11, m4 + %endif pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 @@ -4758,30 +5595,46 @@ psrad m5, rndshift packssdw m5, m5 packuswb m5, m5 - pextrw r4d, m5, 0 - pextrw r6d, m5, 1 + movd r4d, m5 mov [dstq+dsq*0], r4w - mov [dstq+dsq*1], r6w + shr r4d, 16 + mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET - SWAP m15, m8, m9 %endif +INIT_XMM ssse3 .dy1_w4: +%if ARCH_X86_64 mov myd, mym - mova m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd m15, t0d - pmaddwd m8, m7 +%else + %define m10 [base+pd_0x3ff] + %define m11 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq r3 + %endif + movzx r4, byte [esp+0x1f0] + dec srcq + movd m15, r4 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 mova m11, [base+pd_0x4000] +%endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd m15, m8 psrldq m7, m15, 8 +%if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 @@ -4789,26 +5642,53 @@ movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] - movd m4, [base+subpel_filters+r11*8+2] - movd m5, [base+subpel_filters+ r6*8+2] - movd m7, [base+subpel_filters+r13*8+2] - movq m6, [base+subpel_s_shuf2] + movd m2, [base+subpel_filters+r11*8+2] + movd m3, [base+subpel_filters+ r6*8+2] + movd m4, [base+subpel_filters+r13*8+2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] +%else + movd r1, m15 + movd r3, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r4, m15 + movd r5, m7 + %define m15 m5 + SWAP m4, m7 + movd m15, [base+subpel_filters+r1*8+2] + movd m2, [base+subpel_filters+r3*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m4, [base+subpel_filters+r5*8+2] + mov myd, mym + mov rX, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea rX, [rX+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+rX*8+0] + cmovnz r5, [base+subpel_filters+rX*8+4] + mov r3, r3m + %if isprep + lea ss3q, [ssq*3] + %endif +%endif + punpckldq m15, m3 + punpckldq m2, m4 + punpcklqdq m15, m2 + movq m6, [base+subpel_s_shuf2] +%if ARCH_X86_64 pcmpeqd m8, m9 psrld m14, 10 + pshufb m14, [base+bdct_lb_dw] movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] - punpckldq m15, m5 - punpckldq m4, m7 punpcklqdq m6, m6 - punpcklqdq m15, m4 - pshufb m14, [base+bdct_lb_dw] movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m7, [srcq+ssq*2] @@ -4852,15 +5732,104 @@ punpckhwd m2, m3 ; 34 punpcklwd m9, m4, m5 ; 45 punpckhwd m4, m5 ; 56 +%else + pxor m3, m3 + pcmpeqd m8, m3 + psrld m14, 10 + pshufb m14, [base+bdct_lb_dw] + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + add srcq, ss3q + punpcklqdq m6, m6 + SWAP m4, m7 + pand m7, m11, m8 + pandn m8, m15 + SWAP m5, m0 + por m15, m7 + paddb m14, m6 + movu m0, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + pshufb m0, m14 + pshufb m7, m14 + pshufb m6, m14 + pmaddubsw m1, m15 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + mova [esp+0x00], m14 + mova [esp+0x10], m15 + pmaddubsw m0, m15 + pmaddubsw m7, m15 + pmaddubsw m6, m15 + phaddw m1, m2 + movu m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + mov r0, r0m + phaddw m3, m0 + pshufb m2, m14 + pmaddubsw m2, m15 + %define m14 [esp+0x00] + %define m15 [esp+0x10] + phaddw m7, m6 + phaddw m2, m2 + movd m6, r4 + movd m0, r5 + punpckldq m6, m0 + punpcklbw m6, m6 + psraw m6, 8 + mova [esp+0x20], m6 + pmulhrsw m1, m12 ; 0 1 + pmulhrsw m3, m12 ; 2 3 + pmulhrsw m7, m12 ; 4 5 + pmulhrsw m2, m12 ; 6 _ + shufps m0, m1, m3, q1032 ; 1 2 + shufps m4, m3, m7, q1032 ; 3 4 + shufps m5, m7, m2, q1032 ; 5 6 + punpcklwd m6, m1, m0 ; 01 + punpckhwd m1, m0 ; 12 + mova [esp+0x30], m1 + punpcklwd m1, m3, m4 ; 23 + punpckhwd m3, m4 ; 34 + mova [esp+0x40], m3 + punpcklwd m3, m7, m5 ; 45 + punpckhwd m7, m5 ; 56 + mova [esp+0x50], m7 + mova [esp+0x60], m2 + mova m0, [esp+0x20] + %xdefine m8 m1 + %xdefine m9 m3 + %xdefine m10 m0 + SWAP m7, m6 + SWAP m1, m4 + SWAP m3, m2 +%endif pshufd m1, m10, q0000 pshufd m3, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 +%if ARCH_X86_64 mova [rsp+0x00], m8 mova [rsp+0x10], m2 mova [rsp+0x20], m9 mova [rsp+0x30], m4 +%else + mova [esp+0x70], m8 + mova [esp+0x80], m9 + mova [esp+0x90], m1 + mova [esp+0xa0], m3 + mova [esp+0xb0], m5 + mova [esp+0xc0], m10 + %ifidn %1, put + mov dsd, dsm + %endif + %define m11 m6 +%endif .dy1_w4_loop: +%if ARCH_X86_64 movu m11, [srcq+ssq*0] pmaddwd m7, m1 pmaddwd m8, m3 @@ -4891,10 +5860,55 @@ paddd m7, m2 mova m2, [rsp+0x30] paddd m0, m11 +%else + SWAP m7, m6 + SWAP m1, m4 + SWAP m3, m2 + movu m5, [srcq+ssq*0] + mova m0, [esp+0x30] + mova m2, [esp+0x40] + mova m4, [esp+0x50] + pmaddwd m6, [esp+0x90] + pmaddwd m1, [esp+0xa0] + pmaddwd m0, [esp+0x90] + pmaddwd m2, [esp+0xa0] + pmaddwd m3, [esp+0xb0] + pmaddwd m4, [esp+0xb0] + paddd m6, m1 + paddd m0, m2 + movu m7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m5, m14 + pmaddubsw m5, m15 + paddd m6, m13 + paddd m0, m13 + paddd m6, m3 + paddd m0, m4 + pshufb m7, m14 + pmaddubsw m7, m15 + phaddw m5, m7 + mova m7, [rsp+0x80] + pmulhrsw m5, m12 + punpcklwd m3, [esp+0x60], m5 ; 67 + psrldq m1, m5, 8 + punpcklwd m4, m5, m1 ; 78 + pmaddwd m2, m3, [esp+0xc0] + pmaddwd m5, m4, [esp+0xc0] + mova [esp+0x60], m1 + paddd m6, m2 + mova m2, [esp+0x50] + paddd m0, m5 + SWAP m7, m6 +%endif psrad m7, rndshift psrad m0, rndshift packssdw m7, m0 +%if ARCH_X86_64 mova m0, [rsp+0x10] +%else + mova m0, [esp+0x40] +%define m11 m5 +%endif %ifidn %1, put packuswb m7, m7 psrldq m11, m7, 4 @@ -4907,13 +5921,24 @@ %endif sub hd, 2 jz .ret +%if ARCH_X86_64 mova m7, [rsp+0x00] mova [rsp+0x00], m8 mova [rsp+0x10], m2 mova [rsp+0x20], m9 mova [rsp+0x30], m4 +%else + mova m7, [esp+0x70] ; 01 + mova m1, [esp+0x80] ; 23 + mova m2, [esp+0x50] ; 34 + mova [esp+0x30], m0 + mova [esp+0x70], m1 + mova [esp+0x40], m2 + mova [esp+0x80], m3 + mova [esp+0x50], m4 +%endif jmp .dy1_w4_loop - SWAP m8, m15 +INIT_XMM ssse3 .dy1_w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 @@ -4938,6 +5963,7 @@ %ifidn %1, put movifnidn dsm, dsq %endif +%if ARCH_X86_64 shr t0d, 16 sub srcq, 3 shr myd, 6 @@ -4945,13 +5971,44 @@ lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d +%else + %define m8 m0 + %define m9 m1 + %xdefine m14 m4 + %xdefine m15 m3 + %if isprep + %define ssq ssm + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + sub srcq, 3 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 movq m3, r4q punpcklbw m3, m3 psraw m3, 8 +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + punpcklbw m5, m5 + psraw m5, 8 + SWAP m3, m5 +%endif mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq @@ -4964,42 +6021,81 @@ mova [rsp+0x150], m1 mova [rsp+0x160], m2 mova [rsp+0x170], m3 -%if UNIX64 +%if ARCH_X86_64 && UNIX64 mov hm, hd +%elif ARCH_X86_32 + SWAP m5, m3 + mov r5, hm + mov [esp+0x134], r5 %endif jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+0x090] jz .ret +%if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm +%else + add dword [rsp+0x130], 8*(isprep+1) + mov r5, [esp+0x134] + mov r0, [esp+0x130] +%endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] +%if ARCH_X86_64 mova m10, [base+pd_0x3ff] +%else + %define m10 [base+pd_0x3ff] +%endif mova m15, [rsp+0x120] - pxor m9, m9 mov srcq, [rsp+0x098] +%if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif paddd m14, m7 .dy1_hloop: + pxor m9, m9 +%if ARCH_X86_64 mova m11, [base+pq_0x40000000] - psrld m4, m14, 10 - mova [rsp], m4 +%else + %define m11 [base+pq_0x40000000] +%endif + psrld m2, m14, 10 + mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 - psrldq m4, m5, 8 + psrldq m2, m5, 8 +%if ARCH_X86_64 movd r4d, m5 - movd r6d, m4 + movd r6d, m2 psrldq m5, 4 - psrldq m4, 4 + psrldq m2, 4 movd r7d, m5 - movd r9d, m4 + movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + psrldq m5, 4 + psrldq m2, 4 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] + pxor m2, m2 + %define m9 m2 +%endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 @@ -5007,6 +6103,7 @@ pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 +%if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 @@ -5025,8 +6122,6 @@ psrldq m4, 4 movd r13d, m14 movd rXd, m4 - punpcklbw m14, m14 - psraw m14, 8 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] @@ -5083,7 +6178,85 @@ mova [rsp+0x70], m6 mova [rsp+0x80], m7 mova m14, [base+unpckw] +%else + movd r0, m15 + movd rX, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r4, m15 + movd r5, m4 + mova m14, [esp+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [esp+16], m14 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m11, m4 + pand m1, m11, m6 + pand m2, m11, m7 + pand m3, m11, m5 + pandn m4, [esp+0x20] + pandn m6, [esp+0x30] + pandn m7, [esp+0x40] + pandn m5, [esp+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 + MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 + MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 + MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 + mova m5, [esp+0x1a0] + mova m6, [esp+0x1b0] + mova m7, [esp+0x1c0] + mova m0, [esp+0x1d0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [esp+0x1a0], m4 + mova [esp+0x1b0], m5 + mova [esp+0x1c0], m6 + mova [esp+0x1d0], m7 + mova m1, [esp+0x060] + mova m2, [esp+0x070] + mova m3, [esp+0x180] + mova m4, [esp+0x190] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova [esp+0x060], m0 + mova [esp+0x070], m1 + mova [esp+0x180], m2 + mova [esp+0x190], m3 + %define m8 [esp+0x140] + %define m9 [esp+0x150] + %define m10 [esp+0x160] + %define m11 [esp+0x170] +%endif .dy1_vloop: +%if ARCH_X86_32 + mov r0, r0m +%endif pmaddwd m4, m0, m8 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 @@ -5092,14 +6265,24 @@ paddd m5, m13 paddd m4, m6 paddd m5, m7 +%if ARCH_X86_64 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 - pmaddwd m15, [rsp+0x70], m11 +%else + pmaddwd m6, [rsp+0x1a0], m10 + pmaddwd m7, [rsp+0x1b0], m10 +%endif + paddd m4, m6 + paddd m5, m7 +%if ARCH_X86_64 + pmaddwd m6, [rsp+0x70], m11 + pmaddwd m7, [rsp+0x80], m11 +%else + pmaddwd m6, [rsp+0x1c0], m11 + pmaddwd m7, [rsp+0x1d0], m11 +%endif paddd m4, m6 - pmaddwd m6, [rsp+0x80], m11 paddd m5, m7 - paddd m4, m15 - paddd m5, m6 psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 @@ -5111,8 +6294,12 @@ mova [tmpq], m4 add tmpq, tmp_stridem %endif +%if ARCH_X86_32 + mov r0m, r0 +%endif dec hd jz .dy1_hloop_prep +%if ARCH_X86_64 movq m4, [srcq+ r4] movq m5, [srcq+ r6] movhps m4, [srcq+ r7] @@ -5152,73 +6339,212 @@ mova [rsp+0x60], m15 mova [rsp+0x70], m5 mova [rsp+0x80], m7 +%else + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + mova m6, [base+unpckw] + mova m0, [esp+0x060] + mova m1, [esp+0x070] + mova m7, [esp+0x1a0] + movq m4, [srcq+r0] + movq m5, [srcq+rX] + movhps m4, [srcq+r4] + movhps m5, [srcq+r5] + pshufb m0, m6 ; 0a 1a + pshufb m1, m6 ; 0b 1b + pshufb m7, m6 ; 4a 5a + mov r0, [esp+16] + mov rX, [esp+24] + mov r4, [esp+20] + mov r5, [esp+28] + movq m3, [srcq+r0] + movq m2, [srcq+rX] + movhps m3, [srcq+r4] + movhps m2, [srcq+r5] + add srcq, ssq + pmaddubsw m4, [esp+0x20] + pmaddubsw m5, [esp+0x30] + pmaddubsw m3, [esp+0x40] + pmaddubsw m2, [esp+0x50] + phaddw m4, m5 + phaddw m3, m2 + mova m5, [esp+0x1b0] + mova m2, [esp+0x180] + phaddw m4, m3 + mova m3, [esp+0x190] + pmulhrsw m4, m12 ; 8a 8b + pshufb m5, m6 ; 4b 5b + pshufd m6, m6, q1032 + pshufb m2, m6 ; 3a 2a + pshufb m3, m6 ; 3b 2b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + mova [esp+0x60], m0 + mova [esp+0x70], m1 + mova m0, [esp+0x1c0] + mova m1, [esp+0x1d0] + punpcklwd m2, m7 ; 34a + punpcklwd m3, m5 ; 34b + mova [esp+0x180], m2 + mova [esp+0x190], m3 + pshufb m0, m6 ; 7a 6a + pshufb m1, m6 ; 7b 6b + punpckhwd m7, m0 ; 56a + punpckhwd m5, m1 ; 56b + punpcklwd m0, m4 + punpckhqdq m4, m4 + punpcklwd m1, m4 + mova [esp+0x1a0], m7 + mova [esp+0x1b0], m5 + mova [esp+0x1c0], m0 + mova [esp+0x1d0], m1 + mova m0, [esp+0x60] + mova m1, [esp+0x70] +%endif jmp .dy1_vloop +INIT_XMM ssse3 .dy2: movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy2_w2: + %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d + %else + %define m10 [base+pd_0x3ff] + %define m11 [base+pd_0x4000] + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + movzx r5, byte [esp+0x1f0] + dec srcd + movd m15, r5 + %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] + %if ARCH_X86_64 mova m11, [base+pd_0x4000] + %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 + %if ARCH_X86_64 movd r6d, m15 + %else + movd r3d, m15 + %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [esp+0x00], m14 + %define m14 [esp+0x00] + SWAP m5, m0 + SWAP m6, m3 + %define m8 m5 + %define m15 m6 + %endif movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] movhps m0, [srcq+ssq*2] movhps m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] + %if ARCH_X86_64 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 + movq m10, r4q + %else + mov myd, mym + mov r3, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r3, r3m + %define m10 m4 + movd m10, r4 + movd m3, r5 + punpckldq m10, m3 + %endif + movq m3, [srcq+ssq*0] + movhps m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m10, m10 + psraw m10, 8 punpckldq m15, m7 punpcklqdq m15, m15 + %if ARCH_X86_64 pand m11, m8 + %else + pand m7, m11, m8 + %define m11 m7 + %endif pandn m8, m15 SWAP m15, m8 por m15, m11 - movq m3, [srcq+ssq*0] - movhps m3, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - shr myd, 6 - mov r4d, 64 << 24 - lea myd, [t1+myq] - cmovnz r4q, [base+subpel_filters+myq*8] + %if ARCH_X86_64 + pshufd m8, m10, q0000 + pshufd m9, m10, q1111 + pshufd m11, m10, q3333 + pshufd m10, m10, q2222 + %else + mova [esp+0x10], m15 + %define m15 [esp+0x10] + mov r5, r0m + %define dstq r5 + mov dsd, dsm + pshufd m5, m4, q0000 + pshufd m6, m4, q1111 + pshufd m7, m4, q2222 + pshufd m4, m4, q3333 + %define m8 [esp+0x20] + %define m9 [esp+0x30] + %define m10 [esp+0x40] + %define m11 [esp+0x50] + mova m8, m5 + mova m9, m6 + mova m10, m7 + mova m11, m4 + %endif pshufb m0, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 - movq m11, r4q - punpcklbw m11, m11 - psraw m11, 8 pslldq m2, m3, 8 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 2 _ 4 pmulhrsw m1, m12 ; 1 3 _ 5 - pshufd m8, m11, q0000 - pshufd m9, m11, q1111 - pshufd m10, m11, q2222 - pshufd m11, m11, q3333 pshufd m2, m0, q3110 ; 0 2 2 4 pshufd m1, m1, q3110 ; 1 3 3 5 punpcklwd m3, m2, m1 ; 01 23 @@ -5263,22 +6589,39 @@ sub hd, 2 jg .dy2_w2_loop RET - SWAP m15, m8, m9 %endif +INIT_XMM ssse3 .dy2_w4: +%if ARCH_X86_64 mov myd, mym - mova m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd m15, t0d - pmaddwd m8, m7 +%else + %define m10 [base+pd_0x3ff] + %define m11 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %define dstq r0 + %if isprep + %define ssq r3 + %endif + movzx r4, byte [esp+0x1f0] + dec srcq + movd m15, r4 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 mova m11, [base+pd_0x4000] +%endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd m15, m8 psrldq m7, m15, 8 +%if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 @@ -5286,14 +6629,45 @@ movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] - movd m4, [base+subpel_filters+r11*8+2] - movd m5, [base+subpel_filters+ r6*8+2] - movd m7, [base+subpel_filters+r13*8+2] + movd m2, [base+subpel_filters+r11*8+2] + movd m3, [base+subpel_filters+ r6*8+2] + movd m4, [base+subpel_filters+r13*8+2] movq m6, [base+subpel_s_shuf2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] +%else + movd r1, m15 + movd r3, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r4, m15 + movd r5, m7 + %define m15 m5 + SWAP m4, m7 + movd m15, [base+subpel_filters+r1*8+2] + movd m2, [base+subpel_filters+r3*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m4, [base+subpel_filters+r5*8+2] + movq m6, [base+subpel_s_shuf2] + mov myd, mym + mov r3, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r3, r3m + %if isprep + lea ss3q, [ssq*3] + %endif +%endif + punpckldq m15, m3 + punpckldq m2, m4 + punpcklqdq m15, m2 +%if ARCH_X86_64 pcmpeqd m8, m9 psrld m14, 10 movu m0, [srcq+ssq*0] @@ -5301,10 +6675,7 @@ movu m1, [srcq+ssq*1] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] - punpckldq m15, m5 - punpckldq m4, m7 punpcklqdq m6, m6 - punpcklqdq m15, m4 pshufb m14, [base+bdct_lb_dw] movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] @@ -5339,6 +6710,75 @@ pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 +%else + pxor m3, m3 + pcmpeqd m8, m3 + psrld m14, 10 + pshufb m14, [base+bdct_lb_dw] + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ssq*1] + add srcq, ss3q + punpcklqdq m6, m6 + SWAP m4, m7 + pand m7, m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m7 + paddb m14, m6 + movu m0, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + pshufb m0, m14 + pshufb m7, m14 + pshufb m6, m14 + pmaddubsw m1, m15 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + mova [esp+0x00], m14 + mova [esp+0x10], m15 + pmaddubsw m0, m15 + pmaddubsw m7, m15 + pmaddubsw m6, m15 + %define m14 [esp+0x00] + %define m15 [esp+0x10] + phaddw m1, m2 + phaddw m3, m0 + phaddw m7, m6 + %ifidn %1, put + mov dsd, dsm + %define dstq r5 + %else + %define tmpq r5 + %endif + movd m6, r4 + movd m0, r5 + punpckldq m6, m0 + punpcklbw m6, m6 + psraw m6, 8 + mov r5, r0m + pmulhrsw m1, m12 ; 0 2 + pmulhrsw m3, m12 ; 1 3 + pmulhrsw m7, m12 ; 4 5 + SWAP m0, m1, m3 + SWAP m4, m7 + pshufd m2, m6, q0000 + pshufd m3, m6, q1111 + pshufd m7, m6, q2222 + pshufd m6, m6, q3333 + mova [esp+0x30], m2 + mova [esp+0x40], m3 + mova [esp+0x50], m7 + mova [esp+0x60], m6 + %define m8 [esp+0x30] + %define m9 [esp+0x40] + %define m10 [esp+0x50] + %define m11 [esp+0x60] +%endif psrldq m5, m4, 8 ; 5 _ punpckhwd m2, m0, m1 ; 23 punpcklwd m0, m1 ; 01 @@ -5400,7 +6840,7 @@ sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET - SWAP m8, m15 +INIT_XMM ssse3 .dy2_w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 @@ -5425,6 +6865,7 @@ %ifidn %1, put movifnidn dsm, dsq %endif +%if ARCH_X86_64 shr t0d, 16 sub srcq, 3 shr myd, 6 @@ -5432,13 +6873,49 @@ lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m11 [base+pd_0x4000] + %define m8 m0 + %define m9 m1 + %xdefine m14 m4 + %xdefine m15 m3 + %if isprep + %define tmpq r0 + %define ssq ssm + %else + %define dstq r0 + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + sub srcq, 3 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 movq m3, r4q punpcklbw m3, m3 psraw m3, 8 +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + punpcklbw m5, m5 + psraw m5, 8 + SWAP m3, m5 +%endif mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq @@ -5451,42 +6928,81 @@ mova [rsp+0x150], m1 mova [rsp+0x160], m2 mova [rsp+0x170], m3 -%if UNIX64 +%if ARCH_X86_64 && UNIX64 mov hm, hd +%elif ARCH_X86_32 + SWAP m5, m3 + mov r5, hm + mov [esp+0x134], r5 %endif jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+0x090] jz .ret +%if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm +%else + add dword [rsp+0x130], 8*(isprep+1) + mov r5, [esp+0x134] + mov r0, [esp+0x130] +%endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] +%if ARCH_X86_64 mova m10, [base+pd_0x3ff] +%else + %define m10 [base+pd_0x3ff] +%endif mova m15, [rsp+0x120] - pxor m9, m9 mov srcq, [rsp+0x098] +%if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif paddd m14, m7 .dy2_hloop: + pxor m9, m9 +%if ARCH_X86_64 mova m11, [base+pq_0x40000000] - psrld m4, m14, 10 - mova [rsp], m4 +%else + %define m11 [base+pq_0x40000000] +%endif + psrld m2, m14, 10 + mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 - psrldq m4, m5, 8 + psrldq m2, m5, 8 +%if ARCH_X86_64 movd r4d, m5 - movd r6d, m4 + movd r6d, m2 psrldq m5, 4 - psrldq m4, 4 + psrldq m2, 4 movd r7d, m5 - movd r9d, m4 + movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + psrldq m5, 4 + psrldq m2, 4 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] + pxor m2, m2 + %define m9 m2 +%endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 @@ -5494,6 +7010,7 @@ pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 +%if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 @@ -5567,7 +7084,83 @@ mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 +%else + movd r0, m15 + movd rX, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r4, m15 + movd r5, m4 + mova m14, [esp+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [esp+16], m14 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m11, m4 + pand m1, m11, m6 + pand m2, m11, m7 + pand m3, m11, m5 + pandn m4, [esp+0x20] + pandn m6, [esp+0x30] + pandn m7, [esp+0x40] + pandn m5, [esp+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 + MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 + MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 + MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 + mova m5, [esp+0x1a0] + mova m6, [esp+0x1b0] + mova m7, [esp+0x1c0] + mova m0, [esp+0x1d0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [esp+0x1a0], m4 + mova [esp+0x1b0], m5 + mova [esp+0x1c0], m6 + mova [esp+0x1d0], m7 + mova m1, [esp+0x060] + mova m2, [esp+0x070] + mova m3, [esp+0x180] + mova m4, [esp+0x190] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova [esp+0x180], m2 + mova [esp+0x190], m3 + %define m8 [esp+0x140] + %define m9 [esp+0x150] + %define m10 [esp+0x160] + %define m11 [esp+0x170] +%endif .dy2_vloop: +%if ARCH_X86_32 + mov r0, r0m +%endif pmaddwd m4, m0, m8 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 @@ -5576,14 +7169,24 @@ paddd m5, m13 paddd m4, m6 paddd m5, m7 +%if ARCH_X86_64 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 - pmaddwd m15, [rsp+0x70], m11 +%else + pmaddwd m6, [esp+0x1a0], m10 + pmaddwd m7, [esp+0x1b0], m10 +%endif + paddd m4, m6 + paddd m5, m7 +%if ARCH_X86_64 + pmaddwd m6, [rsp+0x70], m11 + pmaddwd m7, [rsp+0x80], m11 +%else + pmaddwd m6, [esp+0x1c0], m11 + pmaddwd m7, [esp+0x1d0], m11 +%endif paddd m4, m6 - pmaddwd m6, [rsp+0x80], m11 paddd m5, m7 - paddd m4, m15 - paddd m5, m6 psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 @@ -5595,8 +7198,12 @@ mova [tmpq], m4 add tmpq, tmp_stridem %endif +%if ARCH_X86_32 + mov r0m, r0 +%endif dec hd jz .dy2_hloop_prep +%if ARCH_X86_64 mova m8, [rsp+0x10] mova m9, [rsp+0x20] mova m10, [rsp+0x30] @@ -5620,9 +7227,32 @@ mova [rsp+0x80], m2 mova m2, m3 mova m3, m4 +%else + MC_8TAP_SCALED_H 0x20, 0 + punpcklwd m6, m0, m4 + punpckhwd m7, m0, m4 + mova m0, [esp+0x180] ; 01a + mova m1, [esp+0x190] ; 01b + mova m2, [rsp+0x1a0] ; 23a + mova m3, [esp+0x1b0] ; 23b + mova m4, [esp+0x1c0] ; 45a + mova m5, [esp+0x1d0] ; 45b + mova [esp+0x180], m2 + mova [esp+0x190], m3 + mova [esp+0x1a0], m4 + mova [esp+0x1b0], m5 + mova [esp+0x1c0], m6 ; 67a + mova [esp+0x1d0], m7 ; 67b +%endif jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 +%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT + %define r0m [rstk+stack_offset+ 4] + %define r1m [rstk+stack_offset+ 8] + %define r2m [rstk+stack_offset+12] + %define r3m [rstk+stack_offset+16] +%endif %undef isprep %endmacro @@ -5633,11 +7263,12 @@ jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) %endmacro -%if ARCH_X86_64 %if WIN64 DECLARE_REG_TMP 6, 5 -%else +%elif ARCH_X86_64 DECLARE_REG_TMP 6, 8 +%else +DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN put FN put_8tap_scaled, sharp, SHARP, SHARP @@ -5653,8 +7284,10 @@ %if WIN64 DECLARE_REG_TMP 5, 4 -%else +%elif ARCH_X86_64 DECLARE_REG_TMP 6, 7 +%else +DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN prep FN prep_8tap_scaled, sharp, SHARP, SHARP @@ -5667,7 +7300,6 @@ FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH FN prep_8tap_scaled, regular, REGULAR, REGULAR MC_8TAP_SCALED prep -%endif %if ARCH_X86_32 %macro SAVE_ALPHA_BETA 0 @@ -7772,7 +9404,7 @@ pshufd m5, m5, q0000 %if ARCH_X86_64 - DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ %else diff -Nru dav1d-0.9.2/src/x86/refmvs.asm dav1d-1.0.0/src/x86/refmvs.asm --- dav1d-0.9.2/src/x86/refmvs.asm 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/src/x86/refmvs.asm 2022-03-18 14:31:56.034356000 +0000 @@ -26,7 +26,7 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 %macro JMP_TABLE 2-* %xdefine %%prefix mangle(private_prefix %+ _%1) @@ -41,10 +41,13 @@ %if ARCH_X86_64 splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 -JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 +JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 +JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 %endif -JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 +JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 SECTION .text @@ -166,4 +169,80 @@ dec bh4d jg .loop RET -%endif + +INIT_ZMM avx512icl +cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 + vbroadcasti32x4 m0, [aq] + lea r1, [splat_mv_avx512icl_table] + tzcnt bw4d, bw4d + lea bx4d, [bx4q*3] + pshufb m0, [splat_mv_shuf] + movsxd bw4q, [r1+bw4q*4] + mov r6d, bh4m + add bw4q, r1 + lea rrq, [rrq+r6*8] + mov r1d, 0x3f + neg r6 + kmovb k1, r1d + jmp bw4q +.w1: + mov r1, [rrq+r6*8] + vmovdqu16 [r1+bx4q*4]{k1}, xm0 + inc r6 + jl .w1 + RET +.w2: + mov r1, [rrq+r6*8] + vmovdqu32 [r1+bx4q*4]{k1}, ym0 + inc r6 + jl .w2 + RET +.w4: + mov r1, [rrq+r6*8] + vmovdqu64 [r1+bx4q*4]{k1}, m0 + inc r6 + jl .w4 + RET +.w8: + pshufd ym1, ym0, q1021 +.w8_loop: + mov r1, [rrq+r6*8+0] + mov r3, [rrq+r6*8+8] + movu [r1+bx4q*4+ 0], m0 + mova [r1+bx4q*4+64], ym1 + movu [r3+bx4q*4+ 0], m0 + mova [r3+bx4q*4+64], ym1 + add r6, 2 + jl .w8_loop + RET +.w16: + pshufd m1, m0, q1021 + pshufd m2, m0, q2102 +.w16_loop: + mov r1, [rrq+r6*8+0] + mov r3, [rrq+r6*8+8] + mova [r1+bx4q*4+64*0], m0 + mova [r1+bx4q*4+64*1], m1 + mova [r1+bx4q*4+64*2], m2 + mova [r3+bx4q*4+64*0], m0 + mova [r3+bx4q*4+64*1], m1 + mova [r3+bx4q*4+64*2], m2 + add r6, 2 + jl .w16_loop + RET +.w32: + pshufd m1, m0, q1021 + pshufd m2, m0, q2102 +.w32_loop: + mov r1, [rrq+r6*8] + lea r1, [r1+bx4q*4] + mova [r1+64*0], m0 + mova [r1+64*1], m1 + mova [r1+64*2], m2 + mova [r1+64*3], m0 + mova [r1+64*4], m1 + mova [r1+64*5], m2 + inc r6 + jl .w32_loop + RET +%endif ; ARCH_X86_64 diff -Nru dav1d-0.9.2/src/x86/refmvs_init.c dav1d-1.0.0/src/x86/refmvs_init.c --- dav1d-0.9.2/src/x86/refmvs_init.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/src/x86/refmvs_init.c 2022-03-18 14:31:56.034356000 +0000 @@ -30,6 +30,7 @@ decl_splat_mv_fn(dav1d_splat_mv_sse2); decl_splat_mv_fn(dav1d_splat_mv_avx2); +decl_splat_mv_fn(dav1d_splat_mv_avx512icl); COLD void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); @@ -42,5 +43,9 @@ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; c->splat_mv = dav1d_splat_mv_avx2; + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->splat_mv = dav1d_splat_mv_avx512icl; #endif } diff -Nru dav1d-0.9.2/tests/checkasm/cdef.c dav1d-1.0.0/tests/checkasm/cdef.c --- dav1d-0.9.2/tests/checkasm/cdef.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tests/checkasm/cdef.c 2022-03-18 14:31:56.034356000 +0000 @@ -56,12 +56,14 @@ ALIGN_STK_64(pixel, c_src, 16 * 10 + 16, ), *const c_dst = c_src + 8; ALIGN_STK_64(pixel, a_src, 16 * 10 + 16, ), *const a_dst = a_src + 8; ALIGN_STK_64(pixel, top_buf, 16 * 2 + 16, ), *const top = top_buf + 8; + ALIGN_STK_64(pixel, bot_buf, 16 * 2 + 16, ), *const bot = bot_buf + 8; ALIGN_STK_16(pixel, left, 8,[2]); const ptrdiff_t stride = 16 * sizeof(pixel); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2], - const pixel *top, int pri_strength, int sec_strength, - int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX); + const pixel *top, const pixel *bot, int pri_strength, + int sec_strength, int dir, int damping, + enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX); if (check_func(fn, "cdef_filter_%dx%d_%dbpc", w, h, BITDEPTH)) { for (int dir = 0; dir < 8; dir++) { @@ -75,6 +77,7 @@ init_tmp(c_src, 16 * 10 + 16, bitdepth_max); init_tmp(top_buf, 16 * 2 + 16, bitdepth_max); + init_tmp(bot_buf, 16 * 2 + 16, bitdepth_max); init_tmp((pixel *) left, 8 * 2, bitdepth_max); memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel)); @@ -84,9 +87,9 @@ int sec_strength = lvl & 3; sec_strength += sec_strength == 3; sec_strength <<= bitdepth_min_8; - call_ref(c_dst, stride, left, top, pri_strength, sec_strength, + call_ref(c_dst, stride, left, top, bot, pri_strength, sec_strength, dir, damping, edges HIGHBD_TAIL_SUFFIX); - call_new(a_dst, stride, left, top, pri_strength, sec_strength, + call_new(a_dst, stride, left, top, bot, pri_strength, sec_strength, dir, damping, edges HIGHBD_TAIL_SUFFIX); if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) { fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n", @@ -101,8 +104,8 @@ */ pri_strength = (edges & 1) << bitdepth_min_8; sec_strength = (edges & 2) << bitdepth_min_8; - bench_new(a_dst, stride, left, top, pri_strength, sec_strength, - dir, damping, edges HIGHBD_TAIL_SUFFIX); + bench_new(a_dst, stride, left, top, bot, pri_strength, + sec_strength, dir, damping, edges HIGHBD_TAIL_SUFFIX); } } } diff -Nru dav1d-0.9.2/tests/checkasm/checkasm.h dav1d-1.0.0/tests/checkasm/checkasm.h --- dav1d-0.9.2/tests/checkasm/checkasm.h 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tests/checkasm/checkasm.h 2022-03-18 14:31:56.034356000 +0000 @@ -311,11 +311,12 @@ #endif +#define ROUND_UP(x,a) (((x)+((a)-1)) & ~((a)-1)) #define PIXEL_RECT(name, w, h) \ - ALIGN_STK_64(pixel, name##_buf, ((h)+32)*((w)+64) + 64,); \ - ptrdiff_t name##_stride = sizeof(pixel)*((w)+64); \ + ALIGN_STK_64(pixel, name##_buf, ((h)+32)*(ROUND_UP(w,64)+64) + 64,); \ + ptrdiff_t name##_stride = sizeof(pixel)*(ROUND_UP(w,64)+64); \ (void)name##_stride; \ - pixel *name = name##_buf + ((w)+64)*16 + 64 + pixel *name = name##_buf + (ROUND_UP(w,64)+64)*16 + 64 #define CLEAR_PIXEL_RECT(name) \ memset(name##_buf, 0x99, sizeof(name##_buf)) \ diff -Nru dav1d-0.9.2/tests/checkasm/filmgrain.c dav1d-1.0.0/tests/checkasm/filmgrain.c --- dav1d-0.9.2/tests/checkasm/filmgrain.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tests/checkasm/filmgrain.c 2022-03-18 14:31:56.034356000 +0000 @@ -30,7 +30,7 @@ #include #include "src/levels.h" -#include "src/film_grain.h" +#include "src/filmgrain.h" #define UNIT_TEST 1 #include "src/fg_apply_tmpl.c" @@ -47,8 +47,8 @@ }; static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) { - entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH]; - entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH]; + ALIGN_STK_16(entry, grain_lut_c, GRAIN_HEIGHT,[GRAIN_WIDTH]); + ALIGN_STK_16(entry, grain_lut_a, GRAIN_HEIGHT + 1,[GRAIN_WIDTH]); declare_func(void, entry grain_lut[][GRAIN_WIDTH], const Dav1dFilmGrainData *data HIGHBD_DECL_SUFFIX); @@ -155,6 +155,7 @@ if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) { ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 16,); + ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,); fg_data[0].seed = rnd() & 0xFFFF; #if BITDEPTH == 16 @@ -163,7 +164,6 @@ const int bitdepth_max = 0xff; #endif - uint8_t scaling[SCALING_SIZE]; entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH]; fg_data[0].grain_scale_shift = rnd() & 3; fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; @@ -267,6 +267,7 @@ BITDEPTH, ss_name[layout_idx], csfl)) { ALIGN_STK_16(Dav1dFilmGrainData, fg_data, 1,); + ALIGN_STK_64(uint8_t, scaling, SCALING_SIZE,); fg_data[0].seed = rnd() & 0xFFFF; @@ -278,7 +279,6 @@ const int uv_pl = rnd() & 1; const int is_identity = rnd() & 1; - uint8_t scaling[SCALING_SIZE]; entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH]; fg_data[0].grain_scale_shift = rnd() & 3; fg_data[0].ar_coeff_shift = (rnd() & 3) + 6; @@ -368,7 +368,7 @@ checkasm_check_pixel_padded_align(c_dst, stride, a_dst, stride, w, h, "dst", - 32 >> ss_x, 2); + 32 >> ss_x, 4); } } @@ -380,7 +380,7 @@ luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max; } } - bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16, + bench_new(a_dst, src, stride, fg_data, 64 >> ss_x, scaling, grain_lut[1], 32 >> ss_y, 1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); } } diff -Nru dav1d-0.9.2/tests/checkasm/ipred.c dav1d-1.0.0/tests/checkasm/ipred.c --- dav1d-0.9.2/tests/checkasm/ipred.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tests/checkasm/ipred.c 2022-03-18 14:31:56.034356000 +0000 @@ -192,8 +192,8 @@ } static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) { - ALIGN_STK_64(pixel, c_dst, 32 * 32,); - ALIGN_STK_64(pixel, a_dst, 32 * 32,); + PIXEL_RECT(c_dst, 32, 32); + PIXEL_RECT(a_dst, 32, 32); ALIGN_STK_64(int16_t, ac, 32 * 32,); ALIGN_STK_64(pixel, topleft_buf, 257,); pixel *const topleft = topleft_buf + 128; @@ -215,8 +215,6 @@ const int bitdepth_max = 0xff; #endif - const ptrdiff_t stride = w * sizeof(pixel); - int alpha = ((rnd() & 15) + 1) * (1 - (rnd() & 2)); for (int i = -h * 2; i <= w * 2; i++) @@ -229,14 +227,17 @@ for (int i = 0; i < w * h; i++) ac[i] -= luma_avg; - call_ref(c_dst, stride, topleft, w, h, ac, alpha + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, topleft, w, h, ac, alpha HIGHBD_TAIL_SUFFIX); - call_new(a_dst, stride, topleft, w, h, ac, alpha + call_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, stride, a_dst, stride, - w, h, "dst"); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); - bench_new(a_dst, stride, topleft, w, h, ac, alpha + bench_new(a_dst, a_dst_stride, topleft, w, h, ac, alpha HIGHBD_TAIL_SUFFIX); } } @@ -244,8 +245,8 @@ } static void check_pal_pred(Dav1dIntraPredDSPContext *const c) { - ALIGN_STK_64(pixel, c_dst, 64 * 64,); - ALIGN_STK_64(pixel, a_dst, 64 * 64,); + PIXEL_RECT(c_dst, 64, 64); + PIXEL_RECT(a_dst, 64, 64); ALIGN_STK_64(uint8_t, idx, 64 * 64,); ALIGN_STK_16(uint16_t, pal, 8,); @@ -261,7 +262,6 @@ #else const int bitdepth_max = 0xff; #endif - const ptrdiff_t stride = w * sizeof(pixel); for (int i = 0; i < 8; i++) pal[i] = rnd() & bitdepth_max; @@ -269,11 +269,15 @@ for (int i = 0; i < w * h; i++) idx[i] = rnd() & 7; - call_ref(c_dst, stride, pal, idx, w, h); - call_new(a_dst, stride, pal, idx, w, h); - checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst"); + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, pal, idx, w, h); + call_new(a_dst, a_dst_stride, pal, idx, w, h); + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, w, h, "dst"); - bench_new(a_dst, stride, pal, idx, w, h); + bench_new(a_dst, a_dst_stride, pal, idx, w, h); } report("pal_pred"); } diff -Nru dav1d-0.9.2/tests/checkasm/itx.c dav1d-1.0.0/tests/checkasm/itx.c --- dav1d-0.9.2/tests/checkasm/itx.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tests/checkasm/itx.c 2022-03-18 14:31:56.034356000 +0000 @@ -243,8 +243,8 @@ const enum RectTxfmSize tx) { ALIGN_STK_64(coef, coeff, 2, [32 * 32]); - ALIGN_STK_64(pixel, c_dst, 64 * 64,); - ALIGN_STK_64(pixel, a_dst, 64 * 64,); + PIXEL_RECT(c_dst, 64, 64); + PIXEL_RECT(a_dst, 64, 64); static const uint8_t subsh_iters[5] = { 2, 2, 3, 5, 5 }; @@ -275,21 +275,26 @@ const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max); memcpy(coeff[1], coeff[0], sizeof(*coeff)); - for (int j = 0; j < w * h; j++) - c_dst[j] = a_dst[j] = rnd() & bitdepth_max; + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); - call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + c_dst[y*PXSTRIDE(c_dst_stride) + x] = + a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; + + call_ref(c_dst, c_dst_stride, coeff[0], eob HIGHBD_TAIL_SUFFIX); - call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob + call_new(a_dst, a_dst_stride, coeff[1], eob HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, w * sizeof(*c_dst), - a_dst, w * sizeof(*a_dst), - w, h, "dst"); + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, + w, h, "dst"); if (memcmp(coeff[0], coeff[1], sizeof(*coeff))) fail(); - bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob + bench_new(a_dst, a_dst_stride, coeff[0], eob HIGHBD_TAIL_SUFFIX); } } diff -Nru dav1d-0.9.2/tests/checkasm/looprestoration.c dav1d-1.0.0/tests/checkasm/looprestoration.c --- dav1d-0.9.2/tests/checkasm/looprestoration.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tests/checkasm/looprestoration.c 2022-03-18 14:31:56.034356000 +0000 @@ -54,17 +54,17 @@ } static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) { - ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32; - ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32; - ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32; + ALIGN_STK_64(pixel, c_src, 448 * 64 + 64,), *const c_dst = c_src + 64; + ALIGN_STK_64(pixel, a_src, 448 * 64 + 64,), *const a_dst = a_src + 64; + ALIGN_STK_64(pixel, edge_buf, 448 * 8 + 64,), *const h_edge = edge_buf + 64; pixel left[64][4]; LooprestorationParams params; int16_t (*const filter)[8] = params.filter; declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*const left)[4], - const pixel *lpf, ptrdiff_t lpf_stride, - int w, int h, const LooprestorationParams *params, + const pixel *lpf, int w, int h, + const LooprestorationParams *params, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); for (int t = 0; t < 2; t++) { @@ -97,11 +97,9 @@ memcpy(a_src, c_src, 448 * 64 * sizeof(pixel)); call_ref(c_dst, 448 * sizeof(pixel), left, - h_edge, 448 * sizeof(pixel), - w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + h_edge, w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); call_new(a_dst, 448 * sizeof(pixel), left, - h_edge, 448 * sizeof(pixel), - w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + h_edge, w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel), a_dst, 448 * sizeof(pixel), w, h, "dst")) @@ -111,24 +109,23 @@ break; } } - bench_new(a_dst, 448 * sizeof(pixel), left, - h_edge, 448 * sizeof(pixel), + bench_new(a_dst, 448 * sizeof(pixel), left, h_edge, 256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX); } } } static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) { - ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32; - ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32; - ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32; + ALIGN_STK_64(pixel, c_src, 448 * 64 + 64,), *const c_dst = c_src + 64; + ALIGN_STK_64(pixel, a_src, 448 * 64 + 64,), *const a_dst = a_src + 64; + ALIGN_STK_64(pixel, edge_buf, 448 * 8 + 64,), *const h_edge = edge_buf + 64; pixel left[64][4]; LooprestorationParams params; declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*const left)[4], - const pixel *lpf, ptrdiff_t lpf_stride, - int w, int h, const LooprestorationParams *params, + const pixel *lpf, int w, int h, + const LooprestorationParams *params, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); static const struct { char name[4]; uint8_t idx; } sgr_data[3] = { @@ -159,11 +156,9 @@ memcpy(a_src, c_src, 448 * 64 * sizeof(pixel)); - call_ref(c_dst, 448 * sizeof(pixel), left, - h_edge, 448 * sizeof(pixel), + call_ref(c_dst, 448 * sizeof(pixel), left, h_edge, w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); - call_new(a_dst, 448 * sizeof(pixel), left, - h_edge, 448 * sizeof(pixel), + call_new(a_dst, 448 * sizeof(pixel), left, h_edge, w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel), a_dst, 448 * sizeof(pixel), @@ -174,8 +169,7 @@ break; } } - bench_new(a_dst, 448 * sizeof(pixel), left, - h_edge, 448 * sizeof(pixel), + bench_new(a_dst, 448 * sizeof(pixel), left, h_edge, 256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX); } } diff -Nru dav1d-0.9.2/tests/checkasm/mc.c dav1d-1.0.0/tests/checkasm/mc.c --- dav1d-0.9.2/tests/checkasm/mc.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tests/checkasm/mc.c 2022-03-18 14:31:56.034356000 +0000 @@ -57,8 +57,8 @@ static void check_mc(Dav1dMCDSPContext *const c) { ALIGN_STK_64(pixel, src_buf, 135 * 135,); - ALIGN_STK_64(pixel, c_dst, 128 * 128,); - ALIGN_STK_64(pixel, a_dst, 128 * 128,); + PIXEL_RECT(c_dst, 128, 128); + PIXEL_RECT(a_dst, 128, 128); const pixel *src = src_buf + 135 * 3 + 3; const ptrdiff_t src_stride = 135 * sizeof(pixel); @@ -68,7 +68,6 @@ for (int filter = 0; filter < N_2D_FILTERS; filter++) for (int w = 2; w <= 128; w <<= 1) { - const ptrdiff_t dst_stride = w * sizeof(pixel); for (int mxy = 0; mxy < 4; mxy++) if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc", filter_names[filter], w, mxy_names[mxy], BITDEPTH)) @@ -87,18 +86,21 @@ for (int i = 0; i < 135 * 135; i++) src_buf[i] = rnd() & bitdepth_max; - call_ref(c_dst, dst_stride, src, src_stride, w, h, + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, src, src_stride, w, h, mx, my HIGHBD_TAIL_SUFFIX); - call_new(a_dst, dst_stride, src, src_stride, w, h, + call_new(a_dst, a_dst_stride, src, src_stride, w, h, mx, my HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, dst_stride, - a_dst, dst_stride, - w, h, "dst"); + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, + w, h, "dst"); if (filter == FILTER_2D_8TAP_REGULAR || filter == FILTER_2D_BILINEAR) { - bench_new(a_dst, dst_stride, src, src_stride, w, h, + bench_new(a_dst, a_dst_stride, src, src_stride, w, h, mx, my HIGHBD_TAIL_SUFFIX); } } @@ -164,8 +166,8 @@ static void check_mc_scaled(Dav1dMCDSPContext *const c) { ALIGN_STK_64(pixel, src_buf, 263 * 263,); - ALIGN_STK_64(pixel, c_dst, 128 * 128,); - ALIGN_STK_64(pixel, a_dst, 128 * 128,); + PIXEL_RECT(c_dst, 128, 128); + PIXEL_RECT(a_dst, 128, 128); const pixel *src = src_buf + 263 * 3 + 3; const ptrdiff_t src_stride = 263 * sizeof(pixel); #if BITDEPTH == 16 @@ -180,7 +182,6 @@ for (int filter = 0; filter < N_2D_FILTERS; filter++) for (int w = 2; w <= 128; w <<= 1) { - const ptrdiff_t dst_stride = w * sizeof(pixel); for (int p = 0; p < 3; ++p) { if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc", filter_names[filter], w, scaled_paths[p], BITDEPTH)) @@ -198,16 +199,20 @@ for (int k = 0; k < 263 * 263; k++) src_buf[k] = rnd() & bitdepth_max; - call_ref(c_dst, dst_stride, src, src_stride, + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, src, src_stride, w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); - call_new(a_dst, dst_stride, src, src_stride, + call_new(a_dst, a_dst_stride, src, src_stride, w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, dst_stride, - a_dst, dst_stride, w, h, "dst"); + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, + w, h, "dst"); if (filter == FILTER_2D_8TAP_REGULAR || filter == FILTER_2D_BILINEAR) - bench_new(a_dst, dst_stride, src, src_stride, + bench_new(a_dst, a_dst_stride, src, src_stride, w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); } } @@ -281,15 +286,14 @@ static void check_avg(Dav1dMCDSPContext *const c) { ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]); - ALIGN_STK_64(pixel, c_dst, 135 * 135,); - ALIGN_STK_64(pixel, a_dst, 128 * 128,); + PIXEL_RECT(c_dst, 135, 135); + PIXEL_RECT(a_dst, 128, 128); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX); for (int w = 4; w <= 128; w <<= 1) if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) { - ptrdiff_t dst_stride = w * sizeof(pixel); for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) { #if BITDEPTH == 16 @@ -299,12 +303,16 @@ #endif init_tmp(c, c_dst, tmp, bitdepth_max); - call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); - call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride, - w, h, "dst"); - bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); } } report("avg"); @@ -312,15 +320,14 @@ static void check_w_avg(Dav1dMCDSPContext *const c) { ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]); - ALIGN_STK_64(pixel, c_dst, 135 * 135,); - ALIGN_STK_64(pixel, a_dst, 128 * 128,); + PIXEL_RECT(c_dst, 135, 135); + PIXEL_RECT(a_dst, 128, 128); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1, const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX); for (int w = 4; w <= 128; w <<= 1) if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) { - ptrdiff_t dst_stride = w * sizeof(pixel); for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) { int weight = rnd() % 15 + 1; @@ -331,12 +338,15 @@ #endif init_tmp(c, c_dst, tmp, bitdepth_max); - call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); - call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride, - w, h, "dst"); + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); - bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); + call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride,a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); } } report("w_avg"); @@ -344,8 +354,8 @@ static void check_mask(Dav1dMCDSPContext *const c) { ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]); - ALIGN_STK_64(pixel, c_dst, 135 * 135,); - ALIGN_STK_64(pixel, a_dst, 128 * 128,); + PIXEL_RECT(c_dst, 135, 135); + PIXEL_RECT(a_dst, 128, 128); ALIGN_STK_64(uint8_t, mask, 128 * 128,); for (int i = 0; i < 128 * 128; i++) @@ -357,7 +367,6 @@ for (int w = 4; w <= 128; w <<= 1) if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) { - ptrdiff_t dst_stride = w * sizeof(pixel); for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) { #if BITDEPTH == 16 @@ -366,12 +375,16 @@ const int bitdepth_max = 0xff; #endif init_tmp(c, c_dst, tmp, bitdepth_max); - call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); - call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride, - w, h, "dst"); - bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); } } report("mask"); @@ -379,8 +392,8 @@ static void check_w_mask(Dav1dMCDSPContext *const c) { ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]); - ALIGN_STK_64(pixel, c_dst, 135 * 135,); - ALIGN_STK_64(pixel, a_dst, 128 * 128,); + PIXEL_RECT(c_dst, 135, 135); + PIXEL_RECT(a_dst, 128, 128); ALIGN_STK_64(uint8_t, c_mask, 128 * 128,); ALIGN_STK_64(uint8_t, a_mask, 128 * 128,); @@ -397,7 +410,6 @@ if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w, BITDEPTH)) { - ptrdiff_t dst_stride = w * sizeof(pixel); for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) { int sign = rnd() & 1; @@ -408,19 +420,22 @@ #endif init_tmp(c, c_dst, tmp, bitdepth_max); - call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, tmp[0], tmp[1], w, h, c_mask, sign HIGHBD_TAIL_SUFFIX); - call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, + call_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, a_mask, sign HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, dst_stride, - a_dst, dst_stride, - w, h, "dst"); + checkasm_check_pixel_padded(c_dst, c_dst_stride, + a_dst, a_dst_stride, + w, h, "dst"); checkasm_check(uint8_t, c_mask, w >> ss_hor[i], a_mask, w >> ss_hor[i], w >> ss_hor[i], h >> ss_ver[i], "mask"); - bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, + bench_new(a_dst, a_dst_stride, tmp[0], tmp[1], w, h, a_mask, sign HIGHBD_TAIL_SUFFIX); } } @@ -429,15 +444,14 @@ static void check_blend(Dav1dMCDSPContext *const c) { ALIGN_STK_64(pixel, tmp, 32 * 32,); - ALIGN_STK_64(pixel, c_dst, 32 * 32,); - ALIGN_STK_64(pixel, a_dst, 32 * 32,); + PIXEL_RECT(c_dst, 32, 32); + PIXEL_RECT(a_dst, 32, 32); ALIGN_STK_64(uint8_t, mask, 32 * 32,); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h, const uint8_t *mask); for (int w = 4; w <= 32; w <<= 1) { - const ptrdiff_t dst_stride = w * sizeof(pixel); if (check_func(c->blend, "blend_w%d_%dbpc", w, BITDEPTH)) for (int h = imax(w / 2, 4); h <= imin(w * 2, 32); h <<= 1) { #if BITDEPTH == 16 @@ -449,15 +463,21 @@ tmp[i] = rnd() & bitdepth_max; mask[i] = rnd() % 65; } - for (int i = 0; i < w * h; i++) - c_dst[i] = a_dst[i] = rnd() & bitdepth_max; - call_ref(c_dst, dst_stride, tmp, w, h, mask); - call_new(a_dst, dst_stride, tmp, w, h, mask); - checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride, - w, h, "dst"); + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); - bench_new(a_dst, dst_stride, tmp, w, h, mask); + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + c_dst[y*PXSTRIDE(c_dst_stride) + x] = + a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; + + call_ref(c_dst, c_dst_stride, tmp, w, h, mask); + call_new(a_dst, a_dst_stride, tmp, w, h, mask); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); + + bench_new(a_dst, a_dst_stride, tmp, w, h, mask); } } report("blend"); @@ -465,14 +485,13 @@ static void check_blend_v(Dav1dMCDSPContext *const c) { ALIGN_STK_64(pixel, tmp, 32 * 128,); - ALIGN_STK_64(pixel, c_dst, 32 * 128,); - ALIGN_STK_64(pixel, a_dst, 32 * 128,); + PIXEL_RECT(c_dst, 32, 128); + PIXEL_RECT(a_dst, 32, 128); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h); for (int w = 2; w <= 32; w <<= 1) { - const ptrdiff_t dst_stride = w * sizeof(pixel); if (check_func(c->blend_v, "blend_v_w%d_%dbpc", w, BITDEPTH)) for (int h = 2; h <= (w == 2 ? 64 : 128); h <<= 1) { #if BITDEPTH == 16 @@ -481,17 +500,23 @@ const int bitdepth_max = 0xff; #endif - for (int i = 0; i < w * h; i++) - c_dst[i] = a_dst[i] = rnd() & bitdepth_max; + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + c_dst[y*PXSTRIDE(c_dst_stride) + x] = + a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; + for (int i = 0; i < 32 * 128; i++) tmp[i] = rnd() & bitdepth_max; - call_ref(c_dst, dst_stride, tmp, w, h); - call_new(a_dst, dst_stride, tmp, w, h); - checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride, - w, h, "dst"); + call_ref(c_dst, c_dst_stride, tmp, w, h); + call_new(a_dst, a_dst_stride, tmp, w, h); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); - bench_new(a_dst, dst_stride, tmp, w, h); + bench_new(a_dst, a_dst_stride, tmp, w, h); } } report("blend_v"); @@ -499,14 +524,13 @@ static void check_blend_h(Dav1dMCDSPContext *const c) { ALIGN_STK_64(pixel, tmp, 128 * 32,); - ALIGN_STK_64(pixel, c_dst, 128 * 32,); - ALIGN_STK_64(pixel, a_dst, 128 * 32,); + PIXEL_RECT(c_dst, 128, 32); + PIXEL_RECT(a_dst, 128, 32); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h); for (int w = 2; w <= 128; w <<= 1) { - const ptrdiff_t dst_stride = w * sizeof(pixel); if (check_func(c->blend_h, "blend_h_w%d_%dbpc", w, BITDEPTH)) for (int h = (w == 128 ? 4 : 2); h <= 32; h <<= 1) { #if BITDEPTH == 16 @@ -514,17 +538,23 @@ #else const int bitdepth_max = 0xff; #endif - for (int i = 0; i < w * h; i++) - c_dst[i] = a_dst[i] = rnd() & bitdepth_max; + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + for (int y = 0; y < h; y++) + for (int x = 0; x < w; x++) + c_dst[y*PXSTRIDE(c_dst_stride) + x] = + a_dst[y*PXSTRIDE(a_dst_stride) + x] = rnd() & bitdepth_max; + for (int i = 0; i < 128 * 32; i++) tmp[i] = rnd() & bitdepth_max; - call_ref(c_dst, dst_stride, tmp, w, h); - call_new(a_dst, dst_stride, tmp, w, h); - checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride, - w, h, "dst"); + call_ref(c_dst, c_dst_stride, tmp, w, h); + call_new(a_dst, a_dst_stride, tmp, w, h); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + w, h, "dst"); - bench_new(a_dst, dst_stride, tmp, w, h); + bench_new(a_dst, a_dst_stride, tmp, w, h); } } report("blend_h"); @@ -532,11 +562,10 @@ static void check_warp8x8(Dav1dMCDSPContext *const c) { ALIGN_STK_64(pixel, src_buf, 15 * 15,); - ALIGN_STK_64(pixel, c_dst, 8 * 8,); - ALIGN_STK_64(pixel, a_dst, 8 * 8,); + PIXEL_RECT(c_dst, 8, 8); + PIXEL_RECT(a_dst, 8, 8); int16_t abcd[4]; const pixel *src = src_buf + 15 * 3 + 3; - const ptrdiff_t dst_stride = 8 * sizeof(pixel); const ptrdiff_t src_stride = 15 * sizeof(pixel); declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, @@ -558,12 +587,15 @@ for (int i = 0; i < 15 * 15; i++) src_buf[i] = rnd() & bitdepth_max; - call_ref(c_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); - call_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride, - 8, 8, "dst"); + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); + call_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel_padded(c_dst, c_dst_stride, a_dst, a_dst_stride, + 8, 8, "dst"); - bench_new(a_dst, dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); + bench_new(a_dst, a_dst_stride, src, src_stride, abcd, mx, my HIGHBD_TAIL_SUFFIX); } report("warp8x8"); } @@ -687,13 +719,12 @@ } static void check_resize(Dav1dMCDSPContext *const c) { - ALIGN_STK_64(pixel, c_dst, 1024 * 64,); - ALIGN_STK_64(pixel, a_dst, 1024 * 64,); - ALIGN_STK_64(pixel, src, 512 * 64,); + PIXEL_RECT(c_dst, 1024, 64); + PIXEL_RECT(a_dst, 1024, 64); + ALIGN_STK_64(pixel, src, 512 * 64,); const int height = 64; const int max_src_width = 512; - const ptrdiff_t dst_stride = 1024 * sizeof(pixel); const ptrdiff_t src_stride = 512 * sizeof(pixel); declare_func(void, pixel *dst, ptrdiff_t dst_stride, @@ -720,14 +751,17 @@ #undef scale_fac const int mx0 = get_upscale_x0(src_w, dst_w, dx); - call_ref(c_dst, dst_stride, src, src_stride, + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + + call_ref(c_dst, c_dst_stride, src, src_stride, dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX); - call_new(a_dst, dst_stride, src, src_stride, + call_new(a_dst, a_dst_stride, src, src_stride, dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride, - dst_w, height, "dst"); + checkasm_check_pixel_padded_align(c_dst, c_dst_stride, a_dst, a_dst_stride, + dst_w, height, "dst", 16, 1); - bench_new(a_dst, dst_stride, src, src_stride, + bench_new(a_dst, a_dst_stride, src, src_stride, 512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX); } diff -Nru dav1d-0.9.2/tests/checkasm/msac.c dav1d-1.0.0/tests/checkasm/msac.c --- dav1d-0.9.2/tests/checkasm/msac.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tests/checkasm/msac.c 2022-03-18 14:31:56.034356000 +0000 @@ -43,13 +43,13 @@ typedef unsigned (*decode_bool_fn)(MsacContext *s, unsigned f); typedef struct { - decode_symbol_adapt_fn symbol_adapt4; - decode_symbol_adapt_fn symbol_adapt8; - decode_symbol_adapt_fn symbol_adapt16; - decode_adapt_fn bool_adapt; - decode_bool_equi_fn bool_equi; - decode_bool_fn bool; - decode_adapt_fn hi_tok; + decode_symbol_adapt_fn decode_symbol_adapt4; + decode_symbol_adapt_fn decode_symbol_adapt8; + decode_symbol_adapt_fn decode_symbol_adapt16; + decode_adapt_fn decode_bool_adapt; + decode_bool_equi_fn decode_bool_equi; + decode_bool_fn decode_bool; + decode_adapt_fn decode_hi_tok; } MsacDSPContext; static void randomize_cdf(uint16_t *const cdf, const int n) { @@ -104,7 +104,9 @@ } #define CHECK_SYMBOL_ADAPT(n, n_min, n_max) do { \ - if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \ + if (check_func(c->decode_symbol_adapt##n, \ + "msac_decode_symbol_adapt%d", n)) \ + { \ for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \ for (int ns = n_min; ns <= n_max; ns++) { \ dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \ @@ -144,7 +146,7 @@ MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s, uint16_t *cdf); - if (check_func(c->bool_adapt, "msac_decode_bool_adapt")) { + if (check_func(c->decode_bool_adapt, "msac_decode_bool_adapt")) { uint16_t cdf[2][2]; for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); @@ -171,7 +173,7 @@ MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s); - if (check_func(c->bool_equi, "msac_decode_bool_equi")) { + if (check_func(c->decode_bool_equi, "msac_decode_bool_equi")) { dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); s_a = s_c; for (int i = 0; i < 64; i++) { @@ -190,7 +192,7 @@ MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s, unsigned f); - if (check_func(c->bool, "msac_decode_bool")) { + if (check_func(c->decode_bool, "msac_decode_bool")) { dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); s_a = s_c; for (int i = 0; i < 64; i++) { @@ -219,7 +221,7 @@ MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s, uint16_t *cdf); - if (check_func(c->hi_tok, "msac_decode_hi_tok")) { + if (check_func(c->decode_hi_tok, "msac_decode_hi_tok")) { for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); s_a = s_c; @@ -245,38 +247,38 @@ void checkasm_check_msac(void) { MsacDSPContext c; - c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c; - c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c; - c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c; - c.bool_adapt = dav1d_msac_decode_bool_adapt_c; - c.bool_equi = dav1d_msac_decode_bool_equi_c; - c.bool = dav1d_msac_decode_bool_c; - c.hi_tok = dav1d_msac_decode_hi_tok_c; + c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c; + c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt_c; + c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c; + c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_c; + c.decode_bool_equi = dav1d_msac_decode_bool_equi_c; + c.decode_bool = dav1d_msac_decode_bool_c; + c.decode_hi_tok = dav1d_msac_decode_hi_tok_c; #if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) { - c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_neon; - c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_neon; - c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon; - c.bool_adapt = dav1d_msac_decode_bool_adapt_neon; - c.bool_equi = dav1d_msac_decode_bool_equi_neon; - c.bool = dav1d_msac_decode_bool_neon; - c.hi_tok = dav1d_msac_decode_hi_tok_neon; + c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_neon; + c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_neon; + c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon; + c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_neon; + c.decode_bool_equi = dav1d_msac_decode_bool_equi_neon; + c.decode_bool = dav1d_msac_decode_bool_neon; + c.decode_hi_tok = dav1d_msac_decode_hi_tok_neon; } #elif ARCH_X86 && HAVE_ASM if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) { - c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2; - c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2; - c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; - c.bool_adapt = dav1d_msac_decode_bool_adapt_sse2; - c.bool_equi = dav1d_msac_decode_bool_equi_sse2; - c.bool = dav1d_msac_decode_bool_sse2; - c.hi_tok = dav1d_msac_decode_hi_tok_sse2; + c.decode_symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_sse2; + c.decode_symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_sse2; + c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; + c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_sse2; + c.decode_bool_equi = dav1d_msac_decode_bool_equi_sse2; + c.decode_bool = dav1d_msac_decode_bool_sse2; + c.decode_hi_tok = dav1d_msac_decode_hi_tok_sse2; } #if ARCH_X86_64 if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_AVX2) { - c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; + c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; } #endif #endif diff -Nru dav1d-0.9.2/tests/libfuzzer/dav1d_fuzzer.c dav1d-1.0.0/tests/libfuzzer/dav1d_fuzzer.c --- dav1d-0.9.2/tests/libfuzzer/dav1d_fuzzer.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tests/libfuzzer/dav1d_fuzzer.c 2022-03-18 14:31:56.038356000 +0000 @@ -106,23 +106,23 @@ unsigned h = djb_xor(ptr, 32); unsigned seed = h; unsigned probability = h > (RAND_MAX >> 5) ? RAND_MAX >> 5 : h; - int n_frame_threads = (h & 0xf) + 1; - int n_tile_threads = ((h >> 4) & 0x7) + 1; - if (n_frame_threads > 5) n_frame_threads = 1; - if (n_tile_threads > 3) n_tile_threads = 1; + int max_frame_delay = (h & 0xf) + 1; + int n_threads = ((h >> 4) & 0x7) + 1; + if (max_frame_delay > 5) max_frame_delay = 1; + if (n_threads > 3) n_threads = 1; #endif ptr += 32; // skip ivf header dav1d_default_settings(&settings); #ifdef DAV1D_MT_FUZZING - settings.n_frame_threads = settings.n_tile_threads = 2; + settings.max_frame_delay = settings.n_threads = 4; #elif defined(DAV1D_ALLOC_FAIL) - settings.n_frame_threads = n_frame_threads; - settings.n_tile_threads = n_tile_threads; + settings.max_frame_delay = max_frame_delay; + settings.n_threads = n_threads; dav1d_setup_alloc_fail(seed, probability); #else - settings.n_frame_threads = settings.n_tile_threads = 1; + settings.max_frame_delay = settings.n_threads = 1; #endif #if defined(DAV1D_FUZZ_MAX_SIZE) settings.frame_size_limit = DAV1D_FUZZ_MAX_SIZE; diff -Nru dav1d-0.9.2/THANKS.md dav1d-1.0.0/THANKS.md --- dav1d-0.9.2/THANKS.md 2021-09-03 15:51:24.389037100 +0000 +++ dav1d-1.0.0/THANKS.md 2022-03-18 14:31:55.958356000 +0000 @@ -16,16 +16,18 @@ And all the dav1d Authors (git shortlog -sn), including: -Martin Storsjö, Janne Grunau, Henrik Gramner, Ronald S. Bultje, James Almer, -Marvin Scholz, Luc Trudeau, Victorien Le Couviour--Tuffet, Jean-Baptiste Kempf, -Hugo Beauzée-Luyssen, Matthias Dressel, Konstantin Pavlov, David Michael Barr, -Steve Lhomme, Niklas Haas, B Krishnan Iyer, Francois Cartegnie, Liwei Wang, -Nathan E. Egge, Derek Buitenhuis, Michael Bradshaw, Raphaël Zumer, -Xuefeng Jiang, Luca Barbato, Jan Beich, Wan-Teh Chang, Justin Bull, Boyuan Xiao, -Dale Curtis, Kyle Siefring, Raphael Zumer, Rupert Swarbrick, Thierry Foucu, -Thomas Daede, Colin Lee, Emmanuel Gil Peyrot, Lynne, Michail Alvanos, -Nico Weber, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, -Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, -Mark Shuttleworth, Matthieu Bouron, Nicolas Frattaroli, Pablo Stebler, -Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvestre Ledru, Timo Gurr, -Tristan Matthews, Xavier Claessens, Xu Guangxin, kossh1 and skal. +Martin Storsjö, Henrik Gramner, Ronald S. Bultje, Janne Grunau, James Almer, +Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz, Luc Trudeau, +Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Niklas Haas, Konstantin Pavlov, +David Michael Barr, Steve Lhomme, Nathan E. Egge, Kyle Siefring, Raphaël Zumer, +B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Derek Buitenhuis, +Michael Bradshaw, Wan-Teh Chang, Xuefeng Jiang, Luca Barbato, Jan Beich, +Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot, +Rupert Swarbrick, Thierry Foucu, Thomas Daede, Colin Lee, Jonathan Wright, +Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf, Tristan Laurent, +Vittorio Giovara, Yannis Guyon, André Kempe, Anisse Astier, Anton Mitrofanov, +Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago, +Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli, +Pablo Stebler, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvain BERTRAND, +Sylvestre Ledru, Timo Gurr, Tristan Matthews, Vibhoothi, Xavier Claessens, +Xu Guangxin, kossh1 and skal diff -Nru dav1d-0.9.2/tools/dav1d.c dav1d-1.0.0/tools/dav1d.c --- dav1d-0.9.2/tools/dav1d.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tools/dav1d.c 2022-03-18 14:31:56.038356000 +0000 @@ -29,6 +29,7 @@ #include "vcs_version.h" #include "cli_config.h" +#include #include #include #include @@ -139,6 +140,47 @@ fputs(buf, stderr); } +static int picture_alloc(Dav1dPicture *const p, void *const _) { + const int hbd = p->p.bpc > 8; + const int aligned_w = (p->p.w + 127) & ~127; + const int aligned_h = (p->p.h + 127) & ~127; + const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400; + const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444; + ptrdiff_t y_stride = aligned_w << hbd; + ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0; + /* Due to how mapping of addresses to sets works in most L1 and L2 cache + * implementations, strides of multiples of certain power-of-two numbers + * may cause multiple rows of the same superblock to map to the same set, + * causing evictions of previous rows resulting in a reduction in cache + * hit rate. Avoid that by slightly padding the stride when necessary. */ + if (!(y_stride & 1023)) + y_stride += DAV1D_PICTURE_ALIGNMENT; + if (!(uv_stride & 1023) && has_chroma) + uv_stride += DAV1D_PICTURE_ALIGNMENT; + p->stride[0] = -y_stride; + p->stride[1] = -uv_stride; + const size_t y_sz = y_stride * aligned_h; + const size_t uv_sz = uv_stride * (aligned_h >> ss_ver); + const size_t pic_size = y_sz + 2 * uv_sz; + + uint8_t *const buf = malloc(pic_size + DAV1D_PICTURE_ALIGNMENT * 2); + if (!buf) return DAV1D_ERR(ENOMEM); + p->allocator_data = buf; + + const ptrdiff_t align_m1 = DAV1D_PICTURE_ALIGNMENT - 1; + uint8_t *const data = (uint8_t *)(((ptrdiff_t)buf + align_m1) & ~align_m1); + p->data[0] = data + y_sz - y_stride; + p->data[1] = has_chroma ? data + y_sz + uv_sz * 1 - uv_stride : NULL; + p->data[2] = has_chroma ? data + y_sz + uv_sz * 2 - uv_stride : NULL; + + return 0; +} + +static void picture_release(Dav1dPicture *const p, void *const _) { + free(p->allocator_data); +} + int main(const int argc, char *const *const argv) { const int istty = isatty(fileno(stderr)); int res = 0; @@ -162,6 +204,10 @@ } parse(argc, argv, &cli_settings, &lib_settings); + if (cli_settings.neg_stride) { + lib_settings.allocator.alloc_picture_callback = picture_alloc; + lib_settings.allocator.release_picture_callback = picture_release; + } if ((res = input_open(&in, cli_settings.demuxer, cli_settings.inputfile, @@ -224,9 +270,10 @@ memset(&p, 0, sizeof(p)); if ((res = dav1d_send_data(c, &data)) < 0) { if (res != DAV1D_ERR(EAGAIN)) { + dav1d_data_unref(&data); fprintf(stderr, "Error decoding frame: %s\n", strerror(DAV1D_ERR(res))); - break; + if (res != DAV1D_ERR(EINVAL)) break; } } @@ -234,7 +281,7 @@ if (res != DAV1D_ERR(EAGAIN)) { fprintf(stderr, "Error decoding frame: %s\n", strerror(DAV1D_ERR(res))); - break; + if (res != DAV1D_ERR(EINVAL)) break; } res = 0; } else { @@ -270,6 +317,7 @@ if (res != DAV1D_ERR(EAGAIN)) { fprintf(stderr, "Error decoding frame: %s\n", strerror(DAV1D_ERR(res))); + if (res != DAV1D_ERR(EINVAL)) break; } else { res = 0; break; diff -Nru dav1d-0.9.2/tools/dav1d_cli_parse.c dav1d-1.0.0/tools/dav1d_cli_parse.c --- dav1d-0.9.2/tools/dav1d_cli_parse.c 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tools/dav1d_cli_parse.c 2022-03-18 14:31:56.038356000 +0000 @@ -50,39 +50,45 @@ ARG_FRAME_TIMES, ARG_REALTIME, ARG_REALTIME_CACHE, - ARG_FRAME_THREADS, - ARG_TILE_THREADS, - ARG_POSTFILTER_THREADS, + ARG_THREADS, + ARG_FRAME_DELAY, ARG_VERIFY, ARG_FILM_GRAIN, ARG_OPPOINT, ARG_ALL_LAYERS, ARG_SIZE_LIMIT, + ARG_STRICT_STD_COMPLIANCE, ARG_CPU_MASK, + ARG_NEG_STRIDE, + ARG_OUTPUT_INVISIBLE, + ARG_INLOOP_FILTERS, }; static const struct option long_opts[] = { - { "input", 1, NULL, 'i' }, - { "output", 1, NULL, 'o' }, - { "quiet", 0, NULL, 'q' }, - { "demuxer", 1, NULL, ARG_DEMUXER }, - { "muxer", 1, NULL, ARG_MUXER }, - { "version", 0, NULL, 'v' }, - { "frametimes", 1, NULL, ARG_FRAME_TIMES }, - { "limit", 1, NULL, 'l' }, - { "skip", 1, NULL, 's' }, - { "realtime", 2, NULL, ARG_REALTIME }, - { "realtimecache", 1, NULL, ARG_REALTIME_CACHE }, - { "framethreads", 1, NULL, ARG_FRAME_THREADS }, - { "tilethreads", 1, NULL, ARG_TILE_THREADS }, - { "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS }, - { "verify", 1, NULL, ARG_VERIFY }, - { "filmgrain", 1, NULL, ARG_FILM_GRAIN }, - { "oppoint", 1, NULL, ARG_OPPOINT }, - { "alllayers", 1, NULL, ARG_ALL_LAYERS }, - { "sizelimit", 1, NULL, ARG_SIZE_LIMIT }, - { "cpumask", 1, NULL, ARG_CPU_MASK }, - { NULL, 0, NULL, 0 }, + { "input", 1, NULL, 'i' }, + { "output", 1, NULL, 'o' }, + { "quiet", 0, NULL, 'q' }, + { "demuxer", 1, NULL, ARG_DEMUXER }, + { "muxer", 1, NULL, ARG_MUXER }, + { "version", 0, NULL, 'v' }, + { "frametimes", 1, NULL, ARG_FRAME_TIMES }, + { "limit", 1, NULL, 'l' }, + { "skip", 1, NULL, 's' }, + { "realtime", 2, NULL, ARG_REALTIME }, + { "realtimecache", 1, NULL, ARG_REALTIME_CACHE }, + { "threads", 1, NULL, ARG_THREADS }, + { "framedelay", 1, NULL, ARG_FRAME_DELAY }, + { "verify", 1, NULL, ARG_VERIFY }, + { "filmgrain", 1, NULL, ARG_FILM_GRAIN }, + { "oppoint", 1, NULL, ARG_OPPOINT }, + { "alllayers", 1, NULL, ARG_ALL_LAYERS }, + { "sizelimit", 1, NULL, ARG_SIZE_LIMIT }, + { "strict", 1, NULL, ARG_STRICT_STD_COMPLIANCE }, + { "cpumask", 1, NULL, ARG_CPU_MASK }, + { "negstride", 0, NULL, ARG_NEG_STRIDE }, + { "outputinvisible", 1, NULL, ARG_OUTPUT_INVISIBLE }, + { "inloopfilters", 1, NULL, ARG_INLOOP_FILTERS }, + { NULL, 0, NULL, 0 }, }; #if HAVE_XXHASH_H @@ -114,9 +120,10 @@ fprintf(stderr, "Usage: %s [options]\n\n", app); fprintf(stderr, "Supported options:\n" " --input/-i $file: input file\n" - " --output/-o $file: output file\n" + " --output/-o $file: output file (%%n, %%w or %%h will be filled in for per-frame files)\n" " --demuxer $name: force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from content)\n" " --muxer $name: force muxer type (" AVAILABLE_MUXERS "; default: detect from extension)\n" + " use 'frame' as prefix to write per-frame files; if filename contains %%n, will default to writing per-frame files\n" " --quiet/-q: disable status messages\n" " --frametimes $file: dump frame times to file\n" " --limit/-l $num: stop decoding after $num frames\n" @@ -124,15 +131,21 @@ " --realtime [$fract]: limit framerate, optional argument to override input framerate\n" " --realtimecache $num: set the size of the cache in realtime mode (default: 0)\n" " --version/-v: print version and exit\n" - " --framethreads $num: number of frame threads (default: 1)\n" - " --tilethreads $num: number of tile threads (default: 1)\n" - " --pfthreads $num: number of postfilter threads (default: 1)\n" + " --threads $num: number of threads (default: 0)\n" + " --framedelay $num: maximum frame delay, capped at $threads (default: 0);\n" + " set to 1 for low-latency decoding\n" " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5 or xxh3)\n" " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 31)\n" " --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n" " --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n" + " --strict $num: whether to abort decoding on standard compliance violations\n" + " that don't affect bitstream decoding (default: 1)\n" " --verify $md5: verify decoded md5. implies --muxer md5, no output\n" - " --cpumask $mask: restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n"); + " --cpumask $mask: restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n" + " --negstride: use negative picture strides\n" + " this is mostly meant as a developer option\n" + " --outputinvisible $num: whether to output invisible (alt-ref) frames (default: 0)\n" + " --inloopfilters $str: which in-loop filters to enable (none, (no)deblock, (no)cdef, (no)restoration or all; default: all)\n"); exit(1); } @@ -211,6 +224,18 @@ { "none", 0 }, }; +static const EnumParseTable inloop_filters_tbl[] = { + { "none", DAV1D_INLOOPFILTER_NONE }, + { "deblock", DAV1D_INLOOPFILTER_DEBLOCK }, + { "nodeblock", DAV1D_INLOOPFILTER_ALL - DAV1D_INLOOPFILTER_DEBLOCK }, + { "cdef", DAV1D_INLOOPFILTER_CDEF }, + { "nocdef", DAV1D_INLOOPFILTER_ALL - DAV1D_INLOOPFILTER_CDEF }, + { "restoration", DAV1D_INLOOPFILTER_RESTORATION }, + { "norestoration", DAV1D_INLOOPFILTER_ALL - DAV1D_INLOOPFILTER_RESTORATION }, + { "all", DAV1D_INLOOPFILTER_ALL }, + { 0 }, +}; + #define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n))) static unsigned parse_enum(char *optarg, const EnumParseTable *const tbl, @@ -255,6 +280,7 @@ memset(cli_settings, 0, sizeof(*cli_settings)); dav1d_default_settings(lib_settings); + lib_settings->strict_std_compliance = 1; // override library default int grain_specified = 0; while ((o = getopt_long(argc, argv, short_opts, long_opts, NULL)) != -1) { @@ -299,17 +325,13 @@ cli_settings->realtime_cache = parse_unsigned(optarg, ARG_REALTIME_CACHE, argv[0]); break; - case ARG_FRAME_THREADS: - lib_settings->n_frame_threads = - parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]); - break; - case ARG_TILE_THREADS: - lib_settings->n_tile_threads = - parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]); - break; - case ARG_POSTFILTER_THREADS: - lib_settings->n_postfilter_threads = - parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]); + case ARG_FRAME_DELAY: + lib_settings->max_frame_delay = + parse_unsigned(optarg, ARG_FRAME_DELAY, argv[0]); + break; + case ARG_THREADS: + lib_settings->n_threads = + parse_unsigned(optarg, ARG_THREADS, argv[0]); break; case ARG_VERIFY: cli_settings->verify = optarg; @@ -337,6 +359,10 @@ lib_settings->frame_size_limit = (unsigned) res; break; } + case ARG_STRICT_STD_COMPLIANCE: + lib_settings->strict_std_compliance = + parse_unsigned(optarg, ARG_STRICT_STD_COMPLIANCE, argv[0]); + break; case 'v': fprintf(stderr, "%s\n", dav1d_version()); exit(0); @@ -344,6 +370,18 @@ dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl, ARRAY_SIZE(cpu_mask_tbl), ARG_CPU_MASK, argv[0])); break; + case ARG_NEG_STRIDE: + cli_settings->neg_stride = 1; + break; + case ARG_OUTPUT_INVISIBLE: + lib_settings->output_invisible_frames = + !!parse_unsigned(optarg, ARG_OUTPUT_INVISIBLE, argv[0]); + break; + case ARG_INLOOP_FILTERS: + lib_settings->inloop_filters = + parse_enum(optarg, inloop_filters_tbl, + ARRAY_SIZE(inloop_filters_tbl),ARG_INLOOP_FILTERS, argv[0]); + break; default: usage(argv[0], NULL); } diff -Nru dav1d-0.9.2/tools/dav1d_cli_parse.h dav1d-1.0.0/tools/dav1d_cli_parse.h --- dav1d-0.9.2/tools/dav1d_cli_parse.h 2021-09-03 15:51:24.429037000 +0000 +++ dav1d-1.0.0/tools/dav1d_cli_parse.h 2022-03-18 14:31:56.038356000 +0000 @@ -46,6 +46,7 @@ } realtime; double realtime_fps; unsigned realtime_cache; + int neg_stride; } CLISettings; void parse(const int argc, char *const *const argv, diff -Nru dav1d-0.9.2/tools/output/output.c dav1d-1.0.0/tools/output/output.c --- dav1d-0.9.2/tools/output/output.c 2021-09-03 15:51:24.433037000 +0000 +++ dav1d-1.0.0/tools/output/output.c 2022-03-18 14:31:56.042356000 +0000 @@ -34,6 +34,7 @@ #include #include "common/attributes.h" +#include "common/intops.h" #include "output/output.h" #include "output/muxer.h" @@ -41,6 +42,10 @@ struct MuxerContext { MuxerPriv *data; const Muxer *impl; + int one_file_per_frame; + unsigned fps[2]; + const char *filename; + int framenum; }; extern const Muxer null_muxer; @@ -84,10 +89,12 @@ MuxerContext *c; unsigned i; int res; + int name_offset = 0; if (name) { + name_offset = 5 * !strncmp(name, "frame", 5); for (i = 0; muxers[i]; i++) { - if (!strcmp(muxers[i]->name, name)) { + if (!strcmp(muxers[i]->name, &name[name_offset])) { impl = muxers[i]; break; } @@ -122,7 +129,25 @@ } c->impl = impl; c->data = (MuxerPriv *) &c[1]; - if (impl->write_header && (res = impl->write_header(c->data, filename, p, fps)) < 0) { + int have_num_pattern = 0; + for (const char *ptr = filename ? strchr(filename, '%') : NULL; + !have_num_pattern && ptr; ptr = strchr(ptr, '%')) + { + ptr++; // skip '%' + while (*ptr >= '0' && *ptr <= '9') + ptr++; // skip length indicators + have_num_pattern = *ptr == 'n'; + } + c->one_file_per_frame = name_offset || (!name && have_num_pattern); + + if (c->one_file_per_frame) { + c->fps[0] = fps[0]; + c->fps[1] = fps[1]; + c->filename = filename; + c->framenum = 0; + } else if (impl->write_header && + (res = impl->write_header(c->data, filename, p, fps)) < 0) + { free(c); return res; } @@ -131,13 +156,98 @@ return 0; } +static void safe_strncat(char *const dst, const int dst_len, + const char *const src, const int src_len) +{ + if (!src_len) return; + const int dst_fill = (int) strlen(dst); + assert(dst_fill < dst_len); + const int to_copy = imin(src_len, dst_len - dst_fill - 1); + if (!to_copy) return; + memcpy(dst + dst_fill, src, to_copy); + dst[dst_fill + to_copy] = 0; +} + +static void assemble_field(char *const dst, const int dst_len, + const char *const fmt, const int fmt_len, + const int field) +{ + char fmt_copy[32]; + + assert(fmt[0] == '%'); + fmt_copy[0] = '%'; + if (fmt[1] >= '1' && fmt[1] <= '9') { + fmt_copy[1] = '0'; // pad with zeroes, not spaces + fmt_copy[2] = 0; + } else { + fmt_copy[1] = 0; + } + safe_strncat(fmt_copy, sizeof(fmt_copy), &fmt[1], fmt_len - 1); + safe_strncat(fmt_copy, sizeof(fmt_copy), "d", 1); + + char tmp[32]; + snprintf(tmp, sizeof(tmp), fmt_copy, field); + + safe_strncat(dst, dst_len, tmp, (int) strlen(tmp)); +} + +static void assemble_filename(MuxerContext *const ctx, char *const filename, + const int filename_size, + const Dav1dPictureParameters *const p) +{ + filename[0] = 0; + const int framenum = ctx->framenum++; + assert(ctx->filename); + const char *ptr = ctx->filename, *iptr; + while ((iptr = strchr(ptr, '%'))) { + safe_strncat(filename, filename_size, ptr, (int) (iptr - ptr)); + ptr = iptr; + + const char *iiptr = &iptr[1]; // skip '%' + while (*iiptr >= '0' && *iiptr <= '9') + iiptr++; // skip length indicators + + switch (*iiptr) { + case 'w': + assemble_field(filename, filename_size, ptr, (int) (iiptr - ptr), p->w); + break; + case 'h': + assemble_field(filename, filename_size, ptr, (int) (iiptr - ptr), p->h); + break; + case 'n': + assemble_field(filename, filename_size, ptr, (int) (iiptr - ptr), framenum); + break; + default: + safe_strncat(filename, filename_size, "%", 1); + ptr = &iptr[1]; + continue; + } + + ptr = &iiptr[1]; + } + safe_strncat(filename, filename_size, ptr, (int) strlen(ptr)); +} + int output_write(MuxerContext *const ctx, Dav1dPicture *const p) { - const int res = ctx->impl->write_picture(ctx->data, p); - return res < 0 ? res : 0; + int res; + + if (ctx->one_file_per_frame && ctx->impl->write_header) { + char filename[1024]; + assemble_filename(ctx, filename, sizeof(filename), &p->p); + res = ctx->impl->write_header(ctx->data, filename, &p->p, ctx->fps); + if (res < 0) + return res; + } + if ((res = ctx->impl->write_picture(ctx->data, p)) < 0) + return res; + if (ctx->one_file_per_frame && ctx->impl->write_trailer) + ctx->impl->write_trailer(ctx->data); + + return 0; } void output_close(MuxerContext *const ctx) { - if (ctx->impl->write_trailer) + if (!ctx->one_file_per_frame && ctx->impl->write_trailer) ctx->impl->write_trailer(ctx->data); free(ctx); }