diff -Nru dav1d-0.7.1/CONTRIBUTING.md dav1d-0.9.1/CONTRIBUTING.md --- dav1d-0.7.1/CONTRIBUTING.md 2020-06-21 11:48:54.948126300 +0000 +++ dav1d-0.9.1/CONTRIBUTING.md 2021-07-28 21:38:28.849851600 +0000 @@ -12,7 +12,7 @@ The codebase is developed with the following assumptions: For the library: -- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extension, +- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extensions. Anonymous structures and unions are the only allowed compiler extensions for internal code. - x86 asm in .asm files, using the NASM syntax, - arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports, - no C++ is allowed, whatever the version. Binary files /tmp/tmp5v66y4dz/QaUJRHZGKy/dav1d-0.7.1/dav1d_logo.png and /tmp/tmp5v66y4dz/XxehL0GWKO/dav1d-0.9.1/dav1d_logo.png differ diff -Nru dav1d-0.7.1/debian/changelog dav1d-0.9.1/debian/changelog --- dav1d-0.7.1/debian/changelog 2021-08-08 16:42:20.000000000 +0000 +++ dav1d-0.9.1/debian/changelog 2021-07-30 20:28:01.000000000 +0000 @@ -1,8 +1,46 @@ -dav1d (0.7.1-3~ppa20.04+5) focal; urgency=medium +dav1d (0.9.1-1~20.04.sav0) focal; urgency=medium - * Try to build for i386 harder? + * Backport to Focal + * debian/control: Set debhelper-compat (= 12) BD + * d/rules: Change override_dh_auto_test to call "meson test" directly, due + dh >= 13 calling "meson test" (needed for "-t 10" option) not "ninja test" - -- Nicolas Derive Sun, 08 Aug 2021 18:42:20 +0200 + -- Rob Savoury Fri, 30 Jul 2021 13:28:01 -0700 + +dav1d (0.9.1-1) experimental; urgency=medium + + * New upstream release. + * Add more copyright holders in d/copyright. + + -- Dylan Aïssi Fri, 30 Jul 2021 11:21:25 +0200 + +dav1d (0.9.0-1) experimental; urgency=medium + + * New upstream release. + * Update d/libdav1d5.symbols. + * Bump year in d/copyright. + + -- Dylan Aïssi Mon, 17 May 2021 14:45:02 +0200 + +dav1d (0.8.2-1) experimental; urgency=medium + + * New upstream release. + * Bump year in d/copyright. + + -- Dylan Aïssi Wed, 12 May 2021 10:45:22 +0200 + +dav1d (0.8.1-1) experimental; urgency=medium + + [ Vasyl Gello ] + * Team upload. + * New upstream version 0.8.1 + * Bump SONAME to libdav1d5 + + [ Dylan Aïssi ] + * Remove part to disable asm on x32 in d/rules, + should be fixed in upstream makefile. + + -- Dylan Aïssi Thu, 28 Jan 2021 21:10:20 +0100 dav1d (0.7.1-3) unstable; urgency=high diff -Nru dav1d-0.7.1/debian/compat dav1d-0.9.1/debian/compat --- dav1d-0.7.1/debian/compat 2021-08-08 16:42:20.000000000 +0000 +++ dav1d-0.9.1/debian/compat 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -10 diff -Nru dav1d-0.7.1/debian/control dav1d-0.9.1/debian/control --- dav1d-0.7.1/debian/control 2021-08-08 16:41:40.000000000 +0000 +++ dav1d-0.9.1/debian/control 2021-07-30 20:28:01.000000000 +0000 @@ -3,11 +3,11 @@ Priority: optional Maintainer: Debian Multimedia Maintainers Uploaders: Dylan Aïssi -Build-Depends: debhelper (>= 10), +Build-Depends: debhelper-compat (= 12), meson (>= 0.47), ninja-build, - nasm (>= 2.14) -Standards-Version: 4.5.0 + nasm (>= 2.14) [any-amd64 any-i386] +Standards-Version: 4.5.1 Rules-Requires-Root: no Homepage: https://www.videolan.org/projects/dav1d.html Vcs-Browser: https://salsa.debian.org/multimedia-team/dav1d @@ -30,8 +30,9 @@ * full acceleration for ARMv8 chips * partial acceleration for ARMv7 chips -Package: libdav1d4 +Package: libdav1d5 Architecture: any +Multi-Arch: same Section: libs Depends: ${misc:Depends}, ${shlibs:Depends} @@ -52,8 +53,9 @@ Package: libdav1d-dev Architecture: any +Multi-Arch: same Section: libdevel -Depends: libdav1d4 (= ${binary:Version}), +Depends: libdav1d5 (= ${binary:Version}), ${misc:Depends} Description: fast and small AV1 video stream decoder (development files) dav1d is an AOMedia Video 1 (AV1) cross-platform decoder and focused on speed diff -Nru dav1d-0.7.1/debian/copyright dav1d-0.9.1/debian/copyright --- dav1d-0.7.1/debian/copyright 2020-07-08 21:07:41.000000000 +0000 +++ dav1d-0.9.1/debian/copyright 2021-07-30 09:21:25.000000000 +0000 @@ -115,10 +115,10 @@ License was issued. Files: * -Copyright: 2018-2020, VideoLAN and dav1d authors - 2018-2019, Two Orioles, LLC +Copyright: 2018-2021, VideoLAN and dav1d authors + 2018-2021, Two Orioles, LLC 2015-2019, Janne Grunau - 2015-2020, Martin Storsjo + 2015-2021, Martin Storsjo 2018, Niklas Haas 2018-2019, VideoLabs 2019, Luca Barbato @@ -126,6 +126,9 @@ 2019, B Krishnan Iyer 2019, James Almer 2001-2016, Alliance for Open Media + 2017-2021, The rav1e contributors + 2020, Nathan Egge + 2021, Matthias Dressel License: BSD-2-clause Files: include/compat/getopt.h @@ -149,7 +152,7 @@ License: ISC and BSD-2-clause Files: debian/* -Copyright: 2018-2020 Dylan Aïssi +Copyright: 2018-2021 Dylan Aïssi License: BSD-2-clause License: BSD-2-clause diff -Nru dav1d-0.7.1/debian/libdav1d4.install dav1d-0.9.1/debian/libdav1d4.install --- dav1d-0.7.1/debian/libdav1d4.install 2020-07-08 21:07:41.000000000 +0000 +++ dav1d-0.9.1/debian/libdav1d4.install 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -usr/lib/*/libdav1d.so.* diff -Nru dav1d-0.7.1/debian/libdav1d4.lintian-overrides dav1d-0.9.1/debian/libdav1d4.lintian-overrides --- dav1d-0.7.1/debian/libdav1d4.lintian-overrides 2020-07-08 21:07:41.000000000 +0000 +++ dav1d-0.9.1/debian/libdav1d4.lintian-overrides 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -# False positive -spelling-error-in-binary usr/lib/x86_64-linux-gnu/libdav1d.so.4.0.2 AfE Safe diff -Nru dav1d-0.7.1/debian/libdav1d4.symbols dav1d-0.9.1/debian/libdav1d4.symbols --- dav1d-0.7.1/debian/libdav1d4.symbols 2020-07-08 21:07:41.000000000 +0000 +++ dav1d-0.9.1/debian/libdav1d4.symbols 1970-01-01 00:00:00.000000000 +0000 @@ -1,16 +0,0 @@ -libdav1d.so.4 #PACKAGE# #MINVER# -* Build-Depends-Package: libdav1d-dev - dav1d_close@Base 0.1.0 - dav1d_data_create@Base 0.1.0 - dav1d_data_unref@Base 0.1.0 - dav1d_data_wrap@Base 0.1.0 - dav1d_data_wrap_user_data@Base 0.2.1 - dav1d_default_settings@Base 0.1.0 - dav1d_flush@Base 0.1.0 - dav1d_get_picture@Base 0.1.0 - dav1d_open@Base 0.1.0 - dav1d_parse_sequence_header@Base 0.1.0 - dav1d_picture_unref@Base 0.1.0 - dav1d_send_data@Base 0.1.0 - dav1d_set_cpu_flags_mask@Base 0.1.0 - dav1d_version@Base 0.1.0 diff -Nru dav1d-0.7.1/debian/libdav1d5.install dav1d-0.9.1/debian/libdav1d5.install --- dav1d-0.7.1/debian/libdav1d5.install 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/debian/libdav1d5.install 2021-07-30 09:21:25.000000000 +0000 @@ -0,0 +1 @@ +usr/lib/*/libdav1d.so.* diff -Nru dav1d-0.7.1/debian/libdav1d5.symbols dav1d-0.9.1/debian/libdav1d5.symbols --- dav1d-0.7.1/debian/libdav1d5.symbols 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/debian/libdav1d5.symbols 2021-07-30 09:21:25.000000000 +0000 @@ -0,0 +1,17 @@ +libdav1d.so.5 #PACKAGE# #MINVER# +* Build-Depends-Package: libdav1d-dev + dav1d_close@Base 0.1.0 + dav1d_data_create@Base 0.1.0 + dav1d_data_unref@Base 0.1.0 + dav1d_data_wrap@Base 0.1.0 + dav1d_data_wrap_user_data@Base 0.2.1 + dav1d_default_settings@Base 0.1.0 + dav1d_flush@Base 0.1.0 + dav1d_get_event_flags@Base 0.9.0 + dav1d_get_picture@Base 0.1.0 + dav1d_open@Base 0.1.0 + dav1d_parse_sequence_header@Base 0.1.0 + dav1d_picture_unref@Base 0.1.0 + dav1d_send_data@Base 0.1.0 + dav1d_set_cpu_flags_mask@Base 0.1.0 + dav1d_version@Base 0.1.0 diff -Nru dav1d-0.7.1/debian/rules dav1d-0.9.1/debian/rules --- dav1d-0.7.1/debian/rules 2021-08-08 16:42:17.000000000 +0000 +++ dav1d-0.9.1/debian/rules 2021-07-30 20:27:28.000000000 +0000 @@ -7,10 +7,12 @@ CONFIG_ARGS:= --buildtype="release" -DEB_HOST_ARCH?=$(shell dpkg-architecture -qDEB_HOST_ARCH) -ifeq (x32,$(DEB_HOST_ARCH)) -CONFIG_ARGS+= -Denable_asm=false -endif +# Should be fixed in upstream makefile since 0.8.0 +# https://code.videolan.org/videolan/dav1d/-/commit/725f37684d +#DEB_HOST_ARCH?=$(shell dpkg-architecture -qDEB_HOST_ARCH) +#ifeq (x32,$(DEB_HOST_ARCH)) +#CONFIG_ARGS+= -Denable_asm=false +#endif override_dh_auto_configure: dh_auto_configure -- ${CONFIG_ARGS} @@ -19,4 +21,4 @@ # Since 0.7.1, test timeout on armel # https://bugs.debian.org/964249 # So, increase test timeout values - dh_auto_test -- -t 10 + cd obj-* && LC_ALL=C.UTF-8 MESON_TESTTHREADS=1 meson test -t 10 Binary files /tmp/tmp5v66y4dz/QaUJRHZGKy/dav1d-0.7.1/doc/dav1d_logo.png and /tmp/tmp5v66y4dz/XxehL0GWKO/dav1d-0.9.1/doc/dav1d_logo.png differ diff -Nru dav1d-0.7.1/examples/dav1dplay.c dav1d-0.9.1/examples/dav1dplay.c --- dav1d-0.7.1/examples/dav1dplay.c 2020-06-21 11:48:54.948126300 +0000 +++ dav1d-0.9.1/examples/dav1dplay.c 2021-07-28 21:38:28.853851800 +0000 @@ -39,6 +39,11 @@ #include "dp_fifo.h" #include "dp_renderer.h" +#define FRAME_OFFSET_TO_PTS(foff) \ + (uint64_t)(((foff) * rd_ctx->spf) * 1000000000.0 + .5) +#define TS_TO_PTS(ts) \ + (uint64_t)(((ts) * rd_ctx->timebase) * 1000000000.0 + .5) + // Selected renderer callbacks and cookie static const Dav1dPlayRenderInfo *renderer_info = { NULL }; @@ -59,27 +64,43 @@ // Lock to protect access to the context structure SDL_mutex *lock; - // Timestamp of previous decoded frame - int64_t last_pts; - // Timestamp of current decoded frame - int64_t current_pts; + // Timestamp of last displayed frame (in timebase unit) + int64_t last_ts; + // Timestamp of last decoded frame (in timebase unit) + int64_t current_ts; // Ticks when last frame was received uint32_t last_ticks; // PTS time base double timebase; + // Seconds per frame + double spf; + // Number of frames + uint32_t total; // Fifo Dav1dPlayPtrFifo *fifo; - // Custom SDL2 event type - uint32_t renderer_event_type; + // Custom SDL2 event types + uint32_t event_types; + + // User pause state + uint8_t user_paused; + // Internal pause state + uint8_t paused; + // Start of internal pause state + uint32_t pause_start; + // Duration of internal pause state + uint32_t pause_time; + + // Seek accumulator + int seek; // Indicates if termination of the decoder thread was requested uint8_t dec_should_terminate; } Dav1dPlayRenderContext; static void dp_settings_print_usage(const char *const app, - const char *const reason, ...) + const char *const reason, ...) { if (reason) { va_list args; @@ -95,6 +116,7 @@ " --untimed/-u: ignore PTS, render as fast as possible\n" " --framethreads $num: number of frame threads (default: 1)\n" " --tilethreads $num: number of tile threads (default: 1)\n" + " --pfthreads $num: number of postfilter threads(default: 1)\n" " --highquality: enable high quality rendering\n" " --zerocopy/-z: enable zero copy upload path\n" " --gpugrain/-g: enable GPU grain synthesis\n" @@ -115,7 +137,7 @@ } static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, - const int argc, char *const *const argv) + const int argc, char *const *const argv) { int o; Dav1dPlaySettings *settings = &rd_ctx->settings; @@ -127,6 +149,7 @@ enum { ARG_FRAME_THREADS = 256, ARG_TILE_THREADS, + ARG_POSTFILTER_THREADS, ARG_HIGH_QUALITY, }; @@ -137,6 +160,7 @@ { "untimed", 0, NULL, 'u' }, { "framethreads", 1, NULL, ARG_FRAME_THREADS }, { "tilethreads", 1, NULL, ARG_TILE_THREADS }, + { "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS }, { "highquality", 0, NULL, ARG_HIGH_QUALITY }, { "zerocopy", 0, NULL, 'z' }, { "gpugrain", 0, NULL, 'g' }, @@ -175,6 +199,10 @@ lib_settings->n_tile_threads = parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]); break; + case ARG_POSTFILTER_THREADS: + lib_settings->n_postfilter_threads = + parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]); + break; default: dp_settings_print_usage(argv[0], NULL); } @@ -213,16 +241,16 @@ Dav1dPlayRenderContext *rd_ctx; // Alloc - rd_ctx = malloc(sizeof(Dav1dPlayRenderContext)); + rd_ctx = calloc(1, sizeof(Dav1dPlayRenderContext)); if (rd_ctx == NULL) { return NULL; } // Register a custom event to notify our SDL main thread // about new frames - rd_ctx->renderer_event_type = SDL_RegisterEvents(1); - if (rd_ctx->renderer_event_type == UINT32_MAX) { - fprintf(stderr, "Failure to create custom SDL event type!\n"); + rd_ctx->event_types = SDL_RegisterEvents(3); + if (rd_ctx->event_types == UINT32_MAX) { + fprintf(stderr, "Failure to create custom SDL event types!\n"); free(rd_ctx); return NULL; } @@ -265,24 +293,17 @@ return NULL; } - rd_ctx->last_pts = 0; - rd_ctx->last_ticks = 0; - rd_ctx->current_pts = 0; - rd_ctx->timebase = 0; - rd_ctx->dec_should_terminate = 0; - return rd_ctx; } /** - * Notify about new available frame + * Notify about new event */ -static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code) +static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t type) { SDL_Event event; SDL_zero(event); - event.type = rd_ctx->renderer_event_type; - event.user.code = code; + event.type = type; SDL_PushEvent(&event); } @@ -294,10 +315,137 @@ * new picture. */ static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx, - Dav1dPicture *dav1d_pic) + Dav1dPicture *dav1d_pic) { + rd_ctx->current_ts = dav1d_pic->m.timestamp; renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings); - rd_ctx->current_pts = dav1d_pic->m.timestamp; +} + +/** + * Toggle pause state + */ +static void dp_rd_ctx_toggle_pause(Dav1dPlayRenderContext *rd_ctx) +{ + SDL_LockMutex(rd_ctx->lock); + rd_ctx->user_paused = !rd_ctx->user_paused; + if (rd_ctx->seek) + goto out; + rd_ctx->paused = rd_ctx->user_paused; + uint32_t now = SDL_GetTicks(); + if (rd_ctx->paused) + rd_ctx->pause_start = now; + else { + rd_ctx->pause_time += now - rd_ctx->pause_start; + rd_ctx->pause_start = 0; + rd_ctx->last_ticks = now; + } +out: + SDL_UnlockMutex(rd_ctx->lock); +} + +/** + * Query pause state + */ +static int dp_rd_ctx_is_paused(Dav1dPlayRenderContext *rd_ctx) +{ + int ret; + SDL_LockMutex(rd_ctx->lock); + ret = rd_ctx->paused; + SDL_UnlockMutex(rd_ctx->lock); + return ret; +} + +/** + * Request seeking, in seconds + */ +static void dp_rd_ctx_seek(Dav1dPlayRenderContext *rd_ctx, int sec) +{ + SDL_LockMutex(rd_ctx->lock); + rd_ctx->seek += sec; + if (!rd_ctx->paused) + rd_ctx->pause_start = SDL_GetTicks(); + rd_ctx->paused = 1; + SDL_UnlockMutex(rd_ctx->lock); +} + +static int decode_frame(Dav1dPicture **p, Dav1dContext *c, + Dav1dData *data, DemuxerContext *in_ctx); +static inline void destroy_pic(void *a); + +/** + * Seek the stream, if requested + */ +static int dp_rd_ctx_handle_seek(Dav1dPlayRenderContext *rd_ctx, + DemuxerContext *in_ctx, + Dav1dContext *c, Dav1dData *data) +{ + int res = 0; + SDL_LockMutex(rd_ctx->lock); + if (!rd_ctx->seek) + goto out; + int64_t seek = rd_ctx->seek * 1000000000ULL; + uint64_t pts = TS_TO_PTS(rd_ctx->current_ts); + pts = ((int64_t)pts > -seek) ? pts + seek : 0; + int end = pts >= FRAME_OFFSET_TO_PTS(rd_ctx->total); + if (end) + pts = FRAME_OFFSET_TO_PTS(rd_ctx->total - 1); + uint64_t target_pts = pts; + dav1d_flush(c); + uint64_t shift = FRAME_OFFSET_TO_PTS(5); + while (1) { + if (shift > pts) + shift = pts; + if ((res = input_seek(in_ctx, pts - shift))) + goto out; + Dav1dSequenceHeader seq; + uint64_t cur_pts; + do { + if ((res = input_read(in_ctx, data))) + break; + cur_pts = TS_TO_PTS(data->m.timestamp); + res = dav1d_parse_sequence_header(&seq, data->data, data->sz); + } while (res && cur_pts < pts); + if (!res && cur_pts <= pts) + break; + if (shift > pts) + shift = pts; + pts -= shift; + } + if (!res) { + pts = TS_TO_PTS(data->m.timestamp); + while (pts < target_pts) { + Dav1dPicture *p; + if ((res = decode_frame(&p, c, data, in_ctx))) + break; + if (p) { + pts = TS_TO_PTS(p->m.timestamp); + if (pts < target_pts) + destroy_pic(p); + else { + dp_fifo_push(rd_ctx->fifo, p); + uint32_t type = rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME; + dp_rd_ctx_post_event(rd_ctx, type); + } + } + } + if (!res) { + rd_ctx->last_ts = data->m.timestamp - rd_ctx->spf / rd_ctx->timebase; + rd_ctx->current_ts = data->m.timestamp; + } + } +out: + rd_ctx->paused = rd_ctx->user_paused; + if (!rd_ctx->paused && rd_ctx->seek) { + uint32_t now = SDL_GetTicks(); + rd_ctx->pause_time += now - rd_ctx->pause_start; + rd_ctx->pause_start = 0; + rd_ctx->last_ticks = now; + } + rd_ctx->seek = 0; + SDL_UnlockMutex(rd_ctx->lock); + if (res) + fprintf(stderr, "Error seeking, aborting\n"); + return res; } /** @@ -329,14 +477,15 @@ */ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx) { + SDL_LockMutex(rd_ctx->lock); // Calculate time since last frame was received uint32_t ticks_now = SDL_GetTicks(); uint32_t ticks_diff = (rd_ctx->last_ticks != 0) ? ticks_now - rd_ctx->last_ticks : 0; // Calculate when to display the frame - int64_t pts_diff = rd_ctx->current_pts - rd_ctx->last_pts; - int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff; - rd_ctx->last_pts = rd_ctx->current_pts; + int64_t ts_diff = rd_ctx->current_ts - rd_ctx->last_ts; + int32_t pts_diff = (ts_diff * rd_ctx->timebase) * 1000.0 + .5; + int32_t wait_time = pts_diff - ticks_diff; // In untimed mode, simply don't wait if (rd_ctx->settings.untimed) @@ -347,13 +496,59 @@ // accurate player this would need to be done in a better way. if (wait_time > 0) { SDL_Delay(wait_time); - } else if (wait_time < -10) { // Do not warn for minor time drifts - fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000); + } else if (wait_time < -10 && !rd_ctx->paused) { // Do not warn for minor time drifts + fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time / 1000.0); } renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings); + rd_ctx->last_ts = rd_ctx->current_ts; rd_ctx->last_ticks = SDL_GetTicks(); + + SDL_UnlockMutex(rd_ctx->lock); +} + +static int decode_frame(Dav1dPicture **p, Dav1dContext *c, + Dav1dData *data, DemuxerContext *in_ctx) +{ + int res; + // Send data packets we got from the demuxer to dav1d + if ((res = dav1d_send_data(c, data)) < 0) { + // On EAGAIN, dav1d can not consume more data and + // dav1d_get_picture needs to be called first, which + // will happen below, so just keep going in that case + // and do not error out. + if (res != DAV1D_ERR(EAGAIN)) { + dav1d_data_unref(data); + goto err; + } + } + *p = calloc(1, sizeof(**p)); + // Try to get a decoded frame + if ((res = dav1d_get_picture(c, *p)) < 0) { + // In all error cases, even EAGAIN, p needs to be freed as + // it is never added to the queue and would leak. + free(*p); + *p = NULL; + // On EAGAIN, it means dav1d has not enough data to decode + // therefore this is not a decoding error but just means + // we need to feed it more data, which happens in the next + // run of the decoder loop. + if (res != DAV1D_ERR(EAGAIN)) + goto err; + } + return data->sz == 0 ? input_read(in_ctx, data) : 0; +err: + fprintf(stderr, "Error decoding frame: %s\n", + strerror(-res)); + return res; +} + +static inline void destroy_pic(void *a) +{ + Dav1dPicture *p = (Dav1dPicture *)a; + dav1d_picture_unref(p); + free(p); } /* Decoder thread "main" function */ @@ -366,10 +561,7 @@ Dav1dData data; DemuxerContext *in_ctx = NULL; int res = 0; - unsigned n_out = 0, total, timebase[2], fps[2]; - - // Store current ticks for stats calculation - uint32_t decoder_start = SDL_GetTicks(); + unsigned total, timebase[2], fps[2]; Dav1dPlaySettings settings = rd_ctx->settings; @@ -382,8 +574,9 @@ goto cleanup; } - double timebase_d = timebase[1]/(double)timebase[0]; - rd_ctx->timebase = timebase_d; + rd_ctx->timebase = (double)timebase[1] / timebase[0]; + rd_ctx->spf = (double)fps[1] / fps[0]; + rd_ctx->total = total; if ((res = dav1d_open(&c, &rd_ctx->lib_settings))) { fprintf(stderr, "Failed opening dav1d decoder\n"); @@ -398,55 +591,29 @@ } // Decoder loop - do { - if (dp_rd_ctx_should_terminate(rd_ctx)) + while (1) { + if (dp_rd_ctx_should_terminate(rd_ctx) || + (res = dp_rd_ctx_handle_seek(rd_ctx, in_ctx, c, &data)) || + (res = decode_frame(&p, c, &data, in_ctx))) + { break; - - // Send data packets we got from the demuxer to dav1d - if ((res = dav1d_send_data(c, &data)) < 0) { - // On EAGAIN, dav1d can not consume more data and - // dav1d_get_picture needs to be called first, which - // will happen below, so just keep going in that case - // and do not error out. - if (res != DAV1D_ERR(EAGAIN)) { - dav1d_data_unref(&data); - fprintf(stderr, "Error decoding frame: %s\n", - strerror(-res)); - break; - } } - - p = calloc(1, sizeof(*p)); - - // Try to get a decoded frame - if ((res = dav1d_get_picture(c, p)) < 0) { - // In all error cases, even EAGAIN, p needs to be freed as - // it is never added to the queue and would leak. - free(p); - - // On EAGAIN, it means dav1d has not enough data to decode - // therefore this is not a decoding error but just means - // we need to feed it more data, which happens in the next - // run of this decoder loop. - if (res != DAV1D_ERR(EAGAIN)) { - fprintf(stderr, "Error decoding frame: %s\n", - strerror(-res)); - break; - } - res = 0; - } else { - + else if (p) { // Queue frame - dp_fifo_push(rd_ctx->fifo, p); - dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME); - - n_out++; + SDL_LockMutex(rd_ctx->lock); + int seek = rd_ctx->seek; + SDL_UnlockMutex(rd_ctx->lock); + if (!seek) { + dp_fifo_push(rd_ctx->fifo, p); + uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME; + dp_rd_ctx_post_event(rd_ctx, type); + } } - } while ((data.sz > 0 || !input_read(in_ctx, &data))); + } // Release remaining data - if (data.sz > 0) dav1d_data_unref(&data); - + if (data.sz > 0) + dav1d_data_unref(&data); // Do not drain in case an error occured and caused us to leave the // decoding loop early. if (res < 0) @@ -461,7 +628,6 @@ do { if (dp_rd_ctx_should_terminate(rd_ctx)) break; - p = calloc(1, sizeof(*p)); res = dav1d_get_picture(c, p); if (res < 0) { @@ -474,19 +640,13 @@ } else { // Queue frame dp_fifo_push(rd_ctx->fifo, p); - dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_NEW_FRAME); - - n_out++; + uint32_t type = rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME; + dp_rd_ctx_post_event(rd_ctx, type); } } while (res != DAV1D_ERR(EAGAIN)); - // Print stats - uint32_t decoding_time_ms = SDL_GetTicks() - decoder_start; - printf("Decoded %u frames in %d seconds, avg %.02f fps\n", - n_out, decoding_time_ms/1000, n_out / (decoding_time_ms / 1000.0)); - cleanup: - dp_rd_ctx_post_event(rd_ctx, DAV1D_EVENT_DEC_QUIT); + dp_rd_ctx_post_event(rd_ctx, rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT); if (in_ctx) input_close(in_ctx); @@ -543,41 +703,84 @@ decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx); // Main loop +#define NUM_MAX_EVENTS 8 + SDL_Event events[NUM_MAX_EVENTS]; + int num_frame_events = 0; + uint32_t start_time = 0, n_out = 0; while (1) { - - SDL_Event e; - if (SDL_WaitEvent(&e)) { - if (e.type == SDL_QUIT) { + int num_events = 0; + SDL_WaitEvent(NULL); + while (num_events < NUM_MAX_EVENTS && SDL_PollEvent(&events[num_events++])) + break; + for (int i = 0; i < num_events; ++i) { + SDL_Event *e = &events[i]; + if (e->type == SDL_QUIT) { dp_rd_ctx_request_shutdown(rd_ctx); - } else if (e.type == SDL_WINDOWEVENT) { - if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) { + dp_fifo_flush(rd_ctx->fifo, destroy_pic); + SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME); + SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME); + num_frame_events = 0; + } else if (e->type == SDL_WINDOWEVENT) { + if (e->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) { // TODO: Handle window resizes + } else if(e->window.event == SDL_WINDOWEVENT_EXPOSED) { + dp_rd_ctx_render(rd_ctx); } - } else if (e.type == rd_ctx->renderer_event_type) { - if (e.user.code == DAV1D_EVENT_NEW_FRAME) { - // Dequeue frame and update the render context with it - Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo); - - // Do not update textures during termination - if (!dp_rd_ctx_should_terminate(rd_ctx)) - dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p); - dav1d_picture_unref(p); - free(p); - } else if (e.user.code == DAV1D_EVENT_DEC_QUIT) { - break; + } else if (e->type == SDL_KEYDOWN) { + SDL_KeyboardEvent *kbde = (SDL_KeyboardEvent *)e; + if (kbde->keysym.sym == SDLK_SPACE) { + dp_rd_ctx_toggle_pause(rd_ctx); + } else if (kbde->keysym.sym == SDLK_LEFT || + kbde->keysym.sym == SDLK_RIGHT) + { + if (kbde->keysym.sym == SDLK_LEFT) + dp_rd_ctx_seek(rd_ctx, -5); + else if (kbde->keysym.sym == SDLK_RIGHT) + dp_rd_ctx_seek(rd_ctx, +5); + dp_fifo_flush(rd_ctx->fifo, destroy_pic); + SDL_FlushEvent(rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME); + num_frame_events = 0; } + } else if (e->type == rd_ctx->event_types + DAV1D_EVENT_NEW_FRAME) { + num_frame_events++; + // Store current ticks for stats calculation + if (start_time == 0) + start_time = SDL_GetTicks(); + } else if (e->type == rd_ctx->event_types + DAV1D_EVENT_SEEK_FRAME) { + // Dequeue frame and update the render context with it + Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo); + // Do not update textures during termination + if (!dp_rd_ctx_should_terminate(rd_ctx)) { + dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p); + n_out++; + } + destroy_pic(p); + } else if (e->type == rd_ctx->event_types + DAV1D_EVENT_DEC_QUIT) { + goto out; } } - - // Do not render during termination - if (!dp_rd_ctx_should_terminate(rd_ctx)) - dp_rd_ctx_render(rd_ctx); + if (num_frame_events && !dp_rd_ctx_is_paused(rd_ctx)) { + // Dequeue frame and update the render context with it + Dav1dPicture *p = dp_fifo_shift(rd_ctx->fifo); + // Do not update textures during termination + if (!dp_rd_ctx_should_terminate(rd_ctx)) { + dp_rd_ctx_update_with_dav1d_picture(rd_ctx, p); + dp_rd_ctx_render(rd_ctx); + n_out++; + } + destroy_pic(p); + num_frame_events--; + } } +out:; + // Print stats + uint32_t time_ms = SDL_GetTicks() - start_time - rd_ctx->pause_time; + printf("Decoded %u frames in %d seconds, avg %.02f fps\n", + n_out, time_ms / 1000, n_out/ (time_ms / 1000.0)); + int decoder_ret = 0; SDL_WaitThread(decoder_thread, &decoder_ret); - dp_rd_ctx_destroy(rd_ctx); - return decoder_ret; } diff -Nru dav1d-0.7.1/examples/dp_fifo.c dav1d-0.9.1/examples/dp_fifo.c --- dav1d-0.7.1/examples/dp_fifo.c 2020-06-21 11:48:54.948126300 +0000 +++ dav1d-0.9.1/examples/dp_fifo.c 2021-07-28 21:38:28.853851800 +0000 @@ -37,6 +37,8 @@ size_t capacity; size_t count; void **entries; + int push_wait; + int flush; }; @@ -54,6 +56,8 @@ fifo->capacity = capacity; fifo->count = 0; + fifo->push_wait = 0; + fifo->flush = 0; fifo->lock = SDL_CreateMutex(); if (fifo->lock == NULL) { @@ -90,8 +94,16 @@ void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element) { SDL_LockMutex(fifo->lock); - while (fifo->count == fifo->capacity) + while (fifo->count == fifo->capacity) { + fifo->push_wait = 1; SDL_CondWait(fifo->cond_change, fifo->lock); + fifo->push_wait = 0; + if (fifo->flush) { + SDL_CondSignal(fifo->cond_change); + SDL_UnlockMutex(fifo->lock); + return; + } + } fifo->entries[fifo->count++] = element; if (fifo->count == 1) SDL_CondSignal(fifo->cond_change); @@ -120,4 +132,16 @@ return res; } - +void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *)) +{ + SDL_LockMutex(fifo->lock); + fifo->flush = 1; + if (fifo->push_wait) { + SDL_CondSignal(fifo->cond_change); + SDL_CondWait(fifo->cond_change, fifo->lock); + } + while (fifo->count) + destroy_elem(fifo->entries[--fifo->count]); + fifo->flush = 0; + SDL_UnlockMutex(fifo->lock); +} diff -Nru dav1d-0.7.1/examples/dp_fifo.h dav1d-0.9.1/examples/dp_fifo.h --- dav1d-0.7.1/examples/dp_fifo.h 2020-06-21 11:48:54.948126300 +0000 +++ dav1d-0.9.1/examples/dp_fifo.h 2021-07-28 21:38:28.853851800 +0000 @@ -59,3 +59,5 @@ * other thread will call dp_fifo_shift will lead to a deadlock. */ void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element); + +void dp_fifo_flush(Dav1dPlayPtrFifo *fifo, void (*destroy_elem)(void *)); diff -Nru dav1d-0.7.1/examples/dp_renderer.h dav1d-0.9.1/examples/dp_renderer.h --- dav1d-0.7.1/examples/dp_renderer.h 2020-06-21 11:48:54.948126300 +0000 +++ dav1d-0.9.1/examples/dp_renderer.h 2021-07-28 21:38:28.853851800 +0000 @@ -66,8 +66,11 @@ #define WINDOW_WIDTH 910 #define WINDOW_HEIGHT 512 -#define DAV1D_EVENT_NEW_FRAME 1 -#define DAV1D_EVENT_DEC_QUIT 2 +enum { + DAV1D_EVENT_NEW_FRAME, + DAV1D_EVENT_SEEK_FRAME, + DAV1D_EVENT_DEC_QUIT +}; /** * Renderer info @@ -79,12 +82,12 @@ // Cookie passed to the renderer implementation callbacks void *cookie; // Callback to create the renderer - void* (*create_renderer)(); + void* (*create_renderer)(void); // Callback to destroy the renderer void (*destroy_renderer)(void *cookie); // Callback to the render function that renders a prevously sent frame void (*render)(void *cookie, const Dav1dPlaySettings *settings); - // Callback to the send frame function + // Callback to the send frame function, _may_ also unref dav1d_pic! int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic, const Dav1dPlaySettings *settings); // Callback for alloc/release pictures (optional) diff -Nru dav1d-0.7.1/examples/dp_renderer_placebo.c dav1d-0.9.1/examples/dp_renderer_placebo.c --- dav1d-0.7.1/examples/dp_renderer_placebo.c 2020-06-21 11:48:54.952126300 +0000 +++ dav1d-0.9.1/examples/dp_renderer_placebo.c 2021-07-28 21:38:28.853851800 +0000 @@ -30,7 +30,7 @@ #include #include -#include +#include #ifdef HAVE_PLACEBO_VULKAN # include @@ -72,7 +72,7 @@ // Lock protecting access to the texture SDL_mutex *lock; // Image to render, and planes backing them - struct pl_image image; + struct pl_frame image; const struct pl_tex *plane_tex[3]; } Dav1dPlayRendererPrivateContext; @@ -319,22 +319,15 @@ if (settings->highquality) render_params = pl_render_default_params; - struct pl_render_target target; - pl_render_target_from_swapchain(&target, &frame); - target.profile = (struct pl_icc_profile) { - .data = NULL, - .len = 0, - }; - -#if PL_API_VER >= 66 - pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0); - if (pl_render_target_partial(&target)) - pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 }); -#endif + struct pl_frame target; + pl_frame_from_swapchain(&target, &frame); + pl_rect2df_aspect_copy(&target.crop, &rd_priv_ctx->image.crop, 0.0); + if (pl_frame_is_cropped(&target)) + pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 0.0 }); if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) { fprintf(stderr, "Failed rendering frame!\n"); - pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 }); + pl_tex_clear(rd_priv_ctx->gpu, frame.fbo, (float[4]){ 1.0 }); } ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain); @@ -351,320 +344,37 @@ static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic, const Dav1dPlaySettings *settings) { - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_LockMutex(rd_priv_ctx->lock); - - if (dav1d_pic == NULL) { - SDL_UnlockMutex(rd_priv_ctx->lock); - return 0; - } - - int width = dav1d_pic->p.w; - int height = dav1d_pic->p.h; - int sub_x = 0, sub_y = 0; - int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up - enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN; - - struct pl_image *image = &rd_priv_ctx->image; - *image = (struct pl_image) { - .num_planes = 3, - .width = width, - .height = height, - .src_rect = {0, 0, width, height}, - - .repr = { - .bits = { - .sample_depth = bytes * 8, - .color_depth = dav1d_pic->p.bpc, - }, - }, + Dav1dPlayRendererPrivateContext *p = cookie; + assert(p != NULL); + int ret = 0; + + if (!dav1d_pic) + return ret; + + struct pl_dav1d_upload_params params = { + .picture = dav1d_pic, + .film_grain = settings->gpugrain, + .gpu_allocated = settings->zerocopy, + .asynchronous = true, }; - // Figure out the correct plane dimensions/count - switch (dav1d_pic->p.layout) { - case DAV1D_PIXEL_LAYOUT_I400: - image->num_planes = 1; - break; - case DAV1D_PIXEL_LAYOUT_I420: - sub_x = sub_y = 1; - break; - case DAV1D_PIXEL_LAYOUT_I422: - sub_x = 1; - break; - case DAV1D_PIXEL_LAYOUT_I444: - break; - } - - // Set the right colorspace metadata etc. - switch (dav1d_pic->seq_hdr->pri) { - case DAV1D_COLOR_PRI_UNKNOWN: image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break; - case DAV1D_COLOR_PRI_BT709: image->color.primaries = PL_COLOR_PRIM_BT_709; break; - case DAV1D_COLOR_PRI_BT470M: image->color.primaries = PL_COLOR_PRIM_BT_470M; break; - case DAV1D_COLOR_PRI_BT470BG: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break; - case DAV1D_COLOR_PRI_BT601: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break; - case DAV1D_COLOR_PRI_BT2020: image->color.primaries = PL_COLOR_PRIM_BT_2020; break; - - case DAV1D_COLOR_PRI_XYZ: - // Handled below - assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY); - break; - - default: - printf("warning: unknown dav1d color primaries %d.. ignoring, picture " - "may be very incorrect\n", dav1d_pic->seq_hdr->pri); - break; - } - - switch (dav1d_pic->seq_hdr->trc) { - case DAV1D_TRC_BT709: - case DAV1D_TRC_BT470M: - case DAV1D_TRC_BT470BG: - case DAV1D_TRC_BT601: - case DAV1D_TRC_SMPTE240: - case DAV1D_TRC_BT2020_10BIT: - case DAV1D_TRC_BT2020_12BIT: - // These all map to the effective "SDR" CRT-based EOTF, BT.1886 - image->color.transfer = PL_COLOR_TRC_BT_1886; - break; - - case DAV1D_TRC_UNKNOWN: image->color.transfer = PL_COLOR_TRC_UNKNOWN; break; - case DAV1D_TRC_LINEAR: image->color.transfer = PL_COLOR_TRC_LINEAR; break; - case DAV1D_TRC_SRGB: image->color.transfer = PL_COLOR_TRC_SRGB; break; - case DAV1D_TRC_SMPTE2084: image->color.transfer = PL_COLOR_TRC_PQ; break; - case DAV1D_TRC_HLG: image->color.transfer = PL_COLOR_TRC_HLG; break; - - default: - printf("warning: unknown dav1d color transfer %d.. ignoring, picture " - "may be very incorrect\n", dav1d_pic->seq_hdr->trc); - break; - } - - switch (dav1d_pic->seq_hdr->mtrx) { - case DAV1D_MC_IDENTITY: - // This is going to be either RGB or XYZ - if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) { - image->repr.sys = PL_COLOR_SYSTEM_XYZ; - } else { - image->repr.sys = PL_COLOR_SYSTEM_RGB; - } - break; - - case DAV1D_MC_UNKNOWN: - // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one - image->repr.sys = pl_color_system_guess_ycbcr(width, height); - break; - - case DAV1D_MC_BT709: image->repr.sys = PL_COLOR_SYSTEM_BT_709; break; - case DAV1D_MC_BT601: image->repr.sys = PL_COLOR_SYSTEM_BT_601; break; - case DAV1D_MC_SMPTE240: image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break; - case DAV1D_MC_SMPTE_YCGCO: image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break; - case DAV1D_MC_BT2020_NCL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break; - case DAV1D_MC_BT2020_CL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break; - - case DAV1D_MC_ICTCP: - // This one is split up based on the actual HDR curve in use - if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) { - image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG; - } else { - image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ; - } - break; - - default: - printf("warning: unknown dav1d color matrix %d.. ignoring, picture " - "may be very incorrect\n", dav1d_pic->seq_hdr->mtrx); - break; - } - - if (dav1d_pic->seq_hdr->color_range) { - image->repr.levels = PL_COLOR_LEVELS_PC; - } else { - image->repr.levels = PL_COLOR_LEVELS_TV; - } - - switch (dav1d_pic->seq_hdr->chr) { - case DAV1D_CHR_UNKNOWN: chroma_loc = PL_CHROMA_UNKNOWN; break; - case DAV1D_CHR_VERTICAL: chroma_loc = PL_CHROMA_LEFT; break; - case DAV1D_CHR_COLOCATED: chroma_loc = PL_CHROMA_TOP_LEFT; break; - } - -#if PL_API_VER >= 63 - if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) { - Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data; - struct pl_av1_grain_data *dst = &image->av1_grain; - *dst = (struct pl_av1_grain_data) { - .grain_seed = src->seed, - .num_points_y = src->num_y_points, - .chroma_scaling_from_luma = src->chroma_scaling_from_luma, - .num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] }, - .scaling_shift = src->scaling_shift, - .ar_coeff_lag = src->ar_coeff_lag, - .ar_coeff_shift = src->ar_coeff_shift, - .grain_scale_shift = src->grain_scale_shift, - .uv_mult = { src->uv_mult[0], src->uv_mult[1] }, - .uv_mult_luma = { src->uv_luma_mult[0], src->uv_luma_mult[1] }, - .uv_offset = { src->uv_offset[0], src->uv_offset[1] }, - .overlap = src->overlap_flag, - }; - - assert(sizeof(dst->points_y) == sizeof(src->y_points)); - assert(sizeof(dst->points_uv) == sizeof(src->uv_points)); - assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y)); - memcpy(dst->points_y, src->y_points, sizeof(src->y_points)); - memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points)); - memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y)); - - // this one has different row sizes for alignment - for (int c = 0; c < 2; c++) { - for (int i = 0; i < 25; i++) - dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i]; - } - } -#endif - - // Upload the actual planes - struct pl_plane_data data[3] = { - { - // Y plane - .type = PL_FMT_UNORM, - .width = width, - .height = height, - .pixel_stride = bytes, - .row_stride = dav1d_pic->stride[0], - .component_size = {bytes * 8}, - .component_map = {0}, - }, { - // U plane - .type = PL_FMT_UNORM, - .width = width >> sub_x, - .height = height >> sub_y, - .pixel_stride = bytes, - .row_stride = dav1d_pic->stride[1], - .component_size = {bytes * 8}, - .component_map = {1}, - }, { - // V plane - .type = PL_FMT_UNORM, - .width = width >> sub_x, - .height = height >> sub_y, - .pixel_stride = bytes, - .row_stride = dav1d_pic->stride[1], - .component_size = {bytes * 8}, - .component_map = {2}, - }, - }; - - bool ok = true; - - for (int i = 0; i < image->num_planes; i++) { - if (settings->zerocopy) { - const struct pl_buf *buf = dav1d_pic->allocator_data; - assert(buf); - data[i].buf = buf; - data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data; - } else { - data[i].pixels = dav1d_pic->data[i]; - } - - ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]); - } - - // Apply the correct chroma plane shift. This has to be done after pl_upload_plane -#if PL_API_VER >= 67 - pl_image_set_chroma_location(image, chroma_loc); -#else - pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y); - pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y); -#endif - - if (!ok) { + SDL_LockMutex(p->lock); + if (!pl_upload_dav1dpicture(p->gpu, &p->image, p->plane_tex, ¶ms)) { fprintf(stderr, "Failed uploading planes!\n"); - *image = (struct pl_image) {0}; + p->image = (struct pl_frame) {0}; + ret = -1; } - - SDL_UnlockMutex(rd_priv_ctx->lock); - return !ok; + SDL_UnlockMutex(p->lock); + return ret; } -// Align to power of 2 -#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1)) - -static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie) +static int placebo_alloc_pic(Dav1dPicture *const pic, void *cookie) { Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); - SDL_LockMutex(rd_priv_ctx->lock); - - const struct pl_gpu *gpu = rd_priv_ctx->gpu; - int ret = DAV1D_ERR(ENOMEM); - - // Copied from dav1d_default_picture_alloc - const int hbd = p->p.bpc > 8; - const int aligned_w = ALIGN2(p->p.w, 128); - const int aligned_h = ALIGN2(p->p.h, 128); - const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400; - const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420; - const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444; - p->stride[0] = aligned_w << hbd; - p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0; - - // Align strides up to multiples of the GPU performance hints - p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride); - p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride); - - // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2) - size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4); - const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align); - const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align); - - // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment, - // even in the case that the driver gives us insane alignments - const size_t pic_size = y_sz + 2 * uv_sz; - const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4; - - // Validate size limitations - if (total_size > gpu->limits.max_xfer_size) { - printf("alloc of %zu bytes exceeds limits\n", total_size); - goto err; - } - - const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) { - .type = PL_BUF_TEX_TRANSFER, - .host_mapped = true, - .size = total_size, - .memory_type = PL_BUF_MEM_HOST, - .user_data = p, - }); - - if (!buf) { - printf("alloc of GPU mapped buffer failed\n"); - goto err; - } - - assert(buf->data); - uintptr_t base = (uintptr_t) buf->data, data[3]; - data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT); - data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT); - data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT); - - // Sanity check offset alignment for the sake of debugging - if (data[0] - base != ALIGN2(data[0] - base, off_align) || - data[1] - base != ALIGN2(data[1] - base, off_align) || - data[2] - base != ALIGN2(data[2] - base, off_align)) - { - printf("GPU buffer horribly misaligned, expect slowdown!\n"); - } - - p->allocator_data = (void *) buf; - p->data[0] = (void *) data[0]; - p->data[1] = (void *) data[1]; - p->data[2] = (void *) data[2]; - ret = 0; - // fall through -err: + SDL_LockMutex(rd_priv_ctx->lock); + int ret = pl_allocate_dav1dpicture(pic, rd_priv_ctx->gpu); SDL_UnlockMutex(rd_priv_ctx->lock); return ret; } @@ -673,11 +383,9 @@ { Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; assert(rd_priv_ctx != NULL); - assert(pic->allocator_data); SDL_LockMutex(rd_priv_ctx->lock); - const struct pl_gpu *gpu = rd_priv_ctx->gpu; - pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data); + pl_release_dav1dpicture(pic, rd_priv_ctx->gpu); SDL_UnlockMutex(rd_priv_ctx->lock); } @@ -690,10 +398,7 @@ .update_frame = placebo_upload_image, .alloc_pic = placebo_alloc_pic, .release_pic = placebo_release_pic, - -# if PL_API_VER >= 63 .supports_gpu_grain = 1, -# endif }; #else const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL }; @@ -706,12 +411,7 @@ .destroy_renderer = placebo_renderer_destroy, .render = placebo_render, .update_frame = placebo_upload_image, - .alloc_pic = placebo_alloc_pic, - .release_pic = placebo_release_pic, - -# if PL_API_VER >= 63 .supports_gpu_grain = 1, -# endif }; #else const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL }; diff -Nru dav1d-0.7.1/examples/dp_renderer_sdl.c dav1d-0.9.1/examples/dp_renderer_sdl.c --- dav1d-0.7.1/examples/dp_renderer_sdl.c 2020-06-21 11:48:54.952126300 +0000 +++ dav1d-0.9.1/examples/dp_renderer_sdl.c 2021-07-28 21:38:28.853851800 +0000 @@ -43,7 +43,7 @@ SDL_Texture *tex; } Dav1dPlayRendererPrivateContext; -static void *sdl_renderer_create() +static void *sdl_renderer_create(void) { SDL_Window *win = dp_create_sdl_window(0); if (win == NULL) diff -Nru dav1d-0.7.1/examples/meson.build dav1d-0.9.1/examples/meson.build --- dav1d-0.7.1/examples/meson.build 2020-06-21 11:48:54.952126300 +0000 +++ dav1d-0.9.1/examples/meson.build 2021-07-28 21:38:28.853851800 +0000 @@ -43,10 +43,10 @@ sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true) if sdl2_dependency.found() - dav1dplay_deps = [sdl2_dependency] + dav1dplay_deps = [sdl2_dependency, libm_dependency] dav1dplay_cflags = [] - placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false) + placebo_dependency = dependency('libplacebo', version: '>= 3.110.0', required: false) if placebo_dependency.found() dav1dplay_deps += placebo_dependency diff -Nru dav1d-0.7.1/include/common/attributes.h dav1d-0.9.1/include/common/attributes.h --- dav1d-0.7.1/include/common/attributes.h 2020-06-21 11:48:54.952126300 +0000 +++ dav1d-0.9.1/include/common/attributes.h 2021-07-28 21:38:28.853851800 +0000 @@ -31,6 +31,15 @@ #include "config.h" #include +#include + +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +#ifndef __has_feature +#define __has_feature(x) 0 +#endif #ifdef __GNUC__ #define ATTR_ALIAS __attribute__((may_alias)) @@ -92,9 +101,11 @@ */ #ifdef _MSC_VER #define NOINLINE __declspec(noinline) -#else /* !_MSC_VER */ +#elif __has_attribute(noclone) +#define NOINLINE __attribute__((noinline, noclone)) +#else #define NOINLINE __attribute__((noinline)) -#endif /* !_MSC_VER */ +#endif #ifdef __clang__ #define NO_SANITIZE(x) __attribute__((no_sanitize(x))) @@ -103,11 +114,11 @@ #endif #if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__)) +#undef assert #define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0) #elif defined(NDEBUG) && defined(_MSC_VER) +#undef assert #define assert __assume -#else -#include #endif #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) @@ -116,8 +127,8 @@ # define dav1d_uninit(x) x #endif - #ifdef _MSC_VER - #include +#if defined(_MSC_VER) && !defined(__clang__) +#include static inline int ctz(const unsigned int mask) { unsigned long idx; @@ -159,8 +170,12 @@ } #endif /* !_MSC_VER */ -#ifndef __has_feature -#define __has_feature(x) 0 +#ifndef static_assert +#define CHECK_OFFSET(type, field, name) \ + struct check_##type##_##field { int x[(name == offsetof(type, field)) ? 1 : -1]; } +#else +#define CHECK_OFFSET(type, field, name) \ + static_assert(name == offsetof(type, field), #field) #endif #endif /* DAV1D_COMMON_ATTRIBUTES_H */ diff -Nru dav1d-0.7.1/include/common/frame.h dav1d-0.9.1/include/common/frame.h --- dav1d-0.7.1/include/common/frame.h 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/include/common/frame.h 2021-07-28 21:38:28.853851800 +0000 @@ -0,0 +1,45 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_COMMON_FRAME_H +#define DAV1D_COMMON_FRAME_H + +/* + * Checks whether Dav1dFrameType == INTER || == SWITCH + * Both are defined as odd numbers {1, 3} and therefore have the LSB set. + * See also: AV1 spec 6.8.2 + */ +#define IS_INTER_OR_SWITCH(frame_header) \ + ((frame_header)->frame_type & 1) + +/* + * Checks whether Dav1dFrameType == KEY || == INTRA + * See also: AV1 spec 6.8.2 + */ +#define IS_KEY_OR_INTRA(frame_header) \ + (!IS_INTER_OR_SWITCH(frame_header)) + +#endif /* DAV1D_COMMON_FRAME_H */ diff -Nru dav1d-0.7.1/include/common/mem.h dav1d-0.9.1/include/common/mem.h --- dav1d-0.7.1/include/common/mem.h 2020-06-21 11:48:54.952126300 +0000 +++ dav1d-0.9.1/include/common/mem.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,84 +0,0 @@ -/* - * Copyright © 2018, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef DAV1D_COMMON_MEM_H -#define DAV1D_COMMON_MEM_H - -#include - -#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN) -#include -#endif - -#include "common/attributes.h" - -/* - * Allocate align-byte aligned memory. The return value can be released - * by calling the dav1d_free_aligned() function. - */ -static inline void *dav1d_alloc_aligned(size_t sz, size_t align) { - assert(!(align & (align - 1))); -#ifdef HAVE_POSIX_MEMALIGN - void *ptr; - if (posix_memalign(&ptr, align, sz)) return NULL; - return ptr; -#elif defined(HAVE_ALIGNED_MALLOC) - return _aligned_malloc(sz, align); -#elif defined(HAVE_MEMALIGN) - return memalign(align, sz); -#else -#error Missing aligned alloc implementation -#endif -} - -static inline void dav1d_free_aligned(void* ptr) { -#ifdef HAVE_POSIX_MEMALIGN - free(ptr); -#elif defined(HAVE_ALIGNED_MALLOC) - _aligned_free(ptr); -#elif defined(HAVE_MEMALIGN) - free(ptr); -#endif -} - -static inline void dav1d_freep_aligned(void* ptr) { - void **mem = (void **) ptr; - if (*mem) { - dav1d_free_aligned(*mem); - *mem = NULL; - } -} - -static inline void freep(void *ptr) { - void **mem = (void **) ptr; - if (*mem) { - free(*mem); - *mem = NULL; - } -} - -#endif /* DAV1D_COMMON_MEM_H */ diff -Nru dav1d-0.7.1/include/dav1d/dav1d.h dav1d-0.9.1/include/dav1d/dav1d.h --- dav1d-0.7.1/include/dav1d/dav1d.h 2020-06-21 11:48:54.952126300 +0000 +++ dav1d-0.9.1/include/dav1d/dav1d.h 2021-07-28 21:38:28.853851800 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -45,6 +45,7 @@ #define DAV1D_MAX_FRAME_THREADS 256 #define DAV1D_MAX_TILE_THREADS 64 +#define DAV1D_MAX_POSTFILTER_THREADS 256 typedef struct Dav1dLogger { void *cookie; ///< Custom data to pass to the callback. @@ -65,9 +66,10 @@ int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31) int all_layers; ///< output all spatial layers of a scalable AV1 biststream unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited) - uint8_t reserved[32]; ///< reserved for future use Dav1dPicAllocator allocator; ///< Picture allocator callback. Dav1dLogger logger; ///< Logger callback. + int n_postfilter_threads; + uint8_t reserved[28]; ///< reserved for future use } Dav1dSettings; /** @@ -103,7 +105,12 @@ * @param buf The data to be parser. * @param sz Size of the data. * - * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error. + * @return + * 0: Success, and out is filled with the parsed Sequence Header + * OBU parameters. + * DAV1D_ERR(ENOENT): No Sequence Header OBUs were found in the buffer. + * other negative DAV1D_ERR codes: Invalid data in the buffer, invalid passed-in + * arguments, and other errors during parsing. * * @note It is safe to feed this function data containing other OBUs than a * Sequence Header, as they will simply be ignored. If there is more than @@ -200,6 +207,35 @@ */ DAV1D_API void dav1d_flush(Dav1dContext *c); +enum Dav1dEventFlags { + /** + * The last returned picture contains a reference to a new Sequence Header, + * either because it's the start of a new coded sequence, or the decoder was + * flushed before it was generated. + */ + DAV1D_EVENT_FLAG_NEW_SEQUENCE = 1 << 0, + /** + * The last returned picture contains a reference to a Sequence Header with + * new operating parameters information for the current coded sequence. + */ + DAV1D_EVENT_FLAG_NEW_OP_PARAMS_INFO = 1 << 1, +}; + +/** + * Fetch a combination of DAV1D_EVENT_FLAG_* event flags generated by the decoding + * process. + * + * @param c Input decoder instance. + * @param flags Where to write the flags. + * + * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error. + * + * @note Calling this function will clear all the event flags currently stored in + * the decoder. + * + */ +DAV1D_API int dav1d_get_event_flags(Dav1dContext *c, enum Dav1dEventFlags *flags); + # ifdef __cplusplus } # endif diff -Nru dav1d-0.7.1/include/dav1d/headers.h dav1d-0.9.1/include/dav1d/headers.h --- dav1d-0.7.1/include/dav1d/headers.h 2020-06-21 11:48:54.952126300 +0000 +++ dav1d-0.9.1/include/dav1d/headers.h 2021-07-28 21:38:28.853851800 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2020, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -28,6 +28,7 @@ #ifndef DAV1D_HEADERS_H #define DAV1D_HEADERS_H +#include #include // Constants from Section 3. "Symbols and abbreviated terms" @@ -95,9 +96,9 @@ union { struct { int16_t alpha, beta, gamma, delta; - }; + } p; int16_t abcd[4]; - }; + } u; } Dav1dWarpedMotionParams; enum Dav1dPixelLayout { @@ -127,6 +128,7 @@ DAV1D_COLOR_PRI_SMPTE431 = 11, DAV1D_COLOR_PRI_SMPTE432 = 12, DAV1D_COLOR_PRI_EBU3213 = 22, + DAV1D_COLOR_PRI_RESERVED = 255, }; enum Dav1dTransferCharacteristics { @@ -147,6 +149,7 @@ DAV1D_TRC_SMPTE2084 = 16, ///< PQ DAV1D_TRC_SMPTE428 = 17, DAV1D_TRC_HLG = 18, ///< hybrid log/gamma (BT.2100 / ARIB STD-B67) + DAV1D_TRC_RESERVED = 255, }; enum Dav1dMatrixCoefficients { @@ -164,6 +167,7 @@ DAV1D_MC_CHROMAT_NCL = 12, ///< Chromaticity-derived DAV1D_MC_CHROMAT_CL = 13, DAV1D_MC_ICTCP = 14, + DAV1D_MC_RESERVED = 255, }; enum Dav1dChromaSamplePosition { diff -Nru dav1d-0.7.1/include/dav1d/meson.build dav1d-0.9.1/include/dav1d/meson.build --- dav1d-0.7.1/include/dav1d/meson.build 2020-06-21 11:48:54.952126300 +0000 +++ dav1d-0.9.1/include/dav1d/meson.build 2021-07-28 21:38:28.853851800 +0000 @@ -31,11 +31,15 @@ output: 'version.h', configuration: version_h_data) +dav1d_api_headers = [ + 'common.h', + 'data.h', + 'dav1d.h', + 'headers.h', + 'picture.h', +] + # install headers -install_headers('common.h', - 'data.h', - 'dav1d.h', - 'headers.h', - 'picture.h', +install_headers(dav1d_api_headers, version_h_target, subdir : 'dav1d') diff -Nru dav1d-0.7.1/include/dav1d/picture.h dav1d-0.9.1/include/dav1d/picture.h --- dav1d-0.7.1/include/dav1d/picture.h 2020-06-21 11:48:54.952126300 +0000 +++ dav1d-0.9.1/include/dav1d/picture.h 2021-07-28 21:38:28.853851800 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2020, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * diff -Nru dav1d-0.7.1/include/meson.build dav1d-0.9.1/include/meson.build --- dav1d-0.7.1/include/meson.build 2020-06-21 11:48:54.956126500 +0000 +++ dav1d-0.9.1/include/meson.build 2021-07-28 21:38:28.853851800 +0000 @@ -25,9 +25,7 @@ # Revision file (vcs_version.h) generation dav1d_git_dir = join_paths(dav1d_src_root, '.git') rev_target = vcs_tag(command: [ - 'git', '--git-dir', dav1d_git_dir, - 'describe', '--tags', '--long', - '--match', '?.*.*', '--always' + 'git', '--git-dir', dav1d_git_dir, 'describe', '--long', '--always' ], input: 'vcs_version.h.in', output: 'vcs_version.h' diff -Nru dav1d-0.7.1/meson.build dav1d-0.9.1/meson.build --- dav1d-0.7.1/meson.build 2020-06-21 11:48:54.956126500 +0000 +++ dav1d-0.9.1/meson.build 2021-07-28 21:38:28.853851800 +0000 @@ -1,4 +1,4 @@ -# Copyright © 2018-2020, VideoLAN and dav1d authors +# Copyright © 2018-2021, VideoLAN and dav1d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -23,14 +23,14 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '0.7.1', + version: '0.9.1', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], - meson_version: '>= 0.47.0') + meson_version: '>= 0.49.0') -dav1d_soname_version = '4.0.2' +dav1d_soname_version = '5.1.1' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] @@ -62,7 +62,8 @@ # ASM option is_asm_enabled = (get_option('enable_asm') == true and - (host_machine.cpu_family().startswith('x86') or + (host_machine.cpu_family() == 'x86' or + (host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__') == '') or host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm') or host_machine.cpu() == 'ppc64le')) @@ -117,6 +118,17 @@ thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c')) rt_dependency = [] + + rc_version_array = meson.project_version().split('.') + winmod = import('windows') + rc_data = configuration_data() + rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0]) + rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1]) + rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2]) + rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major) + rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor) + rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision) + rc_data.set('COPYRIGHT_YEARS', '2021') else thread_dependency = dependency('threads') thread_compat_dep = [] @@ -156,19 +168,21 @@ endif endif +libm_dependency = cc.find_library('m', required: false) + # Header checks -stdatomic_dependency = [] +stdatomic_dependencies = [] if not cc.check_header('stdatomic.h') if cc.get_id() == 'msvc' # we have a custom replacement for MSVC - stdatomic_dependency = declare_dependency( + stdatomic_dependencies += declare_dependency( include_directories : include_directories('include/compat/msvc'), ) elif cc.compiles('''int main() { int v = 0; return __atomic_fetch_add(&v, 1, __ATOMIC_SEQ_CST); }''', name : 'GCC-style atomics', args : test_args) - stdatomic_dependency = declare_dependency( + stdatomic_dependencies += declare_dependency( include_directories : include_directories('include/compat/gcc'), ) else @@ -176,6 +190,11 @@ endif endif +if host_machine.cpu_family().startswith('wasm') + # enable atomics + bulk-memory features + stdatomic_dependencies += thread_dependency.partial_dependency(compile_args: true) +endif + if cc.check_header('unistd.h') cdata.set('HAVE_UNISTD_H', 1) endif @@ -226,13 +245,14 @@ # Compiler flags that should be set # But when the compiler does not supports them # it is not an error and silently tolerated -if cc.get_id() != 'msvc' +if cc.get_argument_syntax() != 'msvc' optional_arguments += [ '-Wundef', '-Werror=vla', '-Wno-maybe-uninitialized', '-Wno-missing-field-initializers', '-Wno-unused-parameter', + '-Wstrict-prototypes', '-Werror=missing-prototypes', '-Wshorten-64-to-32', ] @@ -245,6 +265,7 @@ else optional_arguments += [ '-wd4028', # parameter different from declaration + '-wd4090', # broken with arrays of pointers '-wd4996' # use of POSIX functions ] endif @@ -313,8 +334,8 @@ cdata.set('STACK_ALIGNMENT', stack_alignment) endif -cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64') -cdata.set10('ARCH_ARM', host_machine.cpu_family().startswith('arm')) +cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64') +cdata.set10('ARCH_ARM', host_machine.cpu_family().startswith('arm') and host_machine.cpu() != 'arm64') if (is_asm_enabled and (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm'))) @@ -350,6 +371,7 @@ cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86') if host_machine.cpu_family().startswith('x86') + cdata_asm.set('private_prefix', 'dav1d') cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64') cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86') cdata_asm.set10('PIC', true) @@ -424,6 +446,28 @@ ]) endif +use_gaspp = false +if (is_asm_enabled and + (host_machine.cpu_family() == 'aarch64' or + host_machine.cpu_family().startswith('arm')) and + cc.get_argument_syntax() == 'msvc') + gaspp = find_program('gas-preprocessor.pl') + use_gaspp = true + gaspp_gen = generator(gaspp, + output: '@BASENAME@.obj', + arguments: [ + '-as-type', 'armasm', + '-arch', host_machine.cpu_family(), + '--', + host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm', + '-nologo', + '-I@0@'.format(dav1d_src_root), + '-I@0@/'.format(meson.current_build_dir()), + '@INPUT@', + '-c', + '-o', '@OUTPUT@' + ]) +endif # Generate config.h config_h_target = configure_file(output: 'config.h', configuration: cdata) diff -Nru dav1d-0.7.1/meson_options.txt dav1d-0.9.1/meson_options.txt --- dav1d-0.7.1/meson_options.txt 2020-06-21 11:48:54.956126500 +0000 +++ dav1d-0.9.1/meson_options.txt 2021-07-28 21:38:28.857851700 +0000 @@ -53,3 +53,7 @@ option('stack_alignment', type: 'integer', value: 0) + +option('xxhash_muxer', + type : 'feature', + value : 'auto') diff -Nru dav1d-0.7.1/NEWS dav1d-0.9.1/NEWS --- dav1d-0.7.1/NEWS 2020-06-21 11:48:54.948126300 +0000 +++ dav1d-0.9.1/NEWS 2021-07-28 21:38:28.849851600 +0000 @@ -1,3 +1,76 @@ +Changes for 0.9.1 'Golden Eagle': +--------------------------------- + +0.9.1 is a middle-size revision of dav1d, adding notably 10b acceleration for SSSE3: + - 10/12b SSSE3 optimizations for mc (avg, w_avg, mask, w_mask, emu_edge), + prep/put_bilin, prep/put_8tap, ipred (dc/h/v, paeth, smooth, pal, filter), wiener, + sgr (10b), warp8x8, deblock, film_grain, cfl_ac/pred for 32bit and 64bit x86 processors + - Film grain NEON for fguv 10/12b, fgy/fguv 8b and fgy/fguv 10/12 arm32 + - Fixes for filmgrain on ARM + - itx 10bit optimizations for 4x4/x8/x16, 8x4/x8/x16 for SSE4 + - Misc improvements on SSE2, SSE4 + + +Changes for 0.9.0 'Golden Eagle': +--------------------------------- + +0.9.0 is a major version of dav1d, adding notably 10b acceleration on x64. + +Details: + - x86 (64bit) AVX2 implementation of most 10b/12b functions, which should provide + a large boost for high-bitdepth decoding on modern x86 computers and servers. + - ARM64 neon implementation of FilmGrain (4:2:0/4:2:2/4:4:4 8bit) + - New API to signal events happening during the decoding process + + +Changes for 0.8.2 'Eurasian hobby': +----------------------------------- + +0.8.2 is a middle-size update of the 0.8.0 branch: + - ARM32 optimizations for ipred and itx in 10/12bits, + completing the 10b/12b work on ARM64 and ARM32 + - Give the post-filters their own threads + - ARM64: rewrite the wiener functions + - Speed up coefficient decoding, 0.5%-3% global decoding gain + - x86 optimizations for CDEF_filter and wiener in 10/12bit + - x86: rewrite the SGR AVX2 asm + - x86: improve msac speed on SSE2+ machines + - ARM32: improve speed of ipred and warp + - ARM64: improve speed of ipred, cdef_dir, cdef_filter, warp_motion and itx16 + - ARM32/64: improve speed of looprestoration + - Add seeking, pausing to the player + - Update the player for rendering of 10b/12b + - Misc speed improvements and fixes on all platforms + - Add a xxh3 muxer in the dav1d application + + +Changes for 0.8.1 'Eurasian hobby': +----------------------------------- + +0.8.1 is a minor update on 0.8.0: + - Keep references to buffers valid after dav1d_close(). Fixes a regression + caused by the picture buffer pool added in 0.8.0. + - ARM32 optimizations for 10bit bitdepth for SGR + - ARM32 optimizations for 16bit bitdepth for blend/w_masl/emu_edge + - ARM64 optimizations for 10bit bitdepth for SGR + - x86 optimizations for wiener in SSE2/SSSE3/AVX2 + + +Changes for 0.8.0 'Eurasian hobby': +----------------------------------- + +0.8.0 is a major update for dav1d: + - Improve the performance by using a picture buffer pool; + The improvements can reach 10% on some cases on Windows. + - Support for Apple ARM Silicon + - ARM32 optimizations for 8bit bitdepth for ipred paeth, smooth, cfl + - ARM32 optimizations for 10/12/16bit bitdepth for mc_avg/mask/w_avg, + put/prep 8tap/bilin, wiener and CDEF filters + - ARM64 optimizations for cfl_ac 444 for all bitdepths + - x86 optimizations for MC 8-tap, mc_scaled in AVX2 + - x86 optimizations for CDEF in SSE and {put/prep}_{8tap/bilin} in SSSE3 + + Changes for 0.7.1 'Frigatebird': ------------------------------ diff -Nru dav1d-0.7.1/package/crossfiles/i686-linux32.meson dav1d-0.9.1/package/crossfiles/i686-linux32.meson --- dav1d-0.7.1/package/crossfiles/i686-linux32.meson 2020-06-21 11:48:54.956126500 +0000 +++ dav1d-0.9.1/package/crossfiles/i686-linux32.meson 2021-07-28 21:38:28.857851700 +0000 @@ -5,7 +5,7 @@ strip = 'strip' [properties] -c_link_args = ['-m32'] +c_link_args = ['-m32', '-Wl,-z,text'] c_args = ['-m32'] [host_machine] diff -Nru dav1d-0.7.1/README.md dav1d-0.9.1/README.md --- dav1d-0.7.1/README.md 2020-06-21 11:48:54.948126300 +0000 +++ dav1d-0.9.1/README.md 2021-07-28 21:38:28.849851600 +0000 @@ -1,4 +1,4 @@ -![dav1d logo](dav1d_logo.png) +![dav1d logo](doc/dav1d_logo.png) # dav1d @@ -30,17 +30,22 @@ 1. Complete C implementation of the decoder, 2. Provide a usable API, 3. Port to most platforms, -4. Make it fast on desktop, by writing asm for AVX-2 chips. +4. Make it fast on desktop, by writing asm for AVX2 chips. 5. Make it fast on mobile, by writing asm for ARMv8 chips, -6. Make it fast on older desktop, by writing asm for SSSE3+ chips. +6. Make it fast on older desktop, by writing asm for SSSE3+ chips, +7. Make high bit-depth fast on mobile, by writing asm for ARMv8 chips. +8. Make it fast on older mobile, by writing asm for ARMv7 chips, +9. Make high bit-depth fast on older mobile, by writing asm for ARMv7 chips, ### On-going -7. Make it fast on older mobiles, by writing asm for ARMv7 chips, -8. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list), -9. Accelerate for less common architectures, like PPC, SSE2 or AVX-512. +10. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list), +11. Accelerate for less common architectures, like PPC, SSE2 or AVX-512. +12. Make high bit-depth fast on desktop, by writing asm for AVX2 chips, ### After -10. Use more GPU, when possible. +13. Make high bit-depth fast on older desktop, by writing asm for SSSE3+ chips, +14. Use more GPU decoding, when possible. +15. Improve threading. # Contribute @@ -55,7 +60,7 @@ Notably, the codebase is in pure C and asm. -We are on IRC, on the **#dav1d** channel on *Freenode*. +We are on IRC, on the **#dav1d** channel on [*Libera.chat*](http://libera.chat/). If you do not have an IRC Client at hand, use [KiwiIRC Web Interface](https://kiwiirc.com/nextclient/#ircs://irc.libera.chat/#dav1d). See the [contributions document](CONTRIBUTING.md). @@ -130,7 +135,7 @@ ## I am not a developer. Can I help? -- Yes. We need testers, bug reporters, and documentation writers. +- Yes. We need testers, bug reporters and documentation writers. ## What about the AV1 patent license? @@ -142,3 +147,5 @@ - We do, but we don't have either the time or the knowledge. Therefore, patches and contributions welcome. +## Where can I find documentation? +- The current library documentation, built from master, can be found [here](https://videolan.videolan.me/dav1d/). diff -Nru dav1d-0.7.1/src/arm/32/cdef16.S dav1d-0.9.1/src/arm/32/cdef16.S --- dav1d-0.7.1/src/arm/32/cdef16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/cdef16.S 2021-07-28 21:38:28.857851700 +0000 @@ -0,0 +1,232 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "cdef_tmpl.S" + +// r1 = d0/q0 +// r2 = d2/q1 +.macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret + tst r6, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + vldr s8, [\s1, #-4] + vld1.16 {\r1}, [\s1, :\align] + vldr s9, [\s1, #2*\w] + vldr s10, [\s2, #-4] + vld1.16 {\r2}, [\s2, :\align] + vldr s11, [\s2, #2*\w] + vstr s8, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s9, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s10, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s11, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vldr s8, [\s1, #-4] + vld1.16 {\r1}, [\s1, :\align] + vldr s9, [\s2, #-4] + vld1.16 {\r2}, [\s2, :\align] + vstr s8, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s9, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +2: + // !CDEF_HAVE_LEFT + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT + vld1.16 {\r1}, [\s1, :\align] + vldr s8, [\s1, #2*\w] + vld1.16 {\r2}, [\s2, :\align] + vldr s9, [\s2, #2*\w] + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s8, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s9, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride + b 3f +.endif + +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.16 {\r1}, [\s1, :\align] + vld1.16 {\r2}, [\s2, :\align] + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + vstr s12, [r0, #-4] + vst1.16 {\r2}, [r0, :\align] + vstr s12, [r0, #2*\w] +.if \ret + pop {r4-r7,pc} +.else + add r0, r0, #2*\stride +.endif +3: +.endm + +// void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, +// ptrdiff_t src_stride, const pixel (*left)[2], +// const pixel *const top, int h, +// enum CdefEdgeFlags edges); + +// r1 = d0/q0 +// r2 = d2/q1 +.macro padding_func_16 w, stride, r1, r2, align +function cdef_padding\w\()_16bpc_neon, export=1 + push {r4-r7,lr} + ldrd r4, r5, [sp, #20] + ldr r6, [sp, #28] + vmov.i16 q3, #0x8000 + tst r6, #4 // CDEF_HAVE_TOP + bne 1f + // !CDEF_HAVE_TOP + sub r12, r0, #2*(2*\stride+2) + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + b 3f +1: + // CDEF_HAVE_TOP + add r7, r4, r2 + sub r0, r0, #2*(2*\stride) + pad_top_bot_16 r4, r7, \w, \stride, \r1, \r2, \align, 0 + + // Middle section +3: + tst r6, #1 // CDEF_HAVE_LEFT + beq 2f + // CDEF_HAVE_LEFT + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + vld1.32 {d2[]}, [r3, :32]! + vldr s5, [r1, #2*\w] + vld1.16 {\r1}, [r1, :\align], r2 + subs r5, r5, #1 + vstr s4, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s5, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.32 {d2[]}, [r3, :32]! + vld1.16 {\r1}, [r1, :\align], r2 + subs r5, r5, #1 + vstr s4, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + b 3f +2: + tst r6, #2 // CDEF_HAVE_RIGHT + beq 1f + // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT +0: + vldr s4, [r1, #2*\w] + vld1.16 {\r1}, [r1, :\align], r2 + subs r5, r5, #1 + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s4, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 0b + b 3f +1: + // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT + vld1.16 {\r1}, [r1, :\align], r2 + subs r5, r5, #1 + vstr s12, [r0, #-4] + vst1.16 {\r1}, [r0, :\align] + vstr s12, [r0, #2*\w] + add r0, r0, #2*\stride + bgt 1b + +3: + tst r6, #8 // CDEF_HAVE_BOTTOM + bne 1f + // !CDEF_HAVE_BOTTOM + sub r12, r0, #4 + vmov.i16 q2, #0x8000 + vst1.16 {q2,q3}, [r12]! +.if \w == 8 + vst1.16 {q2,q3}, [r12]! +.endif + pop {r4-r7,pc} +1: + // CDEF_HAVE_BOTTOM + add r7, r1, r2 + pad_top_bot_16 r1, r7, \w, \stride, \r1, \r2, \align, 1 +endfunc +.endm + +padding_func_16 8, 16, q0, q1, 128 +padding_func_16 4, 8, d0, d2, 64 + +tables + +filter 8, 16 +filter 4, 16 + +find_dir 16 diff -Nru dav1d-0.7.1/src/arm/32/cdef.S dav1d-0.9.1/src/arm/32/cdef.S --- dav1d-0.7.1/src/arm/32/cdef.S 2020-06-21 11:48:54.956126500 +0000 +++ dav1d-0.9.1/src/arm/32/cdef.S 2021-07-28 21:38:28.857851700 +0000 @@ -27,6 +27,7 @@ #include "src/arm/asm.S" #include "util.S" +#include "cdef_tmpl.S" // n1 = s0/d0 // w1 = d0/q0 @@ -190,11 +191,9 @@ beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: - ldrh r12, [r3], #2 - vldr \n1, [r1] - vdup.16 d2, r12 + vld1.16 {d2[]}, [r3, :16]! ldrh r12, [r1, #\w] - add r1, r1, r2 + load_n_incr d0, r1, r2, \w subs r5, r5, #1 vmov.16 d2[1], r12 vmovl.u8 q0, d0 @@ -207,9 +206,8 @@ b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT - ldrh r12, [r3], #2 + vld1.16 {d2[]}, [r3, :16]! load_n_incr d0, r1, r2, \w - vdup.16 d2, r12 subs r5, r5, #1 vmovl.u8 q0, d0 vmovl.u8 q1, d2 @@ -327,230 +325,12 @@ padding_func_edged 8, 16, d0, 64 padding_func_edged 4, 8, s0, 32 -.macro dir_table w, stride -const directions\w - .byte -1 * \stride + 1, -2 * \stride + 2 - .byte 0 * \stride + 1, -1 * \stride + 2 - .byte 0 * \stride + 1, 0 * \stride + 2 - .byte 0 * \stride + 1, 1 * \stride + 2 - .byte 1 * \stride + 1, 2 * \stride + 2 - .byte 1 * \stride + 0, 2 * \stride + 1 - .byte 1 * \stride + 0, 2 * \stride + 0 - .byte 1 * \stride + 0, 2 * \stride - 1 -// Repeated, to avoid & 7 - .byte -1 * \stride + 1, -2 * \stride + 2 - .byte 0 * \stride + 1, -1 * \stride + 2 - .byte 0 * \stride + 1, 0 * \stride + 2 - .byte 0 * \stride + 1, 1 * \stride + 2 - .byte 1 * \stride + 1, 2 * \stride + 2 - .byte 1 * \stride + 0, 2 * \stride + 1 -endconst -.endm - -dir_table 8, 16 -dir_table 4, 8 - -const pri_taps - .byte 4, 2, 3, 3 -endconst - -.macro load_px d11, d12, d21, d22, w -.if \w == 8 - add r6, r2, r9, lsl #1 // x + off - sub r9, r2, r9, lsl #1 // x - off - vld1.16 {\d11,\d12}, [r6] // p0 - vld1.16 {\d21,\d22}, [r9] // p1 -.else - add r6, r2, r9, lsl #1 // x + off - sub r9, r2, r9, lsl #1 // x - off - vld1.16 {\d11}, [r6] // p0 - add r6, r6, #2*8 // += stride - vld1.16 {\d21}, [r9] // p1 - add r9, r9, #2*8 // += stride - vld1.16 {\d12}, [r6] // p0 - vld1.16 {\d22}, [r9] // p1 -.endif -.endm -.macro handle_pixel s1, s2, thresh_vec, shift, tap, min -.if \min - vmin.u16 q2, q2, \s1 - vmax.s16 q3, q3, \s1 - vmin.u16 q2, q2, \s2 - vmax.s16 q3, q3, \s2 -.endif - vabd.u16 q8, q0, \s1 // abs(diff) - vabd.u16 q11, q0, \s2 // abs(diff) - vshl.u16 q9, q8, \shift // abs(diff) >> shift - vshl.u16 q12, q11, \shift // abs(diff) >> shift - vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) - vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) - vsub.i16 q10, \s1, q0 // diff = p0 - px - vsub.i16 q13, \s2, q0 // diff = p1 - px - vneg.s16 q8, q9 // -clip - vneg.s16 q11, q12 // -clip - vmin.s16 q10, q10, q9 // imin(diff, clip) - vmin.s16 q13, q13, q12 // imin(diff, clip) - vdup.16 q9, \tap // taps[k] - vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip) - vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip) - vmla.i16 q1, q10, q9 // sum += taps[k] * constrain() - vmla.i16 q1, q13, q9 // sum += taps[k] * constrain() -.endm - -// void dav1d_cdef_filterX_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, -// const uint16_t *tmp, int pri_strength, -// int sec_strength, int dir, int damping, -// int h, size_t edges); -.macro filter_func w, pri, sec, min, suffix -function cdef_filter\w\suffix\()_neon - cmp r8, #0xf - beq cdef_filter\w\suffix\()_edged_neon -.if \pri - movrel_local r8, pri_taps - and r9, r3, #1 - add r8, r8, r9, lsl #1 -.endif - movrel_local r9, directions\w - add r5, r9, r5, lsl #1 - vmov.u16 d17, #15 - vdup.16 d16, r6 // damping - -.if \pri - vdup.16 q5, r3 // threshold -.endif -.if \sec - vdup.16 q7, r4 // threshold -.endif - vmov.16 d8[0], r3 - vmov.16 d8[1], r4 - vclz.i16 d8, d8 // clz(threshold) - vsub.i16 d8, d17, d8 // ulog2(threshold) - vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) - vneg.s16 d8, d8 // -shift -.if \sec - vdup.16 q6, d8[1] -.endif -.if \pri - vdup.16 q4, d8[0] -.endif - -1: -.if \w == 8 - vld1.16 {q0}, [r2, :128] // px -.else - add r12, r2, #2*8 - vld1.16 {d0}, [r2, :64] // px - vld1.16 {d1}, [r12, :64] // px -.endif - - vmov.u16 q1, #0 // sum -.if \min - vmov.u16 q2, q0 // min - vmov.u16 q3, q0 // max -.endif - - // Instead of loading sec_taps 2, 1 from memory, just set it - // to 2 initially and decrease for the second round. - // This is also used as loop counter. - mov lr, #2 // sec_taps[0] - -2: -.if \pri - ldrsb r9, [r5] // off1 - - load_px d28, d29, d30, d31, \w -.endif - -.if \sec - add r5, r5, #4 // +2*2 - ldrsb r9, [r5] // off2 -.endif - -.if \pri - ldrb r12, [r8] // *pri_taps - - handle_pixel q14, q15, q5, q4, r12, \min -.endif - -.if \sec - load_px d28, d29, d30, d31, \w - - add r5, r5, #8 // +2*4 - ldrsb r9, [r5] // off3 - - handle_pixel q14, q15, q7, q6, lr, \min - - load_px d28, d29, d30, d31, \w - - handle_pixel q14, q15, q7, q6, lr, \min +tables - sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; -.else - add r5, r5, #1 // r5 += 1 -.endif - subs lr, lr, #1 // sec_tap-- (value) -.if \pri - add r8, r8, #1 // pri_taps++ (pointer) -.endif - bne 2b - - vshr.s16 q14, q1, #15 // -(sum < 0) - vadd.i16 q1, q1, q14 // sum - (sum < 0) - vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 - vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4 -.if \min - vmin.s16 q0, q0, q3 - vmax.s16 q0, q0, q2 // iclip(px + .., min, max) -.endif - vmovn.u16 d0, q0 -.if \w == 8 - add r2, r2, #2*16 // tmp += tmp_stride - subs r7, r7, #1 // h-- - vst1.8 {d0}, [r0, :64], r1 -.else - vst1.32 {d0[0]}, [r0, :32], r1 - add r2, r2, #2*16 // tmp += 2*tmp_stride - subs r7, r7, #2 // h -= 2 - vst1.32 {d0[1]}, [r0, :32], r1 -.endif +filter 8, 8 +filter 4, 8 - // Reset pri_taps and directions back to the original point - sub r5, r5, #2 -.if \pri - sub r8, r8, #2 -.endif - - bgt 1b - vpop {q4-q7} - pop {r4-r9,pc} -endfunc -.endm - -.macro filter w -filter_func \w, pri=1, sec=0, min=0, suffix=_pri -filter_func \w, pri=0, sec=1, min=0, suffix=_sec -filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec - -function cdef_filter\w\()_8bpc_neon, export=1 - push {r4-r9,lr} - vpush {q4-q7} - ldrd r4, r5, [sp, #92] - ldrd r6, r7, [sp, #100] - ldr r8, [sp, #108] - cmp r3, #0 // pri_strength - bne 1f - b cdef_filter\w\()_sec_neon // only sec -1: - cmp r4, #0 // sec_strength - bne 1f - b cdef_filter\w\()_pri_neon // only pri -1: - b cdef_filter\w\()_pri_sec_neon // both pri and sec -endfunc -.endm - -filter 8 -filter 4 +find_dir 8 .macro load_px_8 d11, d12, d21, d22, w .if \w == 8 @@ -756,219 +536,3 @@ filter_8 8 filter_8 4 - -const div_table, align=4 - .short 840, 420, 280, 210, 168, 140, 120, 105 -endconst - -const alt_fact, align=4 - .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 -endconst - -// int dav1d_cdef_find_dir_8bpc_neon(const pixel *img, const ptrdiff_t stride, -// unsigned *const var) -function cdef_find_dir_8bpc_neon, export=1 - push {lr} - vpush {q4-q7} - sub sp, sp, #32 // cost - mov r3, #8 - vmov.u16 q1, #0 // q0-q1 sum_diag[0] - vmov.u16 q3, #0 // q2-q3 sum_diag[1] - vmov.u16 q5, #0 // q4-q5 sum_hv[0-1] - vmov.u16 q8, #0 // q6,d16 sum_alt[0] - // q7,d17 sum_alt[1] - vmov.u16 q9, #0 // q9,d22 sum_alt[2] - vmov.u16 q11, #0 - vmov.u16 q10, #0 // q10,d23 sum_alt[3] - - -.irpc i, 01234567 - vld1.8 {d30}, [r0, :64], r1 - vmov.u8 d31, #128 - vsubl.u8 q15, d30, d31 // img[x] - 128 - vmov.u16 q14, #0 - -.if \i == 0 - vmov q0, q15 // sum_diag[0] -.else - vext.8 q12, q14, q15, #(16-2*\i) - vext.8 q13, q15, q14, #(16-2*\i) - vadd.i16 q0, q0, q12 // sum_diag[0] - vadd.i16 q1, q1, q13 // sum_diag[0] -.endif - vrev64.16 q13, q15 - vswp d26, d27 // [-x] -.if \i == 0 - vmov q2, q13 // sum_diag[1] -.else - vext.8 q12, q14, q13, #(16-2*\i) - vext.8 q13, q13, q14, #(16-2*\i) - vadd.i16 q2, q2, q12 // sum_diag[1] - vadd.i16 q3, q3, q13 // sum_diag[1] -.endif - - vpadd.u16 d26, d30, d31 // [(x >> 1)] - vmov.u16 d27, #0 - vpadd.u16 d24, d26, d28 - vpadd.u16 d24, d24, d28 // [y] - vmov.u16 r12, d24[0] - vadd.i16 q5, q5, q15 // sum_hv[1] -.if \i < 4 - vmov.16 d8[\i], r12 // sum_hv[0] -.else - vmov.16 d9[\i-4], r12 // sum_hv[0] -.endif - -.if \i == 0 - vmov.u16 q6, q13 // sum_alt[0] -.else - vext.8 q12, q14, q13, #(16-2*\i) - vext.8 q14, q13, q14, #(16-2*\i) - vadd.i16 q6, q6, q12 // sum_alt[0] - vadd.i16 d16, d16, d28 // sum_alt[0] -.endif - vrev64.16 d26, d26 // [-(x >> 1)] - vmov.u16 q14, #0 -.if \i == 0 - vmov q7, q13 // sum_alt[1] -.else - vext.8 q12, q14, q13, #(16-2*\i) - vext.8 q13, q13, q14, #(16-2*\i) - vadd.i16 q7, q7, q12 // sum_alt[1] - vadd.i16 d17, d17, d26 // sum_alt[1] -.endif - -.if \i < 6 - vext.8 q12, q14, q15, #(16-2*(3-(\i/2))) - vext.8 q13, q15, q14, #(16-2*(3-(\i/2))) - vadd.i16 q9, q9, q12 // sum_alt[2] - vadd.i16 d22, d22, d26 // sum_alt[2] -.else - vadd.i16 q9, q9, q15 // sum_alt[2] -.endif -.if \i == 0 - vmov q10, q15 // sum_alt[3] -.elseif \i == 1 - vadd.i16 q10, q10, q15 // sum_alt[3] -.else - vext.8 q12, q14, q15, #(16-2*(\i/2)) - vext.8 q13, q15, q14, #(16-2*(\i/2)) - vadd.i16 q10, q10, q12 // sum_alt[3] - vadd.i16 d23, d23, d26 // sum_alt[3] -.endif -.endr - - vmov.u32 q15, #105 - - vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0] - vmlal.s16 q12, d9, d9 - vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1] - vmlal.s16 q13, d11, d11 - vadd.s32 d8, d24, d25 - vadd.s32 d9, d26, d27 - vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17) - vmul.i32 d8, d8, d30 // cost[2,6] *= 105 - - vrev64.16 q1, q1 - vrev64.16 q3, q3 - vext.8 q1, q1, q1, #10 // sum_diag[0][14-n] - vext.8 q3, q3, q3, #10 // sum_diag[1][14-n] - - vstr s16, [sp, #2*4] // cost[2] - vstr s17, [sp, #6*4] // cost[6] - - movrel_local r12, div_table - vld1.16 {q14}, [r12, :128] - - vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0] - vmull.s16 q12, d1, d1 - vmlal.s16 q5, d2, d2 - vmlal.s16 q12, d3, d3 - vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1] - vmull.s16 q1, d5, d5 - vmlal.s16 q0, d6, d6 - vmlal.s16 q1, d7, d7 - vmovl.u16 q13, d28 // div_table - vmovl.u16 q14, d29 - vmul.i32 q5, q5, q13 // cost[0] - vmla.i32 q5, q12, q14 - vmul.i32 q0, q0, q13 // cost[4] - vmla.i32 q0, q1, q14 - vadd.i32 d10, d10, d11 - vadd.i32 d0, d0, d1 - vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1 - - movrel_local r12, alt_fact - vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105 - - vstr s0, [sp, #0*4] // cost[0] - vstr s1, [sp, #4*4] // cost[4] - - vmovl.u16 q13, d29 // div_table[2*m+1] + 105 - vmovl.u16 q14, d30 - vmovl.u16 q15, d31 - -.macro cost_alt dest, s1, s2, s3, s4, s5, s6 - vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n] - vmull.s16 q2, \s2, \s2 - vmull.s16 q3, \s3, \s3 - vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n] - vmull.s16 q12, \s5, \s5 - vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here - vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact - vmla.i32 q1, q2, q14 - vmla.i32 q1, q3, q15 - vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact - vmla.i32 q5, q12, q14 - vmla.i32 q5, q6, q15 - vadd.i32 d2, d2, d3 - vadd.i32 d3, d10, d11 - vpadd.i32 \dest, d2, d3 // *cost_ptr -.endm - cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3] - cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7] - vstr s28, [sp, #1*4] // cost[1] - vstr s29, [sp, #3*4] // cost[3] - - mov r0, #0 // best_dir - vmov.32 r1, d0[0] // best_cost - mov r3, #1 // n - - vstr s30, [sp, #5*4] // cost[5] - vstr s31, [sp, #7*4] // cost[7] - - vmov.32 r12, d14[0] - -.macro find_best s1, s2, s3 -.ifnb \s2 - vmov.32 lr, \s2 -.endif - cmp r12, r1 // cost[n] > best_cost - itt gt - movgt r0, r3 // best_dir = n - movgt r1, r12 // best_cost = cost[n] -.ifnb \s2 - add r3, r3, #1 // n++ - cmp lr, r1 // cost[n] > best_cost - vmov.32 r12, \s3 - itt gt - movgt r0, r3 // best_dir = n - movgt r1, lr // best_cost = cost[n] - add r3, r3, #1 // n++ -.endif -.endm - find_best d14[0], d8[0], d14[1] - find_best d14[1], d0[1], d15[0] - find_best d15[0], d8[1], d15[1] - find_best d15[1] - - eor r3, r0, #4 // best_dir ^4 - ldr r12, [sp, r3, lsl #2] - sub r1, r1, r12 // best_cost - cost[best_dir ^ 4] - lsr r1, r1, #10 - str r1, [r2] // *var - - add sp, sp, #32 - vpop {q4-q7} - pop {pc} -endfunc diff -Nru dav1d-0.7.1/src/arm/32/cdef_tmpl.S dav1d-0.9.1/src/arm/32/cdef_tmpl.S --- dav1d-0.7.1/src/arm/32/cdef_tmpl.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/cdef_tmpl.S 2021-07-28 21:38:28.857851700 +0000 @@ -0,0 +1,515 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro dir_table w, stride +const directions\w + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 + .byte 1 * \stride + 0, 2 * \stride + 0 + .byte 1 * \stride + 0, 2 * \stride - 1 +// Repeated, to avoid & 7 + .byte -1 * \stride + 1, -2 * \stride + 2 + .byte 0 * \stride + 1, -1 * \stride + 2 + .byte 0 * \stride + 1, 0 * \stride + 2 + .byte 0 * \stride + 1, 1 * \stride + 2 + .byte 1 * \stride + 1, 2 * \stride + 2 + .byte 1 * \stride + 0, 2 * \stride + 1 +endconst +.endm + +.macro tables +dir_table 8, 16 +dir_table 4, 8 + +const pri_taps + .byte 4, 2, 3, 3 +endconst +.endm + +.macro load_px d11, d12, d21, d22, w +.if \w == 8 + add r6, r2, r9, lsl #1 // x + off + sub r9, r2, r9, lsl #1 // x - off + vld1.16 {\d11,\d12}, [r6] // p0 + vld1.16 {\d21,\d22}, [r9] // p1 +.else + add r6, r2, r9, lsl #1 // x + off + sub r9, r2, r9, lsl #1 // x - off + vld1.16 {\d11}, [r6] // p0 + add r6, r6, #2*8 // += stride + vld1.16 {\d21}, [r9] // p1 + add r9, r9, #2*8 // += stride + vld1.16 {\d12}, [r6] // p0 + vld1.16 {\d22}, [r9] // p1 +.endif +.endm +.macro handle_pixel s1, s2, thresh_vec, shift, tap, min +.if \min + vmin.u16 q2, q2, \s1 + vmax.s16 q3, q3, \s1 + vmin.u16 q2, q2, \s2 + vmax.s16 q3, q3, \s2 +.endif + vabd.u16 q8, q0, \s1 // abs(diff) + vabd.u16 q11, q0, \s2 // abs(diff) + vshl.u16 q9, q8, \shift // abs(diff) >> shift + vshl.u16 q12, q11, \shift // abs(diff) >> shift + vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) + vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) + vsub.i16 q10, \s1, q0 // diff = p0 - px + vsub.i16 q13, \s2, q0 // diff = p1 - px + vneg.s16 q8, q9 // -clip + vneg.s16 q11, q12 // -clip + vmin.s16 q10, q10, q9 // imin(diff, clip) + vmin.s16 q13, q13, q12 // imin(diff, clip) + vdup.16 q9, \tap // taps[k] + vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip) + vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip) + vmla.i16 q1, q10, q9 // sum += taps[k] * constrain() + vmla.i16 q1, q13, q9 // sum += taps[k] * constrain() +.endm + +// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, +// const uint16_t *tmp, int pri_strength, +// int sec_strength, int dir, int damping, +// int h, size_t edges); +.macro filter_func w, bpc, pri, sec, min, suffix +function cdef_filter\w\suffix\()_\bpc\()bpc_neon +.if \bpc == 8 + cmp r8, #0xf + beq cdef_filter\w\suffix\()_edged_neon +.endif +.if \pri +.if \bpc == 16 + clz r9, r9 + sub r9, r9, #24 // -bitdepth_min_8 + neg r9, r9 // bitdepth_min_8 +.endif + movrel_local r8, pri_taps +.if \bpc == 16 + lsr r9, r3, r9 // pri_strength >> bitdepth_min_8 + and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1 +.else + and r9, r3, #1 +.endif + add r8, r8, r9, lsl #1 +.endif + movrel_local r9, directions\w + add r5, r9, r5, lsl #1 + vmov.u16 d17, #15 + vdup.16 d16, r6 // damping + +.if \pri + vdup.16 q5, r3 // threshold +.endif +.if \sec + vdup.16 q7, r4 // threshold +.endif + vmov.16 d8[0], r3 + vmov.16 d8[1], r4 + vclz.i16 d8, d8 // clz(threshold) + vsub.i16 d8, d17, d8 // ulog2(threshold) + vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) + vneg.s16 d8, d8 // -shift +.if \sec + vdup.16 q6, d8[1] +.endif +.if \pri + vdup.16 q4, d8[0] +.endif + +1: +.if \w == 8 + vld1.16 {q0}, [r2, :128] // px +.else + add r12, r2, #2*8 + vld1.16 {d0}, [r2, :64] // px + vld1.16 {d1}, [r12, :64] // px +.endif + + vmov.u16 q1, #0 // sum +.if \min + vmov.u16 q2, q0 // min + vmov.u16 q3, q0 // max +.endif + + // Instead of loading sec_taps 2, 1 from memory, just set it + // to 2 initially and decrease for the second round. + // This is also used as loop counter. + mov lr, #2 // sec_taps[0] + +2: +.if \pri + ldrsb r9, [r5] // off1 + + load_px d28, d29, d30, d31, \w +.endif + +.if \sec + add r5, r5, #4 // +2*2 + ldrsb r9, [r5] // off2 +.endif + +.if \pri + ldrb r12, [r8] // *pri_taps + + handle_pixel q14, q15, q5, q4, r12, \min +.endif + +.if \sec + load_px d28, d29, d30, d31, \w + + add r5, r5, #8 // +2*4 + ldrsb r9, [r5] // off3 + + handle_pixel q14, q15, q7, q6, lr, \min + + load_px d28, d29, d30, d31, \w + + handle_pixel q14, q15, q7, q6, lr, \min + + sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; +.else + add r5, r5, #1 // r5 += 1 +.endif + subs lr, lr, #1 // sec_tap-- (value) +.if \pri + add r8, r8, #1 // pri_taps++ (pointer) +.endif + bne 2b + + vshr.s16 q14, q1, #15 // -(sum < 0) + vadd.i16 q1, q1, q14 // sum - (sum < 0) + vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 + vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4 +.if \min + vmin.s16 q0, q0, q3 + vmax.s16 q0, q0, q2 // iclip(px + .., min, max) +.endif +.if \bpc == 8 + vmovn.u16 d0, q0 +.endif +.if \w == 8 + add r2, r2, #2*16 // tmp += tmp_stride + subs r7, r7, #1 // h-- +.if \bpc == 8 + vst1.8 {d0}, [r0, :64], r1 +.else + vst1.16 {q0}, [r0, :128], r1 +.endif +.else +.if \bpc == 8 + vst1.32 {d0[0]}, [r0, :32], r1 +.else + vst1.16 {d0}, [r0, :64], r1 +.endif + add r2, r2, #2*16 // tmp += 2*tmp_stride + subs r7, r7, #2 // h -= 2 +.if \bpc == 8 + vst1.32 {d0[1]}, [r0, :32], r1 +.else + vst1.16 {d1}, [r0, :64], r1 +.endif +.endif + + // Reset pri_taps and directions back to the original point + sub r5, r5, #2 +.if \pri + sub r8, r8, #2 +.endif + + bgt 1b + vpop {q4-q7} + pop {r4-r9,pc} +endfunc +.endm + +.macro filter w, bpc +filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri +filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec +filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec + +function cdef_filter\w\()_\bpc\()bpc_neon, export=1 + push {r4-r9,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #92] + ldrd r6, r7, [sp, #100] +.if \bpc == 16 + ldrd r8, r9, [sp, #108] +.else + ldr r8, [sp, #108] +.endif + cmp r3, #0 // pri_strength + bne 1f + b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec +1: + cmp r4, #0 // sec_strength + bne 1f + b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri +1: + b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec +endfunc +.endm + +const div_table, align=4 + .short 840, 420, 280, 210, 168, 140, 120, 105 +endconst + +const alt_fact, align=4 + .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 +endconst + +.macro cost_alt dest, s1, s2, s3, s4, s5, s6 + vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n] + vmull.s16 q2, \s2, \s2 + vmull.s16 q3, \s3, \s3 + vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n] + vmull.s16 q12, \s5, \s5 + vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here + vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact + vmla.i32 q1, q2, q14 + vmla.i32 q1, q3, q15 + vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact + vmla.i32 q5, q12, q14 + vmla.i32 q5, q6, q15 + vadd.i32 d2, d2, d3 + vadd.i32 d3, d10, d11 + vpadd.i32 \dest, d2, d3 // *cost_ptr +.endm + +.macro find_best s1, s2, s3 +.ifnb \s2 + vmov.32 lr, \s2 +.endif + cmp r12, r1 // cost[n] > best_cost + itt gt + movgt r0, r3 // best_dir = n + movgt r1, r12 // best_cost = cost[n] +.ifnb \s2 + add r3, r3, #1 // n++ + cmp lr, r1 // cost[n] > best_cost + vmov.32 r12, \s3 + itt gt + movgt r0, r3 // best_dir = n + movgt r1, lr // best_cost = cost[n] + add r3, r3, #1 // n++ +.endif +.endm + +// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, +// unsigned *const var) +.macro find_dir bpc +function cdef_find_dir_\bpc\()bpc_neon, export=1 + push {lr} + vpush {q4-q7} +.if \bpc == 16 + clz r3, r3 // clz(bitdepth_max) + sub lr, r3, #24 // -bitdepth_min_8 +.endif + sub sp, sp, #32 // cost + mov r3, #8 + vmov.u16 q1, #0 // q0-q1 sum_diag[0] + vmov.u16 q3, #0 // q2-q3 sum_diag[1] + vmov.u16 q5, #0 // q4-q5 sum_hv[0-1] + vmov.u16 q8, #0 // q6,d16 sum_alt[0] + // q7,d17 sum_alt[1] + vmov.u16 q9, #0 // q9,d22 sum_alt[2] + vmov.u16 q11, #0 + vmov.u16 q10, #0 // q10,d23 sum_alt[3] + + +.irpc i, 01234567 +.if \bpc == 8 + vld1.8 {d30}, [r0, :64], r1 + vmov.u8 d31, #128 + vsubl.u8 q15, d30, d31 // img[x] - 128 +.else + vld1.16 {q15}, [r0, :128], r1 + vdup.16 q14, lr // -bitdepth_min_8 + vshl.u16 q15, q15, q14 + vmov.u16 q14, #128 + vsub.i16 q15, q15, q14 // img[x] - 128 +.endif + vmov.u16 q14, #0 + +.if \i == 0 + vmov q0, q15 // sum_diag[0] +.else + vext.8 q12, q14, q15, #(16-2*\i) + vext.8 q13, q15, q14, #(16-2*\i) + vadd.i16 q0, q0, q12 // sum_diag[0] + vadd.i16 q1, q1, q13 // sum_diag[0] +.endif + vrev64.16 q13, q15 + vswp d26, d27 // [-x] +.if \i == 0 + vmov q2, q13 // sum_diag[1] +.else + vext.8 q12, q14, q13, #(16-2*\i) + vext.8 q13, q13, q14, #(16-2*\i) + vadd.i16 q2, q2, q12 // sum_diag[1] + vadd.i16 q3, q3, q13 // sum_diag[1] +.endif + + vpadd.u16 d26, d30, d31 // [(x >> 1)] + vmov.u16 d27, #0 + vpadd.u16 d24, d26, d28 + vpadd.u16 d24, d24, d28 // [y] + vmov.u16 r12, d24[0] + vadd.i16 q5, q5, q15 // sum_hv[1] +.if \i < 4 + vmov.16 d8[\i], r12 // sum_hv[0] +.else + vmov.16 d9[\i-4], r12 // sum_hv[0] +.endif + +.if \i == 0 + vmov.u16 q6, q13 // sum_alt[0] +.else + vext.8 q12, q14, q13, #(16-2*\i) + vext.8 q14, q13, q14, #(16-2*\i) + vadd.i16 q6, q6, q12 // sum_alt[0] + vadd.i16 d16, d16, d28 // sum_alt[0] +.endif + vrev64.16 d26, d26 // [-(x >> 1)] + vmov.u16 q14, #0 +.if \i == 0 + vmov q7, q13 // sum_alt[1] +.else + vext.8 q12, q14, q13, #(16-2*\i) + vext.8 q13, q13, q14, #(16-2*\i) + vadd.i16 q7, q7, q12 // sum_alt[1] + vadd.i16 d17, d17, d26 // sum_alt[1] +.endif + +.if \i < 6 + vext.8 q12, q14, q15, #(16-2*(3-(\i/2))) + vext.8 q13, q15, q14, #(16-2*(3-(\i/2))) + vadd.i16 q9, q9, q12 // sum_alt[2] + vadd.i16 d22, d22, d26 // sum_alt[2] +.else + vadd.i16 q9, q9, q15 // sum_alt[2] +.endif +.if \i == 0 + vmov q10, q15 // sum_alt[3] +.elseif \i == 1 + vadd.i16 q10, q10, q15 // sum_alt[3] +.else + vext.8 q12, q14, q15, #(16-2*(\i/2)) + vext.8 q13, q15, q14, #(16-2*(\i/2)) + vadd.i16 q10, q10, q12 // sum_alt[3] + vadd.i16 d23, d23, d26 // sum_alt[3] +.endif +.endr + + vmov.u32 q15, #105 + + vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0] + vmlal.s16 q12, d9, d9 + vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1] + vmlal.s16 q13, d11, d11 + vadd.s32 d8, d24, d25 + vadd.s32 d9, d26, d27 + vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17) + vmul.i32 d8, d8, d30 // cost[2,6] *= 105 + + vrev64.16 q1, q1 + vrev64.16 q3, q3 + vext.8 q1, q1, q1, #10 // sum_diag[0][14-n] + vext.8 q3, q3, q3, #10 // sum_diag[1][14-n] + + vstr s16, [sp, #2*4] // cost[2] + vstr s17, [sp, #6*4] // cost[6] + + movrel_local r12, div_table + vld1.16 {q14}, [r12, :128] + + vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0] + vmull.s16 q12, d1, d1 + vmlal.s16 q5, d2, d2 + vmlal.s16 q12, d3, d3 + vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1] + vmull.s16 q1, d5, d5 + vmlal.s16 q0, d6, d6 + vmlal.s16 q1, d7, d7 + vmovl.u16 q13, d28 // div_table + vmovl.u16 q14, d29 + vmul.i32 q5, q5, q13 // cost[0] + vmla.i32 q5, q12, q14 + vmul.i32 q0, q0, q13 // cost[4] + vmla.i32 q0, q1, q14 + vadd.i32 d10, d10, d11 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1 + + movrel_local r12, alt_fact + vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105 + + vstr s0, [sp, #0*4] // cost[0] + vstr s1, [sp, #4*4] // cost[4] + + vmovl.u16 q13, d29 // div_table[2*m+1] + 105 + vmovl.u16 q14, d30 + vmovl.u16 q15, d31 + + cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3] + cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7] + vstr s28, [sp, #1*4] // cost[1] + vstr s29, [sp, #3*4] // cost[3] + + mov r0, #0 // best_dir + vmov.32 r1, d0[0] // best_cost + mov r3, #1 // n + + vstr s30, [sp, #5*4] // cost[5] + vstr s31, [sp, #7*4] // cost[7] + + vmov.32 r12, d14[0] + + find_best d14[0], d8[0], d14[1] + find_best d14[1], d0[1], d15[0] + find_best d15[0], d8[1], d15[1] + find_best d15[1] + + eor r3, r0, #4 // best_dir ^4 + ldr r12, [sp, r3, lsl #2] + sub r1, r1, r12 // best_cost - cost[best_dir ^ 4] + lsr r1, r1, #10 + str r1, [r2] // *var + + add sp, sp, #32 + vpop {q4-q7} + pop {pc} +endfunc +.endm diff -Nru dav1d-0.7.1/src/arm/32/film_grain16.S dav1d-0.9.1/src/arm/32/film_grain16.S --- dav1d-0.7.1/src/arm/32/film_grain16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/film_grain16.S 2021-07-28 21:38:28.857851700 +0000 @@ -0,0 +1,949 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 + +.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off + vmov.u16 r11, \src1[0+\off] + vmov.u16 r12, \src3[0+\off] + add r11, r11, r3 + vmov.u16 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u16 r11, \src3[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u16 r12, \src2[0+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u16 lr, \src4[0+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u16 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u16 r12, \src4[2+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0 + gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0 + gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, d2, d3, 0 + gather_interleaved d8, d9, d0, d1, d2, d3, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, lsl #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH*2 // grain_lut stride + ldr r10, [sp, #124] // bitdepth_max + + eor r4, r4, #15 // 15 - scaling_shift + vdup.16 q6, r10 // bitdepth_max + clz r10, r10 + vdup.16 q13, r4 // 15 - scaling_shift + rsb r10, r10, #24 // bitdepth_min_8 + cmp r8, #0 + vdup.16 q12, r10 // bitdepth_min_8 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i16 q14, #16 + vmov.i16 q15, #235 + vshl.s16 q14, q14, q12 + vshl.s16 q15, q15, q12 + b 2f +1: + // no clip + vmov.i16 q14, #0 + vmov q15, q6 +2: + vshr.u16 q6, q6, #1 // grain_max + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #18 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.16 d14, d24[0] + vdup.16 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + sub r2, r2, #32 // src_stride -= 32 + sub r9, r9, #32 // grain_stride -= 32 + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.16 {d0}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r6]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r8], r9 // grain_lut top old +.endif +.if \oy + vld1.16 {q4, q5}, [r6], r9 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r1, :128]! // src +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if !\ox && !\oy + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif +.if !\oy + vmvn.i16 q5, #0xf000 // 0x0fff +.endif + vld1.16 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vmlal.s16 q0, d16, d25 +.endif + +.if \oy +.if \ox + add r8, r8, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vmvn d0, d12 // grain_min + vqrshrn.s32 d4, q1, #5 + vmin.s16 d16, d16, d12 + vmin.s16 d4, d4, d12 + vmax.s16 d16, d16, d0 + vmax.s16 d4, d4, d0 +.endif + + vmull.s16 q0, d4, d14 + vmull.s16 q1, d5, d14 + vmull.s16 q2, d6, d14 + vmull.s16 q3, d7, d14 + vmlal.s16 q0, d16, d15 + vmlal.s16 q1, d17, d15 + vmlal.s16 q2, d18, d15 + vmlal.s16 q3, d19, d15 + vmull.s16 q8, d20, d15 + vmull.s16 q9, d21, d15 + vmull.s16 q10, d22, d15 + vmull.s16 q11, d23, d15 + vmlal.s16 q8, d8, d14 + vmlal.s16 q9, d9, d14 + vmlal.s16 q10, d10, d14 + vmlal.s16 q11, d11, d14 + vmvn q4, q6 // grain_min + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q6 + vmin.s16 q9, q1, q6 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 q10, q2, q6 + vmin.s16 q11, q3, q6 + vmax.s16 q8, q8, q4 + vmax.s16 q9, q9, q4 + vld1.16 {q2, q3}, [r1, :128], r2 // src + vmvn.i16 q5, #0xf000 // 0x0fff + vmax.s16 q10, q10, q4 + vmax.s16 q11, q11, q4 +.elseif \ox + vmvn d4, d12 // grain_min + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r1, :128]! // src + vmin.s16 d16, d16, d12 + vmax.s16 d16, d16, d4 + vld1.16 {q2, q3}, [r1, :128], r2 // src +.endif + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q5 + vand q1, q1, q5 + vand q2, q2, q5 + vand q3, q3, q5 + + bl gather32_neon + +.if \ox || \oy + vpush {q6-q7} +.endif + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + +.if \ox || \oy + vpop {q6-q7} +.endif + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + + vst1.16 {q0, q1}, [r0, :128]! // dst + subs r7, r7, #1 +.if \oy + vdup.16 d14, d25[0] + vdup.16 d15, d25[1] +.endif + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox\()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r10, r11, [sp, #124] // uv, is_id + ldr r6, [sp, #136] // bitdepth_max + + clz r7, r6 + rsb r7, r7, #24 // bitdepth_min_8 + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset + vld1.16 {d30[]}, [r12] // uv_luma_mult + lsl r10, r10, r7 // uv_offset << bitdepth_min_8 + vld1.16 {d30[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + eor lr, lr, #15 // 15 - scaling_shift + + vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8 + + cmp r12, #0 + vdup.16 q13, lr // 15 - scaling_shift + + beq 1f + // clip + cmp r11, #0 + mov r8, #16 + mov r9, #240 + lsl r8, r8, r7 + lsl r9, r9, r7 + beq 2f + // is_id + mov r9, #235 + lsl r9, r9, r7 + b 2f +1: + // no clip + mov r8, #0 + mov r9, r6 // bitdepth_max +2: + vmov.16 d30[3], r6 // bitdepth_max + vdup.16 d31, r8 // clip_min + + mov r10, #GRAIN_WIDTH*2 // grain_lut stride + +.if \sy + mov r6, #23 + mov r7, #22 +.else + mov r6, #27 + mov r7, #17 +.endif + vmov.16 d31[1], r9 // clip_max + + ldrd r8, r9, [sp, #116] // offsets, h + + add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + vmov.16 d31[2], r6 // overlap y [0] + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + vmov.16 d31[3], r7 // overlap y [1] + + add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + + vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx\()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + sub r7, r7, #32 // luma_stride -= 32 + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): + sub r2, r2, #32 // src_stride -= 32 + sub r10, r10, #32 // grain_stride -= 32 +.if \oy + mov r12, lr +.endif +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart): +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.16 {q2, q3}, [r8]! // grain_lut top +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5]! // grain_lut +.if \oy + vld1.16 {q4, q5}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif + vld1.16 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + add r4, r4, #32 + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + add r11, r11, #32 + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vmull.s16 q8, d20, d29 + vmull.s16 q9, d21, d29 + vmull.s16 q10, d22, d29 + vmull.s16 q11, d23, d29 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vmlal.s16 q8, d8, d28 + vmlal.s16 q9, d9, d28 + vmlal.s16 q10, d10, d28 + vmlal.s16 q11, d11, d28 + vqrshrn.s32 d0, q0, #5 + vqrshrn.s32 d1, q1, #5 + vqrshrn.s32 d2, q2, #5 + vqrshrn.s32 d3, q3, #5 + vqrshrn.s32 d4, q8, #5 + vqrshrn.s32 d5, q9, #5 + vqrshrn.s32 d6, q10, #5 + vqrshrn.s32 d7, q11, #5 + vmin.s16 q8, q0, q7 + vmin.s16 q9, q1, q7 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q10, q2, q7 + vmin.s16 q11, q3, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 q10, q10, q6 + vmax.s16 q11, q11, q6 +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q4, q5}, [r1, :128]! // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d8, d29 + vmlal.s16 q7, d9, d29 + vmlal.s16 q0, d10, d29 + vmlal.s16 q1, d11, d29 + vld1.16 {q4, q5}, [r1, :128] // src + sub r1, r1, #32 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 + vmlal.s16 q0, d8, d29 + vmlal.s16 q1, d9, d29 + vmlal.s16 q2, d10, d29 + vmlal.s16 q3, d11, d29 + vdup.16 q14, d30[2] // uv_offset + vshrn.s32 d0, q0, #6 + vshrn.s32 d1, q1, #6 + vshrn.s32 d2, q2, #6 + vshrn.s32 d3, q3, #6 + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vadd.i16 q2, q0, q14 + vadd.i16 q3, q1, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmin.s16 q2, q2, q4 + vmin.s16 q3, q3, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 + vmax.s16 q2, q2, q5 + vmax.s16 q3, q3, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 + vand q2, q2, q14 + vand q3, q3, q14 +.endif + + bl gather32_neon + + vld1.16 {q0, q1}, [r1, :128]! // src + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vld1.16 {q2, q3}, [r1, :128], r2 // src + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + vshl.u16 q4, q4, q13 + vshl.u16 q5, q5, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + vqrdmulh.s16 q10, q10, q4 + vqrdmulh.s16 q11, q11, q5 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q0, q8 // *src + noise + vqadd.s16 q1, q1, q9 + vqadd.s16 q2, q2, q10 + vqadd.s16 q3, q3, q11 + +.if \oy + vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmax.s16 q2, q2, q4 + vmax.s16 q3, q3, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + vmin.s16 q2, q2, q5 + vmin.s16 q3, q3, q5 + + vst1.16 {q0, q1}, [r0, :128]! // dst + + subs r9, r9, #1 +.if \oy + vmov.32 d31[1], lr // new coeffs for overlap y +.endif + + vst1.16 {q2, q3}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.16 {d0}, [r4], r10 // grain_lut old +.endif +.if \ox && \oy + vld1.16 {d2}, [r11], r10 // grain_lut top old +.endif +.if \oy + vld1.16 {q2, q3}, [r8], r10 // grain_lut top +.endif +.if !\ox && !\oy + vld1.16 {q0, q1}, [r6, :128]! // luma +.endif + vld1.16 {q8, q9}, [r5], r10 // grain_lut +.if \oy + vdup.16 d28, d31[2] // overlap y coeff + vdup.16 d29, d31[3] // overlap y coeff +.endif +.if !\ox && !\oy + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.endif + +.if \ox + vdup.16 q7, d30[3] // bitdepth_max + vmull.s16 q0, d0, d24 + vshr.u16 q7, q7, #1 // grain_max + vmlal.s16 q0, d16, d25 + vmvn q6, q7 // grain_min +.endif + +.if \oy +.if \ox + vmull.s16 q1, d2, d24 + vmlal.s16 q1, d4, d25 + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d4, q1, #5 + vmin.s16 d4, d4, d14 + vmin.s16 d16, d16, d14 + vmax.s16 d4, d4, d12 + vmax.s16 d16, d16, d12 +.endif + + vmull.s16 q0, d4, d28 + vmull.s16 q1, d5, d28 + vmull.s16 q2, d6, d28 + vmull.s16 q3, d7, d28 +.if !\ox + vdup.16 q7, d30[3] // bitdepth_max +.endif + vmlal.s16 q0, d16, d29 + vmlal.s16 q1, d17, d29 + vmlal.s16 q2, d18, d29 + vmlal.s16 q3, d19, d29 +.if !\ox + vshr.u16 q7, q7, #1 // grain_max +.endif + vqrshrn.s32 d16, q0, #5 + vqrshrn.s32 d17, q1, #5 + vqrshrn.s32 d18, q2, #5 + vqrshrn.s32 d19, q3, #5 +.if !\ox + vmvn q6, q7 // grain_min +.endif + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 q8, q8, q7 + vmin.s16 q9, q9, q7 + vmax.s16 q8, q8, q6 + vmax.s16 q9, q9, q6 + vld1.16 {q2, q3}, [r6, :128], r7 // luma +.elseif \ox + vqrshrn.s32 d16, q0, #5 + vld1.16 {q0, q1}, [r6, :128]! // luma + vmin.s16 d16, d16, d14 + vld1.16 {q2, q3}, [r6, :128], r7 // luma + vmax.s16 d16, d16, d12 +.endif + + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vrshr.u16 q0, q0, #1 + vrshr.u16 q1, q1, #1 +.if !\csfl + vdup.16 d28, d30[0] // uv_luma_mult + vld1.16 {q2, q3}, [r1, :128], r2 // src + vdup.16 d29, d30[1] // uv_mult + vmull.s16 q6, d0, d28 + vmull.s16 q7, d1, d28 + vmull.s16 q0, d2, d28 + vmull.s16 q1, d3, d28 + vmlal.s16 q6, d4, d29 + vmlal.s16 q7, d5, d29 + vmlal.s16 q0, d6, d29 + vmlal.s16 q1, d7, d29 + vshrn.s32 d12, q6, #6 + vshrn.s32 d13, q7, #6 + vshrn.s32 d14, q0, #6 + vshrn.s32 d15, q1, #6 + vdup.16 q14, d30[2] // uv_offset + vdup.16 q4, d30[3] // bitdepth_max + vmov.i16 q5, #0 + vadd.i16 q6, q6, q14 + vadd.i16 q7, q7, q14 + vmin.s16 q0, q6, q4 + vmin.s16 q1, q7, q4 + vmax.s16 q0, q0, q5 + vmax.s16 q1, q1, q5 +.else + vdup.16 q14, d30[3] // bitdepth_max + vld1.16 {q2, q3}, [r1, :128], r2 // src + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + vand q0, q0, q14 + vand q1, q1, q14 +.endif + + bl gather16_neon + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift) + vshl.u16 q7, q7, q13 + + vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15) + vqrdmulh.s16 q9, q9, q7 + + + vdup.16 q4, d31[0] // clip_min + vdup.16 q5, d31[1] // clip_max + + vqadd.s16 q0, q2, q8 // *src + noise + vqadd.s16 q1, q3, q9 + +.if \oy + // Swap the two last coefficients of d31, place them first in d28 + vrev64.16 d28, d31 +.endif + + vmax.s16 q0, q0, q4 + vmax.s16 q1, q1, q4 + vmin.s16 q0, q0, q5 + vmin.s16 q1, q1, q5 + + subs r9, r9, #1 +.if \oy + // Take the first two 16 bit coefficients of d28 and place them at the + // end of d31 + vtrn.32 d31, d28 +.endif + + vst1.16 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff -Nru dav1d-0.7.1/src/arm/32/film_grain.S dav1d-0.9.1/src/arm/32/film_grain.S --- dav1d-0.7.1/src/arm/32/film_grain.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/film_grain.S 2021-07-28 21:38:28.857851700 +0000 @@ -0,0 +1,714 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 + +.macro gather_interleaved dst1, dst2, src1, src2, off + vmov.u8 r11, \src1[0+\off] + vmov.u8 r12, \src2[0+\off] + add r11, r11, r3 + vmov.u8 lr, \src1[2+\off] + add r12, r12, r3 + vld1.8 {\dst1[0+\off]}, [r11] + vmov.u8 r11, \src2[2+\off] + add lr, lr, r3 + vld1.8 {\dst2[0+\off]}, [r12] + vmov.u8 r12, \src1[4+\off] + add r11, r11, r3 + vld1.8 {\dst1[2+\off]}, [lr] + vmov.u8 lr, \src2[4+\off] + add r12, r12, r3 + vld1.8 {\dst2[2+\off]}, [r11] + vmov.u8 r11, \src1[6+\off] + add lr, lr, r3 + vld1.8 {\dst1[4+\off]}, [r12] + vmov.u8 r12, \src2[6+\off] + add r11, r11, r3 + vld1.8 {\dst2[4+\off]}, [lr] + add r12, r12, r3 + vld1.8 {\dst1[6+\off]}, [r11] + vld1.8 {\dst2[6+\off]}, [r12] +.endm + +.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4 + gather_interleaved \dst1, \dst3, \src1, \src3, 0 + gather_interleaved \dst1, \dst3, \src1, \src3, 1 + gather_interleaved \dst2, \dst4, \src2, \src4, 0 + gather_interleaved \dst2, \dst4, \src2, \src4, 1 +.endm + +function gather32_neon + push {r11-r12,lr} + gather d8, d9, d10, d11, d0, d1, d2, d3 + pop {r11-r12,pc} +endfunc + +function gather16_neon + push {r11-r12,lr} + gather_interleaved d8, d9, d0, d1, 0 + gather_interleaved d8, d9, d0, d1, 1 + pop {r11-r12,pc} +endfunc + +const overlap_coeffs_0, align=4 + .byte 27, 17, 0, 0, 0, 0, 0, 0 + .byte 17, 27, 32, 32, 32, 32, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .byte 23, 0, 0, 0, 0, 0, 0, 0 + .byte 22, 32, 32, 32, 32, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type); +function fgy_32x32_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut + ldrd r6, r7, [sp, #108] // offsets, h + ldr r8, [sp, #116] // clip + mov r9, #GRAIN_WIDTH // grain_lut stride + + neg r4, r4 + vdup.16 q13, r4 // -scaling_shift + cmp r8, #0 + + movrel_local r12, overlap_coeffs_0 + + beq 1f + // clip + vmov.i8 q14, #16 + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + add r5, r5, #9 // grain_lut += 9 + add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r9 // grain_lut += grain_stride + + ldr r10, [r6, #8] // offsets[1][0] + calc_offset r10, r4, r10, 0, 0 + add_offset r4, r10, r4, r5, r9 + ldr r10, [r6, #4] // offsets[0][1] + calc_offset r10, r11, r10, 0, 0 + add_offset r11, r10, r11, r5, r9 + ldr r10, [r6, #12] // offsets[1][1] + calc_offset r10, r8, r10, 0, 0 + add_offset r8, r10, r8, r5, r9 + ldr r6, [r6] // offsets[0][0] + calc_offset r6, lr, r6, 0, 0 + add_offset r5, r6, lr, r5, r9 + + add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + ldr r10, [sp, #120] // type + adr r11, L(fgy_loop_tbl) + + tst r10, #1 + ldr r10, [r11, r10, lsl #2] + + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx + + add r11, r11, r10 + + beq 1f + // y overlap + vdup.8 d14, d24[0] + vdup.8 d15, d24[1] + mov r10, r7 // backup actual h + mov r7, #2 +1: + bx r11 +endfunc + +function fgy_loop_neon +L(fgy_loop_tbl): + .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB + .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB + +.macro fgy ox, oy +L(loop_\ox\oy): +1: +.if \ox + vld1.8 {d8}, [r4], r9 // grain_lut old +.endif +.if \oy + vld1.8 {q2, q3}, [r6], r9 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r8], r9 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r1, :128], r2 // src + vld1.8 {q10, q11}, [r5], r9 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d4, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d4, q5, #5 +.endif + + vmull.s8 q4, d20, d15 + vmull.s8 q5, d21, d15 + vmull.s8 q8, d22, d15 + vmull.s8 q9, d23, d15 + vmlal.s8 q4, d4, d14 + vmlal.s8 q5, d5, d14 + vmlal.s8 q8, d6, d14 + vmlal.s8 q9, d7, d14 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q8, #5 + vqrshrn.s16 d23, q9, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif + + bl gather32_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q2, d8 // scaling + vmovl.u8 q3, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q2 // scaling * grain + vmul.i16 q9, q9, q3 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r7, r7, #1 +.if \oy + vdup.8 d14, d25[0] + vdup.8 d15, d25[1] +.endif + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r10, #2 + sub r7, r10, #2 // restore actual remaining h + bgt L(loop_\ox\()0) +.endif + vpop {q4-q7} + pop {r4-r11,pc} +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 +endfunc + +// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] // data, grain_lut + ldrd r6, r7, [sp, #108] // luma_row, luma_stride + ldrd r8, r9, [sp, #116] // offsets, h + ldrd r10, r11, [sp, #124] // uv, is_id + + // !csfl + add r10, r4, r10, lsl #2 // + 4*uv + add r12, r10, #FGD_UV_LUMA_MULT + add lr, r10, #FGD_UV_MULT + add r10, r10, #FGD_UV_OFFSET + vld1.16 {d4[]}, [r12] // uv_luma_mult + vld1.16 {d4[2]}, [r10] // uv_offset + vld1.16 {d4[1]}, [lr] // uv_mult + + ldr lr, [r4, #FGD_SCALING_SHIFT] + ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE] + neg lr, lr // -scaling_shift + + cmp r12, #0 + vdup.16 q13, lr // -scaling_shift + + beq 1f + // clip + cmp r11, #0 + vmov.i8 q14, #16 + vmov.i8 q15, #240 + beq 2f + // is_id + vmov.i8 q15, #235 + b 2f +1: + // no clip + vmov.i8 q14, #0 + vmov.i8 q15, #255 +2: + + mov r10, #GRAIN_WIDTH // grain_lut stride + + add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 +.if \sy + add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride + add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride +.else + add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride + add r5, r5, r10 // grain_lut += grain_stride +.endif + + ldr r12, [r8, #8] // offsets[1][0] + calc_offset r12, r4, r12, \sx, \sy + add_offset r4, r12, r4, r5, r10 + + ldr r12, [r8, #4] // offsets[0][1] + calc_offset r12, lr, r12, \sx, \sy + add_offset lr, r12, lr, r5, r10 + + ldr r12, [r8, #12] // offsets[1][1] + calc_offset r12, r11, r12, \sx, \sy + add_offset r11, r12, r11, r5, r10 + + ldr r8, [r8] // offsets[0][0] + calc_offset r8, r12, r8, \sx, \sy + add_offset r5, r8, r12, r5, r10 + + add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + movrel_local r12, overlap_coeffs_\sx + ldr lr, [sp, #132] // type + + vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs + + movrel_local r12, L(fguv_loop_sx\sx\()_tbl) +#if CONFIG_THUMB + // This uses movrel_local instead of adr above, because the target + // can be out of range for adr. But movrel_local leaves the thumb bit + // set on COFF (but probably wouldn't if building for thumb on ELF), + // thus try to clear the bit for robustness. + bic r12, r12, #1 +#endif + + tst lr, #1 + ldr lr, [r12, lr, lsl #2] + + add r12, r12, lr + + beq 1f + // y overlap + sub lr, r9, #(2 >> \sy) // backup remaining h + mov r9, #(2 >> \sy) + +1: + +.if \sy + vmov.i8 d6, #23 + vmov.i8 d7, #22 +.else + vmov.i8 d6, #27 + vmov.i8 d7, #17 +.endif + +.if \sy + add r7, r7, r7 // luma_stride *= 2 +.endif + + bx r12 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +L(fguv_loop_sx0_tbl): + .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8, q9}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10, q11}, [r5], r10 // grain_lut + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmull.s8 q6, d22, d7 + vmull.s8 q7, d23, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vmlal.s8 q6, d18, d6 + vmlal.s8 q7, d19, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 + vqrshrn.s16 d22, q6, #5 + vqrshrn.s16 d23, q7, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if !\csfl + vld1.8 {q8, q9}, [r1, :128] // src + vmovl.u8 q4, d0 + vmovl.u8 q5, d1 + vmovl.u8 q6, d2 + vmovl.u8 q7, d3 + vmovl.u8 q0, d16 + vmovl.u8 q1, d17 + vmovl.u8 q8, d18 + vmovl.u8 q9, d19 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q6, q6, d4[0] + vmul.i16 q7, q7, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vmul.i16 q8, q8, d4[1] + vmul.i16 q9, q9, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vqadd.s16 q6, q6, q8 + vqadd.s16 q7, q7, q9 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vshr.s16 q6, q6, #6 + vshr.s16 q7, q7, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vadd.i16 q6, q6, q0 + vadd.i16 q7, q7, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 + vqmovun.s16 d2, q6 + vqmovun.s16 d3, q7 +.endif + + bl gather32_neon + + vld1.8 {q0, q1}, [r1, :128], r2 // src + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + vmovl.u8 q4, d10 + vmovl.u8 q5, d11 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + vmul.i16 q10, q10, q4 + vmul.i16 q11, q11, q5 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + vrshl.s16 q10, q10, q13 + vrshl.s16 q11, q11, q13 + + vaddw.u8 q8, q8, d0 // *src + noise + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + + vmax.u8 q0, q0, q14 + vmax.u8 q1, q1, q14 + vmin.u8 q0, q0, q15 + vmin.u8 q1, q1, q15 + + subs r9, r9, #1 +.if \oy + vdup.8 d6, d25[0] + vdup.8 d7, d25[1] +.endif + + vst1.8 {q0, q1}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function fguv_loop_sx1_neon +L(fguv_loop_sx1_tbl): + .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB + +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +.if \oy + mov r12, lr +.endif +1: +.if \ox + vld1.8 {d8}, [r4], r10 // grain_lut old +.endif +.if \oy + vld1.8 {q8}, [r8], r10 // grain_lut top +.endif +.if \ox && \oy + vld1.8 {d10}, [r11], r10 // grain_lut top old +.endif + vld1.8 {q0, q1}, [r6, :128], r7 // luma + vld1.8 {q10}, [r5], r10 // grain_lut + vld1.8 {q11}, [r1, :128], r2 // src + +.if \ox + vmull.s8 q4, d8, d24 + vmlal.s8 q4, d20, d25 +.endif + + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 +.if \oy +.if \ox + vmull.s8 q5, d10, d24 + vmlal.s8 q5, d16, d25 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d16, q5, #5 +.endif + + vmull.s8 q4, d20, d7 + vmull.s8 q5, d21, d7 + vmlal.s8 q4, d16, d6 + vmlal.s8 q5, d17, d6 + vqrshrn.s16 d20, q4, #5 + vqrshrn.s16 d21, q5, #5 +.elseif \ox + vqrshrn.s16 d20, q4, #5 +.endif +.if \csfl + vrshrn.u16 d0, q0, #1 + vrshrn.u16 d1, q1, #1 +.else + vrshr.u16 q4, q0, #1 + vrshr.u16 q5, q1, #1 + vmovl.u8 q0, d22 + vmovl.u8 q1, d23 + vmul.i16 q4, q4, d4[0] + vmul.i16 q5, q5, d4[0] + vmul.i16 q0, q0, d4[1] + vmul.i16 q1, q1, d4[1] + vqadd.s16 q4, q4, q0 + vqadd.s16 q5, q5, q1 + vdup.16 q0, d4[2] + vshr.s16 q4, q4, #6 + vshr.s16 q5, q5, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vqmovun.s16 d0, q4 + vqmovun.s16 d1, q5 +.endif + + bl gather16_neon + + vmovl.s8 q8, d20 // grain + vmovl.s8 q9, d21 + + vmovl.u8 q6, d8 // scaling + vmovl.u8 q7, d9 + + vmul.i16 q8, q8, q6 // scaling * grain + vmul.i16 q9, q9, q7 + + vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift) + vrshl.s16 q9, q9, q13 + + vaddw.u8 q8, q8, d22 // *src + noise + vaddw.u8 q9, q9, d23 + + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + + vmax.u8 q0, q0, q14 + vmin.u8 q0, q0, q15 + + subs r9, r9, #1 +.if \oy + vswp d6, d7 +.endif + vst1.8 {q0}, [r0, :128], r2 // dst + bgt 1b + +.if \oy + cmp r12, #0 + mov r9, r12 // restore actual remaining h + bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff -Nru dav1d-0.7.1/src/arm/32/ipred16.S dav1d-0.9.1/src/arm/32/ipred16.S --- dav1d-0.7.1/src/arm/32/ipred16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/ipred16.S 2021-07-28 21:38:28.857851700 +0000 @@ -0,0 +1,3254 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, B Krishnan Iyer + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height, +// const int bitdepth_max); +function ipred_dc_128_16bpc_neon, export=1 + push {r4, lr} + ldr r4, [sp, #8] + ldr r12, [sp, #24] + clz r3, r3 + adr r2, L(ipred_dc_128_tbl) + sub r3, r3, #25 + vdup.16 q0, r12 + ldr r3, [r2, r3, lsl #2] + add r12, r0, r1 + vrshr.u16 q0, q0, #1 + add r2, r2, r3 + lsl r1, r1, #1 + bx r2 + + .align 2 +L(ipred_dc_128_tbl): + .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 160f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB +4: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4, pc} +8: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 8b + pop {r4, pc} +160: + vmov q1, q0 +16: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vmov q1, q0 + sub r1, r1, #32 +32: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vmov q1, q0 + sub r1, r1, #96 +64: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_16bpc_neon, export=1 + push {r4, lr} + ldr lr, [sp, #8] + clz r3, r3 + adr r4, L(ipred_v_tbl) + sub r3, r3, #25 + ldr r3, [r4, r3, lsl #2] + add r2, r2, #2 + add r4, r4, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r4 + + .align 2 +L(ipred_v_tbl): + .word 640f - L(ipred_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_v_tbl) + CONFIG_THUMB + +40: + vld1.16 {d0}, [r2] +4: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs lr, lr, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4, pc} +80: + vld1.16 {q0}, [r2] +8: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs lr, lr, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 8b + pop {r4, pc} +160: + vld1.16 {q0, q1}, [r2] +16: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs lr, lr, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vld1.16 {q0, q1}, [r2]! + sub r1, r1, #32 + vld1.16 {q2, q3}, [r2] +32: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 + subs lr, lr, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vld1.16 {q0, q1}, [r2]! + sub r1, r1, #96 + vld1.16 {q2, q3}, [r2]! + vld1.16 {q8, q9}, [r2]! + vld1.16 {q10, q11}, [r2]! +64: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d4, d5, d6, d7}, [r0, :128]! + vst1.16 {d4, d5, d6, d7}, [r12, :128]! + subs lr, lr, #2 + vst1.16 {d16, d17, d18, d19}, [r0, :128]! + vst1.16 {d16, d17, d18, d19}, [r12, :128]! + vst1.16 {d20, d21, d22, d23}, [r0, :128], r1 + vst1.16 {d20, d21, d22, d23}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_h_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + sub r2, r2, #2 + mov lr, #-2 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_h_tbl): + .word 640f - L(ipred_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_h_tbl) + CONFIG_THUMB + .word 8f - L(ipred_h_tbl) + CONFIG_THUMB + .word 40f - L(ipred_h_tbl) + CONFIG_THUMB +40: + sub r2, r2, #6 + mov lr, #-8 +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2], lr + vst1.16 {d3}, [r0, :64], r1 + vst1.16 {d2}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d1}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4-r5, pc} +8: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.16 {d2[], d3[]}, [r2], lr + vst1.16 {q0}, [r0, :128], r1 + vld1.16 {d4[], d5[]}, [r2], lr + vst1.16 {q1}, [r12, :128], r1 + vld1.16 {d6[], d7[]}, [r2], lr + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 8b + pop {r4-r5, pc} +160: + sub r1, r1, #16 +16: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.16 {d2[], d3[]}, [r2], lr + vst1.16 {q0}, [r0, :128]! + vld1.16 {d4[], d5[]}, [r2], lr + vst1.16 {q1}, [r12, :128]! + vld1.16 {d6[], d7[]}, [r2], lr + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + sub r1, r1, #48 +32: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.16 {d2[], d3[]}, [r2], lr + vst1.16 {q0}, [r0, :128]! + vld1.16 {d4[], d5[]}, [r2], lr + vst1.16 {q1}, [r12, :128]! + vld1.16 {d6[], d7[]}, [r2], lr + vst1.16 {q0}, [r0, :128]! + vst1.16 {q1}, [r12, :128]! + vst1.16 {q0}, [r0, :128]! + vst1.16 {q1}, [r12, :128]! + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + sub r1, r1, #96 +64: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #2 + vld1.16 {d4[], d5[]}, [r2], lr + vmov q1, q0 + vmov q3, q2 + vst1.16 {q0, q1}, [r0, :128]! + vst1.16 {q2, q3}, [r12, :128]! + vst1.16 {q0, q1}, [r0, :128]! + vst1.16 {q2, q3}, [r12, :128]! + vst1.16 {q0, q1}, [r0, :128]! + vst1.16 {q2, q3}, [r12, :128]! + vst1.16 {q0, q1}, [r0, :128], r1 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_dc_top_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + add r2, r2, #2 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_top_tbl): + .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB + +40: + vld1.16 {d0}, [r2] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 d0, d0[0] +4: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4-r5, pc} +80: + vld1.16 {d0, d1}, [r2] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] +8: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 8b + pop {r4-r5, pc} +160: + vld1.16 {d0, d1, d2, d3}, [r2] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d4, d0, #4 + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +16: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + vld1.16 {d0, d1, d2, d3}, [r2]! + vld1.16 {d4, d5, d6, d7}, [r2] + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q0, q2 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d18, q0, #5 + vdup.16 q0, d18[0] + vdup.16 q1, d18[0] + sub r1, r1, #32 +32: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + vld1.16 {d0, d1, d2, d3}, [r2]! + vld1.16 {d4, d5, d6, d7}, [r2]! + vadd.i16 q0, q0, q1 + vld1.16 {d16, d17, d18, d19}, [r2]! + vadd.i16 q2, q2, q3 + vld1.16 {d20, d21, d22, d23}, [r2] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q0, q2 + vadd.i16 q8, q8, q10 + vadd.i16 q0, q0, q8 + vadd.i16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpadd.i32 d0, d0, d0 + vrshrn.i32 d18, q0, #6 + vdup.16 q0, d18[0] + vdup.16 q1, d18[0] + sub r1, r1, #96 +64: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + sub r2, r2, r4, lsl #1 + clz r3, r3 + clz lr, r4 + sub lr, lr, #25 + adr r5, L(ipred_dc_left_tbl) + sub r3, r3, #20 + ldr r3, [r5, r3, lsl #2] + ldr lr, [r5, lr, lsl #2] + add r3, r5, r3 + add r5, r5, lr + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_left_tbl): + .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + +L(ipred_dc_left_h4): + vld1.16 {d0}, [r2, :64] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w4): + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt L(ipred_dc_left_w4) + pop {r4-r5, pc} +L(ipred_dc_left_h8): + vld1.16 {d0, d1}, [r2, :128] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w8): + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt L(ipred_dc_left_w8) + pop {r4-r5, pc} +L(ipred_dc_left_h16): + vld1.16 {d0, d1, d2, d3}, [r2, :128] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w16): + vmov q1, q0 +1: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +L(ipred_dc_left_h32): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128] + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q0, q2 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d0, q0, #5 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w32): + sub r1, r1, #32 + vmov q1, q0 +1: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +L(ipred_dc_left_h64): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128]! + vadd.i16 q0, q0, q1 + vld1.16 {d16, d17, d18, d19}, [r2, :128]! + vadd.i16 q2, q2, q3 + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q0, q2 + vadd.i16 q8, q8, q10 + vadd.i16 q0, q0, q8 + vadd.i16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpadd.i32 d0, d0, d0 + vrshrn.i32 d0, q0, #6 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w64): + sub r1, r1, #96 + vmov q1, q0 +1: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_16bpc_neon, export=1 + push {r4-r6, lr} + ldr r4, [sp, #16] + sub r2, r2, r4, lsl #1 + add lr, r3, r4 // width + height + clz r3, r3 + clz r12, r4 + vdup.32 q15, lr // width + height + adr r5, L(ipred_dc_tbl) + rbit lr, lr // rbit(width + height) + sub r3, r3, #20 // 25 leading bits, minus table offset 5 + sub r12, r12, #25 + clz lr, lr // ctz(width + height) + ldr r3, [r5, r3, lsl #2] + ldr r12, [r5, r12, lsl #2] + neg lr, lr // -ctz(width + height) + add r3, r5, r3 + add r5, r5, r12 + vshr.u32 q15, q15, #1 // (width + height) >> 1 + vdup.32 q14, lr // -ctz(width + height) + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_tbl): + .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB + +L(ipred_dc_h4): + vld1.16 {d0}, [r2, :64]! + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w4): + vld1.16 {d2}, [r2] + vadd.i32 d0, d0, d30 + vpadd.i16 d2, d2, d2 + vpaddl.u16 d2, d2 + cmp r4, #4 + vadd.i32 d0, d0, d2 + vshl.u32 d0, d0, d28 + beq 1f + // h = 8/16 + cmp r4, #16 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d0, d0, d24 + vshr.u32 d0, d0, #17 +1: + vdup.16 d0, d0[0] +2: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h8): + vld1.16 {d0, d1}, [r2, :128]! + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w8): + vld1.16 {d2, d3}, [r2] + vadd.i32 d0, d0, d30 + vadd.i16 d2, d2, d3 + vpadd.i16 d2, d2, d2 + vpaddl.u16 d2, d2 + cmp r4, #8 + vadd.i32 d0, d0, d2 + vshl.u32 d0, d0, d28 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d0, d0, d24 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] +2: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h16): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vadd.i16 q0, q0, q1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w16): + vld1.16 {d2, d3, d4, d5}, [r2] + vadd.i32 d0, d0, d30 + vadd.i16 q1, q1, q2 + vadd.i16 d2, d2, d3 + vpadd.i16 d2, d2, d1 + vpaddl.u16 d2, d2 + cmp r4, #16 + vadd.i32 d0, d0, d2 + vshl.u32 d4, d0, d28 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d4, d4, d24 + vshr.u32 d4, d4, #17 +1: + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +2: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h32): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128]! + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q0, q2 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w32): + vld1.16 {d2, d3, d4, d5}, [r2]! + vadd.i32 d0, d0, d30 + vld1.16 {d16, d17, d18, d19}, [r2] + vadd.i16 q1, q1, q2 + vadd.i16 q8, q8, q9 + vadd.i16 q1, q1, q8 + vadd.i16 d2, d2, d3 + vpadd.i16 d2, d2, d2 + vpaddl.u16 d2, d2 + cmp r4, #32 + vadd.i32 d0, d0, d2 + vshl.u32 d4, d0, d28 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d4, d4, d24 + vshr.u32 d4, d4, #17 +1: + sub r1, r1, #32 + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +2: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} +L(ipred_dc_h64): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128]! + vadd.i16 q0, q0, q1 + vld1.16 {d16, d17, d18, d19}, [r2, :128]! + vadd.i16 q2, q2, q3 + vld1.16 {d20, d21, d22, d23}, [r2, :128]! + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q0, q2 + vadd.i16 q8, q8, q10 + vadd.i16 q0, q0, q8 + vadd.i16 d0, d0, d1 + vpaddl.u16 d0, d0 + add r2, r2, #2 + vpadd.i32 d0, d0, d0 + bx r3 +L(ipred_dc_w64): + vld1.16 {d2, d3, d4, d5}, [r2]! + vadd.i32 d0, d0, d30 + vld1.16 {d16, d17, d18, d19}, [r2]! + vadd.i16 q1, q1, q2 + vld1.16 {d20, d21, d22, d23}, [r2]! + vadd.i16 q8, q8, q9 + vld1.16 {d24, d25, d26, d27}, [r2]! + vadd.i16 q10, q10, q11 + vadd.i16 q12, q12, q13 + vadd.i16 q1, q1, q8 + vadd.i16 q10, q10, q12 + vadd.i16 q1, q1, q10 + vadd.i16 d2, d2, d3 + vpaddl.u16 d2, d2 + vpadd.i32 d2, d2, d2 + cmp r4, #64 + vadd.i32 d0, d0, d2 + vshl.u32 d4, d0, d28 + beq 1f + // h = 16/32 + cmp r4, #16 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d4, d4, d24 + vshr.u32 d4, d4, #17 +1: + sub r1, r1, #96 + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +2: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} +endfunc + +// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_16bpc_neon, export=1 + push {r4-r6, lr} + vpush {q4} + ldr r4, [sp, #32] + clz lr, r3 + adr r12, L(ipred_paeth_tbl) + sub lr, lr, #25 + ldr lr, [r12, lr, lsl #2] + vld1.16 {d4[], d5[]}, [r2] + add r6, r2, #2 + sub r2, r2, #4 + add r12, r12, lr + mov r5, #-4 + add lr, r0, r1 + lsl r1, r1, #1 + bx r12 + + .align 2 +L(ipred_paeth_tbl): + .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB + +40: + sub r2, r2, #4 + mov r5, #-8 + vld1.16 {d6}, [r6] + vsub.i16 d16, d6, d4 // top - topleft + vmov d7, d6 + vmov d17, d16 +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r5 + vadd.i16 q9, q8, q0 // base + vadd.i16 q10, q8, q1 + vabd.s16 q11, q3, q9 // tdiff + vabd.s16 q12, q3, q10 + vabd.s16 q13, q2, q9 // tldiff + vabd.s16 q14, q2, q10 + vabd.s16 q9, q0, q9 // ldiff + vabd.s16 q10, q1, q10 + vmin.u16 q15, q11, q13 // min(tdiff, tldiff) + vmin.u16 q4, q12, q14 + vcge.u16 q11, q13, q11 // tldiff >= tdiff + vcge.u16 q12, q14, q12 + vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff + vcge.u16 q10, q4, q10 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d24}, [lr, :64], r1 + subs r4, r4, #4 + vst1.16 {d23}, [r0, :64], r1 + vst1.16 {d22}, [lr, :64], r1 + bgt 4b + vpop {q4} + pop {r4-r6, pc} +80: +160: +320: +640: + vld1.16 {q3}, [r6]! + mov r12, r3 + sub r1, r1, r3, lsl #1 +1: + vld2.16 {d0[], d2[]}, [r2, :32], r5 + vmov d1, d0 + vmov d3, d2 +2: + vsub.i16 q8, q3, q2 // top - topleft + vadd.i16 q9, q8, q0 // base + vadd.i16 q10, q8, q1 + vabd.s16 q11, q3, q9 // tdiff + vabd.s16 q12, q3, q10 + vabd.s16 q13, q2, q9 // tldiff + vabd.s16 q14, q2, q10 + vabd.s16 q9, q0, q9 // ldiff + vabd.s16 q10, q1, q10 + vmin.u16 q15, q11, q13 // min(tdiff, tldiff) + vmin.u16 q4, q12, q14 + vcge.u16 q11, q13, q11 // tldiff >= tdiff + vcge.u16 q12, q14, q12 + vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff + vcge.u16 q10, q4, q10 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + subs r3, r3, #8 + vst1.16 {q12}, [r0, :128]! + vst1.16 {q11}, [lr, :128]! + ble 8f + vld1.16 {q3}, [r6]! + b 2b +8: + subs r4, r4, #2 + ble 9f + // End of horizontal loop, move pointers to next two rows + sub r6, r6, r12, lsl #1 + add r0, r0, r1 + add lr, lr, r1 + vld1.16 {q3}, [r6]! + mov r3, r12 + b 1b +9: + vpop {q4} + pop {r4-r6, pc} +endfunc + +// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_16bpc_neon, export=1 + push {r4-r10, lr} + ldr r4, [sp, #32] + movrel r10, X(sm_weights) + add r12, r10, r4 + add r10, r10, r3 + clz r9, r3 + adr r5, L(ipred_smooth_tbl) + sub lr, r2, r4, lsl #1 + sub r9, r9, #25 + ldr r9, [r5, r9, lsl #2] + vld1.16 {d4[], d5[]}, [lr] // bottom + add r8, r2, #2 + add r5, r5, r9 + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_tbl): + .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB + +40: + vld1.16 {d16}, [r8] // top + vld1.32 {d18[]}, [r10, :32] // weights_hor + sub r2, r2, #8 + mov r7, #-8 + vdup.16 q3, d16[3] // right + vsub.i16 q8, q8, q2 // top-bottom + vmovl.u8 q9, d18 // weights_hor + vadd.i16 d19, d4, d6 // bottom+right +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left + vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver + vshll.u16 q12, d19, #8 // (bottom+right)*256 + vshll.u16 q13, d19, #8 + vshll.u16 q14, d19, #8 + vshll.u16 q15, d19, #8 + vzip.32 d20, d21 // weights_ver + vzip.32 d22, d23 + vsub.i16 q1, q1, q3 // left-right + vsub.i16 q0, q0, q3 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmlal.s16 q12, d3, d18 // += (left-right)*weights_hor + vmlal.s16 q13, d2, d18 // (left flipped) + vmlal.s16 q14, d1, d18 + vmlal.s16 q15, d0, d18 + vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver + vmlal.s16 q13, d16, d21 + vmlal.s16 q14, d16, d22 + vmlal.s16 q15, d16, d23 + vrshrn.i32 d24, q12, #9 + vrshrn.i32 d25, q13, #9 + vrshrn.i32 d26, q14, #9 + vrshrn.i32 d27, q15, #9 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d25}, [r6, :64], r1 + subs r4, r4, #4 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d27}, [r6, :64], r1 + bgt 4b + pop {r4-r10, pc} +80: + vld1.16 {q8}, [r8] // top + vld1.8 {d18}, [r10, :64] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vdup.16 q3, d17[3] // right + vsub.i16 q8, q8, q2 // top-bottom + vmovl.u8 q9, d18 // weights_hor + vadd.i16 d3, d4, d6 // bottom+right +8: + vld2.16 {d0[], d1[]}, [r2, :32], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vshll.u16 q12, d3, #8 // (bottom+right)*256 + vshll.u16 q13, d3, #8 + vshll.u16 q14, d3, #8 + vshll.u16 q15, d3, #8 + vsub.i16 q0, q0, q3 // left-right + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor + vmlal.s16 q13, d1, d19 // (left flipped) + vmlal.s16 q14, d0, d18 + vmlal.s16 q15, d0, d19 + vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver + vmlal.s16 q13, d17, d20 + vmlal.s16 q14, d16, d22 + vmlal.s16 q15, d17, d22 + vrshrn.i32 d24, q12, #9 + vrshrn.i32 d25, q13, #9 + vrshrn.i32 d26, q14, #9 + vrshrn.i32 d27, q15, #9 + subs r4, r4, #2 + vst1.16 {q12}, [r0, :128], r1 + vst1.16 {q13}, [r6, :128], r1 + bgt 8b + pop {r4-r10, pc} +160: +320: +640: + add lr, r2, r3, lsl #1 + sub r2, r2, #4 + mov r7, #-4 + vld1.16 {d6[], d7[]}, [lr] // right + sub r1, r1, r3, lsl #1 + mov r9, r3 + vadd.i16 d3, d4, d6 // bottom+right + +1: + vld2.16 {d0[], d1[]}, [r2, :32], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vsub.i16 q0, q0, q3 // left-right + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 +2: + vld1.8 {d18}, [r10, :64]! // weights_hor + vld1.16 {q8}, [r8]! // top + vshll.u16 q12, d3, #8 // (bottom+right)*256 + vshll.u16 q13, d3, #8 + vmovl.u8 q9, d18 // weights_hor + vshll.u16 q14, d3, #8 + vshll.u16 q15, d3, #8 + vsub.i16 q8, q8, q2 // top-bottom + vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor + vmlal.s16 q13, d1, d19 // (left flipped) + vmlal.s16 q14, d0, d18 + vmlal.s16 q15, d0, d19 + vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver + vmlal.s16 q13, d17, d20 + vmlal.s16 q14, d16, d22 + vmlal.s16 q15, d17, d22 + vrshrn.i32 d24, q12, #9 + vrshrn.i32 d25, q13, #9 + vrshrn.i32 d26, q14, #9 + vrshrn.i32 d27, q15, #9 + subs r3, r3, #8 + vst1.16 {q12}, [r0, :128]! + vst1.16 {q13}, [r6, :128]! + bgt 2b + subs r4, r4, #2 + ble 9f + sub r8, r8, r9, lsl #1 + sub r10, r10, r9 + add r0, r0, r1 + add r6, r6, r1 + mov r3, r9 + b 1b +9: + pop {r4-r10, pc} +endfunc + +// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_16bpc_neon, export=1 + push {r4-r7, lr} + ldr r4, [sp, #20] + movrel r7, X(sm_weights) + add r7, r7, r4 + clz lr, r3 + adr r5, L(ipred_smooth_v_tbl) + sub r12, r2, r4, lsl #1 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.16 {d4[], d5[]}, [r12] // bottom + add r2, r2, #2 + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_v_tbl): + .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + +40: + vld1.16 {d6}, [r2] // top + vsub.i16 d6, d6, d4 // top-bottom + vmov d7, d6 +4: + vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver + vzip.32 d16, d17 // weights_ver + vzip.32 d18, d19 + vshll.u8 q8, d16, #7 // weights_ver << 7 + vshll.u8 q9, d18, #7 + vqrdmulh.s16 q10, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 + vqrdmulh.s16 q11, q3, q9 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vst1.16 {d20}, [r0, :64], r1 + vst1.16 {d21}, [r6, :64], r1 + subs r4, r4, #4 + vst1.16 {d22}, [r0, :64], r1 + vst1.16 {d23}, [r6, :64], r1 + bgt 4b + pop {r4-r7, pc} +80: + vld1.16 {q3}, [r2] // top + vsub.i16 q3, q3, q2 // top-bottom +8: + vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver + vshll.u8 q8, d16, #7 // weights_ver << 7 + vshll.u8 q9, d18, #7 + vshll.u8 q10, d20, #7 + vshll.u8 q11, d22, #7 + vqrdmulh.s16 q8, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 + vqrdmulh.s16 q9, q3, q9 + vqrdmulh.s16 q10, q3, q10 + vqrdmulh.s16 q11, q3, q11 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vst1.16 {q8}, [r0, :128], r1 + vst1.16 {q9}, [r6, :128], r1 + subs r4, r4, #4 + vst1.16 {q10}, [r0, :128], r1 + vst1.16 {q11}, [r6, :128], r1 + bgt 8b + pop {r4-r7, pc} +160: +320: +640: + vpush {q4-q7} + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3, lsl #1 + mov r12, r3 + +1: + vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver + vshll.u8 q4, d8, #7 // weights_ver << 7 + vshll.u8 q5, d10, #7 + vshll.u8 q6, d12, #7 + vshll.u8 q7, d14, #7 +2: + vld1.16 {q0, q1}, [r2]! // top + vsub.i16 q0, q0, q2 // top-bottom + vsub.i16 q1, q1, q2 + vqrdmulh.s16 q8, q0, q4 // ((top-bottom)*weights_ver + 128) >> 8 + vqrdmulh.s16 q9, q1, q4 + vqrdmulh.s16 q10, q0, q5 + vqrdmulh.s16 q11, q1, q5 + vqrdmulh.s16 q12, q0, q6 + vqrdmulh.s16 q13, q1, q6 + vqrdmulh.s16 q14, q0, q7 + vqrdmulh.s16 q15, q1, q7 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q2 + vadd.i16 q14, q14, q2 + vadd.i16 q15, q15, q2 + subs r3, r3, #16 + vst1.16 {q8, q9}, [r0, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r5, :128]! + vst1.16 {q14, q15}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r2, r2, r12, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r7, pc} +endfunc + +// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_16bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + movrel r8, X(sm_weights) + add r8, r8, r3 + clz lr, r3 + adr r5, L(ipred_smooth_h_tbl) + add r12, r2, r3, lsl #1 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.16 {d4[], d5[]}, [r12] // right + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_h_tbl): + .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[]}, [r8, :32] // weights_hor + sub r2, r2, #8 + mov r7, #-8 + vshll.u8 q3, d6, #7 // weights_hor << 7 +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left + vsub.i16 q0, q0, q2 // left-right + vsub.i16 q1, q1, q2 + subs r4, r4, #4 + vqrdmulh.s16 q8, q1, q3 // ((left-right)*weights_hor + 128) >> 8 + vqrdmulh.s16 q9, q0, q3 // (left flipped) + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vst1.16 {d17}, [r0, :64], r1 + vst1.16 {d16}, [r6, :64], r1 + vst1.16 {d19}, [r0, :64], r1 + vst1.16 {d18}, [r6, :64], r1 + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d6}, [r8, :64] // weights_hor + sub r2, r2, #8 + mov r7, #-8 + vshll.u8 q3, d6, #7 // weights_hor << 7 +8: + vld1.16 {d23}, [r2, :64], r7 // left + subs r4, r4, #4 + vsub.i16 d23, d23, d4 // left-right + vdup.16 q8, d23[3] // flip left + vdup.16 q9, d23[2] + vdup.16 q10, d23[1] + vdup.16 q11, d23[0] + vqrdmulh.s16 q8, q8, q3 // ((left-right)*weights_hor + 128) >> 8 + vqrdmulh.s16 q9, q9, q3 + vqrdmulh.s16 q10, q10, q3 + vqrdmulh.s16 q11, q11, q3 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vst1.16 {q8}, [r0, :128], r1 + vst1.16 {q9}, [r6, :128], r1 + vst1.16 {q10}, [r0, :128], r1 + vst1.16 {q11}, [r6, :128], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: +640: + vpush {q4-q7} + sub r2, r2, #8 + mov r7, #-8 + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3, lsl #1 + mov r12, r3 + +1: + vld1.16 {d15}, [r2, :64], r7 // left + vsub.i16 d15, d15, d4 // left-right + vdup.16 q4, d15[3] // flip left + vdup.16 q5, d15[2] + vdup.16 q6, d15[1] + vdup.16 q7, d15[0] +2: + vld1.8 {q1}, [r8, :128]! // weights_hor + subs r3, r3, #16 + vshll.u8 q0, d2, #7 // weights_hor << 7 + vshll.u8 q1, d3, #7 + vqrdmulh.s16 q8, q0, q4 // ((left-right)*weights_hor + 128) >> 8 + vqrdmulh.s16 q9, q1, q4 + vqrdmulh.s16 q10, q0, q5 + vqrdmulh.s16 q11, q1, q5 + vqrdmulh.s16 q12, q0, q6 + vqrdmulh.s16 q13, q1, q6 + vqrdmulh.s16 q14, q0, q7 + vqrdmulh.s16 q15, q1, q7 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q2 + vadd.i16 q14, q14, q2 + vadd.i16 q15, q15, q2 + vst1.16 {q8, q9}, [r0, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r5, :128]! + vst1.16 {q14, q15}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r8, r8, r12 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r8, pc} +endfunc + +// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height, +// const int bitdepth_max); +.macro filter_fn bpc +function ipred_filter_\bpc\()bpc_neon, export=1 + movw r12, #511 + ldrd r4, r5, [sp, #88] + and r5, r5, r12 // 511 + movrel r6, X(filter_intra_taps) + lsl r5, r5, #6 + add r6, r6, r5 + vld1.8 {d20, d21, d22, d23}, [r6, :128]! + clz lr, r3 + adr r5, L(ipred_filter\bpc\()_tbl) + vld1.8 {d27, d28, d29}, [r6, :64] + sub lr, lr, #26 + ldr lr, [r5, lr, lsl #2] + vmovl.s8 q8, d20 + vmovl.s8 q9, d21 + add r5, r5, lr + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + add r6, r0, r1 + lsl r1, r1, #1 + vmovl.s8 q12, d27 + vmovl.s8 q13, d28 + vmovl.s8 q14, d29 + mov r7, #-4 + vdup.16 q15, r8 + add r8, r2, #2 + sub r2, r2, #4 +.if \bpc == 10 + vmov.i16 q7, #0 +.endif + bx r5 + + .align 2 +L(ipred_filter\bpc\()_tbl): + .word 320f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + .word 160f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + .word 80f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + .word 40f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + +40: + vld1.16 {d0}, [r8] // top (0-3) +4: + vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) +.if \bpc == 10 + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vrshr.s16 q2, q2, #4 + vmax.s16 q2, q2, q7 +.else + vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) + vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) + vqrshrun.s32 d4, q2, #4 + vqrshrun.s32 d5, q3, #4 +.endif + vmin.s16 q2, q2, q15 + subs r4, r4, #2 + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d5}, [r6, :64], r1 + vmov d0, d5 // move top from [4-7] to [0-3] + bgt 4b + vpop {q4-q7} + pop {r4-r8, pc} +80: + vld1.16 {q0}, [r8] // top (0-7) +8: + vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) +.if \bpc == 10 + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) + vrshr.s16 q2, q2, #4 + vmax.s16 q2, q2, q7 + vmin.s16 q2, q2, q15 + vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) + vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d4[3] // p5(left[0]) * filter(5) + vmla.i16 q3, q14, d5[3] // p6(left[1]) * filter(6) + vrshr.s16 q3, q3, #4 + vmax.s16 q3, q3, q7 +.else + vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) + vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) + vqrshrun.s32 d4, q2, #4 + vmull.s16 q4, d18, d1[0] // p1(top[0]) * filter(1) + vmlal.s16 q4, d20, d1[1] // p2(top[1]) * filter(2) + vmlal.s16 q4, d22, d1[2] // p3(top[2]) * filter(3) + vqrshrun.s32 d5, q3, #4 + vmin.s16 q2, q2, q15 + vmlal.s16 q4, d24, d1[3] // p4(top[3]) * filter(4) + vmlal.s16 q4, d16, d0[3] // p0(topleft) * filter(0) + vmlal.s16 q4, d26, d4[3] // p5(left[0]) * filter(5) + vmlal.s16 q4, d28, d5[3] // p6(left[1]) * filter(6) + vmull.s16 q5, d19, d1[0] // p1(top[0]) * filter(1) + vmlal.s16 q5, d21, d1[1] // p2(top[1]) * filter(2) + vmlal.s16 q5, d23, d1[2] // p3(top[2]) * filter(3) + vmlal.s16 q5, d25, d1[3] // p4(top[3]) * filter(4) + vmlal.s16 q5, d17, d0[3] // p0(topleft) * filter(0) + vmlal.s16 q5, d27, d4[3] // p5(left[0]) * filter(5) + vmlal.s16 q5, d29, d5[3] // p6(left[1]) * filter(6) + vqrshrun.s32 d6, q4, #4 + vqrshrun.s32 d7, q5, #4 +.endif + vmin.s16 q3, q3, q15 + vswp d5, d6 + subs r4, r4, #2 + vst1.16 {q2}, [r0, :128], r1 + vmov q0, q3 + vst1.16 {q3}, [r6, :128], r1 + bgt 8b + vpop {q4-q7} + pop {r4-r8, pc} +160: +320: + sub r1, r1, r3, lsl #1 + mov lr, r3 + +1: + vld1.16 {d0}, [r2], r7 // left (0-1) + topleft (2) +2: + vld1.16 {q1, q2}, [r8]! // top(0-15) +.if \bpc == 10 + vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) + vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) + vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) + vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) + + vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) + vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) + vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) + vrshr.s16 q3, q3, #4 + vmax.s16 q3, q3, q7 + vmin.s16 q3, q3, q15 + vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) + vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) + vmla.i16 q4, q13, d6[3] // p5(left[0]) * filter(5) + vmla.i16 q4, q14, d7[3] // p6(left[1]) * filter(6) + + vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) + vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) + vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) + vrshr.s16 q4, q4, #4 + vmax.s16 q4, q4, q7 + vmin.s16 q4, q4, q15 + vmov q0, q4 + vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) + vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) + vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) + + vmul.i16 q6, q9, d5[0] // p1(top[0]) * filter(1) + vmla.i16 q6, q10, d5[1] // p2(top[1]) * filter(2) + vmla.i16 q6, q11, d5[2] // p3(top[2]) * filter(3) + vrshr.s16 q5, q5, #4 + vmax.s16 q5, q5, q7 + vmin.s16 q5, q5, q15 + vmov q0, q5 + vmov.u16 r12, d5[3] + vmla.i16 q6, q12, d5[3] // p4(top[3]) * filter(4) + vmla.i16 q6, q8, d4[3] // p0(topleft) * filter(0) + vmla.i16 q6, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q6, q14, d1[3] // p6(left[1]) * filter(6) + vmov.16 d0[2], r12 + subs r3, r3, #16 + vrshr.s16 q6, q6, #4 +.else + vmull.s16 q3, d16, d0[2] // p0(topleft) * filter(0) + vmlal.s16 q3, d26, d0[1] // p5(left[0]) * filter(5) + vmlal.s16 q3, d28, d0[0] // p6(left[1]) * filter(6) + vmlal.s16 q3, d18, d2[0] // p1(top[0]) * filter(1) + vmlal.s16 q3, d20, d2[1] // p2(top[1]) * filter(2) + vmlal.s16 q3, d22, d2[2] // p3(top[2]) * filter(3) + vmlal.s16 q3, d24, d2[3] // p4(top[3]) * filter(4) + vmull.s16 q4, d17, d0[2] // p0(topleft) * filter(0) + vmlal.s16 q4, d27, d0[1] // p5(left[0]) * filter(5) + vmlal.s16 q4, d29, d0[0] // p6(left[1]) * filter(6) + vmlal.s16 q4, d19, d2[0] // p1(top[0]) * filter(1) + vmlal.s16 q4, d21, d2[1] // p2(top[1]) * filter(2) + vmlal.s16 q4, d23, d2[2] // p3(top[2]) * filter(3) + vmlal.s16 q4, d25, d2[3] // p4(top[3]) * filter(4) + vqrshrun.s32 d6, q3, #4 + vmull.s16 q5, d18, d3[0] // p1(top[0]) * filter(1) + vmlal.s16 q5, d20, d3[1] // p2(top[1]) * filter(2) + vqrshrun.s32 d7, q4, #4 + vmin.s16 q3, q3, q15 + vmlal.s16 q5, d22, d3[2] // p3(top[2]) * filter(3) + vmlal.s16 q5, d24, d3[3] // p4(top[3]) * filter(4) + vmlal.s16 q5, d16, d2[3] // p0(topleft) * filter(0) + vmlal.s16 q5, d26, d6[3] // p5(left[0]) * filter(5) + vmlal.s16 q5, d28, d7[3] // p6(left[1]) * filter(6) + vmull.s16 q6, d19, d3[0] // p1(top[0]) * filter(1) + vmlal.s16 q6, d21, d3[1] // p2(top[1]) * filter(2) + vmlal.s16 q6, d23, d3[2] // p3(top[2]) * filter(3) + vmlal.s16 q6, d25, d3[3] // p4(top[3]) * filter(4) + vmlal.s16 q6, d17, d2[3] // p0(topleft) * filter(0) + vmlal.s16 q6, d27, d6[3] // p5(left[0]) * filter(5) + vmlal.s16 q6, d29, d7[3] // p6(left[1]) * filter(6) + vqrshrun.s32 d8, q5, #4 + vmull.s16 q7, d18, d4[0] // p1(top[0]) * filter(1) + vmlal.s16 q7, d20, d4[1] // p2(top[1]) * filter(2) + vmlal.s16 q7, d22, d4[2] // p3(top[2]) * filter(3) + vqrshrun.s32 d9, q6, #4 + vmin.s16 q0, q4, q15 + vmlal.s16 q7, d24, d4[3] // p4(top[3]) * filter(4) + vmlal.s16 q7, d16, d3[3] // p0(topleft) * filter(0) + vmlal.s16 q7, d26, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q7, d28, d1[3] // p6(left[1]) * filter(6) + vmin.s16 q4, q4, q15 + vmull.s16 q6, d19, d4[0] // p1(top[0]) * filter(1) + vmlal.s16 q6, d21, d4[1] // p2(top[1]) * filter(2) + vmlal.s16 q6, d23, d4[2] // p3(top[2]) * filter(3) + vmlal.s16 q6, d25, d4[3] // p4(top[3]) * filter(4) + vmlal.s16 q6, d17, d3[3] // p0(topleft) * filter(0) + vmlal.s16 q6, d27, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q6, d29, d1[3] // p6(left[1]) * filter(6) + vqrshrun.s32 d10, q7, #4 + vmull.s16 q1, d18, d5[0] // p1(top[0]) * filter(1) + vmlal.s16 q1, d20, d5[1] // p2(top[1]) * filter(2) + vmlal.s16 q1, d22, d5[2] // p3(top[2]) * filter(3) + vqrshrun.s32 d11, q6, #4 + vmin.s16 q0, q5, q15 + vmlal.s16 q1, d24, d5[3] // p4(top[3]) * filter(4) + vmlal.s16 q1, d16, d4[3] // p0(topleft) * filter(0) + vmlal.s16 q1, d26, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q1, d28, d1[3] // p6(left[1]) * filter(6) + vmin.s16 q5, q5, q15 + vmov.u16 r12, d5[3] + vmull.s16 q7, d19, d5[0] // p1(top[0]) * filter(1) + vmlal.s16 q7, d21, d5[1] // p2(top[1]) * filter(2) + vmlal.s16 q7, d23, d5[2] // p3(top[2]) * filter(3) + vmlal.s16 q7, d25, d5[3] // p4(top[3]) * filter(4) + vmlal.s16 q7, d17, d4[3] // p0(topleft) * filter(0) + vmlal.s16 q7, d27, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q7, d29, d1[3] // p6(left[1]) * filter(6) + vmov.16 d0[2], r12 + vqrshrun.s32 d12, q1, #4 + subs r3, r3, #16 + vqrshrun.s32 d13, q7, #4 +.endif + vswp q4, q5 +.if \bpc == 10 + vmax.s16 q6, q6, q7 +.endif + vswp d7, d10 + vmin.s16 q6, q6, q15 + + vswp d9, d12 + + vst1.16 {q3, q4}, [r0, :128]! + vst1.16 {q5, q6}, [r6, :128]! + ble 8f + vmov.u16 r12, d13[3] + vmov.16 d0[0], r12 + vmov.u16 r12, d9[3] + vmov.16 d0[1], r12 + b 2b +8: + subs r4, r4, #2 + + ble 9f + sub r8, r6, lr, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + b 1b +9: + vpop {q4-q7} + pop {r4-r8, pc} +endfunc +.endm + +filter_fn 10 +filter_fn 12 + +function ipred_filter_16bpc_neon, export=1 + push {r4-r8, lr} + vpush {q4-q7} + movw r12, 0x3ff + ldr r8, [sp, #104] + cmp r8, r12 + ble ipred_filter_10bpc_neon + b ipred_filter_12bpc_neon +endfunc + +// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + vld1.16 {q14}, [r2, :128] + clz lr, r4 + adr r12, L(pal_pred_tbl) + sub lr, lr, #25 + ldr lr, [r12, lr, lsl #2] + vmov.i16 q15, #0x100 + add r12, r12, lr + add r2, r0, r1 + bx r12 + + .align 2 +L(pal_pred_tbl): + .word 640f - L(pal_pred_tbl) + CONFIG_THUMB + .word 320f - L(pal_pred_tbl) + CONFIG_THUMB + .word 160f - L(pal_pred_tbl) + CONFIG_THUMB + .word 80f - L(pal_pred_tbl) + CONFIG_THUMB + .word 40f - L(pal_pred_tbl) + CONFIG_THUMB + +40: + lsl r1, r1, #1 +4: + vld1.8 {q1}, [r3, :128]! + subs r5, r5, #4 + // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... + vadd.i8 q0, q1, q1 + vadd.i8 q1, q1, q1 + vzip.8 q0, q1 + vadd.i16 q0, q0, q15 + vadd.i16 q1, q1, q15 + vtbl.8 d0, {q14}, d0 + vtbl.8 d1, {q14}, d1 + vst1.16 {d0}, [r0, :64], r1 + vtbl.8 d2, {q14}, d2 + vst1.16 {d1}, [r2, :64], r1 + vtbl.8 d3, {q14}, d3 + vst1.16 {d2}, [r0, :64], r1 + vst1.16 {d3}, [r2, :64], r1 + bgt 4b + pop {r4-r5, pc} +80: + lsl r1, r1, #1 +8: + vld1.8 {q1, q2}, [r3, :128]! + subs r5, r5, #4 + // Prefer doing the adds twice, instead of chaining a vmov after + // the add. + vadd.i8 q0, q1, q1 + vadd.i8 q1, q1, q1 + vadd.i8 q3, q2, q2 + vadd.i8 q2, q2, q2 + vzip.8 q0, q1 + vzip.8 q2, q3 + vadd.i16 q0, q0, q15 + vadd.i16 q1, q1, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q2, q2, q15 + vtbl.8 d1, {q14}, d1 + vadd.i16 q3, q3, q15 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vst1.16 {q0}, [r0, :128], r1 + vtbl.8 d6, {q14}, d6 + vst1.16 {q1}, [r2, :128], r1 + vtbl.8 d7, {q14}, d7 + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r2, :128], r1 + bgt 8b + pop {r4-r5, pc} +160: + lsl r1, r1, #1 +16: + vld1.8 {q2, q3}, [r3, :128]! + subs r5, r5, #4 + vld1.8 {q10, q11}, [r3, :128]! + vadd.i8 q0, q2, q2 + vadd.i8 q1, q2, q2 + vadd.i8 q2, q3, q3 + vadd.i8 q3, q3, q3 + vadd.i8 q8, q10, q10 + vadd.i8 q9, q10, q10 + vadd.i8 q10, q11, q11 + vzip.8 q0, q1 + vadd.i8 q11, q11, q11 + vzip.8 q2, q3 + vzip.8 q8, q9 + vadd.i16 q0, q0, q15 + vzip.8 q10, q11 + vadd.i16 q1, q1, q15 + vadd.i16 q2, q2, q15 + vadd.i16 q3, q3, q15 + vadd.i16 q8, q8, q15 + vadd.i16 q9, q9, q15 + vadd.i16 q10, q10, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q11, q11, q15 + vtbl.8 d1, {q14}, d1 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vtbl.8 d6, {q14}, d6 + vtbl.8 d7, {q14}, d7 + vtbl.8 d16, {q14}, d16 + vtbl.8 d17, {q14}, d17 + vtbl.8 d18, {q14}, d18 + vst1.16 {q0, q1}, [r0, :128], r1 + vtbl.8 d19, {q14}, d19 + vtbl.8 d20, {q14}, d20 + vst1.16 {q2, q3}, [r2, :128], r1 + vtbl.8 d21, {q14}, d21 + vtbl.8 d22, {q14}, d22 + vst1.16 {q8, q9}, [r0, :128], r1 + vtbl.8 d23, {q14}, d23 + vst1.16 {q10, q11}, [r2, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + lsl r1, r1, #1 + sub r1, r1, #32 +32: + vld1.8 {q2, q3}, [r3, :128]! + subs r5, r5, #2 + vld1.8 {q10, q11}, [r3, :128]! + vadd.i8 q0, q2, q2 + vadd.i8 q1, q2, q2 + vadd.i8 q2, q3, q3 + vadd.i8 q3, q3, q3 + vadd.i8 q8, q10, q10 + vadd.i8 q9, q10, q10 + vadd.i8 q10, q11, q11 + vzip.8 q0, q1 + vadd.i8 q11, q11, q11 + vzip.8 q2, q3 + vzip.8 q8, q9 + vadd.i16 q0, q0, q15 + vzip.8 q10, q11 + vadd.i16 q1, q1, q15 + vadd.i16 q2, q2, q15 + vadd.i16 q3, q3, q15 + vadd.i16 q8, q8, q15 + vadd.i16 q9, q9, q15 + vadd.i16 q10, q10, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q11, q11, q15 + vtbl.8 d1, {q14}, d1 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vtbl.8 d6, {q14}, d6 + vtbl.8 d7, {q14}, d7 + vtbl.8 d16, {q14}, d16 + vtbl.8 d17, {q14}, d17 + vtbl.8 d18, {q14}, d18 + vst1.16 {q0, q1}, [r0, :128]! + vtbl.8 d19, {q14}, d19 + vtbl.8 d20, {q14}, d20 + vst1.16 {q2, q3}, [r0, :128], r1 + vtbl.8 d21, {q14}, d21 + vtbl.8 d22, {q14}, d22 + vst1.16 {q8, q9}, [r2, :128]! + vtbl.8 d23, {q14}, d23 + vst1.16 {q10, q11}, [r2, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + sub r1, r1, #96 +64: + vld1.8 {q2, q3}, [r3, :128]! + subs r5, r5, #1 + vld1.8 {q10, q11}, [r3, :128]! + vadd.i8 q0, q2, q2 + vadd.i8 q1, q2, q2 + vadd.i8 q2, q3, q3 + vadd.i8 q3, q3, q3 + vadd.i8 q8, q10, q10 + vadd.i8 q9, q10, q10 + vadd.i8 q10, q11, q11 + vzip.8 q0, q1 + vadd.i8 q11, q11, q11 + vzip.8 q2, q3 + vzip.8 q8, q9 + vadd.i16 q0, q0, q15 + vzip.8 q10, q11 + vadd.i16 q1, q1, q15 + vadd.i16 q2, q2, q15 + vadd.i16 q3, q3, q15 + vadd.i16 q8, q8, q15 + vadd.i16 q9, q9, q15 + vadd.i16 q10, q10, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q11, q11, q15 + vtbl.8 d1, {q14}, d1 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vtbl.8 d6, {q14}, d6 + vtbl.8 d7, {q14}, d7 + vtbl.8 d16, {q14}, d16 + vtbl.8 d17, {q14}, d17 + vtbl.8 d18, {q14}, d18 + vst1.16 {q0, q1}, [r0, :128]! + vtbl.8 d19, {q14}, d19 + vtbl.8 d20, {q14}, d20 + vst1.16 {q2, q3}, [r0, :128]! + vtbl.8 d21, {q14}, d21 + vtbl.8 d22, {q14}, d22 + vst1.16 {q8, q9}, [r0, :128]! + vtbl.8 d23, {q14}, d23 + vst1.16 {q10, q11}, [r0, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_128_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + clz lr, r3 + vdup.16 q15, r7 // bitdepth_max + adr r12, L(ipred_cfl_128_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vrshr.u16 q0, q15, #1 + vdup.16 q1, r6 // alpha + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r12 + + .align 2 +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + +L(ipred_cfl_splat_w4): + vld1.16 {q8, q9}, [r5, :128]! + vmull.s16 q2, d16, d2 // diff = ac * alpha + vmull.s16 q3, d17, d3 + vmull.s16 q8, d18, d2 + vmull.s16 q9, d19, d3 + vshr.s32 q10, q2, #31 // sign = diff >> 15 + vshr.s32 q11, q3, #31 + vshr.s32 q12, q8, #31 + vshr.s32 q13, q9, #31 + vadd.i32 q2, q2, q10 // diff + sign + vadd.i32 q3, q3, q11 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q8, #6 + vrshrn.i32 d7, q9, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vadd.i16 q3, q3, q0 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d5}, [r6, :64], r1 + subs r4, r4, #4 + vst1.16 {d6}, [r0, :64], r1 + vst1.16 {d7}, [r6, :64], r1 + bgt L(ipred_cfl_splat_w4) + pop {r4-r8, pc} +L(ipred_cfl_splat_w8): + vld1.16 {q8, q9}, [r5, :128]! + subs r4, r4, #2 + vmull.s16 q2, d16, d2 // diff = ac * alpha + vmull.s16 q3, d17, d3 + vmull.s16 q8, d18, d2 + vmull.s16 q9, d19, d3 + vshr.s32 q10, q2, #31 // sign = diff >> 15 + vshr.s32 q11, q3, #31 + vshr.s32 q12, q8, #31 + vshr.s32 q13, q9, #31 + vadd.i32 q2, q2, q10 // diff + sign + vadd.i32 q3, q3, q11 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q8, #6 + vrshrn.i32 d7, q9, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vadd.i16 q3, q3, q0 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r6, :128], r1 + bgt L(ipred_cfl_splat_w8) + pop {r4-r8, pc} +L(ipred_cfl_splat_w16): + vpush {q4-q7} + add r12, r5, r3, lsl #1 + sub r1, r1, r3, lsl #1 + mov lr, r3 +1: + vld1.16 {q6, q7}, [r5, :128]! + vmull.s16 q2, d12, d2 // diff = ac * alpha + vld1.16 {q8, q9}, [r12, :128]! + vmull.s16 q3, d13, d3 + vmull.s16 q4, d14, d2 + vmull.s16 q5, d15, d3 + vmull.s16 q6, d16, d2 + vmull.s16 q7, d17, d3 + vmull.s16 q8, d18, d2 + vmull.s16 q9, d19, d3 + vshr.s32 q10, q2, #31 // sign = diff >> 15 + vshr.s32 q11, q3, #31 + vshr.s32 q12, q4, #31 + vshr.s32 q13, q5, #31 + vadd.i32 q2, q2, q10 // diff + sign + vshr.s32 q10, q6, #31 + vadd.i32 q3, q3, q11 + vshr.s32 q11, q7, #31 + vadd.i32 q4, q4, q12 + vshr.s32 q12, q8, #31 + vadd.i32 q5, q5, q13 + vshr.s32 q13, q9, #31 + vadd.i32 q6, q6, q10 + vadd.i32 q7, q7, q11 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q4, #6 + vrshrn.i32 d7, q5, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vrshrn.i32 d8, q6, #6 + vrshrn.i32 d9, q7, #6 + vadd.i16 q3, q3, q0 + vrshrn.i32 d10, q8, #6 + vrshrn.i32 d11, q9, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmax.s16 q4, q4, q14 + vmax.s16 q5, q5, q14 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vmin.s16 q4, q4, q15 + vmin.s16 q5, q5, q15 + subs r3, r3, #16 + vst1.16 {q2, q3}, [r0, :128]! + vst1.16 {q4, q5}, [r6, :128]! + bgt 1b + subs r4, r4, #2 + add r5, r5, lr, lsl #1 + add r12, r12, lr, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + bgt 1b + vpop {q4-q7} + pop {r4-r8, pc} +endfunc + +// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_top_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + clz lr, r3 + vdup.16 q15, r7 // bitdepth_max + adr r12, L(ipred_cfl_top_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vdup.16 q1, r6 // alpha + add r2, r2, #2 + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r12 + + .align 2 +L(ipred_cfl_top_tbl): + .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + +4: + vld1.16 {d0}, [r2] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) +8: + vld1.16 {q0}, [r2] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) +16: + vld1.16 {q2, q3}, [r2] + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +32: + vld1.16 {q8, q9}, [r2]! + vld1.16 {q10, q11}, [r2] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q8, q10 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d0, q0, #5 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_left_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + sub r2, r2, r4, lsl #1 + clz lr, r3 + clz r8, r4 + vdup.16 q15, r7 // bitdepth_max + adr r12, L(ipred_cfl_splat_tbl) + adr r7, L(ipred_cfl_left_tbl) + sub lr, lr, #26 + sub r8, r8, #26 + ldr lr, [r12, lr, lsl #2] + ldr r8, [r7, r8, lsl #2] + vdup.16 q1, r6 // alpha + add r12, r12, lr + add r7, r7, r8 + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r7 + + .align 2 +L(ipred_cfl_left_tbl): + .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + +L(ipred_cfl_left_h4): + vld1.16 {d0}, [r2, :64] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h8): + vld1.16 {q0}, [r2, :128] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h16): + vld1.16 {q2, q3}, [r2, :128] + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h32): + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q8, q10 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d0, q0, #5 + vdup.16 q0, d0[0] + bx r12 +endfunc + +// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + sub r2, r2, r4, lsl #1 + add r8, r3, r4 // width + height + vdup.16 q1, r6 // alpha + clz lr, r3 + clz r6, r4 + vdup.32 d16, r8 // width + height + vdup.16 q15, r7 // bitdepth_max + adr r7, L(ipred_cfl_tbl) + rbit r8, r8 // rbit(width + height) + sub lr, lr, #22 // 26 leading bits, minus table offset 4 + sub r6, r6, #26 + clz r8, r8 // ctz(width + height) + ldr lr, [r7, lr, lsl #2] + ldr r6, [r7, r6, lsl #2] + neg r8, r8 // -ctz(width + height) + add r12, r7, lr + add r7, r7, r6 + vshr.u32 d16, d16, #1 // (width + height) >> 1 + vdup.32 d17, r8 // -ctz(width + height) + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r7 + + .align 2 +L(ipred_cfl_tbl): + .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB + +L(ipred_cfl_h4): + vld1.16 {d0}, [r2, :64]! + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w4): + vld1.16 {d1}, [r2] + vadd.i32 d0, d0, d16 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #4 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 8/16 + cmp r4, #16 + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + vld1.16 {q0}, [r2, :128]! + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w8): + vld1.16 {q2}, [r2] + vadd.i32 d0, d0, d16 + vadd.i16 d1, d4, d5 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #8 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + vld1.16 {q2, q3}, [r2, :128]! + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w16): + vld1.16 {q2, q3}, [r2] + vadd.i32 d0, d0, d16 + vadd.i16 q2, q2, q3 + vadd.i16 d1, d4, d5 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #16 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + vld1.16 {q2, q3}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128]! + vadd.i16 q2, q2, q3 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q2, q10 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w32): + vld1.16 {q2, q3}, [r2]! + vadd.i32 d0, d0, d16 + vld1.16 {q10, q11}, [r2]! + vadd.i16 q2, q2, q3 + vadd.i16 q10, q10, q11 + vadd.i16 q2, q2, q10 + vadd.i16 d1, d4, d5 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #32 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_420_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i32 q8, #0 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_420_tbl): + .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w4): +1: // Copy and subsample input + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vshl.i16 q0, q0, #1 + subs r8, r8, #2 + vst1.16 {q0}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + bgt 1b + cmp r4, #0 + vmov d0, d1 + vmov d2, d1 + vmov d3, d1 +L(ipred_cfl_ac_420_w4_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + vadd.i32 q8, q8, q9 + vadd.i32 q10, q10, q11 + vadd.i32 q0, q8, q10 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d0, d0 // sum + sub r0, r0, r6, lsl #3 + vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz + vdup.16 q8, d16[0] +6: // Subtract dc from ac + vld1.16 {q0, q1}, [r0, :128] + subs r6, r6, #4 + vsub.i16 q0, q0, q8 + vsub.i16 q1, q1, q8 + vst1.16 {q0, q1}, [r0, :128]! + bgt 6b + pop {r4-r8, pc} + +L(ipred_cfl_ac_420_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vld1.16 {q12, q13}, [r1, :128], r2 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q3 + vpadd.i16 d2, d24, d25 + vpadd.i16 d3, d26, d27 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q1, #1 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov q0, q1 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vshl.i16 q0, q0, #1 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov q0, q1 + +L(ipred_cfl_ac_420_w8_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl r6, r6, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + adr r7, L(ipred_cfl_ac_420_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_420_w16_tbl): + .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w16_wpad0): + sub r2, r2, #32 +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q12, q13}, [r12, :128]! + vld1.16 {q2, q3}, [r1, :128], r2 + vadd.i16 q0, q0, q12 + vadd.i16 q1, q1, q13 + vld1.16 {q12, q13}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vadd.i16 q2, q2, q12 + vadd.i16 q3, q3, q13 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q1, #1 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): + sub r2, r2, #32 +1: // Copy and subsample input, padding 4 + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q12, q13}, [r12, :128]! + vld1.16 {q2}, [r1, :128], r2 + vadd.i16 q0, q0, q12 + vadd.i16 q1, q1, q13 + vld1.16 {q12}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vadd.i16 q2, q2, q12 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vshl.i16 q0, q0, #1 + vshl.i16 d2, d2, #1 + subs r8, r8, #1 + vdup.16 d3, d2[3] + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q12, q13}, [r12, :128], r2 + vadd.i16 q0, q0, q12 + vadd.i16 q1, q1, q13 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vshl.i16 q0, q0, #1 + subs r8, r8, #1 + vdup.16 q1, d1[3] + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q12}, [r12, :128], r2 + vadd.i16 q0, q0, q12 + vpadd.i16 d0, d0, d1 + vshl.i16 d0, d0, #1 + subs r8, r8, #1 + vdup.16 q1, d0[3] + vdup.16 d1, d0[3] + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl r6, r6, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) +endfunc + +// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_422_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_422_tbl): + .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w4): +1: // Copy and subsample input + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vld1.16 {q12, q13}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d24, d24, d25 + vpadd.i16 d25, d26, d27 + vpadd.i16 d26, d4, d5 + vpadd.i16 d27, d6, d7 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q12, #2 + vshl.i16 q3, q13, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q2}, [r12, :128], r2 + vld1.16 {q12}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d24, d24, d25 + vpadd.i16 d25, d4, d5 + vshl.i16 q0, q0, #2 + vshl.i16 q12, q12, #2 + vdup.16 d7, d25[3] + vmov d6, d25 + vdup.16 d5, d24[3] + vmov d4, d24 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + adr r7, L(ipred_cfl_ac_422_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_422_w16_tbl): + .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w16_wpad0): + sub r2, r2, #32 +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2, q3}, [r12, :128]! + vld1.16 {q12, q13}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d24, d25 + vpadd.i16 d3, d26, d27 + vld1.16 {q12, q13}, [r12, :128], r2 + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d6, d24, d25 + vpadd.i16 d7, d26, d27 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q2, #2 + vshl.i16 q3, q3, #2 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): + sub r2, r2, #32 +1: // Copy and subsample input, padding 4 + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2, q3}, [r12, :128]! + vld1.16 {q12}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d24, d25 + vld1.16 {q12}, [r12, :128], r2 + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d6, d24, d25 + vshl.i16 q0, q0, #2 + vshl.i16 d2, d2, #2 + vshl.i16 q2, q2, #2 + vshl.i16 d6, d6, #2 + vdup.16 d3, d2[3] + vdup.16 d7, d6[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vshl.i16 q0, q0, #2 + vshl.i16 q2, q2, #2 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q2}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vshl.i16 q0, q0, #2 + vdup.16 q3, d1[3] + vdup.16 q1, d0[3] + vdup.16 d5, d1[3] + vmov d4, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) +endfunc + +// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_444_tbl) + sub r8, r8, #26 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_444_tbl): + .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + vld1.16 {d0}, [r1, :64], r2 + vld1.16 {d1}, [r12, :64], r2 + vld1.16 {d2}, [r1, :64], r2 + vld1.16 {d3}, [r12, :64], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + vshl.i16 q3, q3, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cmp r3, #0 + bne L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + vshl.i16 q3, q3, #3 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q2}, [r12, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q2, q2, #3 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr r7, L(ipred_cfl_ac_444_w32_tbl) + ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 + asr r2, r2, #1 + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_444_w32_tbl): + .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w32_wpad0): + sub r2, r2, #32 +1: // Copy and expand input, without padding + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2, q3}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + vshl.i16 q3, q3, #3 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): + sub r2, r2, #32 +1: // Copy and expand input, padding 8 + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vdup.16 q3, d5[3] + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + vld1.16 {q0, q1}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vdup.16 q2, d3[3] + vdup.16 q3, d3[3] + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + vld1.16 {q0}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + subs r8, r8, #1 + vdup.16 q1, d1[3] + vst1.16 {q0, q1}, [r0, :128]! + vdup.16 q2, d1[3] + vdup.16 q3, d1[3] + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + +L(ipred_cfl_ac_444_w32_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #1 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl r6, r6, #3 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) +endfunc diff -Nru dav1d-0.7.1/src/arm/32/ipred.S dav1d-0.9.1/src/arm/32/ipred.S --- dav1d-0.7.1/src/arm/32/ipred.S 2020-06-21 11:48:54.956126500 +0000 +++ dav1d-0.9.1/src/arm/32/ipred.S 2021-07-28 21:38:28.857851700 +0000 @@ -1,6 +1,6 @@ /* * Copyright © 2018, VideoLAN and dav1d authors - * Copyright © 2019, Martin Storsjo + * Copyright © 2020, Martin Storsjo * Copyright © 2019, B Krishnan Iyer * All rights reserved. * @@ -40,8 +40,7 @@ adr r2, L(ipred_dc_128_tbl) sub r3, r3, #25 ldr r3, [r2, r3, lsl #2] - mov lr, #128 - vdup.8 q0, lr + vmov.i8 q0, #128 add r2, r2, r3 add r12, r0, r1 lsl r1, r1, #1 @@ -79,7 +78,7 @@ bgt 16b pop {r4, pc} 320: - vdup.8 q1, lr + vmov.i8 q1, #128 32: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 @@ -89,20 +88,18 @@ bgt 32b pop {r4, pc} 640: - vdup.8 q1, lr - vdup.8 q2, lr - vdup.8 q3, lr + vmov.i8 q1, #128 sub r1, r1, #32 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc @@ -132,7 +129,7 @@ .word 80f - L(ipred_v_tbl) + CONFIG_THUMB .word 40f - L(ipred_v_tbl) + CONFIG_THUMB 40: - vld1.32 {d0[0]}, [r2] + vld1.32 {d0[]}, [r2] 4: vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 @@ -215,7 +212,7 @@ .word 8f - L(ipred_h_tbl) + CONFIG_THUMB .word 4f - L(ipred_h_tbl) + CONFIG_THUMB 4: - vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr vst1.32 {d3[0]}, [r0, :32], r1 vst1.32 {d2[0]}, [r12, :32], r1 subs r4, r4, #4 @@ -224,7 +221,7 @@ bgt 4b pop {r4-r5, pc} 8: - vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr vst1.8 {d3}, [r0, :64], r1 vst1.8 {d2}, [r12, :64], r1 subs r4, r4, #4 @@ -401,19 +398,17 @@ vrshrn.u16 d18, q0, #6 vdup.8 q0, d18[0] vdup.8 q1, d18[0] - vdup.8 q2, d18[0] - vdup.8 q3, d18[0] sub r1, r1, #32 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc @@ -453,7 +448,7 @@ .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB L(ipred_dc_left_h4): - vld1.32 {d0[]}, [r2] + vld1.32 {d0[]}, [r2, :32] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #2 @@ -468,7 +463,7 @@ bgt L(ipred_dc_left_w4) pop {r4-r5, pc} L(ipred_dc_left_h8): - vld1.8 {d0}, [r2] + vld1.8 {d0}, [r2, :64] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vpadd.u16 d0, d0 @@ -484,7 +479,7 @@ bgt L(ipred_dc_left_w8) pop {r4-r5, pc} L(ipred_dc_left_h16): - vld1.8 {d0, d1}, [r2] + vld1.8 {d0, d1}, [r2, :128] vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 @@ -501,7 +496,7 @@ bgt L(ipred_dc_left_w16) pop {r4-r5, pc} L(ipred_dc_left_h32): - vld1.8 {d0, d1, d2, d3}, [r2] + vld1.8 {d0, d1, d2, d3}, [r2, :128] vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vadd.u16 q0, q0, q1 @@ -522,8 +517,8 @@ bgt 1b pop {r4-r5, pc} L(ipred_dc_left_h64): - vld1.8 {d0, d1, d2, d3}, [r2]! - vld1.8 {d4, d5, d6, d7}, [r2] + vld1.8 {d0, d1, d2, d3}, [r2, :128]! + vld1.8 {d4, d5, d6, d7}, [r2, :128] vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 @@ -538,20 +533,18 @@ vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w64): - sub r1, r1, #32 vmov.8 q1, q0 - vmov.8 q2, q0 - vmov.8 q3, q0 + sub r1, r1, #32 1: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} endfunc @@ -568,7 +561,6 @@ clz r3, r3 clz r12, r4 vdup.16 q15, lr // width + height - mov r6, #0 adr r5, L(ipred_dc_tbl) rbit lr, lr // rbit(width + height) sub r3, r3, #20 // 25 leading bits, minus table offset 5 @@ -599,22 +591,21 @@ .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB L(ipred_dc_h4): - vld1.32 {d0[0]}, [r2]! + vld1.32 {d0[]}, [r2, :32]! vpaddl.u8 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w4): - add r2, r2, #1 - vld1.32 {d1[0]}, [r2] - vmov.32 d1[1], r6 + vld1.32 {d1[]}, [r2] vadd.s16 d0, d0, d30 vpaddl.u8 d1, d1 vpadd.u16 d1, d1 - vpadd.u16 d1, d1 cmp r4, #4 vadd.s16 d0, d0, d1 vshl.u16 d0, d0, d28 - beq 1f // h = 8/16 + beq 1f + // h = 8/16 movw lr, #(0x3334/2) movw r5, #(0x5556/2) cmp r4, #16 @@ -634,13 +625,13 @@ pop {r4-r6, pc} L(ipred_dc_h8): - vld1.8 {d0}, [r2]! + vld1.8 {d0}, [r2, :64]! vpaddl.u8 d0, d0 vpadd.u16 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w8): - add r2, r2, #1 vld1.8 {d2}, [r2] vadd.s16 d0, d0, d30 vpaddl.u8 d2, d2 @@ -649,13 +640,14 @@ cmp r4, #8 vadd.s16 d0, d0, d2 vshl.u16 d0, d0, d28 - beq 1f // h = 4/16/32 + beq 1f + // h = 4/16/32 cmp r4, #32 movw lr, #(0x3334/2) movw r5, #(0x5556/2) it ne movne lr, r5 - vdup.16 q12, lr + vdup.16 d24, lr vqdmulh.s16 d0, d0, d24 1: vdup.8 d0, d0[0] @@ -669,14 +661,14 @@ pop {r4-r6, pc} L(ipred_dc_h16): - vld1.8 {d0, d1}, [r2]! + vld1.8 {d0, d1}, [r2, :128]! vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w16): - add r2, r2, #1 vld1.8 {d2, d3}, [r2] vadd.s16 d0, d0, d30 vaddl.u8 q1, d2, d3 @@ -686,13 +678,14 @@ cmp r4, #16 vadd.s16 d0, d0, d2 vshl.u16 d0, d0, d28 - beq 1f // h = 4/8/32/64 + beq 1f + // h = 4/8/32/64 tst r4, #(32+16+8) // 16 added to make a consecutive bitmask movw lr, #(0x3334/2) movw r5, #(0x5556/2) it ne movne lr, r5 - vdup.16 q12, lr + vdup.16 d24, lr vqdmulh.s16 d0, d0, d24 1: vdup.8 q0, d0[0] @@ -706,37 +699,35 @@ pop {r4-r6, pc} L(ipred_dc_h32): - vld1.8 {d0, d1, d2, d3}, [r2]! + vld1.8 {d0, d1, d2, d3}, [r2, :128]! vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w32): - add r2, r2, #1 vld1.8 {d2, d3, d4, d5}, [r2] vadd.s16 d0, d0, d30 - vaddl.u8 q2, d4, d5 - vadd.u16 d4, d4, d5 vaddl.u8 q1, d2, d3 + vaddl.u8 q2, d4, d5 + vadd.u16 q1, q1, q2 vadd.u16 d2, d2, d3 - vpadd.u16 d4, d4 vpadd.u16 d2, d2 - vpadd.u16 d4, d4 vpadd.u16 d2, d2 cmp r4, #32 - vadd.s16 d0, d0, d4 vadd.s16 d0, d0, d2 vshl.u16 d4, d0, d28 - beq 1f // h = 8/16/64 + beq 1f + // h = 8/16/64 cmp r4, #8 movw lr, #(0x3334/2) movw r5, #(0x5556/2) it ne movne lr, r5 - vdup.16 q12, lr + vdup.16 d24, lr vqdmulh.s16 d4, d4, d24 1: vdup.8 q0, d4[0] @@ -751,9 +742,9 @@ pop {r4-r6, pc} L(ipred_dc_h64): - vld1.8 {d0, d1, d2, d3}, [r2]! + vld1.8 {d0, d1, d2, d3}, [r2, :128]! vaddl.u8 q0, d0, d1 - vld1.8 {d4, d5, d6, d7}, [r2]! + vld1.8 {d4, d5, d6, d7}, [r2, :128]! vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 @@ -762,10 +753,10 @@ vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w64): - add r2, r2, #1 vld1.8 {d2, d3, d4, d5}, [r2]! vadd.s16 d0, d0, d30 vaddl.u8 q2, d4, d5 @@ -791,11 +782,11 @@ vadd.s16 d0, d0, d2 vadd.s16 d0, d0, d3 vshl.u16 d18, d0, d28 - beq 1f // h = 16/32 + beq 1f + // h = 16/32 movw lr, #(0x5556/2) movt lr, #(0x3334/2) - mov r5, r4 - and r5, r5, #31 + and r5, r4, #31 lsr lr, lr, r5 vdup.16 d30, lr vqdmulh.s16 d18, d18, d30 @@ -803,19 +794,2144 @@ sub r1, r1, #32 vdup.8 q0, d18[0] vdup.8 q1, d18[0] - vdup.8 q2, d18[0] - vdup.8 q3, d18[0] 2: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} endfunc +// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + clz lr, r3 + adr r5, L(ipred_paeth_tbl) + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.8 {d4[], d5[]}, [r2] + add r8, r2, #1 + sub r2, r2, #4 + add r5, r5, lr + mov r7, #-4 + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_paeth_tbl): + .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[], d7[]}, [r8] + vsubl.u8 q8, d6, d4 // top - topleft +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 + vzip.32 d0, d1 + vzip.32 d2, d3 + vaddw.u8 q9, q8, d0 + vaddw.u8 q10, q8, d2 + vqmovun.s16 d18, q9 // base + vqmovun.s16 d19, q10 + vmov d1, d2 + vabd.u8 q10, q3, q9 // tdiff + vabd.u8 q11, q2, q9 // tldiff + vabd.u8 q9, q0, q9 // ldiff + vmin.u8 q12, q10, q11 // min(tdiff, tldiff) + vcge.u8 q10, q11, q10 // tldiff >= tdiff + vcge.u8 q9, q12, q9 // min(tdiff, tldiff) >= ldiff + vbsl q10, q3, q2 // tdiff <= tldiff ? top : topleft + vbit q10, q0, q9 // ldiff <= min ? left : ... + vst1.32 {d21[1]}, [r0, :32], r1 + vst1.32 {d21[0]}, [r6, :32], r1 + subs r4, r4, #4 + vst1.32 {d20[1]}, [r0, :32], r1 + vst1.32 {d20[0]}, [r6, :32], r1 + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d6}, [r8] + vsubl.u8 q8, d6, d4 // top - topleft + vmov d7, d6 +8: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 + vaddw.u8 q9, q8, d0 + vaddw.u8 q10, q8, d1 + vaddw.u8 q11, q8, d2 + vaddw.u8 q12, q8, d3 + vqmovun.s16 d18, q9 // base + vqmovun.s16 d19, q10 + vqmovun.s16 d20, q11 + vqmovun.s16 d21, q12 + vabd.u8 q11, q3, q9 // tdiff + vabd.u8 q12, q3, q10 + vabd.u8 q13, q2, q9 // tldiff + vabd.u8 q14, q2, q10 + vabd.u8 q10, q1, q10 // ldiff + vabd.u8 q9, q0, q9 + vmin.u8 q15, q12, q14 // min(tdiff, tldiff) + vcge.u8 q12, q14, q12 // tldiff >= tdiff + vmin.u8 q14, q11, q13 // min(tdiff, tldiff) + vcge.u8 q11, q13, q11 // tldiff >= tdiff + vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff + vcge.u8 q9, q14, q9 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + vst1.8 {d25}, [r0, :64], r1 + vst1.8 {d24}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d23}, [r0, :64], r1 + vst1.8 {d22}, [r6, :64], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: +640: + vld1.8 {d6}, [r8]! + mov r12, r3 + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3 +1: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 +2: + vsubl.u8 q8, d6, d4 // top - topleft + vmov d7, d6 + vaddw.u8 q9, q8, d0 + vaddw.u8 q10, q8, d1 + vaddw.u8 q11, q8, d2 + vaddw.u8 q12, q8, d3 + vqmovun.s16 d18, q9 // base + vqmovun.s16 d19, q10 + vqmovun.s16 d20, q11 + vqmovun.s16 d21, q12 + vabd.u8 q11, q3, q9 // tdiff + vabd.u8 q12, q3, q10 + vabd.u8 q13, q2, q9 // tldiff + vabd.u8 q14, q2, q10 + vabd.u8 q10, q1, q10 // ldiff + vabd.u8 q9, q0, q9 + vmin.u8 q15, q12, q14 // min(tdiff, tldiff) + vcge.u8 q12, q14, q12 // tldiff >= tdiff + vmin.u8 q14, q11, q13 // min(tdiff, tldiff) + vcge.u8 q11, q13, q11 // tldiff >= tdiff + vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff + vcge.u8 q9, q14, q9 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + subs r3, r3, #8 + vst1.8 {d25}, [r0, :64]! + vst1.8 {d24}, [r6, :64]! + vst1.8 {d23}, [r5, :64]! + vst1.8 {d22}, [lr, :64]! + ble 8f + vld1.8 {d6}, [r8]! + b 2b +8: + subs r4, r4, #4 + ble 9f + // End of horizontal loop, move pointers to next four rows + sub r8, r8, r12 + add r0, r0, r1 + add r6, r6, r1 + vld1.8 {d6}, [r8]! + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + pop {r4-r8, pc} +endfunc + +// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_8bpc_neon, export=1 + push {r4-r10, lr} + ldr r4, [sp, #32] + movrel r10, X(sm_weights) + add r12, r10, r4 + add r10, r10, r3 + clz r9, r3 + adr r5, L(ipred_smooth_tbl) + sub lr, r2, r4 + sub r9, r9, #25 + ldr r9, [r5, r9, lsl #2] + vld1.8 {d4[]}, [lr] // bottom + add r8, r2, #1 + add r5, r5, r9 + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_tbl): + .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB + +40: + vld1.32 {d16[]}, [r8] // top + vld1.32 {d18[]}, [r10, :32] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vdup.8 q3, d16[3] // right + vsubl.u8 q8, d16, d4 // top-bottom + vmovl.u8 q9, d18 // weights_hor +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left + vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver + vshll.i8 q12, d6, #8 // right*256 + vshll.i8 q13, d6, #8 + vzip.32 d1, d0 // left, flipped + vzip.32 d3, d2 + vzip.32 d20, d21 // weights_ver + vzip.32 d22, d23 + vshll.i8 q14, d4, #8 // bottom*256 + vshll.i8 q15, d4, #8 + vsubl.u8 q0, d1, d6 // left-right + vsubl.u8 q1, d3, d6 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmla.i16 q12, q1, q9 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q0, q9 // (left flipped) + vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q15, q8, q11 + vhadd.u16 q12, q12, q14 + vhadd.u16 q13, q13, q15 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + vst1.32 {d24[0]}, [r0, :32], r1 + vst1.32 {d24[1]}, [r6, :32], r1 + subs r4, r4, #4 + vst1.32 {d25[0]}, [r0, :32], r1 + vst1.32 {d25[1]}, [r6, :32], r1 + bgt 4b + pop {r4-r10, pc} +80: + vld1.8 {d16}, [r8] // top + vld1.8 {d18}, [r10, :64] // weights_hor + sub r2, r2, #2 + mov r7, #-2 + vdup.8 q3, d16[7] // right + vsubl.u8 q8, d16, d4 // top-bottom + vmovl.u8 q9, d18 // weights_hor +8: + vld2.8 {d0[], d1[]}, [r2, :16], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vshll.i8 q12, d6, #8 // right*256 + vshll.i8 q13, d6, #8 + vshll.i8 q14, d4, #8 // bottom*256 + vshll.i8 q15, d4, #8 + vsubl.u8 q1, d0, d6 // left-right (left flipped) + vsubl.u8 q0, d1, d6 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q1, q9 + vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q15, q8, q11 + vhadd.u16 q12, q12, q14 + vhadd.u16 q13, q13, q15 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + subs r4, r4, #2 + vst1.8 {d24}, [r0, :64], r1 + vst1.8 {d25}, [r6, :64], r1 + bgt 8b + pop {r4-r10, pc} +160: +320: +640: + add lr, r2, r3 + sub r2, r2, #2 + mov r7, #-2 + vld1.8 {d6[], d7[]}, [lr] // right + sub r1, r1, r3 + mov r9, r3 + +1: + vld2.8 {d0[], d1[]}, [r2, :16], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vsubl.u8 q1, d0, d6 // left-right (left flipped) + vsubl.u8 q0, d1, d6 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 +2: + vld1.8 {d16}, [r8]! // top + vld1.8 {d18}, [r10, :64]! // weights_hor + vshll.i8 q12, d6, #8 // right*256 + vshll.i8 q13, d6, #8 + vmovl.u8 q9, d18 // weights_hor + vshll.i8 q14, d4, #8 // bottom*256 + vshll.i8 q15, d4, #8 + vsubl.u8 q8, d16, d4 // top-bottom + vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q1, q9 + vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q15, q8, q11 + vhadd.u16 q12, q12, q14 + vhadd.u16 q13, q13, q15 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + subs r3, r3, #8 + vst1.8 {d24}, [r0, :64]! + vst1.8 {d25}, [r6, :64]! + bgt 2b + subs r4, r4, #2 + ble 9f + sub r8, r8, r9 + sub r10, r10, r9 + add r0, r0, r1 + add r6, r6, r1 + mov r3, r9 + b 1b +9: + pop {r4-r10, pc} +endfunc + +// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_8bpc_neon, export=1 + push {r4-r7, lr} + ldr r4, [sp, #20] + movrel r7, X(sm_weights) + add r7, r7, r4 + clz lr, r3 + adr r5, L(ipred_smooth_v_tbl) + sub r12, r2, r4 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.8 {d4[]}, [r12] // bottom + add r2, r2, #1 + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_v_tbl): + .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[]}, [r2] // top + vsubl.u8 q3, d6, d4 // top-bottom +4: + vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver + vshll.i8 q10, d4, #8 // bottom*256 + vshll.i8 q11, d4, #8 + vzip.32 d16, d17 // weights_ver + vzip.32 d18, d19 + vmovl.u8 q8, d16 // weights_ver + vmovl.u8 q9, d18 + subs r4, r4, #4 + vmla.i16 q10, q3, q8 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q11, q3, q9 + vrshrn.i16 d20, q10, #8 + vrshrn.i16 d21, q11, #8 + vst1.32 {d20[0]}, [r0, :32], r1 + vst1.32 {d20[1]}, [r6, :32], r1 + vst1.32 {d21[0]}, [r0, :32], r1 + vst1.32 {d21[1]}, [r6, :32], r1 + bgt 4b + pop {r4-r7, pc} +80: + vld1.8 {d6}, [r2] // top + vsubl.u8 q3, d6, d4 // top-bottom +8: + vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver + vshll.i8 q12, d4, #8 // bottom*256 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vmovl.u8 q8, d16 // weights_ver + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + vmla.i16 q12, q3, q8 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q13, q3, q9 + vmla.i16 q14, q3, q10 + vmla.i16 q15, q3, q11 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + vrshrn.i16 d26, q14, #8 + vrshrn.i16 d27, q15, #8 + vst1.8 {d24}, [r0, :64], r1 + vst1.8 {d25}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d26}, [r0, :64], r1 + vst1.8 {d27}, [r6, :64], r1 + bgt 8b + pop {r4-r7, pc} +160: +320: +640: + vpush {q4-q7} + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3 + mov r12, r3 + +1: + vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver + vmovl.u8 q4, d8 // weights_ver + vmovl.u8 q5, d10 + vmovl.u8 q6, d12 + vmovl.u8 q7, d14 +2: + vld1.8 {q3}, [r2]! // top + vshll.i8 q8, d4, #8 // bottom*256 + vshll.i8 q9, d4, #8 + vshll.i8 q10, d4, #8 + vshll.i8 q11, d4, #8 + vsubl.u8 q0, d6, d4 // top-bottom + vsubl.u8 q1, d7, d4 + vshll.i8 q12, d4, #8 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vmla.i16 q8, q0, q4 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q9, q1, q4 + vmla.i16 q10, q0, q5 + vmla.i16 q11, q1, q5 + vmla.i16 q12, q0, q6 // bottom*256 + (top-bottom)*weights_ver + vmla.i16 q13, q1, q6 + vmla.i16 q14, q0, q7 + vmla.i16 q15, q1, q7 + vrshrn.i16 d16, q8, #8 + vrshrn.i16 d17, q9, #8 + vrshrn.i16 d18, q10, #8 + vrshrn.i16 d19, q11, #8 + vrshrn.i16 d20, q12, #8 + vrshrn.i16 d21, q13, #8 + vrshrn.i16 d22, q14, #8 + vrshrn.i16 d23, q15, #8 + subs r3, r3, #16 + vst1.8 {q8}, [r0, :128]! + vst1.8 {q9}, [r6, :128]! + vst1.8 {q10}, [r5, :128]! + vst1.8 {q11}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r2, r2, r12 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r7, pc} +endfunc + +// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_8bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + movrel r8, X(sm_weights) + add r8, r8, r3 + clz lr, r3 + adr r5, L(ipred_smooth_h_tbl) + add r12, r2, r3 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.8 {d4[]}, [r12] // right + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_h_tbl): + .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[]}, [r8, :32] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vmovl.u8 q3, d6 // weights_hor +4: + vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left + vshll.i8 q8, d4, #8 // right*256 + vshll.i8 q9, d4, #8 + vzip.32 d3, d2 // left, flipped + vzip.32 d1, d0 + vsubl.u8 q1, d3, d4 // left-right + vsubl.u8 q0, d1, d4 + subs r4, r4, #4 + vmla.i16 q8, q1, q3 // right*256 + (left-right)*weights_hor + vmla.i16 q9, q0, q3 + vrshrn.i16 d16, q8, #8 + vrshrn.i16 d17, q9, #8 + vst1.32 {d16[0]}, [r0, :32], r1 + vst1.32 {d16[1]}, [r6, :32], r1 + vst1.32 {d17[0]}, [r0, :32], r1 + vst1.32 {d17[1]}, [r6, :32], r1 + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d6}, [r8, :64] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vmovl.u8 q3, d6 // weights_hor +8: + vld4.8 {d16[], d18[], d20[], d22[]}, [r2, :32], r7 // left + vshll.i8 q12, d4, #8 // right*256 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vsubl.u8 q11, d22, d4 // left-right + vsubl.u8 q10, d20, d4 + vsubl.u8 q9, d18, d4 + vsubl.u8 q8, d16, d4 + vmla.i16 q12, q11, q3 // right*256 + (left-right)*weights_hor + vmla.i16 q13, q10, q3 // (left flipped) + vmla.i16 q14, q9, q3 + vmla.i16 q15, q8, q3 + vrshrn.i16 d24, q12, #8 + vrshrn.i16 d25, q13, #8 + vrshrn.i16 d26, q14, #8 + vrshrn.i16 d27, q15, #8 + vst1.8 {d24}, [r0, :64], r1 + vst1.8 {d25}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d26}, [r0, :64], r1 + vst1.8 {d27}, [r6, :64], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: +640: + vpush {q4-q7} + sub r2, r2, #4 + mov r7, #-4 + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3 + mov r12, r3 + +1: + vld4.8 {d8[], d10[], d12[], d14[]}, [r2, :32], r7 // left + vsubl.u8 q4, d8, d4 // left-right + vsubl.u8 q5, d10, d4 + vsubl.u8 q6, d12, d4 + vsubl.u8 q7, d14, d4 +2: + vld1.8 {q1}, [r8, :128]! // weights_hor + vshll.i8 q8, d4, #8 // right*256 + vshll.i8 q9, d4, #8 + vshll.i8 q10, d4, #8 + vshll.i8 q11, d4, #8 + vmovl.u8 q0, d2 // weights_hor + vmovl.u8 q1, d3 + vshll.i8 q12, d4, #8 + vshll.i8 q13, d4, #8 + vshll.i8 q14, d4, #8 + vshll.i8 q15, d4, #8 + vmla.i16 q8, q7, q0 // right*256 + (left-right)*weights_hor + vmla.i16 q9, q7, q1 // (left flipped) + vmla.i16 q10, q6, q0 + vmla.i16 q11, q6, q1 + vmla.i16 q12, q5, q0 + vmla.i16 q13, q5, q1 + vmla.i16 q14, q4, q0 + vmla.i16 q15, q4, q1 + vrshrn.i16 d16, q8, #8 + vrshrn.i16 d17, q9, #8 + vrshrn.i16 d18, q10, #8 + vrshrn.i16 d19, q11, #8 + vrshrn.i16 d20, q12, #8 + vrshrn.i16 d21, q13, #8 + vrshrn.i16 d22, q14, #8 + vrshrn.i16 d23, q15, #8 + subs r3, r3, #16 + vst1.8 {q8}, [r0, :128]! + vst1.8 {q9}, [r6, :128]! + vst1.8 {q10}, [r5, :128]! + vst1.8 {q11}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r8, r8, r12 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r8, pc} +endfunc + +// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height); +function ipred_filter_8bpc_neon, export=1 + push {r4-r8, lr} + movw r12, #511 + ldrd r4, r5, [sp, #24] + and r5, r5, r12 // 511 + movrel r6, X(filter_intra_taps) + lsl r5, r5, #6 + add r6, r6, r5 + vld1.8 {d20, d21, d22, d23}, [r6, :128]! + clz lr, r3 + adr r5, L(ipred_filter_tbl) + vld1.8 {d27, d28, d29}, [r6, :64] + sub lr, lr, #26 + ldr lr, [r5, lr, lsl #2] + vmovl.s8 q8, d20 + vmovl.s8 q9, d21 + add r5, r5, lr + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + add r6, r0, r1 + lsl r1, r1, #1 + vmovl.s8 q12, d27 + vmovl.s8 q13, d28 + vmovl.s8 q14, d29 + add r8, r2, #1 + sub r2, r2, #2 + mov r7, #-2 + bx r5 + + .align 2 +L(ipred_filter_tbl): + .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB + .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB + .word 80f - L(ipred_filter_tbl) + CONFIG_THUMB + .word 40f - L(ipred_filter_tbl) + CONFIG_THUMB + +40: + vld1.32 {d0[]}, [r8] // top (0-3) + vmovl.u8 q0, d0 // top (0-3) +4: + vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmovl.u8 q1, d2 // left (0-1) + topleft (2) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vqrshrun.s16 d4, q2, #4 + subs r4, r4, #2 + vst1.32 {d4[0]}, [r0, :32], r1 + vmovl.u8 q0, d4 + vst1.32 {d4[1]}, [r6, :32], r1 + vmov d0, d1 // move top from [4-7] to [0-3] + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d0}, [r8] // top (0-7) + vmovl.u8 q0, d0 // top (0-7) +8: + vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmovl.u8 q1, d2 // left (0-1) + topleft (2) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d4, q2, #4 + vmovl.u8 q1, d4 // first block, in 16 bit + vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) + vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d2[3] // p5(left[0]) * filter(5) + vmla.i16 q3, q14, d3[3] // p6(left[1]) * filter(6) + vqrshrun.s16 d5, q3, #4 + vzip.32 d4, d5 + subs r4, r4, #2 + vst1.8 {d4}, [r0, :64], r1 + vmovl.u8 q0, d5 + vst1.8 {d5}, [r6, :64], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: + vpush {q4-q5} + sub r1, r1, r3 + mov lr, r3 + +1: + vld1.32 {d0[]}, [r2], r7 // left (0-1) + topleft (2) + vmovl.u8 q0, d0 // left (0-1) + topleft (2) +2: + vld1.8 {q2}, [r8]! // top(0-15) + vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) + vmovl.u8 q1, d4 // top(0-7) + vmovl.u8 q2, d5 // top(8-15) + vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) + vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) + vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) + + vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) + vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) + vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d6, q3, #4 + vmovl.u8 q0, d6 // first block, in 16 bit + vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) + vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) + vmla.i16 q4, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q4, q14, d1[3] // p6(left[1]) * filter(6) + + vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) + vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) + vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d7, q4, #4 + vmovl.u8 q0, d7 // second block, in 16 bit + vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) + vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) + vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) + + vmul.i16 q15, q9, d5[0] // p1(top[0]) * filter(1) + vmla.i16 q15, q10, d5[1] // p2(top[1]) * filter(2) + vmla.i16 q15, q11, d5[2] // p3(top[2]) * filter(3) + vqrshrun.s16 d8, q5, #4 + vmovl.u8 q0, d8 // third block, in 16 bit + vmov.u8 r12, d5[6] + vmla.i16 q15, q12, d5[3] // p4(top[3]) * filter(4) + vmla.i16 q15, q8, d4[3] // p0(topleft) * filter(0) + vmla.i16 q15, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q15, q14, d1[3] // p6(left[1]) * filter(6) + vmov.8 d0[4], r12 + + subs r3, r3, #16 + vqrshrun.s16 d9, q15, #4 + + vst4.32 {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]! + vst4.32 {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]! + ble 8f + vmov.u8 r12, d9[7] + vmov.8 d0[0], r12 + vmov.u8 r12, d9[3] + vmov.8 d0[2], r12 + b 2b +8: + subs r4, r4, #2 + + ble 9f + sub r8, r6, lr + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + b 1b +9: + vpop {q4-q5} + pop {r4-r8, pc} +endfunc + +// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_8bpc_neon, export=1 + push {r4-r5, lr} + ldrd r4, r5, [sp, #12] + vld1.16 {q0}, [r2, :128] + clz lr, r4 + adr r12, L(pal_pred_tbl) + sub lr, lr, #25 + ldr lr, [r12, lr, lsl #2] + vmovn.i16 d0, q0 + add r12, r12, lr + add r2, r0, r1 + bx r12 + + .align 2 +L(pal_pred_tbl): + .word 640f - L(pal_pred_tbl) + CONFIG_THUMB + .word 320f - L(pal_pred_tbl) + CONFIG_THUMB + .word 160f - L(pal_pred_tbl) + CONFIG_THUMB + .word 80f - L(pal_pred_tbl) + CONFIG_THUMB + .word 40f - L(pal_pred_tbl) + CONFIG_THUMB + +40: + lsl r1, r1, #1 +4: + vld1.8 {q1}, [r3, :128]! + subs r5, r5, #4 + vtbl.8 d2, {d0}, d2 + vtbl.8 d3, {d0}, d3 + vst1.32 {d2[0]}, [r0, :32], r1 + vst1.32 {d2[1]}, [r2, :32], r1 + vst1.32 {d3[0]}, [r0, :32], r1 + vst1.32 {d3[1]}, [r2, :32], r1 + bgt 4b + pop {r4-r5, pc} +80: + lsl r1, r1, #1 +8: + vld1.8 {q1, q2}, [r3, :128]! + subs r5, r5, #4 + vtbl.8 d2, {d0}, d2 + vtbl.8 d3, {d0}, d3 + vst1.8 {d2}, [r0, :64], r1 + vtbl.8 d4, {d0}, d4 + vst1.8 {d3}, [r2, :64], r1 + vtbl.8 d5, {d0}, d5 + vst1.8 {d4}, [r0, :64], r1 + vst1.8 {d5}, [r2, :64], r1 + bgt 8b + pop {r4-r5, pc} +160: + lsl r1, r1, #1 +16: + vld1.8 {q8, q9}, [r3, :128]! + subs r5, r5, #4 + vld1.8 {q10, q11}, [r3, :128]! + vtbl.8 d16, {d0}, d16 + vtbl.8 d17, {d0}, d17 + vtbl.8 d18, {d0}, d18 + vtbl.8 d19, {d0}, d19 + vtbl.8 d20, {d0}, d20 + vtbl.8 d21, {d0}, d21 + vst1.8 {q8}, [r0, :128], r1 + vtbl.8 d22, {d0}, d22 + vst1.8 {q9}, [r2, :128], r1 + vtbl.8 d23, {d0}, d23 + vst1.8 {q10}, [r0, :128], r1 + vst1.8 {q11}, [r2, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + lsl r1, r1, #1 +32: + vld1.8 {q8, q9}, [r3, :128]! + subs r5, r5, #2 + vld1.8 {q10, q11}, [r3, :128]! + vtbl.8 d16, {d0}, d16 + vtbl.8 d17, {d0}, d17 + vtbl.8 d18, {d0}, d18 + vtbl.8 d19, {d0}, d19 + vtbl.8 d20, {d0}, d20 + vtbl.8 d21, {d0}, d21 + vst1.8 {q8, q9}, [r0, :128], r1 + vtbl.8 d22, {d0}, d22 + vtbl.8 d23, {d0}, d23 + vst1.8 {q10, q11}, [r2, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + sub r1, r1, #32 +64: + vld1.8 {q8, q9}, [r3, :128]! + subs r5, r5, #1 + vld1.8 {q10, q11}, [r3, :128]! + vtbl.8 d16, {d0}, d16 + vtbl.8 d17, {d0}, d17 + vtbl.8 d18, {d0}, d18 + vtbl.8 d19, {d0}, d19 + vtbl.8 d20, {d0}, d20 + vtbl.8 d21, {d0}, d21 + vst1.8 {q8, q9}, [r0, :128]! + vtbl.8 d22, {d0}, d22 + vtbl.8 d23, {d0}, d23 + vst1.8 {q10, q11}, [r0, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_128_8bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz lr, r3 + adr r12, L(ipred_cfl_128_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vmov.i16 q0, #128 // dc + vdup.i16 q1, r6 // alpha + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r12 + + .align 2 +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + +L(ipred_cfl_splat_w4): + vld1.16 {q2, q3}, [r5, :128]! + vmul.i16 q2, q2, q1 // diff = ac * alpha + vmul.i16 q3, q3, q1 + vshr.s16 q8, q2, #15 // sign = diff >> 15 + vshr.s16 q9, q3, #15 + vadd.i16 q2, q2, q8 // diff + sign + vadd.i16 q3, q3, q9 + vrshr.s16 q2, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshr.s16 q3, q3, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vadd.i16 q3, q3, q0 + vqmovun.s16 d4, q2 // iclip_pixel(dc + apply_sign()) + vqmovun.s16 d5, q3 + vst1.32 {d4[0]}, [r0, :32], r1 + vst1.32 {d4[1]}, [r6, :32], r1 + subs r4, r4, #4 + vst1.32 {d5[0]}, [r0, :32], r1 + vst1.32 {d5[1]}, [r6, :32], r1 + bgt L(ipred_cfl_splat_w4) + pop {r4-r8, pc} +L(ipred_cfl_splat_w8): + vld1.16 {q8, q9}, [r5, :128]! + vld1.16 {q10, q11}, [r5, :128]! + vmul.i16 q8, q8, q1 // diff = ac * alpha + vmul.i16 q9, q9, q1 + vmul.i16 q10, q10, q1 + vmul.i16 q11, q11, q1 + vshr.s16 q12, q8, #15 // sign = diff >> 15 + vshr.s16 q13, q9, #15 + vshr.s16 q14, q10, #15 + vshr.s16 q15, q11, #15 + vadd.i16 q8, q8, q12 // diff + sign + vadd.i16 q9, q9, q13 + vadd.i16 q10, q10, q14 + vadd.i16 q11, q11, q15 + vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshr.s16 q9, q9, #6 + vrshr.s16 q10, q10, #6 + vrshr.s16 q11, q11, #6 + vadd.i16 q8, q8, q0 // dc + apply_sign() + vadd.i16 q9, q9, q0 + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q0 + vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 + vqmovun.s16 d19, q11 + vst1.8 {d16}, [r0, :64], r1 + vst1.8 {d17}, [r6, :64], r1 + subs r4, r4, #4 + vst1.8 {d18}, [r0, :64], r1 + vst1.8 {d19}, [r6, :64], r1 + bgt L(ipred_cfl_splat_w8) + pop {r4-r8, pc} +L(ipred_cfl_splat_w16): + add r12, r5, r3, lsl #1 + sub r1, r1, r3 + mov lr, r3 +1: + vld1.16 {q8, q9}, [r5, :128]! + vmul.i16 q8, q8, q1 // diff = ac * alpha + vld1.16 {q10, q11}, [r12, :128]! + vmul.i16 q9, q9, q1 + vmul.i16 q10, q10, q1 + vmul.i16 q11, q11, q1 + vshr.s16 q12, q8, #15 // sign = diff >> 15 + vshr.s16 q13, q9, #15 + vshr.s16 q14, q10, #15 + vshr.s16 q15, q11, #15 + vadd.i16 q8, q8, q12 // diff + sign + vadd.i16 q9, q9, q13 + vadd.i16 q10, q10, q14 + vadd.i16 q11, q11, q15 + vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshr.s16 q9, q9, #6 + vrshr.s16 q10, q10, #6 + vrshr.s16 q11, q11, #6 + vadd.i16 q8, q8, q0 // dc + apply_sign() + vadd.i16 q9, q9, q0 + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q0 + vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 + vqmovun.s16 d19, q11 + subs r3, r3, #16 + vst1.16 {q8}, [r0, :128]! + vst1.16 {q9}, [r6, :128]! + bgt 1b + subs r4, r4, #2 + add r5, r5, lr, lsl #1 + add r12, r12, lr, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + bgt 1b + pop {r4-r8, pc} +endfunc + +// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_top_8bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz lr, r3 + adr r12, L(ipred_cfl_top_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vdup.16 q1, r6 // alpha + add r2, r2, #1 + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r12 + + .align 2 +L(ipred_cfl_top_tbl): + .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + +4: + vld1.32 {d0[]}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) +8: + vld1.8 {d0}, [r2] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) +16: + vld1.8 {q0}, [r2] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +32: + vld1.8 {q2, q3}, [r2] + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q2, q3 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #5 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_left_8bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + sub r2, r2, r4 + clz lr, r3 + clz r8, r4 + adr r12, L(ipred_cfl_splat_tbl) + adr r7, L(ipred_cfl_left_tbl) + sub lr, lr, #26 + sub r8, r8, #26 + ldr lr, [r12, lr, lsl #2] + ldr r8, [r7, r8, lsl #2] + vdup.16 q1, r6 // alpha + add r12, r12, lr + add r7, r7, r8 + add r6, r0, r1 + lsl r1, r1, #1 + bx r7 + + .align 2 +L(ipred_cfl_left_tbl): + .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + +L(ipred_cfl_left_h4): + vld1.32 {d0[]}, [r2, :32] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h8): + vld1.8 {d0}, [r2, :64] + vpaddl.u8 d0, d0 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h16): + vld1.8 {q0}, [r2, :128] + vaddl.u8 q0, d0, d1 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h32): + vld1.8 {q2, q3}, [r2, :128] + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.u16 q0, q2, q3 + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0 + vpadd.u16 d0, d0 + vrshr.u16 d0, d0, #5 + vdup.16 q0, d0[0] + bx r12 +endfunc + +// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_8bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + sub r2, r2, r4 + add r8, r3, r4 // width + height + vdup.16 q1, r6 // alpha + clz lr, r3 + clz r6, r4 + vdup.16 d16, r8 // width + height + adr r7, L(ipred_cfl_tbl) + rbit r8, r8 // rbit(width + height) + sub lr, lr, #22 // 26 leading bits, minus table offset 4 + sub r6, r6, #26 + clz r8, r8 // ctz(width + height) + ldr lr, [r7, lr, lsl #2] + ldr r6, [r7, r6, lsl #2] + neg r8, r8 // -ctz(width + height) + add r12, r7, lr + add r7, r7, r6 + vshr.u16 d16, d16, #1 // (width + height) >> 1 + vdup.16 d17, r8 // -ctz(width + height) + add r6, r0, r1 + lsl r1, r1, #1 + bx r7 + + .align 2 +L(ipred_cfl_tbl): + .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB + +L(ipred_cfl_h4): + vld1.32 {d0[]}, [r2, :32]! + vpaddl.u8 d0, d0 + add r2, r2, #1 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w4): + vld1.32 {d1[]}, [r2] + vadd.i16 d0, d0, d16 + vpaddl.u8 d1, d1 + vpadd.u16 d1, d1 + cmp r4, #4 + vadd.i16 d0, d0, d1 + vshl.u16 d0, d0, d17 + beq 1f + // h = 8/16 + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + cmp r4, #16 + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + vld1.8 {d0}, [r2, :64]! + vpaddl.u8 d0, d0 + vpadd.i16 d0, d0 + add r2, r2, #1 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w8): + vld1.8 {d1}, [r2] + vadd.i16 d0, d0, d16 + vpaddl.u8 d1, d1 + vpadd.i16 d1, d1 + vpadd.i16 d1, d1 + cmp r4, #8 + vadd.i16 d0, d0, d1 + vshl.u16 d0, d0, d17 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + vld1.8 {q0}, [r2, :128]! + vaddl.u8 q0, d0, d1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0 + add r2, r2, #1 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w16): + vld1.8 {q2}, [r2] + vadd.i16 d0, d0, d16 + vaddl.u8 q2, d4, d5 + vadd.i16 d4, d4, d5 + vpadd.i16 d4, d4 + vpadd.i16 d4, d4 + cmp r4, #16 + vadd.i16 d0, d0, d4 + vshl.u16 d0, d0, d17 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + vld1.8 {q2, q3}, [r2, :128]! + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0 + add r2, r2, #1 + vpadd.i16 d0, d0 + bx r12 +L(ipred_cfl_w32): + vld1.8 {q2, q3}, [r2] + vadd.i16 d0, d0, d16 + vaddl.u8 q2, d4, d5 + vaddl.u8 q3, d6, d7 + vadd.i16 q2, q2, q3 + vadd.i16 d4, d4, d5 + vpadd.i16 d4, d4 + vpadd.i16 d4, d4 + cmp r4, #32 + vadd.i16 d0, d0, d4 + vshl.u16 d0, d0, d17 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #(0x3334/2) + movw r8, #(0x5556/2) + it ne + movne lr, r8 + vdup.16 d18, lr + vqdmulh.s16 d0, d0, d18 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_8bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_420_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_420_tbl): + .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w4): +1: // Copy and subsample input + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d2}, [r12, :64], r2 + vld1.8 {d1}, [r1, :64], r2 + vld1.8 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vadd.i16 q0, q0, q1 + vshl.i16 q0, q0, #1 + subs r8, r8, #2 + vst1.16 {q0}, [r0, :128]! + vadd.i16 q8, q8, q0 + bgt 1b + cmp r4, #0 + vmov d0, d1 + vmov d2, d1 + vmov d3, d1 +L(ipred_cfl_ac_420_w4_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q8, q8, q1 + bgt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + vadd.i16 q0, q8, q9 + vadd.i16 q1, q10, q11 + vpaddl.u16 q0, q0 + vpaddl.u16 q1, q1 + vadd.i32 q0, q1 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d0, d0 // sum + sub r0, r0, r6, lsl #3 + vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz + vdup.16 q8, d16[0] +L(ipred_cfl_ac_420_w4_subtract_dc): +6: // Subtract dc from ac + vld1.16 {q0, q1}, [r0, :128] + subs r6, r6, #4 + vsub.i16 q0, q0, q8 + vsub.i16 q1, q1, q8 + vst1.16 {q0, q1}, [r0, :128]! + bgt 6b + pop {r4-r8, pc} + +L(ipred_cfl_ac_420_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q1}, [r12, :128], r2 + vld1.8 {q2}, [r1, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q3}, [r12, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q2, #1 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov q0, q1 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.16 {d0}, [r1, :64], r2 + vld1.16 {d2}, [r12, :64], r2 + vld1.16 {d1}, [r1, :64], r2 + vld1.16 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vadd.i16 q0, q0, q1 + vshl.i16 q0, q0, #1 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov q0, q1 + +L(ipred_cfl_ac_420_w8_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl r6, r6, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + adr r7, L(ipred_cfl_ac_420_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_420_w16_tbl): + .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w16_wpad0): +1: // Copy and subsample input, without padding + vld1.8 {q0, q1}, [r1, :128], r2 + vld1.8 {q2, q3}, [r12, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q12, q13}, [r1, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vld1.8 {q2, q3}, [r12, :128], r2 + vpaddl.u8 q12, q12 + vpaddl.u8 q13, q13 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q3 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q1, #1 + vshl.i16 q2, q12, #1 + vshl.i16 q3, q13, #1 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): +1: // Copy and subsample input, padding 4 + vldr d2, [r1, #16] + vld1.8 {q0}, [r1, :128], r2 + vldr d6, [r12, #16] + vld1.8 {q2}, [r12, :128], r2 + vpaddl.u8 d2, d2 + vldr d26, [r1, #16] + vpaddl.u8 q0, q0 + vld1.8 {q12}, [r1, :128], r2 + vpaddl.u8 d6, d6 + vldr d30, [r12, #16] + vpaddl.u8 q2, q2 + vld1.8 {q14}, [r12, :128], r2 + vpaddl.u8 d26, d26 + vpaddl.u8 q12, q12 + vpaddl.u8 d30, d30 + vpaddl.u8 q14, q14 + vadd.i16 d2, d2, d6 + vadd.i16 q0, q0, q2 + vadd.i16 d26, d26, d30 + vadd.i16 q12, q12, q14 + vshl.i16 d2, d2, #1 + vshl.i16 q0, q0, #1 + vshl.i16 d6, d26, #1 + vshl.i16 q2, q12, #1 + vdup.16 d3, d2[3] + vdup.16 d7, d6[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q1}, [r12, :128], r2 + vld1.8 {q2}, [r1, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q3}, [r12, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vshl.i16 q0, q0, #1 + vshl.i16 q2, q2, #1 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vld1.8 {d4}, [r1, :64], r2 + vpaddl.u8 q0, q0 + vld1.8 {d5}, [r12, :64], r2 + vpaddl.u8 q2, q2 + vadd.i16 d0, d0, d1 + vadd.i16 d4, d4, d5 + vshl.i16 d0, d0, #1 + vshl.i16 d4, d4, #1 + vdup.16 q1, d0[3] + vdup.16 q3, d4[3] + vdup.16 d1, d0[3] + vdup.16 d5, d4[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl r6, r6, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) +endfunc + +// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_8bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_422_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_422_tbl): + .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w4): +1: // Copy and subsample input + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vld1.8 {d2}, [r1, :64], r2 + vld1.8 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q1}, [r12, :128], r2 + vld1.8 {q2}, [r1, :128], r2 + vpaddl.u8 q0, q0 + vld1.8 {q3}, [r12, :128], r2 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q2, #2 + vshl.i16 q3, q3, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vld1.8 {d2}, [r1, :64], r2 + vld1.8 {d3}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vdup.16 d7, d3[3] + vmov d6, d3 + vdup.16 d5, d2[3] + vmov d4, d2 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + adr r7, L(ipred_cfl_ac_422_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_422_w16_tbl): + .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w16_wpad0): +1: // Copy and subsample input, without padding + vld1.8 {q0, q1}, [r1, :128], r2 + vld1.8 {q2, q3}, [r12, :128], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q1, q1 + vpaddl.u8 q2, q2 + vpaddl.u8 q3, q3 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q2, #2 + vshl.i16 q3, q3, #2 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): +1: // Copy and subsample input, padding 4 + vldr d2, [r1, #16] + vld1.8 {q0}, [r1, :128], r2 + vldr d6, [r12, #16] + vld1.8 {q2}, [r12, :128], r2 + vpaddl.u8 d2, d2 + vpaddl.u8 q0, q0 + vpaddl.u8 d6, d6 + vpaddl.u8 q2, q2 + vshl.i16 d2, d2, #2 + vshl.i16 q0, q0, #2 + vshl.i16 d6, d6, #2 + vshl.i16 q2, q2, #2 + vdup.16 d3, d2[3] + vdup.16 d7, d6[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.8 {q0}, [r1, :128], r2 + vld1.8 {q2}, [r12, :128], r2 + vpaddl.u8 q0, q0 + vpaddl.u8 q2, q2 + vshl.i16 q0, q0, #2 + vshl.i16 q2, q2, #2 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d1}, [r12, :64], r2 + vpaddl.u8 q0, q0 + vshl.i16 q0, q0, #2 + vdup.16 q3, d1[3] + vdup.16 q1, d0[3] + vdup.16 d5, d1[3] + vmov d4, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) +endfunc + +// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_8bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_444_tbl) + sub r8, r8, #26 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_444_tbl): + .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + vld1.32 {d0[]}, [r1, :32], r2 + vld1.32 {d0[1]}, [r12, :32], r2 + vld1.32 {d2[]}, [r1, :32], r2 + vld1.32 {d2[1]}, [r12, :32], r2 + vshll.u8 q0, d0, #3 + vshll.u8 q1, d2, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + vld1.16 {d0}, [r1, :64], r2 + vld1.16 {d2}, [r12, :64], r2 + vld1.16 {d4}, [r1, :64], r2 + vshll.u8 q0, d0, #3 + vld1.16 {d6}, [r12, :64], r2 + vshll.u8 q1, d2, #3 + vshll.u8 q2, d4, #3 + vshll.u8 q3, d6, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cmp r3, #0 + bne L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + vld1.8 {q1}, [r1, :128], r2 + vld1.8 {q3}, [r12, :128], r2 + vshll.u8 q0, d2, #3 + vshll.u8 q1, d3, #3 + vshll.u8 q2, d6, #3 + vshll.u8 q3, d7, #3 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d4}, [r12, :64], r2 + vshll.u8 q0, d0, #3 + vshll.u8 q2, d4, #3 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr r7, L(ipred_cfl_ac_444_w32_tbl) + ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_444_w32_tbl): + .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w32_wpad0): +1: // Copy and expand input, without padding + vld1.8 {q2, q3}, [r1, :128], r2 + vld1.8 {q13, q14}, [r12, :128], r2 + vshll.u8 q0, d4, #3 + vshll.u8 q1, d5, #3 + vshll.u8 q2, d6, #3 + vshll.u8 q3, d7, #3 + vshll.u8 q12, d26, #3 + vshll.u8 q13, d27, #3 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vshll.u8 q0, d28, #3 + vshll.u8 q1, d29, #3 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): +1: // Copy and expand input, padding 8 + vldr d4, [r1, #16] + vld1.8 {q1}, [r1, :128], r2 + vldr d28, [r12, #16] + vld1.8 {q13}, [r12, :128], r2 + vshll.u8 q2, d4, #3 + vshll.u8 q0, d2, #3 + vshll.u8 q1, d3, #3 + vshll.u8 q12, d26, #3 + vshll.u8 q13, d27, #3 + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vshll.u8 q0, d28, #3 + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vdup.16 q1, d1[3] + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + vld1.8 {q1}, [r1, :128], r2 + vld1.8 {q13}, [r12, :128], r2 + vshll.u8 q0, d2, #3 + vshll.u8 q1, d3, #3 + vshll.u8 q12, d26, #3 + vshll.u8 q13, d27, #3 + vdup.16 q2, d3[3] + vdup.16 q3, d3[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vdup.16 q0, d27[3] + vdup.16 q1, d27[3] + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + vld1.8 {d0}, [r1, :64], r2 + vld1.8 {d24}, [r12, :64], r2 + vshll.u8 q0, d0, #3 + vshll.u8 q12, d24, #3 + subs r8, r8, #2 + vdup.16 q1, d1[3] + vdup.16 q2, d1[3] + vdup.16 q3, d1[3] + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q8, q8, q0 + vadd.i16 q9, q9, q1 + vdup.16 q13, d25[3] + vdup.16 q0, d25[3] + vdup.16 q1, d25[3] + vst1.16 {q2, q3}, [r0, :128]! + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q3 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 1b + cmp r4, #0 + +L(ipred_cfl_ac_444_w32_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #1 + vst1.16 {q12, q13}, [r0, :128]! + vadd.i16 q8, q8, q12 + vadd.i16 q9, q9, q13 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q10, q10, q0 + vadd.i16 q11, q11, q1 + bgt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl r6, r6, #3 + // Aggregate the sums, with wider intermediates earlier than in + // ipred_cfl_ac_420_w8_calc_subtract_dc. + vpaddl.u16 q0, q8 + vpaddl.u16 q1, q9 + vpaddl.u16 q2, q10 + vpaddl.u16 q3, q11 + vadd.i32 q0, q0, q1 + vadd.i32 q2, q2, q3 + vadd.i32 q0, q0, q2 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d0, d0 // sum + sub r0, r0, r6, lsl #3 + vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz + vdup.16 q8, d16[0] + b L(ipred_cfl_ac_420_w4_subtract_dc) +endfunc diff -Nru dav1d-0.7.1/src/arm/32/itx16.S dav1d-0.9.1/src/arm/32/itx16.S --- dav1d-0.7.1/src/arm/32/itx16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/itx16.S 2021-07-28 21:38:28.861851700 +0000 @@ -0,0 +1,3428 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); + +// Most of the functions use the following register layout: +// r0-r3 external parameters +// r4 function pointer to first transform +// r5 function pointer to second transform +// r6 output parameter for helper function +// r7 input parameter for helper function +// r8 input stride for helper function +// r9 scratch variable for helper functions +// r10-r11 pointer to list of eob thresholds, eob threshold value, +// scratch variables within helper functions (backed up) + +// The SIMD registers most often use the following layout: +// d0-d3 multiplication coefficients +// d4-d7 scratch registers +// d8-d15 unused in some transforms, used for scratch registers in others +// d16-v31 inputs/outputs of transforms + +// Potential further optimizations, that are left unimplemented for now: +// - Trying to keep multiplication coefficients in registers across multiple +// transform functions. (The register layout is designed to potentially +// allow this.) +// - Use a simplified version of the transforms themselves for cases where +// we know a significant number of inputs are zero. E.g. if the eob value +// indicates only a quarter of input values are set, for idct16 and up, +// a significant amount of calculation can be skipped, at the cost of more +// code duplication and special casing. + +// A macro for cases where a thumb mov can express the constant in one +// instruction, while arm mode requires two separate movw+movt pairs. +.macro mov_const reg, val +#if CONFIG_THUMB + mov.w \reg, #\val +#else + movw \reg, #((\val) & 0xffff) + movt \reg, #(((\val) >> 16) & 0xffff) +#endif +.endm + +const idct_coeffs, align=4 + // idct4 + .int 2896, 2896*8*(1<<16), 1567, 3784 + // idct8 + .int 799, 4017, 3406, 2276 + // idct16 + .int 401, 4076, 3166, 2598 + .int 1931, 3612, 3920, 1189 + // idct32 + .int 201, 4091, 3035, 2751 + .int 1751, 3703, 3857, 1380 + .int 995, 3973, 3513, 2106 + .int 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) + .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) + .int 4076, 401, 4017, 799 + + .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) + .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) + .int -3166, -2598, -799, -4017 + + .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) + .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) + .int 3612, 1931, 2276, 3406 + + .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) + .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) + .int -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + .int 1321, 3803, 2482, 3344 +endconst + +const iadst8_coeffs, align=4 + .int 4076, 401, 3612, 1931 + .int 2598, 3166, 1189, 3920 + // idct_coeffs + .int 2896, 0, 1567, 3784 +endconst + +const iadst16_coeffs, align=4 + .int 4091, 201, 3973, 995 + .int 3703, 1751, 3290, 2440 + .int 2751, 3035, 2106, 3513 + .int 1380, 3857, 601, 4052 +endconst + +.macro vmul_vmla d0, s0, s1, c0, c1 + vmul.i32 \d0, \s0, \c0 + vmla.i32 \d0, \s1, \c1 +.endm + +.macro vmul_vmls d0, s0, s1, c0, c1 + vmul.i32 \d0, \s0, \c0 + vmls.i32 \d0, \s1, \c1 +.endm + +.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 + vqrdmulh.s32 \r0, \r0, \c + vqrdmulh.s32 \r1, \r1, \c +.ifnb \r2 + vqrdmulh.s32 \r2, \r2, \c + vqrdmulh.s32 \r3, \r3, \c +.endif +.ifnb \r4 + vqrdmulh.s32 \r4, \r4, \c + vqrdmulh.s32 \r5, \r5, \c + vqrdmulh.s32 \r6, \r6, \c + vqrdmulh.s32 \r7, \r7, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 +.ifnb \load + vld1.16 {\load}, [\src, :128], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + vqadd.s16 \adddst, \adddst, \addsrc +.endif +.ifnb \max + vmax.s16 \max, \max, q6 +.endif +.ifnb \min + vmin.s16 \min, \min, q7 +.endif +.ifnb \store + vst1.16 {\store}, [\dst, :128], r1 +.endif +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + load_add_store q0, q8, , , , , , \dst, \src, \shiftbits + load_add_store q1, q9, , , , , , \dst, \src, \shiftbits + load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits + load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits + load_add_store q4, q12, q2, q10, q9, q8, , \dst, \src, \shiftbits + load_add_store q5, q13, q3, q11, q10, q9, q8, \dst, \src, \shiftbits + load_add_store q0, q14, q4, q12, q11, q10, q9, \dst, \src, \shiftbits + load_add_store q1, q15, q5, q13, q12, q11, q10, \dst, \src, \shiftbits + load_add_store , , q0, q14, q13, q12, q11, \dst, \src, \shiftbits + load_add_store , , q1, q15, q14, q13, q12, \dst, \src, \shiftbits + load_add_store , , , , q15, q14, q13, \dst, \src, \shiftbits + load_add_store , , , , , q15, q14, \dst, \src, \shiftbits + load_add_store , , , , , , q15, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + load_add_store q0, q8, , , , , , \dst, \src, \shiftbits + load_add_store q1, q9, , , , , , \dst, \src, \shiftbits + load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits + load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits + load_add_store , , q2, q10, q9, q8, , \dst, \src, \shiftbits + load_add_store , , q3, q11, q10, q9, q8, \dst, \src, \shiftbits + load_add_store , , , , q11, q10, q9, \dst, \src, \shiftbits + load_add_store , , , , , q11, q10, \dst, \src, \shiftbits + load_add_store , , , , , , q11, \dst, \src, \shiftbits +.endm +.macro load_add_store4 load1, load2, shift, addsrc, adddst, max, min, store1, store2, dst, src, shiftbits=4 +.ifnb \load1 + vld1.16 {\load1}, [\src, :64], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \load2 + vld1.16 {\load2}, [\src, :64], r1 +.endif +.ifnb \addsrc + vqadd.s16 \adddst, \adddst, \addsrc +.endif +.ifnb \max + vmax.s16 \max, \max, q6 +.endif +.ifnb \store1 + vst1.16 {\store1}, [\dst, :64], r1 +.endif +.ifnb \min + vmin.s16 \min, \min, q7 +.endif +.ifnb \store2 + vst1.16 {\store2}, [\dst, :64], r1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + mov \src, \dst + load_add_store4 d0, d1, q8, , , , , , , \dst, \src + load_add_store4 d2, d3, q9, , , , , , , \dst, \src + load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src + load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src + load_add_store4 d8, d9, q12, q2, q10, q9, q8, , , \dst, \src + load_add_store4 d10, d11, q13, q3, q11, q10, q9, d16, d17, \dst, \src + load_add_store4 d0, d1, q14, q4, q12, q11, q10, d18, d19, \dst, \src + load_add_store4 d2, d3, q15, q5, q13, q12, q11, d20, d21, \dst, \src + load_add_store4 , , , q0, q14, q13, q12, d22, d23, \dst, \src + load_add_store4 , , , q1, q15, q14, q13, d24, d25, \dst, \src + load_add_store4 , , , , , q15, q14, d26, d27, \dst, \src + load_add_store4 , , , , , , q15, d28, d29, \dst, \src + load_add_store4 , , , , , , , d30, d31, \dst, \src +.endm +.macro load_add_store_4x8 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + mov \src, \dst + load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits + load_add_store4 d2, d3, q9, , , , , , , \dst, \src, \shiftbits + load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src, \shiftbits + load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src, \shiftbits + load_add_store4 , , , q2, q10, q9, q8, , , \dst, \src, \shiftbits + load_add_store4 , , , q3, q11, q10, q9, d16, d17, \dst, \src, \shiftbits + load_add_store4 , , , , , q11, q10, d18, d19, \dst, \src, \shiftbits + load_add_store4 , , , , , , q11, d20, d21, \dst, \src, \shiftbits + load_add_store4 , , , , , , , d22, d23, \dst, \src, \shiftbits +.endm +.macro load_add_store_4x4 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + mov \src, \dst + load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits + load_add_store4 d2, d3, q9, q0, q8, , , , , \dst, \src, \shiftbits + load_add_store4 , , , q1, q9, q8, , , , \dst, \src, \shiftbits + load_add_store4 , , , , , q9, q8, , , \dst, \src, \shiftbits + load_add_store4 , , , , , , q9, d16, d17, \dst, \src, \shiftbits + load_add_store4 , , , , , , , d18, d19, \dst, \src, \shiftbits +.endm + +.macro idct_dc w, h, shift + cmp r3, #0 + bne 1f + vmov.i16 q14, #0 + mov_const r12, 2896*8*(1<<16) + vld1.32 {d24[], d25[]}, [r2, :32] + vdup.32 d0, r12 + vqrdmulh.s32 q13, q12, d0[0] + vst1.32 {d28[0]}, [r2, :32] +.if (\w == 2*\h) || (2*\w == \h) + vqrdmulh.s32 q13, q13, d0[0] +.endif +.if \shift > 0 + vqrshrn.s32 d24, q13, #\shift + vqrshrn.s32 d25, q13, #\shift +.else + vqmovn.s32 d24, q13 + vqmovn.s32 d25, q13 +.endif + vqrdmulh.s16 q12, q12, d0[1] + mov r3, #\h + vrshr.s16 q12, q12, #4 + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {d0}, [r0, :64], r1 + vld1.16 {d1}, [r0, :64], r1 + vld1.16 {d2}, [r0, :64], r1 + vld1.16 {d3}, [r0, :64], r1 + subs r3, r3, #4 + vqadd.s16 q0, q0, q12 + sub r0, r0, r1, lsl #2 + vqadd.s16 q1, q1, q12 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmin.s16 q0, q0, q15 + vst1.16 {d0}, [r0, :64], r1 + vmin.s16 q1, q1, q15 + vst1.16 {d1}, [r0, :64], r1 + vst1.16 {d2}, [r0, :64], r1 + vst1.16 {d3}, [r0, :64], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w8_neon + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0}, [r0, :128], r1 + subs r3, r3, #4 + vld1.16 {q1}, [r0, :128], r1 + vqadd.s16 q0, q0, q12 + vld1.16 {q2}, [r0, :128], r1 + vqadd.s16 q1, q1, q12 + vld1.16 {q3}, [r0, :128], r1 + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + sub r0, r0, r1, lsl #2 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vst1.16 {q0}, [r0, :128], r1 + vmin.s16 q2, q2, q15 + vst1.16 {q1}, [r0, :128], r1 + vmin.s16 q3, q3, q15 + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w16_neon + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0, q1}, [r0, :128], r1 + subs r3, r3, #2 + vld1.16 {q2, q3}, [r0, :128], r1 + vqadd.s16 q0, q0, q12 + vqadd.s16 q1, q1, q12 + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + sub r0, r0, r1, lsl #1 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vst1.16 {q0, q1}, [r0, :128], r1 + vmin.s16 q3, q3, q15 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w32_neon + sub r1, r1, #32 + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.16 {q2, q3}, [r0, :128] + vqadd.s16 q0, q0, q12 + vqadd.s16 q1, q1, q12 + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + sub r0, r0, #32 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vst1.16 {q0, q1}, [r0, :128]! + vmin.s16 q3, q3, q15 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w64_neon + sub r1, r1, #96 + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.16 {q2, q3}, [r0, :128]! + vqadd.s16 q0, q0, q12 + vld1.16 {q8, q9}, [r0, :128]! + vqadd.s16 q1, q1, q12 + vld1.16 {q10, q11}, [r0, :128] + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + vqadd.s16 q8, q8, q12 + vqadd.s16 q9, q9, q12 + vqadd.s16 q10, q10, q12 + vqadd.s16 q11, q11, q12 + sub r0, r0, #96 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmax.s16 q8, q8, q14 + vmax.s16 q9, q9, q14 + vmax.s16 q10, q10, q14 + vmax.s16 q11, q11, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vmin.s16 q8, q8, q15 + vst1.16 {q0, q1}, [r0, :128]! + vmin.s16 q9, q9, q15 + vst1.16 {q2, q3}, [r0, :128]! + vmin.s16 q10, q10, q15 + vst1.16 {q8, q9}, [r0, :128]! + vmin.s16 q11, q11, q15 + vst1.16 {q10, q11}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +.macro iwht4 + vadd.i32 q8, q8, q9 + vsub.i32 q13, q10, q11 + vsub.i32 q12, q8, q13 + vshr.s32 q12, q12, #1 + vsub.i32 q10, q12, q9 + vsub.i32 q9, q12, q11 + vadd.i32 q11, q13, q10 + vsub.i32 q8, q8, q9 +.endm + +.macro idct_4s_x4 r0, r1, r2, r3 + vmul_vmla q4, \r1, \r3, d1[1], d1[0] + vmul_vmla q2, \r0, \r2, d0[0], d0[0] + vmul_vmls q3, \r1, \r3, d1[0], d1[1] + vmul_vmls q5, \r0, \r2, d0[0], d0[0] + vrshr.s32 q4, q4, #12 + vrshr.s32 q2, q2, #12 + vrshr.s32 q3, q3, #12 + vrshr.s32 q5, q5, #12 + vqadd.s32 \r0, q2, q4 + vqsub.s32 \r3, q2, q4 + vqadd.s32 \r1, q5, q3 + vqsub.s32 \r2, q5, q3 +.endm + +.macro idct_2s_x4 r0, r1, r2, r3 + vmul_vmla d6, \r1, \r3, d1[1], d1[0] + vmul_vmla d4, \r0, \r2, d0[0], d0[0] + vmul_vmls d5, \r1, \r3, d1[0], d1[1] + vmul_vmls d7, \r0, \r2, d0[0], d0[0] + vrshr.s32 d6, d6, #12 + vrshr.s32 d4, d4, #12 + vrshr.s32 d5, d5, #12 + vrshr.s32 d7, d7, #12 + vqadd.s32 \r0, d4, d6 + vqsub.s32 \r3, d4, d6 + vqadd.s32 \r1, d7, d5 + vqsub.s32 \r2, d7, d5 +.endm + +function inv_dct_4s_x4_neon + movrel_local r12, idct_coeffs + vld1.32 {d0, d1}, [r12, :128] + idct_4s_x4 q8, q9, q10, q11 + bx lr +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel_local r12, iadst4_coeffs + vld1.32 {d0, d1}, [r12, :128] + + vsub.i32 q1, q8, q10 + vmul.i32 q2, q8, d0[0] + vmla.i32 q2, q10, d0[1] + vmla.i32 q2, q11, d1[0] + vmul.i32 q4, q9, d1[1] + vadd.i32 q1, q1, q11 + vmul.i32 q3, q8, d1[0] + vmls.i32 q3, q10, d0[0] + vmls.i32 q3, q11, d0[1] + + vadd.i32 \o3, q2, q3 + vmul.i32 \o2, q1, d1[1] + vadd.i32 \o0, q2, q4 + vadd.i32 \o1, q3, q4 + vsub.i32 \o3, \o3, q4 + + vrshr.s32 \o0, \o0, #12 + vrshr.s32 \o2, \o2, #12 + vrshr.s32 \o1, \o1, #12 + vrshr.s32 \o3, \o3, #12 +.endm + +function inv_adst_4s_x4_neon + iadst_4x4 q8, q9, q10, q11 + bx lr +endfunc + +function inv_flipadst_4s_x4_neon + iadst_4x4 q11, q10, q9, q8 + bx lr +endfunc + +function inv_identity_4s_x4_neon + mov r12, #0 + movt r12, #(5793-4096)*8 + vdup.32 d0, r12 + vqrdmulh.s32 q1, q8, d0[0] + vqrdmulh.s32 q2, q9, d0[0] + vqrdmulh.s32 q3, q10, d0[0] + vqrdmulh.s32 q4, q11, d0[0] + vqadd.s32 q8, q8, q1 + vqadd.s32 q9, q9, q2 + vqadd.s32 q10, q10, q3 + vqadd.s32 q11, q11, q4 + bx lr +endfunc + +function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 + push {r4-r5,lr} + vpush {q4-q5} + vmov.i16 q14, #0 + vmov.i16 q15, #0 + vld1.32 {q8, q9}, [r2, :128] + vst1.32 {q14, q15}, [r2, :128]! + vshr.s16 q8, q8, #2 + vld1.32 {q10, q11}, [r2, :128] + vshr.s16 q9, q9, #2 + vshr.s16 q10, q10, #2 + vshr.s16 q11, q11, #2 + + iwht4 + + vst1.32 {q14, q15}, [r2, :128] + transpose_4x4s q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 + + iwht4 + + vld1.16 {d0}, [r0, :64], r1 + vqmovn.s32 d16, q8 + vld1.16 {d1}, [r0, :64], r1 + vqmovn.s32 d17, q9 + vld1.16 {d2}, [r0, :64], r1 + vqmovn.s32 d18, q10 + vld1.16 {d3}, [r0, :64], r1 + vqmovn.s32 d19, q11 + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + vld1.32 {q8, q9}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.32 {q10, q11}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + blx r4 + + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q9 + vqmovn.s32 d18, q10 + vqmovn.s32 d19, q11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + + blx r5 + + vld1.16 {d0}, [r0, :64], r1 + vld1.16 {d1}, [r0, :64], r1 + vrshr.s16 q8, q8, #4 + vld1.16 {d2}, [r0, :64], r1 + vrshr.s16 q9, q9, #4 + vld1.16 {d3}, [r0, :64], r1 + +L(itx_4x4_end): + vmvn.i16 q15, #0xfc00 // 0x3ff + sub r0, r0, r1, lsl #2 + vqadd.s16 q8, q8, q0 + vqadd.s16 q9, q9, q1 + vmax.s16 q8, q8, q14 + vmax.s16 q9, q9, q14 + vmin.s16 q8, q8, q15 + vmin.s16 q9, q9, q15 + vst1.16 {d16}, [r0, :64], r1 + vst1.16 {d17}, [r0, :64], r1 + vst1.16 {d18}, [r0, :64], r1 + vst1.16 {d19}, [r0, :64], r1 + + vpop {q4-q5} + pop {r4-r5,pc} +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 + push {r4-r5,lr} + vpush {q4-q5} + +.ifc \txfm1\()_\txfm2, dct_dct + cmp r3, #0 + bne 1f + vmov.i16 q14, #0 + mov_const r12, 2896*8*(1<<16) + vld1.32 {d16[], d17[]}, [r2, :32] + vdup.32 d4, r12 + vst1.32 {d28[0]}, [r2, :32] + vqrdmulh.s32 q8, q8, d4[0] + vld1.16 {d0}, [r0, :64], r1 + vqmovn.s32 d20, q8 + vqmovn.s32 d21, q8 + vld1.16 {d1}, [r0, :64], r1 + vqrdmulh.s16 q10, q10, d4[1] + vld1.16 {d2}, [r0, :64], r1 + vrshr.s16 q8, q10, #4 + vld1.16 {d3}, [r0, :64], r1 + vrshr.s16 q9, q10, #4 + b L(itx_4x4_end) +1: +.endif + movrel_local r4, inv_\txfm1\()_4s_x4_neon + movrel r5, X(inv_\txfm2\()_4h_x4_neon) + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4s_x4 \r0, \r2, \r4, \r6 + + vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a + vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a + vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a + vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a + vrshr.s32 \r1, q2, #12 // t4a + vrshr.s32 \r7, q4, #12 // t7a + vrshr.s32 \r3, q6, #12 // t5a + vrshr.s32 \r5, q7, #12 // t6a + + vqadd.s32 q2, \r1, \r3 // t4 + vqsub.s32 \r1, \r1, \r3 // t5a + vqadd.s32 q3, \r7, \r5 // t7 + vqsub.s32 \r3, \r7, \r5 // t6a + + vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5 + vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6 + vrshr.s32 q4, q4, #12 // t5 + vrshr.s32 q5, q6, #12 // t6 + + vqsub.s32 \r7, \r0, q3 // out7 + vqadd.s32 \r0, \r0, q3 // out0 + vqadd.s32 \r1, \r2, q5 // out1 + vqsub.s32 q6, \r2, q5 // out6 + vqadd.s32 \r2, \r4, q4 // out2 + vqsub.s32 \r5, \r4, q4 // out5 + vqadd.s32 \r3, \r6, q2 // out3 + vqsub.s32 \r4, \r6, q2 // out4 + vmov \r6, q6 // out6 +.endm + +.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_2s_x4 \r0, \r2, \r4, \r6 + + vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a + vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a + vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a + vmul_vmla d7, \r5, \r3, d3[1], d3[0] // -> t6a + vrshr.s32 \r1, d4, #12 // t4a + vrshr.s32 \r7, d5, #12 // t7a + vrshr.s32 \r3, d6, #12 // t5a + vrshr.s32 \r5, d7, #12 // t6a + + vqadd.s32 d4, \r1, \r3 // t4 + vqsub.s32 \r1, \r1, \r3 // t5a + vqadd.s32 d5, \r7, \r5 // t7 + vqsub.s32 \r3, \r7, \r5 // t6a + + vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5 + vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6 + vrshr.s32 d6, d6, #12 // t5 + vrshr.s32 d7, d7, #12 // t6 + + vqsub.s32 \r7, \r0, d5 // out7 + vqadd.s32 \r0, \r0, d5 // out0 + vqadd.s32 \r1, \r2, d7 // out1 + vqsub.s32 d7, \r2, d7 // out6 + vqadd.s32 \r2, \r4, d6 // out2 + vqsub.s32 \r5, \r4, d6 // out5 + vqadd.s32 \r3, \r6, d4 // out3 + vqsub.s32 \r4, \r6, d4 // out4 + vmov \r6, d7 // out6 +.endm + +function inv_dct_4s_x8_neon + movrel_local r12, idct_coeffs + vld1.32 {q0, q1}, [r12, :128] + idct_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 + bx lr +endfunc + +.macro iadst_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 + movrel_local r12, iadst8_coeffs + vld1.32 {q0, q1}, [r12, :128]! + + vmul_vmla q2, q15, q8, d0[0], d0[1] + vmul_vmls q3, q15, q8, d0[1], d0[0] + vmul_vmla q4, q13, q10, d1[0], d1[1] + vrshr.s32 q8, q2, #12 // t0a + vrshr.s32 q15, q3, #12 // t1a + vmul_vmls q5, q13, q10, d1[1], d1[0] + vmul_vmla q6, q11, q12, d2[0], d2[1] + vrshr.s32 q10, q4, #12 // t2a + vrshr.s32 q13, q5, #12 // t3a + vmul_vmls q7, q11, q12, d2[1], d2[0] + vmul_vmla q2, q9, q14, d3[0], d3[1] + vrshr.s32 q12, q6, #12 // t4a + vrshr.s32 q11, q7, #12 // t5a + vmul_vmls q3, q9, q14, d3[1], d3[0] + vrshr.s32 q14, q2, #12 // t6a + vrshr.s32 q9, q3, #12 // t7a + + vld1.32 {q0}, [r12] + + vqadd.s32 q2, q8, q12 // t0 + vqsub.s32 q3, q8, q12 // t4 + vqadd.s32 q4, q15, q11 // t1 + vqsub.s32 q5, q15, q11 // t5 + vqadd.s32 q6, q10, q14 // t2 + vqsub.s32 q7, q10, q14 // t6 + vqadd.s32 q10, q13, q9 // t3 + vqsub.s32 q11, q13, q9 // t7 + + vmul_vmla q8, q3, q5, d1[1], d1[0] + vmul_vmls q12, q3, q5, d1[0], d1[1] + vmul_vmls q14, q11, q7, d1[1], d1[0] + + vrshr.s32 q3, q8, #12 // t4a + vrshr.s32 q5, q12, #12 // t5a + + vmul_vmla q8, q11, q7, d1[0], d1[1] + + vrshr.s32 q7, q14, #12 // t6a + vrshr.s32 q11, q8, #12 // t7a + + vqadd.s32 \r0, q2, q6 // out0 + vqsub.s32 q2, q2, q6 // t2 + vqadd.s32 \r7, q4, q10 // out7 + vqsub.s32 q4, q4, q10 // t3 + vqneg.s32 \r7, \r7 // out7 + + vqadd.s32 \r1, q3, q7 // out1 + vqsub.s32 q3, q3, q7 // t6 + vqadd.s32 \r6, q5, q11 // out6 + vqsub.s32 q5, q5, q11 // t7 + vqneg.s32 \r1, \r1 // out1 + + vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12) + vmul_vmls q6, q2, q4, d0[0], d0[0] // -> out4 (q12 or q11) + vmul_vmls q12, q3, q5, d0[0], d0[0] // -> out5 (q13 or q10) + vrshr.s32 q2, q10, #12 // out3 + vmul_vmla q10, q3, q5, d0[0], d0[0] // -> out2 (q10 or q13) + vrshr.s32 q3, q12, #12 // out5 + vrshr.s32 \r2, q10, #12 // out2 (q10 or q13) + vrshr.s32 \r4, q6, #12 // out4 (q12 or q11) + + vqneg.s32 \r3, q2 // out3 + vqneg.s32 \r5, q3 // out5 +.endm + +function inv_adst_4s_x8_neon + iadst_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 + bx lr +endfunc + +function inv_flipadst_4s_x8_neon + iadst_4s_x8 q15, q14, q13, q12, q11, q10, q9, q8 + bx lr +endfunc + +function inv_identity_4s_x8_neon + vqshl.s32 q8, q8, #1 + vqshl.s32 q9, q9, #1 + vqshl.s32 q10, q10, #1 + vqshl.s32 q11, q11, #1 + vqshl.s32 q12, q12, #1 + vqshl.s32 q13, q13, #1 + vqshl.s32 q14, q14, #1 + vqshl.s32 q15, q15, #1 + bx lr +endfunc + +function inv_txfm_add_8x8_neon + vmov.i32 q0, #0 + mov r7, #8*4 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + + blx r4 + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q12, #1 + vqrshrn.s32 d18, q9, #1 + vqrshrn.s32 d19, q13, #1 + vqrshrn.s32 d20, q10, #1 + vqrshrn.s32 d21, q14, #1 + vqrshrn.s32 d22, q11, #1 + vqrshrn.s32 d23, q15, #1 + + cmp r3, r10 + transpose_4x8h q8, q9, q10, q11 + + blt 1f + + sub r2, r2, r7, lsl #3 + vpush {q8-q11} + + add r2, r2, #16 + vmov.i32 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + + blx r4 + + vqrshrn.s32 d31, q15, #1 + vqrshrn.s32 d30, q11, #1 + vqrshrn.s32 d29, q14, #1 + vqrshrn.s32 d28, q10, #1 + vqrshrn.s32 d27, q13, #1 + vqrshrn.s32 d26, q9, #1 + vqrshrn.s32 d25, q12, #1 + vqrshrn.s32 d24, q8, #1 + vpop {q8-q11} + + transpose_4x8h q12, q13, q14, q15 + + b 2f + +1: + vmov.i16 q12, #0 + vmov.i16 q13, #0 + vmov.i16 q14, #0 + vmov.i16 q15, #0 + +2: + blx r5 + + load_add_store_8x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,r10,pc} +endfunc + +.macro def_fn_8x8 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + push {r4-r5,r7,r10,lr} + vpush {q4-q7} + mov r10, #\eob_half + movrel_local r4, inv_\txfm1\()_4s_x8_neon + movrel r5, X(inv_\txfm2\()_8h_x8_neon) + b inv_txfm_add_8x8_neon +endfunc +.endm + +def_fn_8x8 dct, dct, 10 +def_fn_8x8 identity, identity, 10 +def_fn_8x8 dct, adst, 10 +def_fn_8x8 dct, flipadst, 10 +def_fn_8x8 dct, identity, 4 +def_fn_8x8 adst, dct, 10 +def_fn_8x8 adst, adst, 10 +def_fn_8x8 adst, flipadst, 10 +def_fn_8x8 flipadst, dct, 10 +def_fn_8x8 flipadst, adst, 10 +def_fn_8x8 flipadst, flipadst, 10 +def_fn_8x8 identity, dct, 4 +def_fn_8x8 adst, identity, 4 +def_fn_8x8 flipadst, identity, 4 +def_fn_8x8 identity, adst, 4 +def_fn_8x8 identity, flipadst, 4 + +function inv_txfm_add_8x4_neon + mov_const r12, 2896*8*(1<<16) + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vdup.32 d4, r12 + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q12, q13}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q14, q15}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + + scale_input d4[0], q8, q9, q10, q11, q12, q13, q14, q15 + + blx r4 + + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q9 + vqmovn.s32 d18, q10 + vqmovn.s32 d19, q11 + vqmovn.s32 d20, q12 + vqmovn.s32 d21, q13 + vqmovn.s32 d22, q14 + vqmovn.s32 d23, q15 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + vswp d17, d20 + vswp d19, d21 + vswp d18, d20 + vswp d21, d22 + + blx r5 + + load_add_store_8x4 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,r10,pc} +endfunc + +function inv_txfm_add_4x8_neon + mov_const r12, 2896*8*(1<<16) + vmov.i32 q0, #0 + cmp r3, r10 + mov r7, #32 + blt 1f + + add r2, r2, #16 + vdup.32 d2, r12 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + + scale_input d2[0], q8, q9, q10, q11 + sub r2, r2, r7, lsl #2 + + blx r4 + + sub r2, r2, #16 + + vqmovn.s32 d24, q8 + vqmovn.s32 d25, q9 + vqmovn.s32 d26, q10 + vqmovn.s32 d27, q11 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + b 2f + +1: + vmov.i16 q12, #0 + vmov.i16 q13, #0 + +2: + mov_const r12, 2896*8*(1<<16) + vmov.i32 q0, #0 + vdup.32 d2, r12 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + scale_input d2[0], q8, q9, q10, q11 + blx r4 + + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q9 + vqmovn.s32 d18, q10 + vqmovn.s32 d19, q11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + + vmov q10, q12 + vmov q11, q13 + + blx r5 + + load_add_store_4x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,r10,pc} +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + push {r4-r5,r7,r10,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon +.if \w == 4 + mov r10, #\eob_half +.endif + movrel r5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct, 13 +def_fn_48 \w, \h, identity, identity, 13 +def_fn_48 \w, \h, dct, adst, 13 +def_fn_48 \w, \h, dct, flipadst, 13 +def_fn_48 \w, \h, dct, identity, 4 +def_fn_48 \w, \h, adst, dct, 13 +def_fn_48 \w, \h, adst, adst, 13 +def_fn_48 \w, \h, adst, flipadst, 13 +def_fn_48 \w, \h, flipadst, dct, 13 +def_fn_48 \w, \h, flipadst, adst, 13 +def_fn_48 \w, \h, flipadst, flipadst, 13 +def_fn_48 \w, \h, identity, dct, 16 +def_fn_48 \w, \h, adst, identity, 4 +def_fn_48 \w, \h, flipadst, identity, 4 +def_fn_48 \w, \h, identity, adst, 16 +def_fn_48 \w, \h, identity, flipadst, 16 +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + +function inv_dct_2s_x16_neon + movrel_local r12, idct_coeffs + vld1.32 {q0, q1}, [r12, :128]! + + idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30 + + vld1.32 {q0, q1}, [r12, :128] + sub r12, r12, #32 + + vmul_vmls d4, d17, d31, d0[0], d0[1] // -> t8a + vmul_vmla d5, d17, d31, d0[1], d0[0] // -> t15a + vmul_vmls d6, d25, d23, d1[0], d1[1] // -> t9a + vrshr.s32 d17, d4, #12 // t8a + vrshr.s32 d31, d5, #12 // t15a + vmul_vmla d4, d25, d23, d1[1], d1[0] // -> t14a + vmul_vmls d5, d21, d27, d2[0], d2[1] // -> t10a + vrshr.s32 d23, d6, #12 // t9a + vrshr.s32 d25, d4, #12 // t14a + vmul_vmla d6, d21, d27, d2[1], d2[0] // -> t13a + vmul_vmls d4, d29, d19, d3[0], d3[1] // -> t11a + vrshr.s32 d21, d5, #12 // t10a + vrshr.s32 d27, d6, #12 // t13a + vmul_vmla d5, d29, d19, d3[1], d3[0] // -> t12a + vrshr.s32 d19, d4, #12 // t11a + vrshr.s32 d29, d5, #12 // t12a + + vld1.32 {q0}, [r12, :128] + + vqsub.s32 d4, d17, d23 // t9 + vqadd.s32 d17, d17, d23 // t8 + vqsub.s32 d5, d31, d25 // t14 + vqadd.s32 d31, d31, d25 // t15 + vqsub.s32 d23, d19, d21 // t10 + vqadd.s32 d19, d19, d21 // t11 + vqadd.s32 d25, d29, d27 // t12 + vqsub.s32 d29, d29, d27 // t13 + + vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a + vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a + vrshr.s32 d21, d6, #12 // t9a + vrshr.s32 d27, d7, #12 // t14a + + vmul_vmls d6, d29, d23, d1[0], d1[1] // -> t13a + vmul_vmla d7, d29, d23, d1[1], d1[0] // -> t10a + vrshr.s32 d29, d6, #12 // t13a + vneg.s32 d7, d7 + vrshr.s32 d23, d7, #12 // t10a + + vqsub.s32 d4, d17, d19 // t11a + vqadd.s32 d17, d17, d19 // t8a + vqsub.s32 d5, d31, d25 // t12a + vqadd.s32 d31, d31, d25 // t15a + vqadd.s32 d19, d21, d23 // t9 + vqsub.s32 d21, d21, d23 // t10 + vqsub.s32 d25, d27, d29 // t13 + vqadd.s32 d27, d27, d29 // t14 + + vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11 + vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12 + vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a + + vrshr.s32 d6, d6, #12 // t11 + vrshr.s32 d7, d7, #12 // t12 + vmul_vmla d5, d25, d21, d0[0], d0[0] // -> t13a + vrshr.s32 d4, d4, #12 // t10a + vrshr.s32 d5, d5, #12 // t13a + + vqadd.s32 d8, d16, d31 // out0 + vqsub.s32 d31, d16, d31 // out15 + vmov d16, d8 + vqadd.s32 d23, d30, d17 // out7 + vqsub.s32 d9, d30, d17 // out8 + vqadd.s32 d17, d18, d27 // out1 + vqsub.s32 d30, d18, d27 // out14 + vqadd.s32 d18, d20, d5 // out2 + vqsub.s32 d29, d20, d5 // out13 + vqadd.s32 d5, d28, d19 // out6 + vqsub.s32 d25, d28, d19 // out9 + vqadd.s32 d19, d22, d7 // out3 + vqsub.s32 d28, d22, d7 // out12 + vqadd.s32 d20, d24, d6 // out4 + vqsub.s32 d27, d24, d6 // out11 + vqadd.s32 d21, d26, d4 // out5 + vqsub.s32 d26, d26, d4 // out10 + vmov d24, d9 + vmov d22, d5 + + bx lr +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel_local r12, iadst16_coeffs + vld1.32 {q0, q1}, [r12, :128]! + + vmul_vmla d4, d31, d16, d0[0], d0[1] // -> t0 + vmul_vmls d6, d31, d16, d0[1], d0[0] // -> t1 + vmul_vmla d8, d29, d18, d1[0], d1[1] // -> t2 + vrshr.s32 d16, d4, #12 // t0 + vrshr.s32 d31, d6, #12 // t1 + vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t3 + vmul_vmla d6, d27, d20, d2[0], d2[1] // -> t4 + vrshr.s32 d18, d8, #12 // t2 + vrshr.s32 d29, d4, #12 // t3 + vmul_vmls d8, d27, d20, d2[1], d2[0] // -> t5 + vmul_vmla d4, d25, d22, d3[0], d3[1] // -> t6 + vrshr.s32 d20, d6, #12 // t4 + vrshr.s32 d27, d8, #12 // t5 + vmul_vmls d6, d25, d22, d3[1], d3[0] // -> t7 + vld1.32 {q0, q1}, [r12, :128] + movrel_local r12, idct_coeffs + vmul_vmla d8, d23, d24, d0[0], d0[1] // -> t8 + vrshr.s32 d22, d4, #12 // t6 + vrshr.s32 d25, d6, #12 // t7 + vmul_vmls d4, d23, d24, d0[1], d0[0] // -> t9 + vmul_vmla d6, d21, d26, d1[0], d1[1] // -> t10 + vrshr.s32 d23, d8, #12 // t8 + vrshr.s32 d24, d4, #12 // t9 + vmul_vmls d8, d21, d26, d1[1], d1[0] // -> t11 + vmul_vmla d4, d19, d28, d2[0], d2[1] // -> t12 + vrshr.s32 d21, d6, #12 // t10 + vrshr.s32 d26, d8, #12 // t11 + vmul_vmls d6, d19, d28, d2[1], d2[0] // -> t13 + vmul_vmla d8, d17, d30, d3[0], d3[1] // -> t14 + vrshr.s32 d19, d4, #12 // t12 + vrshr.s32 d28, d6, #12 // t13 + vmul_vmls d4, d17, d30, d3[1], d3[0] // -> t15 + vrshr.s32 d17, d8, #12 // t14 + vrshr.s32 d30, d4, #12 // t15 + + vld1.32 {q0, q1}, [r12, :128] + + vqsub.s32 d5, d16, d23 // t8a + vqadd.s32 d16, d16, d23 // t0a + vqsub.s32 d7, d31, d24 // t9a + vqadd.s32 d31, d31, d24 // t1a + vqadd.s32 d23, d18, d21 // t2a + vqsub.s32 d18, d18, d21 // t10a + vqadd.s32 d24, d29, d26 // t3a + vqsub.s32 d29, d29, d26 // t11a + vqadd.s32 d21, d20, d19 // t4a + vqsub.s32 d20, d20, d19 // t12a + vqadd.s32 d26, d27, d28 // t5a + vqsub.s32 d27, d27, d28 // t13a + vqadd.s32 d19, d22, d17 // t6a + vqsub.s32 d22, d22, d17 // t14a + vqadd.s32 d28, d25, d30 // t7a + vqsub.s32 d25, d25, d30 // t15a + + vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8 + vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9 + vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10 + vrshr.s32 d17, d4, #12 // t8 + vrshr.s32 d30, d6, #12 // t9 + vmul_vmls d4, d18, d29, d3[0], d3[1] // -> t11 + vmul_vmls d6, d27, d20, d2[1], d2[0] // -> t12 + vrshr.s32 d18, d8, #12 // t10 + vrshr.s32 d29, d4, #12 // t11 + vmul_vmla d8, d27, d20, d2[0], d2[1] // -> t13 + vmul_vmls d4, d25, d22, d3[1], d3[0] // -> t14 + vrshr.s32 d27, d6, #12 // t12 + vrshr.s32 d20, d8, #12 // t13 + vmul_vmla d6, d25, d22, d3[0], d3[1] // -> t15 + vrshr.s32 d25, d4, #12 // t14 + vrshr.s32 d22, d6, #12 // t15 + + vqsub.s32 d2, d16, d21 // t4 + vqadd.s32 d16, d16, d21 // t0 + vqsub.s32 d3, d31, d26 // t5 + vqadd.s32 d31, d31, d26 // t1 + vqadd.s32 d21, d23, d19 // t2 + vqsub.s32 d23, d23, d19 // t6 + vqadd.s32 d26, d24, d28 // t3 + vqsub.s32 d24, d24, d28 // t7 + vqadd.s32 d19, d17, d27 // t8a + vqsub.s32 d17, d17, d27 // t12a + vqadd.s32 d28, d30, d20 // t9a + vqsub.s32 d30, d30, d20 // t13a + vqadd.s32 d27, d18, d25 // t10a + vqsub.s32 d18, d18, d25 // t14a + vqadd.s32 d20, d29, d22 // t11a + vqsub.s32 d29, d29, d22 // t15a + + vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a + vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a + vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a + vrshr.s32 d22, d4, #12 // t4a + vrshr.s32 d25, d6, #12 // t5a + vmul_vmla d4, d24, d23, d1[0], d1[1] // -> t7a + vmul_vmla d6, d17, d30, d1[1], d1[0] // -> t12 + vrshr.s32 d24, d8, #12 // t6a + vrshr.s32 d23, d4, #12 // t7a + vmul_vmls d8, d17, d30, d1[0], d1[1] // -> t13 + vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t14 + vrshr.s32 d17, d6, #12 // t12 + vmul_vmla d6, d29, d18, d1[0], d1[1] // -> t15 + vrshr.s32 d29, d8, #12 // t13 + vrshr.s32 d30, d4, #12 // t14 + vrshr.s32 d18, d6, #12 // t15 + + vqsub.s32 d2, d16, d21 // t2a +.ifc \o0, d16 + vqadd.s32 \o0, d16, d21 // out0 + vqsub.s32 d21, d31, d26 // t3a + vqadd.s32 \o15,d31, d26 // out15 +.else + vqadd.s32 d4, d16, d21 // out0 + vqsub.s32 d21, d31, d26 // t3a + vqadd.s32 \o15,d31, d26 // out15 + vmov \o0, d4 +.endif + vqneg.s32 \o15, \o15 // out15 + + vqsub.s32 d3, d29, d18 // t15a + vqadd.s32 \o13,d29, d18 // out13 + vqadd.s32 \o2, d17, d30 // out2 + vqsub.s32 d26, d17, d30 // t14a + vqneg.s32 \o13,\o13 // out13 + + vqadd.s32 \o1, d19, d27 // out1 + vqsub.s32 d27, d19, d27 // t10 + vqadd.s32 \o14,d28, d20 // out14 + vqsub.s32 d20, d28, d20 // t11 + vqneg.s32 \o1, \o1 // out1 + + vqadd.s32 \o3, d22, d24 // out3 + vqsub.s32 d22, d22, d24 // t6 + vqadd.s32 \o12,d25, d23 // out12 + vqsub.s32 d23, d25, d23 // t7 + vqneg.s32 \o3, \o3 // out3 + + vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) + vmul_vmla d4, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) + vmul_vmla d6, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) + + vrshr.s32 d24, d24, #12 // out8 + vrshr.s32 d4, d4, #12 // out7 + vrshr.s32 d5, d6, #12 // out5 + vmul_vmls d8, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) + vmul_vmla d2, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) + vrshr.s32 d26, d8, #12 // out10 + + vmul_vmls d8, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) + vmul_vmla d22, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) + vmul_vmls d6, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) + + vrshr.s32 \o4, d2, #12 // out4 + vrshr.s32 d7, d6, #12 // out9 + vrshr.s32 d6, d8, #12 // out11 + vrshr.s32 \o6, d22, #12 // out6 + +.ifc \o8, d23 + vmov \o8, d24 + vmov \o10,d26 +.endif + + vqneg.s32 \o7, d4 // out7 + vqneg.s32 \o5, d5 // out5 + vqneg.s32 \o11,d6 // out11 + vqneg.s32 \o9, d7 // out9 +.endm + +function inv_adst_2s_x16_neon + iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_2s_x16_neon + iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_2s_x16_neon + mov r12, #0 + movt r12, #2*(5793-4096)*8 + vdup.32 d0, r12 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s32 q1, \i, d0[0] + vqadd.s32 \i, \i, \i + vqadd.s32 \i, \i, q1 +.endr + bx lr +endfunc + +.macro identity_8x4_shift1 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s32 q2, \i, \c + vrshr.s32 q2, q2, #1 + vqadd.s32 \i, \i, q2 +.endr +.endm + +.macro identity_8x4 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s32 q2, \i, \c + vqadd.s32 \i, \i, \i + vqadd.s32 \i, \i, q2 +.endr +.endm + +.macro def_horz_16 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x2_neon + push {lr} + vmov.i32 d7, #0 +.if \scale + mov_const r12, 2896*8*(1<<16) + vdup.32 d1, r12 +.endif +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r7, :64] + vst1.32 {d7}, [r7, :64], r8 +.endr +.if \scale + scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + blx r4 + vqrshrn.s32 d16, q8, #\shift + vqrshrn.s32 d17, q9, #\shift + vqrshrn.s32 d18, q10, #\shift + vqrshrn.s32 d19, q11, #\shift + vqrshrn.s32 d20, q12, #\shift + vqrshrn.s32 d21, q13, #\shift + vqrshrn.s32 d22, q14, #\shift + vqrshrn.s32 d23, q15, #\shift + vuzp.16 q8, q9 + vuzp.16 q10, q11 + +.irp i, q8, q10, q9, q11 + vst1.16 {\i}, [r6, :128]! +.endr + + pop {pc} +endfunc +.endm + +def_horz_16 scale=0, shift=2 +def_horz_16 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_4x16_neon + push {lr} +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + blx r5 + load_add_store_4x16 r6, r7 + pop {pc} +endfunc + +function inv_txfm_add_16x16_neon + sub_sp_align 512 + ldrh r11, [r10], #2 +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 14 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #16*4 + bl inv_txfm_horz_16x2_neon +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #32 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +const eob_16x16 + .short 3, 10, 21, 36, 55, 78, 105, 256 +endconst + +const eob_16x16_identity + .short 2, 4, 6, 8, 10, 12, 14, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + push {r4-r11,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_2s_x16_neon + movrel r5, X(inv_\txfm2\()_4h_x16_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_16x16 +.else + movrel_local r10, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_16x16_identity +.else + movrel_local r10, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +function inv_txfm_add_16x4_neon + cmp r3, r10 + mov r11, #16 + blt 1f + + add r6, r2, #8 + vmov.i32 d4, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r6, :64] + vst1.32 {d4}, [r6, :64], r11 +.endr + blx r4 + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + vqrshrn.s32 d20, q12, #1 + vqrshrn.s32 d21, q13, #1 + vqrshrn.s32 d22, q14, #1 + vqrshrn.s32 d23, q15, #1 + vuzp.16 q8, q9 + mov r6, sp + vuzp.16 q10, q11 + vpush {q8-q11} + + b 2f + +1: + vmov.i16 q8, #0 + vmov.i16 q9, #0 + mov r6, sp + vpush {q8-q9} + vpush {q8-q9} + +2: + vmov.i32 d4, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r2, :64] + vst1.32 {d4}, [r2, :64], r11 +.endr + + blx r4 + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + vqrshrn.s32 d20, q12, #1 + vqrshrn.s32 d21, q13, #1 + vqrshrn.s32 d22, q14, #1 + vqrshrn.s32 d23, q15, #1 + vuzp.16 q8, q9 + mov r6, sp + vuzp.16 q10, q11 + + vmov q12, q10 + vmov q13, q11 + + vpop {q10-q11} + blx r5 + mov r6, r0 + load_add_store_8x4 r6, r7 + + vpop {q10-q11} + vmov q8, q12 + vmov q9, q13 + blx r5 + add r6, r0, #16 + load_add_store_8x4 r6, r7 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_4x16_neon + ldrh r9, [r10, #4] + + mov r11, #64 + cmp r3, r9 + ldrh r9, [r10, #2] + blt 1f + + add r6, r2, #48 + vmov.i32 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r6, :128] + vst1.32 {q2}, [r6, :128], r11 +.endr + blx r4 + vqrshrn.s32 d28, q8, #1 + vqrshrn.s32 d29, q9, #1 + vqrshrn.s32 d30, q10, #1 + vqrshrn.s32 d31, q11, #1 + transpose_4x4h q14, q15, d28, d29, d30, d31 + + b 2f +1: + vmov.i16 q14, #0 + vmov.i16 q15, #0 +2: + cmp r3, r9 + ldrh r9, [r10] + blt 1f + + add r6, r2, #32 + vmov.i32 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r6, :128] + vst1.32 {q2}, [r6, :128], r11 +.endr + blx r4 + vqrshrn.s32 d24, q8, #1 + vqrshrn.s32 d25, q9, #1 + vqrshrn.s32 d26, q10, #1 + vqrshrn.s32 d27, q11, #1 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + b 2f +1: + vmov.i16 q12, #0 + vmov.i16 q13, #0 +2: + cmp r3, r9 + blt 1f + + add r6, r2, #16 + vmov.i32 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r6, :128] + vst1.32 {q2}, [r6, :128], r11 +.endr + blx r4 + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + transpose_4x4h q8, q9, d16, d17, d18, d19 + + b 2f +1: + vmov.i16 q8, #0 + vmov.i16 q9, #0 +2: + vmov.i16 q2, #0 + vpush {q8-q9} +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r2, :128] + vst1.16 {q2}, [r2, :128], r11 +.endr + blx r4 + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + transpose_4x4h q8, q9, d16, d17, d18, d19 + vpop {q10-q11} + + blx r5 + + load_add_store_4x16 r0, r6 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +const eob_4x16 + .short 13, 29, 45, 64 +endconst + +const eob_4x16_identity1 + .short 16, 32, 48, 64 +endconst + +const eob_4x16_identity2 + .short 4, 8, 12, 64 +endconst + +.macro def_fn_416 w, h, txfm1, txfm2, eob_16x4 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 4 + movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon + movrel r5, X(inv_\txfm2\()_4h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_4x16 +.else + movrel_local r10, eob_4x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_4x16_identity2 +.else + movrel_local r10, eob_4x16 +.endif +.endif +.else + mov r10, #\eob_16x4 + movrel_local r4, inv_\txfm1\()_2s_x\w\()_neon + movrel r5, X(inv_\txfm2\()_8h_x\h\()_neon) +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 3 +def_fn_416 \w, \h, identity, identity, 3 +def_fn_416 \w, \h, dct, adst, 3 +def_fn_416 \w, \h, dct, flipadst, 3 +def_fn_416 \w, \h, dct, identity, 2 +def_fn_416 \w, \h, adst, dct, 3 +def_fn_416 \w, \h, adst, adst, 3 +def_fn_416 \w, \h, adst, flipadst, 3 +def_fn_416 \w, \h, flipadst, dct, 3 +def_fn_416 \w, \h, flipadst, adst, 3 +def_fn_416 \w, \h, flipadst, flipadst, 3 +def_fn_416 \w, \h, identity, dct, 2 +def_fn_416 \w, \h, adst, identity, 2 +def_fn_416 \w, \h, flipadst, identity, 2 +def_fn_416 \w, \h, identity, adst, 2 +def_fn_416 \w, \h, identity, flipadst, 2 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + +function inv_txfm_add_16x8_neon + sub_sp_align 256 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(8 - \i) + cmp r3, r11 + blt 1f +.if \i < 6 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #8*4 + bl inv_txfm_horz_scale_16x2_neon +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: + +.irp i, 0, 8 + add r7, sp, #(\i*2) + mov r8, #32 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128], r8 +.endr + blx r5 + + add r6, r0, #(\i*2) + load_add_store_8x8 r6, r7 +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_8x16_neon + add r10, r10, #2 + sub_sp_align 256 + ldrh r11, [r10], #4 + +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*8*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #4 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #16*4 + + mov_const r12, 2896*8*(1<<16) + vmov.i32 q2, #0 + vdup.32 d0, r12 + +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\j}, [r7, :128] + vst1.32 {q2}, [r7, :128], r8 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + blx r4 + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + vqrshrn.s32 d20, q12, #1 + vqrshrn.s32 d21, q13, #1 + vqrshrn.s32 d22, q14, #1 + vqrshrn.s32 d23, q15, #1 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 +.irp j, d16, d20, d17, d21, d18, d22, d19, d23 + vst1.16 {\j}, [r6, :64]! +.endr +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: + +.irp i, 0, 4 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #16 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +const eob_8x16 + .short 3, 10, 21, 43, 59, 75, 91, 128 +endconst + +const eob_8x16_identity1 + .short 2, 4, 6, 64, 80, 96, 112, 128 +endconst + +const eob_8x16_identity2 + .short 2, 4, 6, 8, 10, 12, 14, 128 +endconst + +.macro def_fn_816 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 8 + movrel_local r4, inv_\txfm1\()_4s_x8_neon + movrel r5, X(inv_\txfm2\()_4h_x16_neon) +.else + movrel_local r4, inv_\txfm1\()_2s_x16_neon + movrel r5, X(inv_\txfm2\()_8h_x8_neon) +.endif +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_8x16 +.else + movrel_local r10, eob_8x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_8x16_identity2 +.else + movrel_local r10, eob_8x16 +.endif +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct +def_fn_816 \w, \h, identity, identity +def_fn_816 \w, \h, dct, adst +def_fn_816 \w, \h, dct, flipadst +def_fn_816 \w, \h, dct, identity +def_fn_816 \w, \h, adst, dct +def_fn_816 \w, \h, adst, adst +def_fn_816 \w, \h, adst, flipadst +def_fn_816 \w, \h, flipadst, dct +def_fn_816 \w, \h, flipadst, adst +def_fn_816 \w, \h, flipadst, flipadst +def_fn_816 \w, \h, identity, dct +def_fn_816 \w, \h, adst, identity +def_fn_816 \w, \h, flipadst, identity +def_fn_816 \w, \h, identity, adst +def_fn_816 \w, \h, identity, flipadst +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_2s_x16_neon + movrel_local r12, idct_coeffs, 4*16 + vld1.32 {q0, q1}, [r12, :128]! + + vmul_vmls d4, d16, d31, d0[0], d0[1] // -> t16a + vmul_vmla d6, d16, d31, d0[1], d0[0] // -> t31a + vmul_vmls d8, d24, d23, d1[0], d1[1] // -> t17a + vrshr.s32 d16, d4, #12 // t16a + vrshr.s32 d31, d6, #12 // t31a + vmul_vmla d4, d24, d23, d1[1], d1[0] // -> t30a + vmul_vmls d6, d20, d27, d2[0], d2[1] // -> t18a + vrshr.s32 d24, d8, #12 // t17a + vrshr.s32 d23, d4, #12 // t30a + vmul_vmla d8, d20, d27, d2[1], d2[0] // -> t29a + vmul_vmls d4, d28, d19, d3[0], d3[1] // -> t19a + vrshr.s32 d20, d6, #12 // t18a + vrshr.s32 d27, d8, #12 // t29a + vmul_vmla d6, d28, d19, d3[1], d3[0] // -> t28a + vld1.32 {q0, q1}, [r12, :128] + sub r12, r12, #4*24 + vmul_vmls d8, d18, d29, d0[0], d0[1] // -> t20a + vrshr.s32 d28, d4, #12 // t19a + vrshr.s32 d19, d6, #12 // t28a + vmul_vmla d4, d18, d29, d0[1], d0[0] // -> t27a + vmul_vmls d6, d26, d21, d1[0], d1[1] // -> t21a + vrshr.s32 d18, d8, #12 // t20a + vrshr.s32 d29, d4, #12 // t27a + vmul_vmla d8, d26, d21, d1[1], d1[0] // -> t26a + vmul_vmls d4, d22, d25, d2[0], d2[1] // -> t22a + vrshr.s32 d26, d6, #12 // t21a + vrshr.s32 d21, d8, #12 // t26a + vmul_vmla d6, d22, d25, d2[1], d2[0] // -> t25a + vmul_vmls d8, d30, d17, d3[0], d3[1] // -> t23a + vrshr.s32 d22, d4, #12 // t22a + vrshr.s32 d25, d6, #12 // t25a + vmul_vmla d4, d30, d17, d3[1], d3[0] // -> t24a + vrshr.s32 d30, d8, #12 // t23a + vrshr.s32 d17, d4, #12 // t24a + + vld1.32 {q0, q1}, [r12, :128] + + vqsub.s32 d5, d16, d24 // t17 + vqadd.s32 d16, d16, d24 // t16 + vqsub.s32 d7, d31, d23 // t30 + vqadd.s32 d31, d31, d23 // t31 + vqsub.s32 d24, d28, d20 // t18 + vqadd.s32 d28, d28, d20 // t19 + vqadd.s32 d23, d18, d26 // t20 + vqsub.s32 d18, d18, d26 // t21 + vqsub.s32 d20, d30, d22 // t22 + vqadd.s32 d30, d30, d22 // t23 + vqadd.s32 d26, d17, d25 // t24 + vqsub.s32 d17, d17, d25 // t25 + vqsub.s32 d22, d29, d21 // t26 + vqadd.s32 d29, d29, d21 // t27 + vqadd.s32 d25, d19, d27 // t28 + vqsub.s32 d19, d19, d27 // t29 + + vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a + vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a + vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a + vrshr.s32 d21, d4, #12 // t17a + vrshr.s32 d27, d6, #12 // t30a + vneg.s32 d8, d8 // -> t18a + vmul_vmls d5, d19, d24, d2[0], d2[1] // -> t29a + vmul_vmls d4, d22, d18, d3[0], d3[1] // -> t21a + vrshr.s32 d19, d8, #12 // t18a + vrshr.s32 d24, d5, #12 // t29a + vmul_vmla d6, d22, d18, d3[1], d3[0] // -> t26a + vmul_vmla d8, d17, d20, d3[1], d3[0] // -> t22a + vrshr.s32 d22, d4, #12 // t21a + vrshr.s32 d18, d6, #12 // t26a + vneg.s32 d8, d8 // -> t22a + vmul_vmls d5, d17, d20, d3[0], d3[1] // -> t25a + vrshr.s32 d17, d8, #12 // t22a + vrshr.s32 d20, d5, #12 // t25a + + vqsub.s32 d2, d27, d24 // t29 + vqadd.s32 d27, d27, d24 // t30 + vqsub.s32 d3, d21, d19 // t18 + vqadd.s32 d21, d21, d19 // t17 + vqsub.s32 d24, d16, d28 // t19a + vqadd.s32 d16, d16, d28 // t16a + vqsub.s32 d19, d30, d23 // t20a + vqadd.s32 d30, d30, d23 // t23a + vqsub.s32 d28, d17, d22 // t21 + vqadd.s32 d17, d17, d22 // t22 + vqadd.s32 d23, d26, d29 // t24a + vqsub.s32 d26, d26, d29 // t27a + vqadd.s32 d22, d20, d18 // t25 + vqsub.s32 d20, d20, d18 // t26 + vqsub.s32 d29, d31, d25 // t28a + vqadd.s32 d31, d31, d25 // t31a + + vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a + vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a + vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19 + vrshr.s32 d18, d4, #12 // t18a + vrshr.s32 d25, d6, #12 // t29a + vmul_vmla d5, d29, d24, d1[1], d1[0] // -> t28 + vmul_vmla d4, d26, d19, d1[1], d1[0] // -> t20 + vrshr.s32 d29, d8, #12 // t19 + vrshr.s32 d24, d5, #12 // t28 + vneg.s32 d4, d4 // -> t20 + vmul_vmls d6, d26, d19, d1[0], d1[1] // -> t27 + vmul_vmla d8, d20, d28, d1[1], d1[0] // -> t21a + vrshr.s32 d26, d4, #12 // t20 + vrshr.s32 d19, d6, #12 // t27 + vneg.s32 d8, d8 // -> t21a + vmul_vmls d5, d20, d28, d1[0], d1[1] // -> t26a + vrshr.s32 d20, d8, #12 // t21a + vrshr.s32 d28, d5, #12 // t26a + + vqsub.s32 d2, d16, d30 // t23 + vqadd.s32 d16, d16, d30 // t16 = out16 + vqsub.s32 d3, d31, d23 // t24 + vqadd.s32 d31, d31, d23 // t31 = out31 + vqsub.s32 d23, d21, d17 // t22a + vqadd.s32 d17, d21, d17 // t17a = out17 + vqadd.s32 d30, d27, d22 // t30a = out30 + vqsub.s32 d21, d27, d22 // t25a + vqsub.s32 d27, d18, d20 // t21 + vqadd.s32 d18, d18, d20 // t18 = out18 + vqadd.s32 d4, d29, d26 // t19a = out19 + vqsub.s32 d26, d29, d26 // t20a + vqadd.s32 d29, d25, d28 // t29 = out29 + vqsub.s32 d25, d25, d28 // t26 + vqadd.s32 d28, d24, d19 // t28a = out28 + vqsub.s32 d24, d24, d19 // t27a + vmov d19, d4 // out19 + + vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20 + vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27 + vrshr.s32 d20, d4, #12 // t20 + vrshr.s32 d22, d6, #12 // t27 + + vmul_vmla d4, d25, d27, d0[0], d0[0] // -> t26a + vmul_vmls d6, d25, d27, d0[0], d0[0] // -> t21a + vmov d27, d22 // t27 + vrshr.s32 d26, d4, #12 // t26a + + vmul_vmls d24, d21, d23, d0[0], d0[0] // -> t22 + vmul_vmla d4, d21, d23, d0[0], d0[0] // -> t25 + vrshr.s32 d21, d6, #12 // t21a + vrshr.s32 d22, d24, #12 // t22 + vrshr.s32 d25, d4, #12 // t25 + + vmul_vmls d4, d3, d2, d0[0], d0[0] // -> t23a + vmul_vmla d6, d3, d2, d0[0], d0[0] // -> t24a + vrshr.s32 d23, d4, #12 // t23a + vrshr.s32 d24, d6, #12 // t24a + + bx lr +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x2_neon + push {lr} + vmov.i32 d7, #0 + lsl r8, r8, #1 +.if \scale + mov_const r12, 2896*8*(1<<16) + vdup.32 d0, r12 +.endif + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r7, :64] + vst1.32 {d7}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 +.if \scale + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct_2s_x16_neon + vtrn.32 d16, d17 + vtrn.32 d18, d19 + vtrn.32 d20, d21 + vtrn.32 d22, d23 + vtrn.32 d24, d25 + vtrn.32 d26, d27 + vtrn.32 d28, d29 + vtrn.32 d30, d31 + +.macro store1 r0, r1, r2, r3 + vst1.16 {\r0}, [r6, :64]! + vst1.16 {\r1}, [r6, :64]! + vst1.16 {\r2}, [r6, :64]! + vst1.16 {\r3}, [r6, :64]! +.endm + store1 d16, d18, d20, d22 + store1 d24, d26, d28, d30 + store1 d17, d19, d21, d23 + store1 d25, d27, d29, d31 +.purgem store1 + sub r6, r6, #64*2 + + vmov.i32 d7, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r7, :64] + vst1.32 {d7}, [r7, :64], r8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in d0[1] + scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct32_odd_2s_x16_neon + vtrn.32 d31, d30 + vtrn.32 d29, d28 + vtrn.32 d27, d26 + vtrn.32 d25, d24 + vtrn.32 d23, d22 + vtrn.32 d21, d20 + vtrn.32 d19, d18 + vtrn.32 d17, d16 +.macro store2 r0, r1, r2, r3, r4, r5, r6, r7, shift + vld1.32 {q0, q1}, [r6, :128]! + vld1.32 {q2, q3}, [r6, :128] + sub r6, r6, #32 + vqsub.s32 d15, d0, \r0 + vqadd.s32 d0, d0, \r0 + vqsub.s32 d14, d1, \r1 + vqadd.s32 d1, d1, \r1 + vqsub.s32 d13, d2, \r2 + vqadd.s32 d2, d2, \r2 + vqsub.s32 d12, d3, \r3 + vqadd.s32 d3, d3, \r3 + vqsub.s32 d11, d4, \r4 + vqadd.s32 d4, d4, \r4 + vqsub.s32 d10, d5, \r5 + vqadd.s32 d5, d5, \r5 + vqsub.s32 d9, d6, \r6 + vqadd.s32 d6, d6, \r6 + vqsub.s32 d8, d7, \r7 + vqadd.s32 d7, d7, \r7 + vqrshrn.s32 d0, q0, #\shift + vqrshrn.s32 d1, q1, #\shift + vqrshrn.s32 d2, q2, #\shift + vqrshrn.s32 d3, q3, #\shift + vqrshrn.s32 d4, q4, #\shift + vqrshrn.s32 d5, q5, #\shift + vqrshrn.s32 d6, q6, #\shift + vqrshrn.s32 d7, q7, #\shift + vrev32.16 q2, q2 + vrev32.16 q3, q3 + vst1.16 {q0, q1}, [r6, :128]! + vst1.16 {q2, q3}, [r6, :128]! +.endm + + store2 d31, d29, d27, d25, d23, d21, d19, d17, \shift + store2 d30, d28, d26, d24, d22, d20, d18, d16, \shift +.purgem store2 + pop {pc} +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_4x32_neon + push {r10-r11,lr} + lsl r8, r8, #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + + bl X(inv_dct_4h_x16_neon) + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vst1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + sub r7, r7, r8, lsr #1 + bl X(inv_dct32_odd_4h_x16_neon) + + neg r9, r8 + mov r10, r6 + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff +.macro combine r0, r1, r2, r3, op, stride + vld1.16 {d4}, [r7, :64], \stride + vld1.16 {d0}, [r10, :64], r1 + vld1.16 {d5}, [r7, :64], \stride + vld1.16 {d1}, [r10, :64], r1 + \op\().s16 d4, d4, \r0 + vld1.16 {d6}, [r7, :64], \stride + vld1.16 {d2}, [r10, :64], r1 + \op\().s16 d5, d5, \r1 + vld1.16 {d3}, [r10, :64], r1 + vrshr.s16 q2, q2, #4 + \op\().s16 d6, d6, \r2 + vld1.16 {d7}, [r7, :64], \stride + vqadd.s16 q0, q0, q2 + \op\().s16 d7, d7, \r3 + vmax.s16 q0, q0, q6 + vrshr.s16 q3, q3, #4 + vmin.s16 q0, q0, q7 + vqadd.s16 q1, q1, q3 + vst1.16 {d0}, [r6, :64], r1 + vmax.s16 q1, q1, q6 + vst1.16 {d1}, [r6, :64], r1 + vmin.s16 q1, q1, q7 + vst1.16 {d2}, [r6, :64], r1 + vst1.16 {d3}, [r6, :64], r1 +.endm + combine d31, d30, d29, d28, vqadd, r8 + combine d27, d26, d25, d24, vqadd, r8 + combine d23, d22, d21, d20, vqadd, r8 + combine d19, d18, d17, d16, vqadd, r8 + sub r7, r7, r8 + combine d16, d17, d18, d19, vqsub, r9 + combine d20, d21, d22, d23, vqsub, r9 + combine d24, d25, d26, d27, vqsub, r9 + combine d28, d29, d30, d31, vqsub, r9 +.purgem combine + + pop {r10-r11,pc} +endfunc + +const eob_32x32 + .short 3, 10, 21, 36, 55, 78, 105, 136, 171, 210, 253, 300, 351, 406, 465, 1024 +endconst + +const eob_16x32 + .short 3, 10, 21, 36, 55, 78, 105, 151, 183, 215, 247, 279, 311, 343, 375, 512 +endconst + +const eob_16x32_shortside + .short 3, 10, 21, 36, 55, 78, 105, 512 +endconst + +const eob_8x32 + .short 3, 10, 21, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 + push {r4-r7,lr} + vpush {q6-q7} + movrel_local r5, eob_32x32, 2 + + mov r6, #4*32 +1: + mov r12, #0 + movrel_local r4, eob_32x32, 6 +2: + vmov.i32 q0, #0 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r6 +.endr + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q12 + vqmovn.s32 d18, q9 + vqmovn.s32 d19, q13 + vqmovn.s32 d20, q10 + vqmovn.s32 d21, q14 + vqmovn.s32 d22, q11 + vqmovn.s32 d23, q15 + transpose_4x8h q8, q9, q10, q11 + + load_add_store_8x4 r0, r7, shiftbits=2 + ldrh lr, [r4], #8 + sub r0, r0, r1, lsl #2 + cmp r3, lr + add r0, r0, #2*8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12, lsl #1 + add r0, r0, r1, lsl #2 + mls r2, r6, r12, r2 + add r2, r2, #4*4 + b 1b +9: + vpop {q6-q7} + pop {r4-r7,pc} +endfunc + +.macro shift_8_regs op, shift +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + push {r4-r9,lr} + vpush {q6-q7} + mov r9, #0 + mov_const r8, 2896*8*(1<<16) + movt r9, #2*(5793-4096)*8 + movrel_local r5, eob_16x32\hshort, 2 + + mov r6, #4*\h +1: + mov r12, #0 + movrel_local r4, eob_16x32\wshort, 6 +2: + vdup.i32 d0, r8 + vmov.i32 q1, #0 + vmov.32 d0[1], r9 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q1}, [r2, :128], r6 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + +.if \w == 16 + // 16x32 + identity_8x4_shift1 d0[1] +.else + // 32x16 + shift_8_regs vqshl.s32, 1 + identity_8x4 d0[1] +.endif + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q12 + vqmovn.s32 d18, q9 + vqmovn.s32 d19, q13 + vqmovn.s32 d20, q10 + vqmovn.s32 d21, q14 + vqmovn.s32 d22, q11 + vqmovn.s32 d23, q15 + transpose_4x8h q8, q9, q10, q11 + +.if \w == 16 + load_add_store_8x4 r0, r7, shiftbits=2 +.else + load_add_store_8x4 r0, r7, shiftbits=4 +.endif + ldrh lr, [r4], #8 + sub r0, r0, r1, lsl #2 + cmp r3, lr + add r0, r0, #2*8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12, lsl #1 + add r0, r0, r1, lsl #2 + mls r2, r6, r12, r2 + add r2, r2, #4*4 + b 1b +9: + vpop {q6-q7} + pop {r4-r9,pc} +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + push {r4-r5,lr} + vpush {q6-q7} + movrel_local r4, eob_8x32, 2 + + mov r12, #4*\h +1: + ldrh lr, [r4], #4 +.if \w == 8 + vmov.i32 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r12 +.endr + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q12, #1 + vqrshrn.s32 d18, q9, #1 + vqrshrn.s32 d19, q13, #1 + vqrshrn.s32 d20, q10, #1 + vqrshrn.s32 d21, q14, #1 + vqrshrn.s32 d22, q11, #1 + vqrshrn.s32 d23, q15, #1 + + transpose_4x8h q8, q9, q10, q11 + + cmp r3, lr + load_add_store_8x4 r0, r5, shiftbits=2 + blt 9f + sub r2, r2, r12, lsl #3 + add r2, r2, #4*4 +.else + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vld1.32 {q8, q9}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vld1.32 {q10, q11}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vld1.32 {q12, q13}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vld1.32 {q14, q15}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q10 + vqmovn.s32 d20, q9 + vqmovn.s32 d21, q11 + vqmovn.s32 d18, q12 + vqmovn.s32 d19, q14 + vqmovn.s32 d22, q13 + vqmovn.s32 d23, q15 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + + cmp r3, lr + load_add_store_4x8 r0, r5, shiftbits=3 + blt 9f + sub r0, r0, r1, lsl #3 + add r0, r0, #2*4 +.endif + b 1b + +9: + vpop {q6-q7} + pop {r4-r5,pc} +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 + idct_dc 32, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 2048 + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, sp, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_horz_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 2048 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 + idct_dc 16, 32, 1 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r4, inv_dct_2s_x16_neon + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, sp, #(\i*16*2) + add r7, r2, #(\i*4) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #4*32 + bl inv_txfm_horz_scale_16x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #16*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 1024 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 + idct_dc 32, 16, 1 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel r5, X(inv_dct_4h_x16_neon) + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*4) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 14 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #4*16 + bl inv_txfm_horz_scale_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 1024 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 + idct_dc 8, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + + movrel_local r10, eob_8x32, 2 + + mov r8, #4*32 + mov r9, #32 + mov r6, sp +1: + vmov.i32 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r8 +.endr + ldrh r11, [r10], #4 + sub r2, r2, r8, lsl #3 + sub r9, r9, #4 + add r2, r2, #4*4 + + bl inv_dct_4s_x8_neon + + vqrshrn.s32 d16, q8, #2 + vqrshrn.s32 d18, q9, #2 + vqrshrn.s32 d20, q10, #2 + vqrshrn.s32 d22, q11, #2 + vqrshrn.s32 d17, q12, #2 + vqrshrn.s32 d19, q13, #2 + vqrshrn.s32 d21, q14, #2 + vqrshrn.s32 d23, q15, #2 + + transpose_4x8h q8, q9, q10, q11 + + vst1.16 {q8, q9}, [r6, :128]! + cmp r3, r11 + vst1.16 {q10, q11}, [r6, :128]! + + bge 1b + cmp r9, #0 + beq 3f + + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r9, r9, #4 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #8*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 + idct_dc 32, 8, 2 + + push {r4-r11,lr} + vpush {q4-q7} + movrel_local r10, eob_8x32 + sub_sp_align 512 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*4) +.if \i > 0 + cmp r3, r11 + mov r8, #(8 - \i) + blt 1f +.if \i < 6 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #8*4 + bl inv_txfm_horz_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + mov r8, #2*32 + mov r9, #0 +1: + add r6, r0, r9, lsl #1 + add r7, sp, r9, lsl #1 // #(\i*2) + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r7, :128], r8 +.endr + add r9, r9, #8 + + bl X(inv_dct_8h_x8_neon) + + cmp r9, #32 + + load_add_store_8x8 r6, r7 + + blt 1b + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + vld1.32 {q0, q1}, [r12, :128]! + + vqrdmulh.s32 d23, d16, d0[1] // t63a + vqrdmulh.s32 d16, d16, d0[0] // t32a + vqrdmulh.s32 d22, d17, d1[0] // t62a + vqrdmulh.s32 d17, d17, d1[1] // t33a + vqrdmulh.s32 d21, d18, d2[1] // t61a + vqrdmulh.s32 d18, d18, d2[0] // t34a + vqrdmulh.s32 d20, d19, d3[0] // t60a + vqrdmulh.s32 d19, d19, d3[1] // t35a + + vld1.32 {q0}, [r12, :128]! + + vqadd.s32 d24, d16, d17 // t32 + vqsub.s32 d25, d16, d17 // t33 + vqsub.s32 d26, d19, d18 // t34 + vqadd.s32 d27, d19, d18 // t35 + vqadd.s32 d28, d20, d21 // t60 + vqsub.s32 d29, d20, d21 // t61 + vqsub.s32 d30, d23, d22 // t62 + vqadd.s32 d31, d23, d22 // t63 + + vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a + vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a + vneg.s32 d4, d4 // t34a + vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a + vrshr.s32 d26, d4, #12 // t34a + vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a + vrshr.s32 d29, d6, #12 // t61a + vrshr.s32 d25, d8, #12 // t33a + vrshr.s32 d30, d4, #12 // t62a + + vqadd.s32 d16, d24, d27 // t32a + vqsub.s32 d19, d24, d27 // t35a + vqadd.s32 d17, d25, d26 // t33 + vqsub.s32 d18, d25, d26 // t34 + vqsub.s32 d20, d31, d28 // t60a + vqadd.s32 d23, d31, d28 // t63a + vqsub.s32 d21, d30, d29 // t61 + vqadd.s32 d22, d30, d29 // t62 + + vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a + vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a + vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60 + vrshr.s32 d21, d4, #12 // t61a + vrshr.s32 d18, d6, #12 // t34a + vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35 + vrshr.s32 d20, d8, #12 // t60 + vrshr.s32 d19, d4, #12 // t35 + + vst1.32 {d16, d17, d18, d19}, [r6, :128]! + vst1.32 {d20, d21, d22, d23}, [r6, :128]! + + bx lr +endfunc + +function inv_dct64_step2_neon + movrel_local r12, idct_coeffs + vld1.32 {q0}, [r12, :128] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + vldr d16, [r6, #4*2*0] // t32a + vldr d17, [r9, #4*2*8] // t39a + vldr d18, [r9, #4*2*0] // t63a + vldr d19, [r6, #4*2*8] // t56a + vldr d20, [r6, #4*2*16] // t40a + vldr d21, [r9, #4*2*24] // t47a + vldr d22, [r9, #4*2*16] // t55a + vldr d23, [r6, #4*2*24] // t48a + + vqadd.s32 d24, d16, d17 // t32 + vqsub.s32 d25, d16, d17 // t39 + vqadd.s32 d26, d18, d19 // t63 + vqsub.s32 d27, d18, d19 // t56 + vqsub.s32 d28, d21, d20 // t40 + vqadd.s32 d29, d21, d20 // t47 + vqadd.s32 d30, d23, d22 // t48 + vqsub.s32 d31, d23, d22 // t55 + + vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a + vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a + vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a + vrshr.s32 d25, d4, #12 // t56a + vrshr.s32 d27, d6, #12 // t39a + vneg.s32 d8, d8 // t40a + vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a + vrshr.s32 d31, d8, #12 // t40a + vrshr.s32 d28, d4, #12 // t55a + + vqadd.s32 d16, d24, d29 // t32a + vqsub.s32 d19, d24, d29 // t47a + vqadd.s32 d17, d27, d31 // t39 + vqsub.s32 d18, d27, d31 // t40 + vqsub.s32 d20, d26, d30 // t48a + vqadd.s32 d23, d26, d30 // t63a + vqsub.s32 d21, d25, d28 // t55 + vqadd.s32 d22, d25, d28 // t56 + + vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a + vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a + vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47 + vrshr.s32 d18, d4, #12 // t40a + vrshr.s32 d21, d6, #12 // t55a + vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48 + vrshr.s32 d19, d8, #12 // t47 + vrshr.s32 d20, d4, #12 // t48 + + vstr d16, [r6, #4*2*0] // t32a + vstr d17, [r9, #4*2*0] // t39 + vstr d18, [r6, #4*2*8] // t40a + vstr d19, [r9, #4*2*8] // t47 + vstr d20, [r6, #4*2*16] // t48 + vstr d21, [r9, #4*2*16] // t55a + vstr d22, [r6, #4*2*24] // t56 + vstr d23, [r9, #4*2*24] // t63a + + add r6, r6, #4*2 + sub r9, r9, #4*2 + cmp r6, r9 + blt 1b + bx lr +endfunc + +.macro load8 src, strd, zero, clear +.irp i, d16, d17, d18, d19, d20, d21, d22, d23 +.if \clear + vld1.32 {\i}, [\src, :64] + vst1.32 {\zero}, [\src, :64], \strd +.else + vld1.32 {\i}, [\src, :64], \strd +.endif +.endr +.endm + +.macro store16 dst + vst1.32 {q8, q9}, [\dst, :128]! + vst1.32 {q10, q11}, [\dst, :128]! + vst1.32 {q12, q13}, [\dst, :128]! + vst1.32 {q14, q15}, [\dst, :128]! +.endm + +.macro clear_upper8 +.irp i, q12, q13, q14, q15 + vmov.i32 \i, #0 +.endr +.endm + +.macro vmov_if reg, val, cond +.if \cond + vmov.i32 \reg, \val +.endif +.endm + +.macro movdup_if reg, gpr, val, cond +.if \cond + mov_const \gpr, \val + vdup.32 \reg, \gpr +.endif +.endm + +.macro vst1_if regs, dst, dstalign, cond +.if \cond + vst1.32 \regs, \dst, \dstalign +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_2s_x64_neon + mov r6, sp + + push {r10-r11,lr} + + lsl r8, r8, #2 + + movdup_if d0, r12, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + add r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct_2s_x16_neon + + store16 r6 + + movdup_if d0, r12, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + lsr r8, r8, #1 + sub r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct32_odd_2s_x16_neon + + add r10, r6, #8*15 + sub r6, r6, #8*16 + + mov r9, #-8 + +.macro store_addsub r0, r1, r2, r3 + vld1.32 {d2}, [r6, :64]! + vld1.32 {d3}, [r6, :64]! + vqadd.s32 d6, d2, \r0 + vqsub.s32 \r0, d2, \r0 + vld1.32 {d4}, [r6, :64]! + vqadd.s32 d7, d3, \r1 + vqsub.s32 \r1, d3, \r1 + vld1.32 {d5}, [r6, :64]! + vqadd.s32 d2, d4, \r2 + sub r6, r6, #8*4 + vqsub.s32 \r2, d4, \r2 + vst1.32 {d6}, [r6, :64]! + vst1.32 {\r0}, [r10, :64], r9 + vqadd.s32 d3, d5, \r3 + vqsub.s32 \r3, d5, \r3 + vst1.32 {d7}, [r6, :64]! + vst1.32 {\r1}, [r10, :64], r9 + vst1.32 {d2}, [r6, :64]! + vst1.32 {\r2}, [r10, :64], r9 + vst1.32 {d3}, [r6, :64]! + vst1.32 {\r3}, [r10, :64], r9 +.endm + store_addsub d31, d30, d29, d28 + store_addsub d27, d26, d25, d24 + store_addsub d23, d22, d21, d20 + store_addsub d19, d18, d17, d16 +.purgem store_addsub + + add r6, r6, #2*4*16 + + movrel_local r12, idct64_coeffs + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + add r9, r7, r8, lsl #4 // offset 16 + add r10, r7, r8, lsl #3 // offset 8 + sub r9, r9, r8 // offset 15 + sub r11, r10, r8 // offset 7 + vld1.32 {d16}, [r7, :64] // in1 (offset 0) + vld1.32 {d17}, [r9, :64] // in31 (offset 15) + vld1.32 {d18}, [r10, :64] // in17 (offset 8) + vld1.32 {d19}, [r11, :64] // in15 (offset 7) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + add r7, r7, r8, lsl #2 // offset 4 + sub r9, r9, r8, lsl #2 // offset 11 + sub r10, r7, r8 // offset 3 + add r11, r9, r8 // offset 12 + vld1.32 {d16}, [r10, :64] // in7 (offset 3) + vld1.32 {d17}, [r11, :64] // in25 (offset 12) + vld1.32 {d18}, [r9, :64] // in23 (offset 11) + vld1.32 {d19}, [r7, :64] // in9 (offset 4) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + sub r10, r10, r8, lsl #1 // offset 1 + sub r9, r9, r8, lsl #1 // offset 9 + add r10, r10, r8 // offset 2 + add r9, r9, r8 // offset 10 + add r7, r7, r8 // offset 5 + add r11, r11, r8 // offset 13 + vld1.32 d16, [r10, :64] // in5 (offset 2) + vld1.32 d17, [r11, :64] // in27 (offset 13) + vld1.32 d18, [r9, :64] // in21 (offset 10) + vld1.32 d19, [r7, :64] // in11 (offset 5) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + sub r10, r10, r8 // offset 1 + sub r9, r9, r8 // offset 9 + add r11, r11, r8 // offset 14 + add r7, r7, r8 // offset 6 + vld1.32 d16, [r10, :64] // in3 (offset 1) + vld1.32 d17, [r11, :64] // in29 (offset 14) + vld1.32 d18, [r9, :64] // in19 (offset 9) + vld1.32 d19, [r7, :64] // in13 (offset 6) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + + sub r6, r6, #2*4*32 + add r9, r6, #2*4*7 + + bl inv_dct64_step2_neon + + pop {r10-r11,pc} +endfunc +.endm + +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + +function inv_txfm_horz_dct_64x2_neon + vdup.32 q4, r9 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, #2*56 + + push {r10-r11,lr} + + mov r10, #2*64 + mov r11, #-2*4*4 + +1: + vld1.32 {d16, d17, d18, d19}, [r7, :128]! + vld1.32 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.32 {d20, d21, d22, d23}, [r7, :128]! + vld1.32 {d24, d25, d26, d27}, [r8, :128], r11 + vtrn.32 d16, d17 + vtrn.32 d18, d19 + vtrn.32 d20, d21 + vtrn.32 d22, d23 + vtrn.32 d31, d30 + vtrn.32 d29, d28 + vtrn.32 d27, d26 + vtrn.32 d25, d24 + +.macro store_addsub src0, src1, src2, src3, src4, src5, src6, src7 + vqsub.s32 d7, \src0, \src1 + vqsub.s32 d6, \src2, \src3 + vqsub.s32 d5, \src4, \src5 + vqsub.s32 d4, \src6, \src7 + vqadd.s32 d0, \src0, \src1 + vqadd.s32 d1, \src2, \src3 + vqadd.s32 d2, \src4, \src5 + vqadd.s32 d3, \src6, \src7 + vrshl.s32 q3, q3, q4 + vrshl.s32 q2, q2, q4 + vrshl.s32 q0, q0, q4 + vrshl.s32 q1, q1, q4 + vqmovn.s32 d7, q3 + vqmovn.s32 d6, q2 + vqmovn.s32 d0, q0 + vqmovn.s32 d1, q1 + vrev32.16 q3, q3 + vst1.16 {q0}, [r6, :128], r10 + vst1.16 {q3}, [r9, :128], r10 +.endm + store_addsub d16, d31, d18, d29, d20, d27, d22, d25 + store_addsub d17, d30, d19, d28, d21, d26, d23, d24 +.purgem store_addsub + sub r6, r6, r10, lsl #1 + sub r9, r9, r10, lsl #1 + add r6, r6, #16 + sub r9, r9, #16 + + cmp r7, r8 + blt 1b + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_vert_dct_4x64_neon + lsl r8, r8, #1 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, r1, lsl #6 + sub r9, r9, r1 + + push {r10-r11,lr} + + neg r10, r1 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff +.macro add_dest_addsub src0, src1, src2, src3 + vld1.16 {d0}, [r6, :64], r1 + vld1.16 {d1}, [r9, :64], r10 + vqadd.s16 d4, \src0, \src1 + vld1.16 {d2}, [r6, :64] + vqsub.s16 d5, \src0, \src1 + vld1.16 {d3}, [r9, :64] + vqadd.s16 d6, \src2, \src3 + vqsub.s16 d7, \src2, \src3 + sub r6, r6, r1 + sub r9, r9, r10 + vrshr.s16 q2, q2, #4 + vrshr.s16 q3, q3, #4 + vqadd.s16 q2, q2, q0 + vqadd.s16 q3, q3, q1 + vmax.s16 q2, q2, q6 + vmax.s16 q3, q3, q6 + vmin.s16 q2, q2, q7 + vmin.s16 q3, q3, q7 + vst1.16 {d4}, [r6, :64], r1 + vst1.16 {d5}, [r9, :64], r10 + vst1.16 {d6}, [r6, :64], r1 + vst1.16 {d7}, [r9, :64], r10 +.endm + add_dest_addsub d16, d31, d17, d30 + add_dest_addsub d18, d29, d19, d28 + add_dest_addsub d20, d27, d21, d26 + add_dest_addsub d22, d25, d23, d24 +.purgem add_dest_addsub + cmp r7, r8 + blt 1b + + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 + idct_dc 64, 64, 2 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_dct_clear_2s_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x2_neon +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r7, r5, #(\i*2) + mov r8, #64*2 + bl X(inv_txfm_dct_4h_x64_neon) + add r6, r0, #(\i*2) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 + idct_dc 64, 32, 1 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_dct_clear_scale_2s_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-1 // shift + bl inv_txfm_horz_dct_64x2_neon +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i*2) + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 + idct_dc 32, 64, 1 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 32*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_horz_scale_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r7, r5, #(\i*2) + mov r8, #32*2 + bl X(inv_txfm_dct_4h_x64_neon) + add r6, r0, #(\i*2) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 32*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 + idct_dc 64, 16, 2 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 64*16*2+64*4*2 + add r4, sp, #64*4*2 + + movrel_local r10, eob_16x32 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r6, r4, #(\i*64*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*4) + mov r8, #16*4 + bl inv_txfm_dct_clear_2s_x64_neon + add r6, r4, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x2_neon +.if \i < 8 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + movrel r5, X(inv_dct_4h_x16_neon) +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i*2) + add r7, r4, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 64*16*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 + idct_dc 16, 64, 2 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 16*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + + movrel_local r4, inv_dct_2s_x16_neon +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*16*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_horz_16x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r7, r5, #(\i*2) + mov r8, #16*2 + bl X(inv_txfm_dct_4h_x64_neon) + add r6, r0, #(\i*2) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 16*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff -Nru dav1d-0.7.1/src/arm/32/itx.S dav1d-0.9.1/src/arm/32/itx.S --- dav1d-0.7.1/src/arm/32/itx.S 2020-06-21 11:48:54.956126500 +0000 +++ dav1d-0.9.1/src/arm/32/itx.S 2021-07-28 21:38:28.861851700 +0000 @@ -706,7 +706,7 @@ vrshrn_8h \r14, \r15, q4, q5, #12 // t7a vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a vrshrn_8h \r6, \r7, q6, q7, #12 // t5a - vrshrn_8h \r10, \r11, q2, q3, #12 // taa + vrshrn_8h \r10, \r11, q2, q3, #12 // t6a vqadd.s16 q2, \q1, \q3 // t4 vqsub.s16 \q1, \q1, \q3 // t5a @@ -1173,7 +1173,7 @@ vrshrn.i32 d6, q3, #12 // t11 vrshrn.i32 d7, q4, #12 // t12 - vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t10a + vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a vrshrn.i32 d4, q2, #12 // t10a vrshrn.i32 d5, q4, #12 // t13a @@ -1480,53 +1480,6 @@ pop {pc} endfunc -.macro sub_sp_align space -#if CONFIG_THUMB - mov r7, sp - and r7, r7, #15 -#else - and r7, sp, #15 -#endif - sub sp, sp, r7 - // Now the stack is aligned, store the amount of adjustment back - // on the stack, as we don't want to waste a register as frame - // pointer. - str r7, [sp, #-16]! -#ifdef _WIN32 -.if \space > 8192 - // Here, we'd need to touch two (or more) pages while decrementing - // the stack pointer. - .error "sub_sp_align doesn't support values over 8K at the moment" -.elseif \space > 4096 - sub r7, sp, #4096 - ldr r12, [r7] - sub r7, r7, #(\space - 4096) - mov sp, r7 -.else - sub sp, sp, #\space -.endif -#else -.if \space >= 4096 - sub sp, sp, #(\space)/4096*4096 -.endif -.if (\space % 4096) != 0 - sub sp, sp, #(\space)%4096 -.endif -#endif -.endm - -.macro add_sp_align space -.if \space >= 4096 - add sp, sp, #(\space)/4096*4096 -.endif -.if (\space % 4096) != 0 - add sp, sp, #(\space)%4096 -.endif - ldr r7, [sp], #16 - // Add back the original stack adjustment - add sp, sp, r7 -.endm - function inv_txfm_add_16x16_neon sub_sp_align 512 ldrh r11, [r10], #2 @@ -3248,8 +3201,10 @@ mov r8, #(32 - \i) cmp r3, r11 blt 1f +.if \i < 28 ldrh r11, [r10], #2 .endif +.endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_horz_scale_dct_32x4_neon @@ -3304,7 +3259,7 @@ add r6, r4, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x4_neon -.if \i < 8 +.if \i < 12 ldrh r11, [r10], #2 .endif .endr @@ -3353,8 +3308,10 @@ mov r8, #(32 - \i) cmp r3, r11 blt 1f +.if \i < 28 ldrh r11, [r10], #2 .endif +.endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_horz_16x4_neon diff -Nru dav1d-0.7.1/src/arm/32/loopfilter16.S dav1d-0.9.1/src/arm/32/loopfilter16.S --- dav1d-0.7.1/src/arm/32/loopfilter16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/loopfilter16.S 2021-07-28 21:38:28.861851700 +0000 @@ -0,0 +1,859 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro loop_filter wd +function lpf_4_wd\wd\()_neon + vabd.u16 d0, d22, d23 // abs(p1 - p0) + vabd.u16 d1, d25, d24 // abs(q1 - q0) + vabd.u16 d2, d23, d24 // abs(p0 - q0) + vabd.u16 d3, d22, d25 // abs(p1 - q1) +.if \wd >= 6 + vabd.u16 d4, d21, d22 // abs(p2 - p1) + vabd.u16 d5, d26, d25 // abs(q2 - q1) +.endif +.if \wd >= 8 + vabd.u16 d6, d20, d21 // abs(p3 - p2) + vabd.u16 d7, d27, d26 // abs(q3 - q3) +.endif +.if \wd >= 6 + vmax.u16 d4, d4, d5 +.endif + vqadd.u16 d2, d2, d2 // abs(p0 - q0) * 2 +.if \wd >= 8 + vmax.u16 d6, d6, d7 +.endif + vshr.u16 d3, d3, #1 +.if \wd >= 8 + vmax.u16 d4, d4, d6 +.endif + vmax.u16 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0)) + vqadd.u16 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + vmax.u16 d4, d0, d4 + vcge.u16 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + vcge.u16 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + vcge.u16 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + vand d1, d1, d2 // fm && wd >= 4 (implicit) +.if \wd >= 6 + vmov d14, d1 // fm && wd > 4 (implicit) +.endif +.if \wd >= 16 + vmov d15, d1 // fm && wd == 16 (implicit) +.endif + + vmov r10, r11, d1 + orrs r10, r10, r11 + beq 9f // if (!fm || wd < 4) return; + +.if \wd >= 6 + vmov.i16 d10, #1 + vabd.u16 d2, d21, d23 // abs(p2 - p0) + vabd.u16 d3, d22, d23 // abs(p1 - p0) + vabd.u16 d4, d25, d24 // abs(q1 - q0) + vabd.u16 d5, d26, d24 // abs(q2 - q0) + vdup.16 d9, r9 // bitdepth_min_8 +.if \wd >= 8 + vabd.u16 d6, d20, d23 // abs(p3 - p0) + vabd.u16 d7, d27, d24 // abs(q3 - q0) +.endif + vmax.u16 d2, d2, d3 + vmax.u16 d4, d4, d5 +.if \wd >= 8 + vmax.u16 d6, d6, d7 +.endif + vmax.u16 d2, d2, d4 + vshl.u16 d10, d10, d9 // F = 1 << bitdepth_min_8 +.if \wd >= 8 + vmax.u16 d2, d2, d6 +.endif + +.if \wd == 16 + vabd.u16 d3, d17, d23 // abs(p6 - p0) + vabd.u16 d4, d18, d23 // abs(p5 - p0) + vabd.u16 d5, d19, d23 // abs(p4 - p0) +.endif + vcge.u16 d2, d10, d2 // flat8in +.if \wd == 16 + vabd.u16 d6, d28, d24 // abs(q4 - q0) + vabd.u16 d7, d29, d24 // abs(q5 - q0) + vabd.u16 d8, d30, d24 // abs(q6 - q0) +.endif + vand d14, d2, d14 // flat8in && fm && wd > 4 + vbic d1, d1, d14 // fm && wd >= 4 && !flat8in +.if \wd == 16 + vmax.u16 d3, d3, d4 + vmax.u16 d5, d5, d6 +.endif + vmov r10, r11, d1 +.if \wd == 16 + vmax.u16 d7, d7, d8 + vmax.u16 d3, d3, d5 + vmax.u16 d3, d3, d7 + vcge.u16 d3, d10, d3 // flat8out +.endif + orrs r10, r10, r11 +.if \wd == 16 + vand d15, d15, d3 // flat8out && fm && wd == 16 + vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16 + vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out +.endif + beq 1f // skip wd == 4 case +.endif + + vdup.16 d3, r8 // bitdepth_max + vsub.u16 d2, d22, d25 // p1 - q1 + vshr.u16 d3, d3, #1 // 128 << bitdepth_min_8 - 1 + vcgt.u16 d0, d0, d12 // hev + vmvn d9, d3 // - 128 * (1 << bitdepth_min_8) + vmin.s16 d2, d2, d3 // iclip_diff(p1 - q1) + vmax.s16 d2, d2, d9 // iclip_diff(p1 - q1) + vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1) + vsub.u16 d2, d24, d23 + vmov.i16 d6, #3 + vbic d0, d1, d0 // (fm && wd >= 4 && !hev) + vmul.i16 d2, d2, d6 + vmov.i16 d7, #4 + vadd.i16 d2, d2, d4 + vmin.s16 d2, d2, d3 // f = iclip_diff() + vmax.s16 d2, d2, d9 // f = iclip_diff() + vqadd.s16 d4, d7, d2 // f + 4 + vqadd.s16 d5, d6, d2 // f + 3 + vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1) + vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1) + vshr.s16 d4, d4, #3 // f1 + vshr.s16 d5, d5, #3 // f2 + vmov.i16 d9, #0 + vdup.16 d3, r8 // bitdepth_max + vqadd.s16 d2, d23, d5 // p0 + f2 + vqsub.s16 d6, d24, d4 // q0 - f1 + vrshr.s16 d4, d4, #1 // (f1 + 1) >> 1 + vmin.s16 d2, d2, d3 // out p0 = iclip_pixel() + vmin.s16 d6, d6, d3 // out q0 = iclip_pixel() + vmax.s16 d2, d2, d9 // out p0 = iclip_pixel() + vmax.s16 d6, d6, d9 // out q0 = iclip_pixel() + vbit d23, d2, d1 // if (fm && wd >= 4) + vbit d24, d6, d1 // if (fm && wd >= 4) + vqadd.s16 d2, d22, d4 // p1 + f + vqsub.s16 d6, d25, d4 // q1 - f + vmin.s16 d2, d2, d3 // out p1 = iclip_pixel() + vmin.s16 d6, d6, d3 // out q1 = iclip_pixel() + vmax.s16 d2, d2, d9 // out p1 = iclip_pixel() + vmax.s16 d6, d6, d9 // out q1 = iclip_pixel() + vbit d22, d2, d0 // if (fm && wd >= 4 && !hev) + vbit d25, d6, d0 // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 2f // skip if there's no flat8in + + vadd.i16 d0, d21, d21 // p2 * 2 + vadd.i16 d2, d21, d22 // p2 + p1 + vadd.i16 d4, d22, d23 // p1 + p0 + vadd.i16 d6, d23, d24 // p0 + q0 + vadd.i16 d8, d0, d2 + vadd.i16 d10, d4, d6 + vadd.i16 d12, d24, d25 // q0 + q1 + vadd.i16 d8, d8, d10 + vsub.i16 d12, d12, d0 + vadd.i16 d10, d25, d26 // q1 + q2 + vrshr.u16 d0, d8, #3 // out p1 + + vadd.i16 d8, d8, d12 + vsub.i16 d10, d10, d2 + vadd.i16 d12, d26, d26 // q2 + q2 + vrshr.u16 d1, d8, #3 // out p0 + + vadd.i16 d8, d8, d10 + vsub.i16 d12, d12, d4 + vrshr.u16 d2, d8, #3 // out q0 + + vbit d22, d0, d14 // p1 if (flat8in) + vadd.i16 d8, d8, d12 + vbit d23, d1, d14 // p0 if (flat8in) + vrshr.u16 d3, d8, #3 // out q1 + vbit d24, d2, d14 // q0 if (flat8in) + vbit d25, d3, d14 // q1 if (flat8in) +.elseif \wd >= 8 + vmov r10, r11, d14 + orrs r10, r10, r11 +.if \wd == 8 + beq 8f // skip if there's no flat8in +.else + beq 2f // skip if there's no flat8in +.endif + + vadd.i16 d0, d20, d21 // p3 + p2 + vadd.i16 d2, d22, d25 // p1 + q1 + vadd.i16 d4, d20, d22 // p3 + p1 + vadd.i16 d6, d23, d26 // p0 + q2 + vadd.i16 d8, d0, d0 // 2 * (p3 + p2) + vadd.i16 d9, d23, d24 // p0 + q0 + vadd.i16 d8, d8, d4 // + p3 + p1 + vsub.i16 d2, d2, d0 // p1 + q1 - p3 - p2 + vadd.i16 d8, d8, d9 // + p0 + q0 + vsub.i16 d6, d6, d4 // p0 + q2 - p3 - p1 + vrshr.u16 d10, d8, #3 // out p2 + + vadd.i16 d8, d8, d2 + vadd.i16 d0, d20, d23 // p3 + p0 + vadd.i16 d2, d24, d27 // q0 + q3 + vrshr.u16 d11, d8, #3 // out p1 + + vadd.i16 d8, d8, d6 + vsub.i16 d2, d2, d0 // q0 + q3 - p3 - p0 + vadd.i16 d4, d21, d24 // p2 + q0 + vadd.i16 d6, d25, d27 // q1 + q3 + vrshr.u16 d12, d8, #3 // out p0 + + vadd.i16 d8, d8, d2 + vsub.i16 d6, d6, d4 // q1 + q3 - p2 - q0 + vadd.i16 d0, d22, d25 // p1 + q1 + vadd.i16 d2, d26, d27 // q2 + q3 + vrshr.u16 d13, d8, #3 // out q0 + + vadd.i16 d8, d8, d6 + vsub.i16 d2, d2, d0 // q2 + q3 - p1 - q1 + vrshr.u16 d0, d8, #3 // out q1 + + vadd.i16 d8, d8, d2 + + vbit d21, d10, d14 + vbit d22, d11, d14 + vbit d23, d12, d14 + vrshr.u16 d1, d8, #3 // out q2 + vbit d24, d13, d14 + vbit d25, d0, d14 + vbit d26, d1, d14 +.endif +2: +.if \wd == 16 + vmov r10, r11, d15 + orrs r10, r10, r11 + bne 1f // check if flat8out is needed + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + vadd.i16 d2, d17, d17 // p6 + p6 + vadd.i16 d4, d17, d18 // p6 + p5 + vadd.i16 d6, d17, d19 // p6 + p4 + vadd.i16 d8, d17, d20 // p6 + p3 + vadd.i16 d12, d2, d4 + vadd.i16 d10, d6, d8 + vadd.i16 d6, d17, d21 // p6 + p2 + vadd.i16 d12, d12, d10 + vadd.i16 d8, d17, d22 // p6 + p1 + vadd.i16 d10, d18, d23 // p5 + p0 + vadd.i16 d6, d6, d8 + vadd.i16 d8, d19, d24 // p4 + q0 + vadd.i16 d12, d12, d6 + vadd.i16 d10, d10, d8 + vadd.i16 d6, d20, d25 // p3 + q1 + vadd.i16 d12, d12, d10 + vsub.i16 d6, d6, d2 + vadd.i16 d2, d21, d26 // p2 + q2 + vrshr.u16 d0, d12, #4 // out p5 + vadd.i16 d12, d12, d6 // - (p6 + p6) + (p3 + q1) + vsub.i16 d2, d2, d4 + vadd.i16 d4, d22, d27 // p1 + q3 + vadd.i16 d6, d17, d19 // p6 + p4 + vrshr.u16 d1, d12, #4 // out p4 + vadd.i16 d12, d12, d2 // - (p6 + p5) + (p2 + q2) + vsub.i16 d4, d4, d6 + vadd.i16 d6, d23, d28 // p0 + q4 + vadd.i16 d8, d17, d20 // p6 + p3 + vrshr.u16 d2, d12, #4 // out p3 + vadd.i16 d12, d12, d4 // - (p6 + p4) + (p1 + q3) + vsub.i16 d6, d6, d8 + vadd.i16 d8, d24, d29 // q0 + q5 + vadd.i16 d4, d17, d21 // p6 + p2 + vrshr.u16 d3, d12, #4 // out p2 + vadd.i16 d12, d12, d6 // - (p6 + p3) + (p0 + q4) + vsub.i16 d8, d8, d4 + vadd.i16 d6, d25, d30 // q1 + q6 + vadd.i16 d10, d17, d22 // p6 + p1 + vrshr.u16 d4, d12, #4 // out p1 + vadd.i16 d12, d12, d8 // - (p6 + p2) + (q0 + q5) + vsub.i16 d6, d6, d10 + vadd.i16 d8, d26, d30 // q2 + q6 + vbif d0, d18, d15 // out p5 + vadd.i16 d10, d18, d23 // p5 + p0 + vrshr.u16 d5, d12, #4 // out p0 + vadd.i16 d12, d12, d6 // - (p6 + p1) + (q1 + q6) + vsub.i16 d8, d8, d10 + vadd.i16 d10, d27, d30 // q3 + q6 + vbif d1, d19, d15 // out p4 + vadd.i16 d18, d19, d24 // p4 + q0 + vrshr.u16 d6, d12, #4 // out q0 + vadd.i16 d12, d12, d8 // - (p5 + p0) + (q2 + q6) + vsub.i16 d10, d10, d18 + vadd.i16 d8, d28, d30 // q4 + q6 + vbif d2, d20, d15 // out p3 + vadd.i16 d18, d20, d25 // p3 + q1 + vrshr.u16 d7, d12, #4 // out q1 + vadd.i16 d12, d12, d10 // - (p4 + q0) + (q3 + q6) + vsub.i16 d18, d8, d18 + vadd.i16 d10, d29, d30 // q5 + q6 + vbif d3, d21, d15 // out p2 + vadd.i16 d20, d21, d26 // p2 + q2 + vrshr.u16 d8, d12, #4 // out q2 + vadd.i16 d12, d12, d18 // - (p3 + q1) + (q4 + q6) + vsub.i16 d10, d10, d20 + vadd.i16 d18, d30, d30 // q6 + q6 + vbif d4, d22, d15 // out p1 + vadd.i16 d20, d22, d27 // p1 + q3 + vrshr.u16 d9, d12, #4 // out q3 + vadd.i16 d12, d12, d10 // - (p2 + q2) + (q5 + q6) + vsub.i16 d18, d18, d20 + vbif d5, d23, d15 // out p0 + vrshr.u16 d10, d12, #4 // out q4 + vadd.i16 d12, d12, d18 // - (p1 + q3) + (q6 + q6) + vrshr.u16 d11, d12, #4 // out q5 + vbif d6, d24, d15 // out q0 + vbif d7, d25, d15 // out q1 + vbif d8, d26, d15 // out q2 + vbif d9, d27, d15 // out q3 + vbif d10, d28, d15 // out q4 + vbif d11, d29, d15 // out q5 +.endif + + bx lr +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + bx r6 +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + bx r7 +.endif +9: + // Return directly without writing back any pixels + bx r12 +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_4_wd16 + adr r6, 7f + CONFIG_THUMB + adr r7, 8f + CONFIG_THUMB + bl lpf_4_wd16_neon +.endm + +.macro lpf_4_wd8 + adr r7, 8f + CONFIG_THUMB + bl lpf_4_wd8_neon +.endm + +.macro lpf_4_wd6 + bl lpf_4_wd6_neon +.endm + +.macro lpf_4_wd4 + bl lpf_4_wd4_neon +.endm + +function lpf_v_4_4_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + + lpf_4_wd4 + + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_4_4_neon + mov r12, lr + sub r10, r0, #4 + add r0, r10, r1, lsl #1 + vld1.16 {d22}, [r10], r1 + vld1.16 {d24}, [r0], r1 + vld1.16 {d23}, [r10], r1 + vld1.16 {d25}, [r0], r1 + add r0, r0, #4 + + transpose_4x4h q11, q12, d22, d23, d24, d25 + + lpf_4_wd4 + + sub r10, r0, r1, lsl #2 + sub r10, r10, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +function lpf_v_6_4_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vld1.16 {d21}, [r10, :64], r1 // p2 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d25}, [r0, :64], r1 // q1 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + + lpf_4_wd6 + + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_6_4_neon + mov r12, lr + sub r10, r0, #8 + vld1.16 {d20}, [r10, :64], r1 + vld1.16 {d24}, [r0, :64], r1 + vld1.16 {d21}, [r10, :64], r1 + vld1.16 {d25}, [r0, :64], r1 + vld1.16 {d22}, [r10, :64], r1 + vld1.16 {d26}, [r0, :64], r1 + vld1.16 {d23}, [r10, :64], r1 + vld1.16 {d27}, [r0, :64], r1 + + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + lpf_4_wd6 + + sub r0, r0, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + sub r10, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +function lpf_v_8_4_neon + mov r12, lr + sub r10, r0, r1, lsl #2 + vld1.16 {d20}, [r10, :64], r1 // p3 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d21}, [r10, :64], r1 // p2 + vld1.16 {d25}, [r0, :64], r1 // q1 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d26}, [r0, :64], r1 // q2 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d27}, [r0, :64], r1 // q3 + sub r0, r0, r1, lsl #2 + + lpf_4_wd8 + + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vst1.16 {d21}, [r10, :64], r1 // p2 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d25}, [r0, :64], r1 // q1 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_8_4_neon + mov r12, lr + sub r10, r0, #8 + vld1.16 {d20}, [r10, :64], r1 + vld1.16 {d24}, [r0, :64], r1 + vld1.16 {d21}, [r10, :64], r1 + vld1.16 {d25}, [r0, :64], r1 + vld1.16 {d22}, [r10, :64], r1 + vld1.16 {d26}, [r0, :64], r1 + vld1.16 {d23}, [r10, :64], r1 + vld1.16 {d27}, [r0, :64], r1 + + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + lpf_4_wd8 + + sub r0, r0, r1, lsl #2 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + sub r10, r0, #8 + + vst1.16 {d20}, [r10, :64], r1 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d21}, [r10, :64], r1 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d22}, [r10, :64], r1 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d23}, [r10, :64], r1 + vst1.16 {d27}, [r0, :64], r1 + bx r12 +8: + sub r0, r0, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + sub r10, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +function lpf_v_16_4_neon + mov r12, lr + + sub r10, r0, r1, lsl #3 + add r10, r10, r1 + vld1.16 {d17}, [r10, :64], r1 // p6 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d18}, [r10, :64], r1 // p5 + vld1.16 {d25}, [r0, :64], r1 // q1 + vld1.16 {d19}, [r10, :64], r1 // p4 + vld1.16 {d26}, [r0, :64], r1 // q2 + vld1.16 {d20}, [r10, :64], r1 // p3 + vld1.16 {d27}, [r0, :64], r1 // q3 + vld1.16 {d21}, [r10, :64], r1 // p2 + vld1.16 {d28}, [r0, :64], r1 // q4 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d29}, [r0, :64], r1 // q5 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d30}, [r0, :64], r1 // q6 + sub r0, r0, r1, lsl #3 + add r0, r0, r1 + + lpf_4_wd16 + + sub r10, r0, r1, lsl #2 + sub r10, r10, r1, lsl #1 + vst1.16 {d0}, [r10, :64], r1 // p5 + vst1.16 {d6}, [r0, :64], r1 // q0 + vst1.16 {d1}, [r10, :64], r1 // p4 + vst1.16 {d7}, [r0, :64], r1 // q1 + vst1.16 {d2}, [r10, :64], r1 // p3 + vst1.16 {d8}, [r0, :64], r1 // q2 + vst1.16 {d3}, [r10, :64], r1 // p2 + vst1.16 {d9}, [r0, :64], r1 // q3 + vst1.16 {d4}, [r10, :64], r1 // p1 + vst1.16 {d10}, [r0, :64], r1 // q4 + vst1.16 {d5}, [r10, :64], r1 // p0 + vst1.16 {d11}, [r0, :64], r1 // q5 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + bx r12 +7: + sub r10, r0, r1 + sub r10, r10, r1, lsl #1 + vst1.16 {d21}, [r10, :64], r1 // p2 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d25}, [r0, :64], r1 // q1 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_16_4_neon + mov r12, lr + sub r10, r0, #16 + sub r0, r0, #8 + vld1.16 {d16}, [r10, :64], r1 + vld1.16 {d20}, [r0, :64], r1 + vld1.16 {d17}, [r10, :64], r1 + vld1.16 {d21}, [r0, :64], r1 + vld1.16 {d18}, [r10, :64], r1 + vld1.16 {d22}, [r0, :64], r1 + vld1.16 {d19}, [r10, :64], r1 + vld1.16 {d23}, [r0, :64], r1 + sub r10, r10, r1, lsl #2 + sub r0, r0, r1, lsl #2 + add r10, r10, #16 + add r0, r0, #16 + vld1.16 {d24}, [r10, :64], r1 + vld1.16 {d28}, [r0, :64], r1 + vld1.16 {d25}, [r10, :64], r1 + vld1.16 {d29}, [r0, :64], r1 + vld1.16 {d26}, [r10, :64], r1 + vld1.16 {d30}, [r0, :64], r1 + vld1.16 {d27}, [r10, :64], r1 + vld1.16 {d31}, [r0, :64], r1 + sub r0, r0, #8 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + + lpf_4_wd16 + + sub r0, r0, r1, lsl #2 + transpose_4x4h q8, q0, d16, d17, d0, d1 + transpose_4x4h q1, q2, d2, d3, d4, d5 + transpose_4x4h q3, q4, d6, d7, d8, d9 + transpose_4x4h q5, q15, d10, d11, d30, d31 + sub r10, r0, #16 + sub r0, r0, #8 + + vst1.16 {d16}, [r10, :64], r1 + vst1.16 {d2}, [r0, :64], r1 + vst1.16 {d17}, [r10, :64], r1 + vst1.16 {d3}, [r0, :64], r1 + vst1.16 {d0}, [r10, :64], r1 + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d1}, [r10, :64], r1 + vst1.16 {d5}, [r0, :64], r1 + sub r10, r10, r1, lsl #2 + sub r0, r0, r1, lsl #2 + add r10, r10, #16 + add r0, r0, #16 + vst1.16 {d6}, [r10, :64], r1 + vst1.16 {d10}, [r0, :64], r1 + vst1.16 {d7}, [r10, :64], r1 + vst1.16 {d11}, [r0, :64], r1 + vst1.16 {d8}, [r10, :64], r1 + vst1.16 {d30}, [r0, :64], r1 + vst1.16 {d9}, [r10, :64], r1 + vst1.16 {d31}, [r0, :64], r1 + sub r0, r0, #8 + + bx r12 + +7: + sub r0, r0, r1, lsl #2 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + sub r10, r0, #8 + + vst1.16 {d20}, [r10, :64], r1 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d21}, [r10, :64], r1 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d22}, [r10, :64], r1 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d23}, [r10, :64], r1 + vst1.16 {d27}, [r0, :64], r1 + bx r12 +8: + sub r0, r0, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + sub r10, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w, +// const int bitdepth_max) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r8, [sp, #112] // bitdepth_max; the 'w' parameter isn't loaded + sub sp, sp, #8 + clz r9, r8 + rsb r9, r9, #24 // bitdepth_min_8 + ldrd r6, r7, [r2] // vmask[0], vmask[1] +.ifc \type, y + ldr r2, [r2, #8] // vmask[2] +.endif + add r5, r5, #128 // Move to sharp part of lut +.ifc \type, y + orr r7, r7, r2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub r4, r3, r4, lsl #2 +.else + sub r3, r3, #4 + lsl r4, r4, #2 +.endif + orr r6, r6, r7 // vmask[0] |= vmask[1] + +1: + tst r6, #0x01 + strd r6, r7, [sp] +.ifc \dir, v + ldrb r10, [r4], #4 + ldrb r11, [r3], #4 +.else + ldrb r10, [r3] + ldrb r11, [r3, #4] + add r3, r3, r4 +.endif + beq 7f // if (!(vm & bits)) continue; + + orrs r12, r10, r11 + vdup.16 d31, r9 // bitdepth_min_8 + beq 7f // if (!(l[0][0] | l[offset][0])) continue; + cmp r11, #0 // Check for nonzero values in l[0][0] + ldrb r6, [r5], #8 // sharp[0] + it eq + moveq r11, r10 // if (!l[0][0]) L = l[offset][0] + ldrb r12, [r5] // sharp[1] + lsr r6, r11, r6 // L >> sharp[0] + sub r5, r5, #8 + cmp r12, r6 + lsr r10, r11, #4 // H + add r11, r11, #2 // L + 2 + it lt + movlt r6, r12 // imin(L >> sharp[0], sharp[1]) + add r11, r11, r11 // 2*(L + 2) + cmp r6, #1 + lsl r10, r10, r9 // H << bitdepth_min_8 + it lt + movlt r6, #1 // imax(imin(), 1) = limit = I + vdup.16 d12, r10 // H << bitdepth_min_8 + add r11, r11, r6 // 2*(L + 2) + limit = E + lsl r6, r6, r9 // I << bitdepth_min_8 + lsl r11, r11, r9 // E << bitdepth_min_8 + vdup.16 d11, r6 // I << bitdepth_min_8 + vdup.16 d10, r11 // E << bitdepth_min_8 + +.ifc \type, y + tst r2, #0x01 + beq 2f + // wd16 + bl lpf_\dir\()_16_4_neon + b 8f +2: +.endif + tst r7, #0x01 + beq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_4_neon +.else + // wd6 + bl lpf_\dir\()_6_4_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_4_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment r0. + // If the whole function is skipped, increment it here instead. + add r0, r0, r1, lsl #2 +.else +7: +.endif +8: + ldrd r6, r7, [sp] +.ifc \type, y + lsr r2, r2, #1 // vmask[2] >>= 1 +.endif +.ifc \dir, v + add r0, r0, #8 +.else + // For dir h, r0 is returned incremented +.endif + lsrs r6, r6, #1 // vmask[0] >>= 1 + lsr r7, r7, #1 // vmask[1] >>= 1 + bne 1b + + add sp, sp, #8 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv diff -Nru dav1d-0.7.1/src/arm/32/loopfilter.S dav1d-0.9.1/src/arm/32/loopfilter.S --- dav1d-0.7.1/src/arm/32/loopfilter.S 2020-06-21 11:48:54.960126400 +0000 +++ dav1d-0.9.1/src/arm/32/loopfilter.S 2021-07-28 21:38:28.861851700 +0000 @@ -515,7 +515,7 @@ lpf_8_wd8 sub r10, r0, r1, lsl #1 - sub r10, r10, r1 + sub r10, r10, r1 vst1.8 {d21}, [r10, :64], r1 // p2 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d22}, [r10, :64], r1 // p1 @@ -783,11 +783,11 @@ vld1.8 {d6[]}, [r5] // sharp[1] sub r5, r5, #8 vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0] + vtst.32 d2, d1, d2 // L != 0 vmul.i32 d1, d1, d4 // L .ifc \type, y vdup.32 d15, r2 // vmask[2] .endif - vtst.32 d2, d1, d2 // L != 0 vdup.32 d14, r7 // vmask[1] vmov r10, r11, d2 orrs r10, r10, r11 diff -Nru dav1d-0.7.1/src/arm/32/looprestoration16.S dav1d-0.9.1/src/arm/32/looprestoration16.S --- dav1d-0.7.1/src/arm/32/looprestoration16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/looprestoration16.S 2021-07-28 21:38:28.861851700 +0000 @@ -0,0 +1,801 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], +// const pixel *src, ptrdiff_t stride, +// const int16_t fh[7], const intptr_t w, +// int h, enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + ldr r8, [sp, #116] // bitdepth_max + vld1.16 {q0}, [r4, :128] + clz r8, r8 + vmov.i32 q14, #1 + sub r9, r8, #38 // -(bitdepth + 6) + sub r8, r8, #25 // -round_bits_h + neg r9, r9 // bitdepth + 6 + vdup.32 q1, r9 + vdup.32 q13, r8 // -round_bits_h + vmov.i16 q15, #8192 + vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6) + mov r8, r5 + // Calculate mid_stride + add r10, r5, #7 + bic r10, r10, #7 + lsl r10, r10, #1 + + // Set up pointers for reading/writing alternate rows + add r12, r0, r10 + lsl r10, r10, #1 + add lr, r2, r3 + lsl r3, r3, #1 + + // Subtract the aligned width from mid_stride + add r11, r5, #7 + bic r11, r11, #7 + sub r10, r10, r11, lsl #1 + + // Subtract the number of pixels read from the source stride + add r11, r11, #8 + sub r3, r3, r11, lsl #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r1, #0 + bne 0f + // left == NULL + sub r2, r2, #6 + sub lr, lr, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r3, r3, #6 + + +1: // Loop vertically + vld1.16 {q2, q3}, [r2]! + vld1.16 {q4, q5}, [lr]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r1, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d3}, [r1]! + // Move r2/lr back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub r2, r2, #6 + sub lr, lr, #6 + vld1.16 {d13}, [r1]! + vext.8 q3, q2, q3, #10 + vext.8 q2, q1, q2, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill q1 with the leftmost pixel + // and shift q2/q3 to have 3x the first pixel at the front. + vdup.16 q1, d4[0] + vdup.16 q6, d8[0] + // Move r2 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub r2, r2, #6 + sub lr, lr, #6 + vext.8 q3, q2, q3, #10 + vext.8 q2, q1, q2, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + +2: + + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub r9, r5, #14 + lsl r9, r9, #1 + ldrh r11, [r2, r9] + ldrh r9, [lr, r9] + // Fill q11/q12 with the right padding pixel + vdup.16 q11, r11 + vdup.16 q12, r9 +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel_local r4, right_ext_mask, -6 + sub r4, r4, r5, lsl #1 + vld1.8 {q9, q10}, [r4] + + vbit q2, q11, q9 + vbit q3, q11, q10 + vbit q4, q12, q9 + vbit q5, q12, q10 + +4: // Loop horizontally + vext.8 q7, q2, q3, #4 + vext.8 q8, q2, q3, #8 + vext.8 q6, q2, q3, #2 + vext.8 q9, q2, q3, #10 + vadd.i16 q8, q8, q7 + vadd.i16 q9, q9, q6 + vext.8 q6, q2, q3, #12 + vext.8 q7, q2, q3, #6 + vadd.i16 q2, q2, q6 + vmull.s16 q6, d14, d0[3] + vmlal.s16 q6, d16, d1[0] + vmlal.s16 q6, d18, d1[1] + vmlal.s16 q6, d4, d1[2] + vmull.s16 q7, d15, d0[3] + vmlal.s16 q7, d17, d1[0] + vmlal.s16 q7, d19, d1[1] + vmlal.s16 q7, d5, d1[2] + + vext.8 q8, q4, q5, #4 + vext.8 q10, q4, q5, #8 + vext.8 q9, q4, q5, #2 + vext.8 q2, q4, q5, #10 + vadd.i16 q10, q10, q8 + vadd.i16 q2, q2, q9 + vext.8 q8, q4, q5, #12 + vext.8 q9, q4, q5, #6 + vadd.i16 q4, q4, q8 + vmull.s16 q8, d18, d0[3] + vmlal.s16 q8, d20, d1[0] + vmlal.s16 q8, d4, d1[1] + vmlal.s16 q8, d8, d1[2] + vmull.s16 q9, d19, d0[3] + vmlal.s16 q9, d21, d1[0] + vmlal.s16 q9, d5, d1[1] + vmlal.s16 q9, d9, d1[2] + + vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q14 + vadd.i32 q8, q8, q14 + vadd.i32 q9, q9, q14 + vrshl.s32 q6, q6, q13 + vrshl.s32 q7, q7, q13 + vrshl.s32 q8, q8, q13 + vrshl.s32 q9, q9, q13 + vqmovun.s32 d12, q6 + vqmovun.s32 d13, q7 + vqmovun.s32 d14, q8 + vqmovun.s32 d15, q9 + vmin.u16 q6, q6, q10 + vmin.u16 q7, q7, q10 + vsub.i16 q6, q6, q15 + vsub.i16 q7, q7, q15 + subs r5, r5, #8 + vst1.16 {q6}, [r0, :128]! + vst1.16 {q7}, [r12, :128]! + + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q2, q3 + vmov q4, q5 + vld1.16 {q3}, [r2]! + vld1.16 {q5}, [lr]! + bne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r10 + add r12, r12, r10 + add r2, r2, r3 + add lr, lr, r3 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, +// const int16_t *mid, int w, int h, +// const int16_t fv[7], enum LrEdgeFlags edges, +// ptrdiff_t mid_stride, const int bitdepth_max); +function wiener_filter_v_16bpc_neon, export=1 + push {r4-r7,lr} + vpush {q4-q5} + ldrd r4, r5, [sp, #52] + ldrd r6, r7, [sp, #60] + ldr lr, [sp, #68] // bitdepth_max + vld1.16 {q0}, [r5, :128] + vdup.16 q5, lr + clz lr, lr + sub lr, lr, #11 // round_bits_v + vdup.32 q4, lr + mov lr, r4 + vneg.s32 q4, q4 // -round_bits_v + + // Calculate the number of rows to move back when looping vertically + mov r12, r4 + tst r6, #4 // LR_HAVE_TOP + beq 0f + sub r2, r2, r7, lsl #1 + add r12, r12, #2 +0: + tst r6, #8 // LR_HAVE_BOTTOM + beq 1f + add r12, r12, #2 + +1: // Start of horizontal loop; start one vertical filter slice. + // Load rows into q8-q11 and pad properly. + tst r6, #4 // LR_HAVE_TOP + vld1.16 {q8}, [r2, :128], r7 + beq 2f + // LR_HAVE_TOP + vld1.16 {q10}, [r2, :128], r7 + vmov q9, q8 + vld1.16 {q11}, [r2, :128], r7 + b 3f +2: // !LR_HAVE_TOP + vmov q9, q8 + vmov q10, q8 + vmov q11, q8 + +3: + cmp r4, #4 + blt 5f + // Start filtering normally; fill in q12-q14 with unique rows. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vld1.16 {q14}, [r2, :128], r7 + +4: +.macro filter compare + subs r4, r4, #1 + // Interleaving the mul/mla chains actually hurts performance + // significantly on Cortex A53, thus keeping mul/mla tightly + // chained like this. + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d20, d0[2] + vmlal.s16 q2, d22, d0[3] + vmlal.s16 q2, d24, d1[0] + vmlal.s16 q2, d26, d1[1] + vmlal.s16 q2, d28, d1[2] + vmull.s16 q3, d17, d0[0] + vmlal.s16 q3, d19, d0[1] + vmlal.s16 q3, d21, d0[2] + vmlal.s16 q3, d23, d0[3] + vmlal.s16 q3, d25, d1[0] + vmlal.s16 q3, d27, d1[1] + vmlal.s16 q3, d29, d1[2] + vrshl.s32 q2, q2, q4 // round_bits_v + vrshl.s32 q3, q3, q4 + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q5 // bitdepth_max + vst1.16 {q2}, [r0, :128], r1 +.if \compare + cmp r4, #4 +.else + ble 9f +.endif + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 + vmov q11, q12 + vmov q12, q13 + vmov q13, q14 +.endm + filter 1 + blt 7f + vld1.16 {q14}, [r2, :128], r7 + b 4b + +5: // Less than 4 rows in total; not all of q12-q13 are filled yet. + tst r6, #8 // LR_HAVE_BOTTOM + beq 6f + // LR_HAVE_BOTTOM + cmp r4, #2 + // We load at least 2 rows in all cases. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + bgt 53f // 3 rows in total + beq 52f // 2 rows in total +51: // 1 row in total, q11 already loaded, load edge into q12-q14. + vmov q13, q12 + b 8f +52: // 2 rows in total, q11 already loaded, load q12 with content data + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vmov q15, q14 + b 8f +53: + // 3 rows in total, q11 already loaded, load q12 and q13 with content + // and 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f + +6: + // !LR_HAVE_BOTTOM + cmp r4, #2 + bgt 63f // 3 rows in total + beq 62f // 2 rows in total +61: // 1 row in total, q11 already loaded, pad that into q12-q14. + vmov q12, q11 + vmov q13, q11 + vmov q14, q11 + b 8f +62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. + vld1.16 {q12}, [r2, :128], r7 + vmov q13, q12 + vmov q14, q12 + vmov q15, q12 + b 8f +63: + // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. + vld1.16 {q12}, [r2, :128], r7 + vld1.16 {q13}, [r2, :128], r7 + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + b 8f + +7: + // All registers up to q13 are filled already, 3 valid rows left. + // < 4 valid rows left; fill in padding and filter the last + // few rows. + tst r6, #8 // LR_HAVE_BOTTOM + beq 71f + // LR_HAVE_BOTTOM; load 2 rows of edge. + vld1.16 {q14}, [r2, :128], r7 + vld1.16 {q15}, [r2, :128], r7 + vmov q1, q15 + b 8f +71: + // !LR_HAVE_BOTTOM, pad 3 rows + vmov q14, q13 + vmov q15, q13 + vmov q1, q13 + +8: // At this point, all registers up to q14-q15,q1 are loaded with + // edge/padding (depending on how many rows are left). + filter 0 // This branches to 9f when done + vmov q14, q15 + vmov q15, q1 + b 8b + +9: // End of one vertical slice. + subs r3, r3, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + mls r0, r1, lr, r0 + mls r2, r7, r12, r2 + add r0, r0, #16 + add r2, r2, #16 + mov r4, lr + b 1b + +0: + vpop {q4-q5} + pop {r4-r7,pc} +.purgem filter +endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add lr, r5, #7 + bic lr, lr, #7 + sub r9, r9, lr, lsl #1 + + // Store the width for the vertical loop + mov r8, r5 + + // Subtract the number of pixels read from the input from the stride + add lr, lr, #8 + sub r4, r4, lr, lsl #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #4 + sub r12, r12, #4 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #4 + + +1: // Loop vertically + vld1.16 {q0, q1}, [r3]! + vld1.16 {q4, q5}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d5}, [r2]! + // Move r3/r12 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub r3, r3, #4 + sub r12, r12, #4 + vld1.16 {d13}, [r2]! + vext.8 q1, q0, q1, #12 + vext.8 q0, q2, q0, #12 + vext.8 q5, q4, q5, #12 + vext.8 q4, q6, q4, #12 + b 2f +0: + // !LR_HAVE_LEFT, fill q2 with the leftmost pixel + // and shift q0 to have 2x the first byte at the front. + vdup.16 q2, d0[0] + vdup.16 q6, d8[0] + // Move r3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub r3, r3, #4 + sub r12, r12, #4 + vext.8 q1, q0, q1, #12 + vext.8 q0, q2, q0, #12 + vext.8 q5, q4, q5, #12 + vext.8 q4, q6, q4, #12 + +2: + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 2 + 1) + lsl lr, lr, #1 + ldrh r11, [r3, lr] + ldrh lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.16 q14, r11 + vdup.16 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp r5, #10 + bge 4f // If w >= 10, all used input pixels are valid + + // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in q0/1.h[w] onwards + movrel_local lr, right_ext_mask + sub lr, lr, r5, lsl #1 + vld1.8 {q12, q13}, [lr] + + vbit q0, q14, q12 + vbit q1, q14, q13 + vbit q4, q15, q12 + vbit q5, q15, q13 + +4: // Loop horizontally + vext.8 q8, q0, q1, #2 + vext.8 q10, q4, q5, #2 + vext.8 q9, q0, q1, #4 + vext.8 q11, q4, q5, #4 + vadd.i16 q2, q0, q8 + vadd.i16 q3, q4, q10 + vadd.i16 q2, q2, q9 + vadd.i16 q3, q3, q11 + + vmull.u16 q6, d0, d0 + vmlal.u16 q6, d16, d16 + vmlal.u16 q6, d18, d18 + vmull.u16 q12, d8, d8 + vmlal.u16 q12, d20, d20 + vmlal.u16 q12, d22, d22 + vmull.u16 q7, d1, d1 + vmlal.u16 q7, d17, d17 + vmlal.u16 q7, d19, d19 + vmull.u16 q13, d9, d9 + vmlal.u16 q13, d21, d21 + vmlal.u16 q13, d23, d23 + subs r5, r5, #8 + vst1.16 {q2}, [r1, :128]! + vst1.16 {q3}, [r11, :128]! + vst1.32 {q6, q7}, [r0, :128]! + vst1.32 {q12, q13}, [r10, :128]! + + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q0, q1 + vmov q4, q5 + vld1.16 {q1}, [r3]! + vld1.16 {q5}, [r12]! + + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + add lr, r5, #7 + bic lr, lr, #7 + sub r9, r9, lr, lsl #1 + add lr, lr, #8 + sub r4, r4, lr, lsl #1 + + // Store the width for the vertical loop + mov r8, r5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #6 + sub r12, r12, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #6 + +1: // Loop vertically + vld1.16 {q0, q1}, [r3]! + vld1.16 {q4, q5}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d5}, [r2]! + // Move r3/r12 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub r3, r3, #6 + sub r12, r12, #6 + vld1.16 {d13}, [r2]! + vext.8 q1, q0, q1, #10 + vext.8 q0, q2, q0, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill q2 with the leftmost pixel + // and shift q0 to have 3x the first pixel at the front. + vdup.16 q2, d0[0] + vdup.16 q6, d8[0] + // Move r3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub r3, r3, #6 + sub r12, r12, #6 + vext.8 q1, q0, q1, #10 + vext.8 q0, q2, q0, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + +2: + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 3 + 1) + lsl lr, lr, #1 + ldrh r11, [r3, lr] + ldrh lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.16 q14, r11 + vdup.16 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel_local lr, right_ext_mask, -2 + sub lr, lr, r5, lsl #1 + vld1.8 {q12, q13}, [lr] + + vbit q0, q14, q12 + vbit q1, q14, q13 + vbit q4, q15, q12 + vbit q5, q15, q13 + +4: // Loop horizontally + vext.8 q8, q0, q1, #2 + vext.8 q10, q4, q5, #2 + vext.8 q9, q0, q1, #4 + vext.8 q11, q4, q5, #4 + vadd.i16 q2, q0, q8 + vadd.i16 q3, q4, q10 + vadd.i16 q2, q2, q9 + vadd.i16 q3, q3, q11 + + vmull.u16 q6, d0, d0 + vmlal.u16 q6, d16, d16 + vmlal.u16 q6, d18, d18 + vmull.u16 q12, d8, d8 + vmlal.u16 q12, d20, d20 + vmlal.u16 q12, d22, d22 + vmull.u16 q7, d1, d1 + vmlal.u16 q7, d17, d17 + vmlal.u16 q7, d19, d19 + vmull.u16 q13, d9, d9 + vmlal.u16 q13, d21, d21 + vmlal.u16 q13, d23, d23 + + vext.8 q8, q0, q1, #6 + vext.8 q10, q4, q5, #6 + vext.8 q9, q0, q1, #8 + vext.8 q11, q4, q5, #8 + vadd.i16 q2, q2, q8 + vadd.i16 q3, q3, q10 + vadd.i16 q2, q2, q9 + vadd.i16 q3, q3, q11 + + vmlal.u16 q6, d16, d16 + vmlal.u16 q6, d1, d1 + vmlal.u16 q12, d20, d20 + vmlal.u16 q12, d9, d9 + vmlal.u16 q7, d17, d17 + vmlal.u16 q7, d19, d19 + vmlal.u16 q13, d21, d21 + vmlal.u16 q13, d23, d23 + + subs r5, r5, #8 + vst1.16 {q2}, [r1, :128]! + vst1.16 {q3}, [r11, :128]! + vst1.32 {q6, q7}, [r0, :128]! + vst1.32 {q12, q13}, [r10, :128]! + + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q0, q1 + vmov q4, q5 + vld1.16 {q1}, [r3]! + vld1.16 {q5}, [r12]! + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +sgr_funcs 16 diff -Nru dav1d-0.7.1/src/arm/32/looprestoration_common.S dav1d-0.9.1/src/arm/32/looprestoration_common.S --- dav1d-0.7.1/src/arm/32/looprestoration_common.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/looprestoration_common.S 2021-07-28 21:38:28.861851700 +0000 @@ -0,0 +1,453 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define SUM_STRIDE (384+16) + +// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_v_neon, export=1 + push {r4-r9,lr} + ldr r4, [sp, #28] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #2 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 1f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Sum all h+2 lines with the main loop + add lr, lr, #2 +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q8-q13 and q0-q2 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q8, q9}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q1}, [r6, :128], r8 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q10, q8 + vmov q11, q9 + vmov q1, q0 + vmov q12, q8 + vmov q13, q9 + vmov q2, q0 + +3: + subs r3, r3, #1 +.macro add3 + vadd.i32 q8, q8, q10 + vadd.i32 q9, q9, q11 + vadd.i16 q0, q0, q1 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i16 q0, q0, q2 + vst1.32 {q8, q9}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + vmov q10, q12 + vmov q11, q13 + vmov q1, q2 + ble 4f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3b + +4: + tst r4, #8 // LR_HAVE_BOTTOM + bne 5f + // !LR_HAVE_BOTTOM + // Produce two more rows, extending the already loaded rows. + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + add3 + +5: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + pop {r4-r9,pc} +.purgem add3 +endfunc + +// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_v_neon, export=1 + push {r4-r9,lr} + vpush {q5-q7} + ldr r4, [sp, #76] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #8 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 0f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Handle h+2 lines with the main loop + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_BOTTOM + sub r3, r3, #1 // Handle h-1 lines with the main loop +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q6-q15 and q0-q3,q5 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q6, q7}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vmov q10, q6 + vmov q11, q7 + vmov q2, q0 + vmov q12, q6 + vmov q13, q7 + vmov q3, q0 + +3: + cmp r3, #0 + beq 4f + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + +3: + // Start of vertical loop + subs r3, r3, #2 +.macro add5 + vadd.i32 q6, q6, q8 + vadd.i32 q7, q7, q9 + vadd.i16 q0, q0, q1 + vadd.i32 q6, q6, q10 + vadd.i32 q7, q7, q11 + vadd.i16 q0, q0, q2 + vadd.i32 q6, q6, q12 + vadd.i32 q7, q7, q13 + vadd.i16 q0, q0, q3 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q15 + vadd.i16 q0, q0, q5 + vst1.32 {q6, q7}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add5 +.macro shift2 + vmov q6, q10 + vmov q7, q11 + vmov q0, q2 + vmov q8, q12 + vmov q9, q13 + vmov q1, q3 + vmov q10, q14 + vmov q11, q15 + vmov q2, q5 +.endm + shift2 + add r0, r0, r7 + add r1, r1, r8 + ble 5f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + b 3b + +4: + // h == 1, !LR_HAVE_BOTTOM. + // Pad the last row with the only content row, and add. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + add5 + b 6f + +5: + tst r4, #8 // LR_HAVE_BOTTOM + bne 6f + // !LR_HAVE_BOTTOM + cmp r3, #0 + bne 5f + // The intended three edge rows left; output the one at h-2 and + // the past edge one at h. + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + // Pad the past-edge row from the last content row. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + // The last two rows are already padded properly here. + add5 + b 6f + +5: + // r3 == -1, two rows left, output one. + // Pad the last two rows from the mid one. + vmov q12, q10 + vmov q13, q11 + vmov q3, q2 + vmov q14, q10 + vmov q15, q11 + vmov q5, q2 + add5 + add r0, r0, r7 + add r1, r1, r8 + b 6f + +6: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + vpop {q5-q7} + pop {r4-r9,pc} +.purgem add5 +endfunc + +// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength, +// const int bitdepth_max); +function sgr_calc_ab1_neon, export=1 + push {r4-r7,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #84] + add r3, r3, #2 // h += 2 + clz r6, r5 + vmov.i32 q15, #9 // n + movw r5, #455 + mov lr, #SUM_STRIDE + b sgr_calc_ab_neon +endfunc + +function sgr_calc_ab2_neon, export=1 + push {r4-r7,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #84] + add r3, r3, #3 // h += 3 + clz r6, r5 + asr r3, r3, #1 // h /= 2 + vmov.i32 q15, #25 // n + mov r5, #164 + mov lr, #(2*SUM_STRIDE) +endfunc + +function sgr_calc_ab_neon + movrel r12, X(sgr_x_by_x) + sub r6, r6, #24 // -bitdepth_min_8 + vld1.8 {q8, q9}, [r12, :128]! + add r7, r6, r6 // -2*bitdepth_min_8 + vmov.i8 q11, #5 + vmov.i8 d10, #55 // idx of last 5 + vld1.8 {q10}, [r12, :128] + vmov.i8 d11, #72 // idx of last 4 + vmov.i8 d12, #101 // idx of last 3 + vmov.i8 d13, #169 // idx of last 2 + vmov.i8 d14, #254 // idx of last 1 + vmov.i8 d15, #32 // elements consumed in first vtbl + add r2, r2, #2 // w += 2 + add r12, r2, #7 + bic r12, r12, #7 // aligned w + sub r12, lr, r12 // increment between rows + vdup.32 q12, r4 + sub r0, r0, #(4*(SUM_STRIDE)) + sub r1, r1, #(2*(SUM_STRIDE)) + mov r4, r2 // backup of w + vsub.i8 q8, q8, q11 + vsub.i8 q9, q9, q11 + vsub.i8 q10, q10, q11 +1: + vld1.32 {q0, q1}, [r0, :128] // a + vld1.16 {q2}, [r1, :128] // b + vdup.32 q13, r7 // -2*bitdepth_min_8 + vdup.16 q14, r6 // -bitdepth_min_8 + subs r2, r2, #8 + vrshl.s32 q0, q0, q13 + vrshl.s32 q1, q1, q13 + vrshl.s16 q4, q2, q14 + vmul.i32 q0, q0, q15 // a * n + vmul.i32 q1, q1, q15 // a * n + vmull.u16 q3, d8, d8 // b * b + vmull.u16 q4, d9, d9 // b * b + vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) + vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) + vmul.i32 q0, q0, q12 // p * s + vmul.i32 q1, q1, q12 // p * s + vqshrn.u32 d0, q0, #16 + vqshrn.u32 d1, q1, #16 + vqrshrn.u16 d0, q0, #4 // imin(z, 255) + + vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 + vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 + vtbl.8 d1, {q8, q9}, d0 + vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 + vsub.i8 d9, d0, d15 // indices for vtbx + vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 + vadd.i8 d2, d2, d3 + vtbx.8 d1, {q10}, d9 + vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 + vadd.i8 d6, d6, d7 + vadd.i8 d8, d8, d22 + vadd.i8 d2, d2, d6 + vadd.i8 d1, d1, d8 + vadd.i8 d1, d1, d2 + vmovl.u8 q0, d1 // x + + vmov.i16 q13, #256 + vdup.32 q14, r5 // one_by_x + + vmull.u16 q1, d0, d4 // x * BB[i] + vmull.u16 q2, d1, d5 // x * BB[i] + vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x + vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x + vrshr.s32 q1, q1, #12 // AA[i] + vrshr.s32 q2, q2, #12 // AA[i] + vsub.i16 q0, q13, q0 // 256 - x + + vst1.32 {q1, q2}, [r0, :128]! + vst1.16 {q0}, [r1, :128]! + bgt 1b + + subs r3, r3, #1 + ble 0f + add r0, r0, r12, lsl #2 + add r1, r1, r12, lsl #1 + mov r2, r4 + b 1b +0: + vpop {q4-q7} + pop {r4-r7,pc} +endfunc diff -Nru dav1d-0.7.1/src/arm/32/looprestoration.S dav1d-0.9.1/src/arm/32/looprestoration.S --- dav1d-0.7.1/src/arm/32/looprestoration.S 2020-06-21 11:48:54.960126400 +0000 +++ dav1d-0.9.1/src/arm/32/looprestoration.S 2021-07-28 21:38:28.861851700 +0000 @@ -28,46 +28,50 @@ #include "src/arm/asm.S" #include "util.S" +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, -// const int16_t fh[7], const intptr_t w, +// const int16_t fh[8], intptr_t w, // int h, enum LrEdgeFlags edges); function wiener_filter_h_8bpc_neon, export=1 push {r4-r11,lr} - vpush {q4} - ldrd r4, r5, [sp, #52] - ldrd r6, r7, [sp, #60] + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] mov r8, r5 - vld1.16 {q0}, [r4] + vld1.16 {q0}, [r4, :128] movw r9, #(1 << 14) - (1 << 2) - vdup.16 q14, r9 - vmov.s16 q15, #2048 + vdup.16 q14, r9 + vmov.s16 q15, #2048 // Calculate mid_stride add r10, r5, #7 bic r10, r10, #7 lsl r10, r10, #1 - // Clear the last unused element of q0, to allow filtering a single - // pixel with one plain vmul+vpadd. - mov r12, #0 - vmov.16 d1[3], r12 - // Set up pointers for reading/writing alternate rows add r12, r0, r10 lsl r10, r10, #1 add lr, r2, r3 lsl r3, r3, #1 - // Subtract the width from mid_stride - sub r10, r10, r5, lsl #1 - - // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. - cmp r5, #8 - add r11, r5, #13 + // Subtract the aligned width from mid_stride + add r11, r5, #7 bic r11, r11, #7 - bge 1f - mov r11, #16 -1: + sub r10, r10, r11, lsl #1 + + // Subtract the number of pixels read from the source stride + add r11, r11, #8 sub r3, r3, r11 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -108,8 +112,8 @@ 0: // !LR_HAVE_LEFT, fill q1 with the leftmost byte // and shift q2 to have 3x the first byte at the front. - vdup.8 q1, d4[0] - vdup.8 q8, d18[0] + vdup.8 q1, d4[0] + vdup.8 q8, d18[0] // Move r2 back to account for the last 3 bytes we loaded before, // which we shifted out. sub r2, r2, #3 @@ -127,52 +131,60 @@ bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub r9, r5, #14 + sub r9, r5, #14 ldrb r11, [r2, r9] ldrb r9, [lr, r9] // Fill q12/q13 with the right padding pixel - vdup.8 d24, r11 - vdup.8 d26, r9 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 + vdup.16 q12, r11 + vdup.16 q13, r9 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid - cmp r5, #7 - bge 5f // If w >= 7, we can filter 4 pixels - b 6f + + // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel_local r4, right_ext_mask, -6 + sub r4, r4, r5, lsl #1 + vld1.8 {q10, q11}, [r4] + + vbit q1, q12, q10 + vbit q2, q12, q11 + vbit q8, q13, q10 + vbit q9, q13, q11 4: // Loop horizontally -.macro filter_8 - // This is tuned as some sort of compromise between Cortex A7, A8, - // A9 and A53. - vmul.s16 q3, q1, d0[0] - vext.8 q10, q1, q2, #2 vext.8 q11, q1, q2, #4 - vmla.s16 q3, q10, d0[1] - vmla.s16 q3, q11, d0[2] - vext.8 q10, q1, q2, #6 - vext.8 q11, q1, q2, #8 - vmla.s16 q3, q10, d0[3] - vmla.s16 q3, q11, d1[0] - vext.8 q10, q1, q2, #10 - vext.8 q11, q1, q2, #12 - vmla.s16 q3, q10, d1[1] - vmla.s16 q3, q11, d1[2] + vext.8 q5, q1, q2, #8 + vext.8 q10, q1, q2, #2 + vext.8 q6, q1, q2, #10 + vext.8 q7, q1, q2, #12 + vext.8 q4, q1, q2, #6 + vadd.i16 q5, q5, q11 + vadd.i16 q6, q6, q10 + vadd.i16 q7, q7, q1 + vmul.s16 q3, q4, d0[3] + vmla.s16 q3, q5, d1[0] + vmla.s16 q3, q6, d1[1] + vmla.s16 q3, q7, d1[2] - vmul.s16 q10, q8, d0[0] - vext.8 q11, q8, q9, #2 vext.8 q4, q8, q9, #4 - vmla.s16 q10, q11, d0[1] - vmla.s16 q10, q4, d0[2] - vext.8 q11, q8, q9, #6 - vext.8 q4, q8, q9, #8 - vmla.s16 q10, q11, d0[3] - vmla.s16 q10, q4, d1[0] - vext.8 q11, q8, q9, #10 + vext.8 q6, q8, q9, #8 + vext.8 q11, q8, q9, #2 + vext.8 q7, q8, q9, #10 + vadd.i16 q6, q6, q4 vext.8 q4, q8, q9, #12 - vmla.s16 q10, q11, d1[1] + vext.8 q5, q8, q9, #6 + vadd.i16 q7, q7, q11 + vadd.i16 q4, q4, q8 + vmul.s16 q10, q5, d0[3] + vmla.s16 q10, q6, d1[0] + vmla.s16 q10, q7, d1[1] vmla.s16 q10, q4, d1[2] vext.8 q1, q1, q2, #6 @@ -187,12 +199,10 @@ vshr.s16 q10, q10, #3 vadd.s16 q3, q3, q15 vadd.s16 q10, q10, q15 -.endm - filter_8 + subs r5, r5, #8 vst1.16 {q3}, [r0, :128]! vst1.16 {q10}, [r12, :128]! - subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q1, q2 @@ -204,152 +214,6 @@ bne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. -5: // Filter 4 pixels, 7 <= w < 11 -.macro filter_4 - vmul.s16 d6, d2, d0[0] - vext.8 q10, q1, q2, #2 - vext.8 q11, q1, q2, #4 - vmla.s16 d6, d20, d0[1] - vmla.s16 d6, d22, d0[2] - vext.8 q10, q1, q2, #6 - vext.8 q11, q1, q2, #8 - vmla.s16 d6, d20, d0[3] - vmla.s16 d6, d22, d1[0] - vext.8 q10, q1, q2, #10 - vext.8 q11, q1, q2, #12 - vmla.s16 d6, d20, d1[1] - vmla.s16 d6, d22, d1[2] - - vmul.s16 d20, d16, d0[0] - vext.8 q11, q8, q9, #2 - vext.8 q4, q8, q9, #4 - vmla.s16 d20, d22, d0[1] - vmla.s16 d20, d8, d0[2] - vext.8 q11, q8, q9, #6 - vext.8 q4, q8, q9, #8 - vmla.s16 d20, d22, d0[3] - vmla.s16 d20, d8, d1[0] - vext.8 q11, q8, q9, #10 - vext.8 q4, q8, q9, #12 - vmla.s16 d20, d22, d1[1] - vmla.s16 d20, d8, d1[2] - - vext.8 q11, q1, q2, #6 - vshl.s16 d22, d22, #7 - vsub.s16 d22, d22, d28 - vqadd.s16 d6, d6, d22 - vext.8 q11, q8, q9, #6 - vshl.s16 d22, d22, #7 - vsub.s16 d22, d22, d28 - vqadd.s16 d20, d20, d22 - vshr.s16 d6, d6, #3 - vshr.s16 d20, d20, #3 - vadd.s16 d6, d6, d30 - vadd.s16 d20, d20, d30 -.endm - filter_4 - vst1.16 {d6}, [r0, :64]! - vst1.16 {d20}, [r12, :64]! - - subs r5, r5, #4 // 3 <= w < 7 - vext.8 q1, q1, q2, #8 - vext.8 q2, q2, q2, #8 - vext.8 q8, q8, q9, #8 - vext.8 q9, q9, q9, #8 - -6: // Pad the right edge and filter the last few pixels. - // w < 7, w+3 pixels valid in q1-q2 - cmp r5, #5 - blt 7f - bgt 8f - // w == 5, 8 pixels valid in q1, q2 invalid - vmov q2, q12 - vmov q9, q13 - b 88f - -7: // 1 <= w < 5, 4-7 pixels valid in q1 - sub r9, r5, #1 - // r9 = (pixels valid - 4) - adr r11, L(variable_shift_tbl) - ldr r9, [r11, r9, lsl #2] - add r11, r11, r9 - vmov q2, q12 - vmov q9, q13 - bx r11 - - .align 2 -L(variable_shift_tbl): - .word 44f - L(variable_shift_tbl) + CONFIG_THUMB - .word 55f - L(variable_shift_tbl) + CONFIG_THUMB - .word 66f - L(variable_shift_tbl) + CONFIG_THUMB - .word 77f - L(variable_shift_tbl) + CONFIG_THUMB - -44: // 4 pixels valid in d2/d16, fill d3/d17 with padding. - vmov d3, d4 - vmov d17, d18 - b 88f - // Shift q1 right, shifting out invalid pixels, - // shift q1 left to the original offset, shifting in padding pixels. -55: // 5 pixels valid - vext.8 q1, q1, q1, #10 - vext.8 q1, q1, q2, #6 - vext.8 q8, q8, q8, #10 - vext.8 q8, q8, q9, #6 - b 88f -66: // 6 pixels valid - vext.8 q1, q1, q1, #12 - vext.8 q1, q1, q2, #4 - vext.8 q8, q8, q8, #12 - vext.8 q8, q8, q9, #4 - b 88f -77: // 7 pixels valid - vext.8 q1, q1, q1, #14 - vext.8 q1, q1, q2, #2 - vext.8 q8, q8, q8, #14 - vext.8 q8, q8, q9, #2 - b 88f - -8: // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2 - vext.8 q2, q2, q2, #2 - vext.8 q2, q2, q12, #14 - vext.8 q9, q9, q9, #2 - vext.8 q9, q9, q13, #14 - -88: - // w < 7, q1-q2 padded properly - cmp r5, #4 - blt 888f - - // w >= 4, filter 4 pixels - filter_4 - vst1.16 {d6}, [r0, :64]! - vst1.16 {d20}, [r12, :64]! - subs r5, r5, #4 // 0 <= w < 4 - vext.8 q1, q1, q2, #8 - vext.8 q8, q8, q9, #8 - beq 9f -888: // 1 <= w < 4, filter 1 pixel at a time - vmul.s16 q3, q1, q0 - vmul.s16 q10, q8, q0 - vpadd.s16 d6, d6, d7 - vpadd.s16 d7, d20, d21 - vdup.16 d24, d2[3] - vpadd.s16 d6, d6, d7 - vdup.16 d25, d16[3] - vpadd.s16 d6, d6, d6 - vtrn.16 d24, d25 - vshl.s16 d24, d24, #7 - vsub.s16 d24, d24, d28 - vqadd.s16 d6, d6, d24 - vshr.s16 d6, d6, #3 - vadd.s16 d6, d6, d30 - vst1.s16 {d6[0]}, [r0, :16]! - vst1.s16 {d6[1]}, [r12, :16]! - subs r5, r5, #1 - vext.8 q1, q1, q2, #2 - vext.8 q8, q8, q9, #2 - bgt 888b - 9: subs r6, r6, #2 ble 0f @@ -361,26 +225,21 @@ mov r5, r8 b 1b 0: - vpop {q4} + vpop {q4-q7} pop {r4-r11,pc} -.purgem filter_8 -.purgem filter_4 endfunc // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, // const int16_t *mid, int w, int h, -// const int16_t fv[7], enum LrEdgeFlags edges, +// const int16_t fv[8], enum LrEdgeFlags edges, // ptrdiff_t mid_stride); function wiener_filter_v_8bpc_neon, export=1 push {r4-r7,lr} - ldrd r4, r5, [sp, #20] - ldrd r6, r7, [sp, #28] + vpush {q4-q6} + ldrd r4, r5, [sp, #68] + ldrd r6, r7, [sp, #76] mov lr, r4 - vmov.s16 q1, #0 - mov r12, #128 - vld1.16 {q0}, [r5] - vmov.s16 d2[3], r12 - vadd.s16 q0, q0, q1 + vld1.16 {q0}, [r5, :128] // Calculate the number of rows to move back when looping vertically mov r12, r4 @@ -422,24 +281,21 @@ // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - vmull.s16 q2, d16, d0[0] - vmlal.s16 q2, d18, d0[1] - vmlal.s16 q2, d20, d0[2] - vmlal.s16 q2, d22, d0[3] - vmlal.s16 q2, d24, d1[0] - vmlal.s16 q2, d26, d1[1] - vmlal.s16 q2, d28, d1[2] - vmull.s16 q3, d17, d0[0] - vmlal.s16 q3, d19, d0[1] - vmlal.s16 q3, d21, d0[2] - vmlal.s16 q3, d23, d0[3] - vmlal.s16 q3, d25, d1[0] - vmlal.s16 q3, d27, d1[1] - vmlal.s16 q3, d29, d1[2] - vqrshrun.s32 d4, q2, #11 - vqrshrun.s32 d5, q3, #11 + vadd.i16 q4, q10, q12 + vadd.i16 q5, q9, q13 + vadd.i16 q6, q8, q14 + vmull.s16 q2, d22, d0[3] + vmlal.s16 q2, d8, d1[0] + vmlal.s16 q2, d10, d1[1] + vmlal.s16 q2, d12, d1[2] + vmull.s16 q3, d23, d0[3] + vmlal.s16 q3, d9, d1[0] + vmlal.s16 q3, d11, d1[1] + vmlal.s16 q3, d13, d1[2] + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 vqmovun.s16 d4, q2 - vst1.8 {d4}, [r0], r1 + vst1.8 {d4}, [r0, :64], r1 .if \compare cmp r4, #4 .else @@ -473,7 +329,7 @@ 52: // 2 rows in total, q11 already loaded, load q12 with content data // and 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 - vmov q15, q14 + vmov q15, q14 b 8f 53: // 3 rows in total, q11 already loaded, load q12 and q13 with content @@ -544,149 +400,15 @@ b 1b 0: + vpop {q4-q6} pop {r4-r7,pc} .purgem filter endfunc -// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride, -// const pixel *src, int w, int h); -function copy_narrow_8bpc_neon, export=1 - push {r4,lr} - ldr r4, [sp, #8] - adr r12, L(copy_narrow_tbl) - ldr r3, [r12, r3, lsl #2] - add r12, r12, r3 - bx r12 - - .align 2 -L(copy_narrow_tbl): - .word 0 - .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB - -10: - add r3, r0, r1 - lsl r1, r1, #1 -18: - subs r4, r4, #8 - blt 110f - vld1.8 {d0}, [r2, :64]! - vst1.8 {d0[0]}, [r0], r1 - vst1.8 {d0[1]}, [r3], r1 - vst1.8 {d0[2]}, [r0], r1 - vst1.8 {d0[3]}, [r3], r1 - vst1.8 {d0[4]}, [r0], r1 - vst1.8 {d0[5]}, [r3], r1 - vst1.8 {d0[6]}, [r0], r1 - vst1.8 {d0[7]}, [r3], r1 - ble 0f - b 18b -110: - add r4, r4, #8 - asr r1, r1, #1 -11: - subs r4, r4, #1 - vld1.8 {d0[]}, [r2]! - vst1.8 {d0[0]}, [r0], r1 - bgt 11b -0: - pop {r4,pc} - -20: - add r3, r0, r1 - lsl r1, r1, #1 -24: - subs r4, r4, #4 - blt 210f - vld1.16 {d0}, [r2, :64]! - vst1.16 {d0[0]}, [r0, :16], r1 - vst1.16 {d0[1]}, [r3, :16], r1 - vst1.16 {d0[2]}, [r0, :16], r1 - vst1.16 {d0[3]}, [r3, :16], r1 - ble 0f - b 24b -210: - add r4, r4, #4 - asr r1, r1, #1 -22: - subs r4, r4, #1 - vld1.16 {d0[]}, [r2]! - vst1.16 {d0[0]}, [r0], r1 - bgt 22b -0: - pop {r4,pc} - -30: - ldrh r3, [r2] - ldrb r12, [r2, #2] - add r2, r2, #3 - subs r4, r4, #1 - strh r3, [r0] - strb r12, [r0, #2] - add r0, r0, r1 - bgt 30b - pop {r4,pc} - -40: - add r3, r0, r1 - lsl r1, r1, #1 -42: - subs r4, r4, #2 - blt 41f - vld1.8 {d0}, [r2, :64]! - vst1.32 {d0[0]}, [r0, :32], r1 - vst1.32 {d0[1]}, [r3, :32], r1 - ble 0f - b 42b -41: - vld1.32 {d0[]}, [r2] - vst1.32 {d0[0]}, [r0] -0: - pop {r4,pc} - -50: - ldr r3, [r2] - ldrb r12, [r2, #4] - add r2, r2, #5 - subs r4, r4, #1 - str r3, [r0] - strb r12, [r0, #4] - add r0, r0, r1 - bgt 50b - pop {r4,pc} - -60: - ldr r3, [r2] - ldrh r12, [r2, #4] - add r2, r2, #6 - subs r4, r4, #1 - str r3, [r0] - strh r12, [r0, #4] - add r0, r0, r1 - bgt 60b - pop {r4,pc} - -70: - ldr r3, [r2] - ldrh r12, [r2, #4] - ldrb lr, [r2, #6] - add r2, r2, #7 - subs r4, r4, #1 - str r3, [r0] - strh r12, [r0, #4] - strb lr, [r0, #6] - add r0, r0, r1 - bgt 70b - pop {r4,pc} -endfunc - #define SUM_STRIDE (384+16) +#include "looprestoration_tmpl.S" + // void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, @@ -707,25 +429,15 @@ mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - tst r7, #2 // LR_HAVE_RIGHT - bne 0f - // !LR_HAVE_RIGHT - add lr, r5, #3 - bic lr, lr, #3 - b 1f -0: add lr, r5, #7 bic lr, lr, #7 -1: sub r9, r9, lr, lsl #1 // Store the width for the vertical loop mov r8, r5 // Subtract the number of pixels read from the input from the stride - add lr, r5, #14 - bic lr, lr, #7 + add lr, lr, #8 sub r4, r4, lr // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -785,7 +497,7 @@ bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub lr, r5, #(2 + 16 - 2 + 1) + sub lr, r5, #(2 + 16 - 2 + 1) ldrb r11, [r3, lr] ldrb lr, [r12, lr] // Fill q14/q15 with the right padding pixel @@ -794,34 +506,30 @@ // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp r5, #10 bge 4f // If w >= 10, all used input pixels are valid - cmp r5, #6 - bge 5f // If w >= 6, we can filter 4 pixels - b 6f -4: // Loop horizontally -.macro vaddl_u16_n dst1, dst2, src1, src2, src3, src4, w - vaddl.u16 \dst1, \src1, \src3 -.if \w > 4 - vaddl.u16 \dst2, \src2, \src4 -.endif -.endm -.macro vaddw_u16_n dst1, dst2, src1, src2, w - vaddw.u16 \dst1, \dst1, \src1 -.if \w > 4 - vaddw.u16 \dst2, \dst2, \src2 -.endif -.endm -.macro vadd_i32_n dst1, dst2, src1, src2, w - vadd.i32 \dst1, \dst1, \src1 -.if \w > 4 - vadd.i32 \dst2, \dst2, \src2 -.endif -.endm + // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in q0/4.b[w] onwards + movrel_local lr, right_ext_mask + sub lr, lr, r5 + vld1.8 {q13}, [lr] + + vbit q0, q14, q13 + vbit q4, q15, q13 -.macro add3 w + // Update the precalculated squares + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + +4: // Loop horizontally vext.8 d16, d0, d1, #1 vext.8 d17, d0, d1, #2 vext.8 d18, d8, d9, #1 @@ -836,19 +544,22 @@ vext.8 q10, q5, q6, #2 vext.8 q11, q5, q6, #4 - vaddl_u16_n q12, q13, d2, d3, d16, d17, \w - vaddw_u16_n q12, q13, d18, d19, \w + vaddl.u16 q12, d2, d16 + vaddl.u16 q13, d3, d17 + vaddw.u16 q12, q12, d18 + vaddw.u16 q13, q13, d19 - vaddl_u16_n q8, q9, d10, d11, d20, d21, \w - vaddw_u16_n q8, q9, d22, d23, \w -.endm - add3 8 + vaddl.u16 q8, d10, d20 + vaddl.u16 q9, d11, d21 + vaddw.u16 q8, q8, d22 + vaddw.u16 q9, q9, d23 + + subs r5, r5, #8 vst1.16 {q3}, [r1, :128]! vst1.16 {q7}, [r11, :128]! vst1.32 {q12, q13}, [r0, :128]! vst1.32 {q8, q9}, [r10, :128]! - subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vld1.8 {d6}, [r3]! @@ -863,86 +574,6 @@ bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 6 <= w < 10 - add3 4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q8}, [r10, :128]! - - subs r5, r5, #4 // 2 <= w < 6 - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - -6: // Pad the right edge and produce the last few pixels. - // 2 <= w < 6, 2-5 pixels valid in q0 - sub lr, r5, #2 - // lr = (pixels valid - 2) - adr r11, L(box3_variable_shift_tbl) - ldr lr, [r11, lr, lsl #2] - add r11, r11, lr - bx r11 - - .align 2 -L(box3_variable_shift_tbl): - .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB - .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB - .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB - .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB - - // Shift q0 right, shifting out invalid pixels, - // shift q0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - vext.8 q0, q0, q0, #2 - vext.8 q4, q4, q4, #2 - vext.8 q0, q0, q14, #14 - vext.8 q4, q4, q15, #14 - b 88f -33: // 3 pixels valid - vext.8 q0, q0, q0, #3 - vext.8 q4, q4, q4, #3 - vext.8 q0, q0, q14, #13 - vext.8 q4, q4, q15, #13 - b 88f -44: // 4 pixels valid - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - vext.8 q0, q0, q14, #12 - vext.8 q4, q4, q15, #12 - b 88f -55: // 5 pixels valid - vext.8 q0, q0, q0, #5 - vext.8 q4, q4, q4, #5 - vext.8 q0, q0, q14, #11 - vext.8 q4, q4, q15, #11 - -88: - // Restore r11 after using it for a temporary value above - add r11, r1, #(2*SUM_STRIDE) - vmull.u8 q1, d0, d0 - vmull.u8 q2, d1, d1 - vmull.u8 q5, d8, d8 - vmull.u8 q6, d9, d9 - - add3 4 - subs r5, r5, #4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q8}, [r10, :128]! - ble 9f - vext.8 q0, q0, q0, #4 - vext.8 q1, q1, q2, #8 - vext.8 q4, q4, q4, #4 - vext.8 q5, q5, q6, #8 - // Only one needed pixel left, but do a normal 4 pixel - // addition anyway - add3 4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q8}, [r10, :128]! - 9: subs r6, r6, #2 ble 0f @@ -958,7 +589,6 @@ 0: vpop {q4-q7} pop {r4-r11,pc} -.purgem add3 endfunc // void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, @@ -981,23 +611,11 @@ mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - // Subtract the number of pixels read from the input from the stride. - tst r7, #2 // LR_HAVE_RIGHT - bne 0f - // !LR_HAVE_RIGHT - add lr, r5, #3 - bic lr, lr, #3 - add r8, r5, #13 - b 1f -0: add lr, r5, #7 bic lr, lr, #7 - add r8, r5, #15 -1: sub r9, r9, lr, lsl #1 - bic r8, r8, #7 - sub r4, r4, r8 + add lr, lr, #8 + sub r4, r4, lr // Store the width for the vertical loop mov r8, r5 @@ -1058,7 +676,7 @@ bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub lr, r5, #(2 + 16 - 3 + 1) + sub lr, r5, #(2 + 16 - 3 + 1) ldrb r11, [r3, lr] ldrb lr, [r12, lr] // Fill q14/q15 with the right padding pixel @@ -1067,15 +685,31 @@ // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid - cmp r5, #7 - bge 5f // If w >= 7, we can produce 4 pixels - b 6f + + // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel_local lr, right_ext_mask, -1 + sub lr, lr, r5 + vld1.8 {q13}, [lr] + + vbit q0, q14, q13 + vbit q4, q15, q13 + + // Update the precalculated squares + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 4: // Loop horizontally -.macro add5 w vext.8 d16, d0, d1, #1 vext.8 d17, d0, d1, #2 vext.8 d18, d0, d1, #3 @@ -1097,35 +731,33 @@ vext.8 q9, q1, q2, #4 vext.8 q10, q1, q2, #6 vext.8 q11, q1, q2, #8 - vaddl_u16_n q12, q13, d2, d3, d16, d17, \w - vaddl_u16_n q8, q9, d18, d19, d20, d21, \w - vaddw_u16_n q12, q13, d22, d23, \w - vadd_i32_n q12, q13, q8, q9, \w + vaddl.u16 q12, d2, d16 + vaddl.u16 q13, d3, d17 + vaddl.u16 q8, d18, d20 + vaddl.u16 q9, d19, d21 + vaddw.u16 q12, q12, d22 + vaddw.u16 q13, q13, d23 + vadd.i32 q12, q12, q8 + vadd.i32 q13, q13, q9 vext.8 q8, q5, q6, #2 vext.8 q9, q5, q6, #4 vext.8 q10, q5, q6, #6 vext.8 q11, q5, q6, #8 -.if \w > 4 - vaddl_u16_n q1, q5, d10, d11, d16, d17, 8 - vaddl_u16_n q8, q9, d18, d19, d20, d21, 8 - vaddw_u16_n q1, q5, d22, d23, 8 + vaddl.u16 q1, d10, d16 + vaddl.u16 q5, d11, d17 + vaddl.u16 q8, d18, d20 + vaddl.u16 q9, d19, d21 + vaddw.u16 q1, q1, d22 + vaddw.u16 q5, q5, d23 vadd.i32 q10, q1, q8 vadd.i32 q11, q5, q9 -.else - // Can't clobber q1/q5 if only doing 4 pixels - vaddl.u16 q8, d10, d16 - vaddl.u16 q9, d18, d20 - vaddw.u16 q8, q8, d22 - vadd.i32 q10, q8, q9 -.endif -.endm - add5 8 + + subs r5, r5, #8 vst1.16 {q3}, [r1, :128]! vst1.16 {q7}, [r11, :128]! vst1.32 {q12, q13}, [r0, :128]! vst1.32 {q10, q11}, [r10, :128]! - subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vld1.8 {d6}, [r3]! @@ -1139,98 +771,6 @@ bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 7 <= w < 11 - add5 4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q10}, [r10, :128]! - - subs r5, r5, #4 // 3 <= w < 7 - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - -6: // Pad the right edge and produce the last few pixels. - // w < 7, w+1 pixels valid in q0/q4 - sub lr, r5, #1 - // lr = pixels valid - 2 - adr r11, L(box5_variable_shift_tbl) - ldr lr, [r11, lr, lsl #2] - add r11, r11, lr - bx r11 - - .align 2 -L(box5_variable_shift_tbl): - .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB - - // Shift q0 right, shifting out invalid pixels, - // shift q0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - vext.8 q0, q0, q0, #2 - vext.8 q4, q4, q4, #2 - vext.8 q0, q0, q14, #14 - vext.8 q4, q4, q15, #14 - b 88f -33: // 3 pixels valid - vext.8 q0, q0, q0, #3 - vext.8 q4, q4, q4, #3 - vext.8 q0, q0, q14, #13 - vext.8 q4, q4, q15, #13 - b 88f -44: // 4 pixels valid - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - vext.8 q0, q0, q14, #12 - vext.8 q4, q4, q15, #12 - b 88f -55: // 5 pixels valid - vext.8 q0, q0, q0, #5 - vext.8 q4, q4, q4, #5 - vext.8 q0, q0, q14, #11 - vext.8 q4, q4, q15, #11 - b 88f -66: // 6 pixels valid - vext.8 q0, q0, q0, #6 - vext.8 q4, q4, q4, #6 - vext.8 q0, q0, q14, #10 - vext.8 q4, q4, q15, #10 - b 88f -77: // 7 pixels valid - vext.8 q0, q0, q0, #7 - vext.8 q4, q4, q4, #7 - vext.8 q0, q0, q14, #9 - vext.8 q4, q4, q15, #9 - -88: - // Restore r11 after using it for a temporary value above - add r11, r1, #(2*SUM_STRIDE) - vmull.u8 q1, d0, d0 - vmull.u8 q2, d1, d1 - vmull.u8 q5, d8, d8 - vmull.u8 q6, d9, d9 - - add5 4 - subs r5, r5, #4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q10}, [r10, :128]! - ble 9f - vext.8 q0, q0, q0, #4 - vext.8 q1, q1, q2, #8 - vext.8 q4, q4, q4, #4 - vext.8 q5, q5, q6, #8 - add5 4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q10}, [r10, :128]! - 9: subs r6, r6, #2 ble 0f @@ -1246,865 +786,6 @@ 0: vpop {q4-q7} pop {r4-r11,pc} -.purgem add5 -endfunc - -// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box3_v_neon, export=1 - push {r4-r9,lr} - ldr r4, [sp, #28] - add r12, r3, #2 // Number of output rows to move back - mov lr, r3 // Number of input rows to move back - add r2, r2, #2 // Actual summed width - mov r7, #(4*SUM_STRIDE) // sumsq stride - mov r8, #(2*SUM_STRIDE) // sum stride - sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride - sub r1, r1, #(2*SUM_STRIDE) // sum -= stride - - tst r4, #4 // LR_HAVE_TOP - beq 0f - // If have top, read from row -2. - sub r5, r0, #(4*SUM_STRIDE) - sub r6, r1, #(2*SUM_STRIDE) - add lr, lr, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add r5, r0, #(4*SUM_STRIDE) - add r6, r1, #(2*SUM_STRIDE) -1: - - tst r4, #8 // LR_HAVE_BOTTOM - beq 1f - // LR_HAVE_BOTTOM - add r3, r3, #2 // Sum all h+2 lines with the main loop - add lr, lr, #2 -1: - mov r9, r3 // Backup of h for next loops - -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into q8-q13 and q0-q2 taking top - // padding into consideration. - tst r4, #4 // LR_HAVE_TOP - vld1.32 {q8, q9}, [r5, :128], r7 - vld1.16 {q0}, [r6, :128], r8 - beq 2f - // LR_HAVE_TOP - vld1.32 {q10, q11}, [r5, :128], r7 - vld1.16 {q1}, [r6, :128], r8 - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q2}, [r6, :128], r8 - b 3f -2: // !LR_HAVE_TOP - vmov q10, q8 - vmov q11, q9 - vmov q1, q0 - vmov q12, q8 - vmov q13, q9 - vmov q2, q0 - -3: - subs r3, r3, #1 -.macro add3 - vadd.i32 q8, q8, q10 - vadd.i32 q9, q9, q11 - vadd.i16 q0, q0, q1 - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i16 q0, q0, q2 - vst1.32 {q8, q9}, [r0, :128], r7 - vst1.16 {q0}, [r1, :128], r8 -.endm - add3 - vmov q8, q10 - vmov q9, q11 - vmov q0, q1 - vmov q10, q12 - vmov q11, q13 - vmov q1, q2 - ble 4f - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q2}, [r6, :128], r8 - b 3b - -4: - tst r4, #8 // LR_HAVE_BOTTOM - bne 5f - // !LR_HAVE_BOTTOM - // Produce two more rows, extending the already loaded rows. - add3 - vmov q8, q10 - vmov q9, q11 - vmov q0, q1 - add3 - -5: // End of one vertical slice. - subs r2, r2, #8 - ble 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - mls r5, r7, lr, r5 - mls r6, r8, lr, r6 - // Output pointers - mls r0, r7, r12, r0 - mls r1, r8, r12, r1 - add r0, r0, #32 - add r1, r1, #16 - add r5, r5, #32 - add r6, r6, #16 - mov r3, r9 - b 1b - -0: - pop {r4-r9,pc} -.purgem add3 -endfunc - -// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box5_v_neon, export=1 - push {r4-r9,lr} - vpush {q5-q7} - ldr r4, [sp, #76] - add r12, r3, #2 // Number of output rows to move back - mov lr, r3 // Number of input rows to move back - add r2, r2, #8 // Actual summed width - mov r7, #(4*SUM_STRIDE) // sumsq stride - mov r8, #(2*SUM_STRIDE) // sum stride - sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride - sub r1, r1, #(2*SUM_STRIDE) // sum -= stride - - tst r4, #4 // LR_HAVE_TOP - beq 0f - // If have top, read from row -2. - sub r5, r0, #(4*SUM_STRIDE) - sub r6, r1, #(2*SUM_STRIDE) - add lr, lr, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add r5, r0, #(4*SUM_STRIDE) - add r6, r1, #(2*SUM_STRIDE) -1: - - tst r4, #8 // LR_HAVE_BOTTOM - beq 0f - // LR_HAVE_BOTTOM - add r3, r3, #2 // Handle h+2 lines with the main loop - add lr, lr, #2 - b 1f -0: - // !LR_HAVE_BOTTOM - sub r3, r3, #1 // Handle h-1 lines with the main loop -1: - mov r9, r3 // Backup of h for next loops - -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into q6-q15 and q0-q3,q5 taking top - // padding into consideration. - tst r4, #4 // LR_HAVE_TOP - vld1.32 {q6, q7}, [r5, :128], r7 - vld1.16 {q0}, [r6, :128], r8 - beq 2f - // LR_HAVE_TOP - vld1.32 {q10, q11}, [r5, :128], r7 - vld1.16 {q2}, [r6, :128], r8 - vmov q8, q6 - vmov q9, q7 - vmov q1, q0 - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q3}, [r6, :128], r8 - b 3f -2: // !LR_HAVE_TOP - vmov q8, q6 - vmov q9, q7 - vmov q1, q0 - vmov q10, q6 - vmov q11, q7 - vmov q2, q0 - vmov q12, q6 - vmov q13, q7 - vmov q3, q0 - -3: - cmp r3, #0 - beq 4f - vld1.32 {q14, q15}, [r5, :128], r7 - vld1.16 {q5}, [r6, :128], r8 - -3: - // Start of vertical loop - subs r3, r3, #2 -.macro add5 - vadd.i32 q6, q6, q8 - vadd.i32 q7, q7, q9 - vadd.i16 q0, q0, q1 - vadd.i32 q6, q6, q10 - vadd.i32 q7, q7, q11 - vadd.i16 q0, q0, q2 - vadd.i32 q6, q6, q12 - vadd.i32 q7, q7, q13 - vadd.i16 q0, q0, q3 - vadd.i32 q6, q6, q14 - vadd.i32 q7, q7, q15 - vadd.i16 q0, q0, q5 - vst1.32 {q6, q7}, [r0, :128], r7 - vst1.16 {q0}, [r1, :128], r8 -.endm - add5 -.macro shift2 - vmov q6, q10 - vmov q7, q11 - vmov q0, q2 - vmov q8, q12 - vmov q9, q13 - vmov q1, q3 - vmov q10, q14 - vmov q11, q15 - vmov q2, q5 -.endm - shift2 - add r0, r0, r7 - add r1, r1, r8 - ble 5f - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q3}, [r6, :128], r8 - vld1.32 {q14, q15}, [r5, :128], r7 - vld1.16 {q5}, [r6, :128], r8 - b 3b - -4: - // h == 1, !LR_HAVE_BOTTOM. - // Pad the last row with the only content row, and add. - vmov q14, q12 - vmov q15, q13 - vmov q5, q3 - add5 - shift2 - add r0, r0, r7 - add r1, r1, r8 - add5 - b 6f - -5: - tst r4, #8 // LR_HAVE_BOTTOM - bne 6f - // !LR_HAVE_BOTTOM - cmp r3, #0 - bne 5f - // The intended three edge rows left; output the one at h-2 and - // the past edge one at h. - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q3}, [r6, :128], r8 - // Pad the past-edge row from the last content row. - vmov q14, q12 - vmov q15, q13 - vmov q5, q3 - add5 - shift2 - add r0, r0, r7 - add r1, r1, r8 - // The last two rows are already padded properly here. - add5 - b 6f - -5: - // r3 == -1, two rows left, output one. - // Pad the last two rows from the mid one. - vmov q12, q10 - vmov q13, q11 - vmov q3, q2 - vmov q14, q10 - vmov q15, q11 - vmov q5, q2 - add5 - add r0, r0, r7 - add r1, r1, r8 - b 6f - -6: // End of one vertical slice. - subs r2, r2, #8 - ble 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - mls r5, r7, lr, r5 - mls r6, r8, lr, r6 - // Output pointers - mls r0, r7, r12, r0 - mls r1, r8, r12, r1 - add r0, r0, #32 - add r1, r1, #16 - add r5, r5, #32 - add r6, r6, #16 - mov r3, r9 - b 1b - -0: - vpop {q5-q7} - pop {r4-r9,pc} -.purgem add5 -endfunc - -// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength); -// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength); -function sgr_calc_ab1_neon, export=1 - push {r4-r5,lr} - vpush {q4-q7} - ldr r4, [sp, #76] - add r3, r3, #2 // h += 2 - vmov.i32 q15, #9 // n - movw r5, #455 - mov lr, #SUM_STRIDE - b sgr_calc_ab_neon -endfunc - -function sgr_calc_ab2_neon, export=1 - push {r4-r5,lr} - vpush {q4-q7} - ldr r4, [sp, #76] - add r3, r3, #3 // h += 3 - asr r3, r3, #1 // h /= 2 - vmov.i32 q15, #25 // n - mov r5, #164 - mov lr, #(2*SUM_STRIDE) -endfunc - -function sgr_calc_ab_neon - movrel r12, X(sgr_x_by_x) - vld1.8 {q8, q9}, [r12, :128]! - vmov.i8 q11, #5 - vmov.i8 d10, #55 // idx of last 5 - vld1.8 {q10}, [r12, :128] - vmov.i8 d11, #72 // idx of last 4 - vmov.i8 d12, #101 // idx of last 3 - vmov.i8 d13, #169 // idx of last 2 - vmov.i8 d14, #254 // idx of last 1 - vmov.i8 d15, #32 // elements consumed in first vtbl - add r2, r2, #2 // w += 2 - add r12, r2, #7 - bic r12, r12, #7 // aligned w - sub r12, lr, r12 // increment between rows - vmov.i16 q13, #256 - vdup.32 q12, r4 - vdup.32 q14, r5 // one_by_x - sub r0, r0, #(4*(SUM_STRIDE)) - sub r1, r1, #(2*(SUM_STRIDE)) - mov r4, r2 // backup of w - vsub.i8 q8, q8, q11 - vsub.i8 q9, q9, q11 - vsub.i8 q10, q10, q11 -1: - subs r2, r2, #8 - vld1.32 {q0, q1}, [r0, :128] // a - vld1.16 {q2}, [r1, :128] // b - vmul.i32 q0, q0, q15 // a * n - vmul.i32 q1, q1, q15 // a * n - vmull.u16 q3, d4, d4 // b * b - vmull.u16 q4, d5, d5 // b * b - vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) - vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) - vmul.i32 q0, q0, q12 // p * s - vmul.i32 q1, q1, q12 // p * s - vqshrn.u32 d0, q0, #16 - vqshrn.u32 d1, q1, #16 - vqrshrn.u16 d0, q0, #4 // imin(z, 255) - - vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 - vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 - vtbl.8 d1, {q8, q9}, d0 - vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 - vsub.i8 d9, d0, d15 // indices for vtbx - vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 - vadd.i8 d2, d2, d3 - vtbx.8 d1, {q10}, d9 - vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 - vadd.i8 d6, d6, d7 - vadd.i8 d8, d8, d22 - vadd.i8 d2, d2, d6 - vadd.i8 d1, d1, d8 - vadd.i8 d1, d1, d2 - vmovl.u8 q0, d1 // x - - vmull.u16 q1, d0, d4 // x * BB[i] - vmull.u16 q2, d1, d5 // x * BB[i] - vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x - vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x - vrshr.s32 q1, q1, #12 // AA[i] - vrshr.s32 q2, q2, #12 // AA[i] - vsub.i16 q0, q13, q0 // 256 - x - - vst1.32 {q1, q2}, [r0, :128]! - vst1.16 {q0}, [r1, :128]! - bgt 1b - - subs r3, r3, #1 - ble 0f - add r0, r0, r12, lsl #2 - add r1, r1, r12, lsl #1 - mov r2, r4 - b 1b -0: - vpop {q4-q7} - pop {r4-r5,pc} -endfunc - -#define FILTER_OUT_STRIDE 384 - -// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter1_8bpc_neon, export=1 - push {r4-r11,lr} - vpush {q4-q7} - ldrd r4, r5, [sp, #100] - ldr r6, [sp, #108] - sub r7, r3, #(4*SUM_STRIDE) - add r8, r3, #(4*SUM_STRIDE) - sub r9, r4, #(2*SUM_STRIDE) - add r10, r4, #(2*SUM_STRIDE) - mov r11, #SUM_STRIDE - mov r12, #FILTER_OUT_STRIDE - add lr, r5, #3 - bic lr, lr, #3 // Aligned width - sub r2, r2, lr - sub r12, r12, lr - sub r11, r11, lr - sub r11, r11, #4 // We read 4 extra elements from both a and b - mov lr, r5 - vmov.i16 q14, #3 - vmov.i32 q15, #3 -1: - vld1.16 {q0}, [r9]! - vld1.16 {q1}, [r4]! - vld1.16 {q2}, [r10]! - vld1.32 {q8, q9}, [r7]! - vld1.32 {q10, q11}, [r3]! - vld1.32 {q12, q13}, [r8]! - -2: - subs r5, r5, #4 - vext.8 d6, d0, d1, #2 // -stride - vext.8 d7, d2, d3, #2 // 0 - vext.8 d8, d4, d5, #2 // +stride - vext.8 d9, d0, d1, #4 // +1-stride - vext.8 d10, d2, d3, #4 // +1 - vext.8 d11, d4, d5, #4 // +1+stride - vadd.i16 d2, d2, d6 // -1, -stride - vadd.i16 d7, d7, d8 // 0, +stride - vadd.i16 d0, d0, d9 // -1-stride, +1-stride - vadd.i16 d2, d2, d7 - vadd.i16 d4, d4, d11 // -1+stride, +1+stride - vadd.i16 d2, d2, d10 // +1 - vadd.i16 d0, d0, d4 - - vext.8 q3, q8, q9, #4 // -stride - vshl.i16 d2, d2, #2 - vext.8 q4, q8, q9, #8 // +1-stride - vext.8 q5, q10, q11, #4 // 0 - vext.8 q6, q10, q11, #8 // +1 - vmla.i16 d2, d0, d28 // * 3 -> a - vadd.i32 q3, q3, q10 // -stride, -1 - vadd.i32 q8, q8, q4 // -1-stride, +1-stride - vadd.i32 q5, q5, q6 // 0, +1 - vadd.i32 q8, q8, q12 // -1+stride - vadd.i32 q3, q3, q5 - vext.8 q7, q12, q13, #4 // +stride - vext.8 q10, q12, q13, #8 // +1+stride - vld1.32 {d24[0]}, [r1]! // src - vadd.i32 q3, q3, q7 // +stride - vadd.i32 q8, q8, q10 // +1+stride - vshl.i32 q3, q3, #2 - vmla.i32 q3, q8, q15 // * 3 -> b - vmovl.u8 q12, d24 // src - vmov d0, d1 - vmlal.u16 q3, d2, d24 // b + a * src - vmov d2, d3 - vrshrn.i32 d6, q3, #9 - vmov d4, d5 - vst1.16 {d6}, [r0]! - - ble 3f - vmov q8, q9 - vmov q10, q11 - vmov q12, q13 - vld1.16 {d1}, [r9]! - vld1.16 {d3}, [r4]! - vld1.16 {d5}, [r10]! - vld1.32 {q9}, [r7]! - vld1.32 {q11}, [r3]! - vld1.32 {q13}, [r8]! - b 2b - -3: - subs r6, r6, #1 - ble 0f - mov r5, lr - add r0, r0, r12, lsl #1 - add r1, r1, r2 - add r3, r3, r11, lsl #2 - add r7, r7, r11, lsl #2 - add r8, r8, r11, lsl #2 - add r4, r4, r11, lsl #1 - add r9, r9, r11, lsl #1 - add r10, r10, r11, lsl #1 - b 1b -0: - vpop {q4-q7} - pop {r4-r11,pc} endfunc -// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter2_8bpc_neon, export=1 - push {r4-r11,lr} - vpush {q4-q7} - ldrd r4, r5, [sp, #100] - ldr r6, [sp, #108] - add r7, r3, #(4*(SUM_STRIDE)) - sub r3, r3, #(4*(SUM_STRIDE)) - add r8, r4, #(2*(SUM_STRIDE)) - sub r4, r4, #(2*(SUM_STRIDE)) - mov r9, #(2*SUM_STRIDE) - mov r10, #FILTER_OUT_STRIDE - add r11, r5, #7 - bic r11, r11, #7 // Aligned width - sub r2, r2, r11 - sub r10, r10, r11 - sub r9, r9, r11 - sub r9, r9, #4 // We read 4 extra elements from a - sub r12, r9, #4 // We read 8 extra elements from b - mov lr, r5 - -1: - vld1.16 {q0, q1}, [r4]! - vld1.16 {q2, q3}, [r8]! - vld1.32 {q8, q9}, [r3]! - vld1.32 {q11, q12}, [r7]! - vld1.32 {q10}, [r3]! - vld1.32 {q13}, [r7]! - -2: - vmov.i16 q14, #5 - vmov.i16 q15, #6 - subs r5, r5, #8 - vext.8 q4, q0, q1, #4 // +1-stride - vext.8 q5, q2, q3, #4 // +1+stride - vext.8 q6, q0, q1, #2 // -stride - vext.8 q7, q2, q3, #2 // +stride - vadd.i16 q0, q0, q4 // -1-stride, +1-stride - vadd.i16 q5, q2, q5 // -1+stride, +1+stride - vadd.i16 q2, q6, q7 // -stride, +stride - vadd.i16 q0, q0, q5 - - vext.8 q4, q8, q9, #8 // +1-stride - vext.8 q5, q9, q10, #8 - vext.8 q6, q11, q12, #8 // +1+stride - vext.8 q7, q12, q13, #8 - vmul.i16 q0, q0, q14 // * 5 - vmla.i16 q0, q2, q15 // * 6 - vadd.i32 q4, q4, q8 // -1-stride, +1-stride - vadd.i32 q5, q5, q9 - vadd.i32 q6, q6, q11 // -1+stride, +1+stride - vadd.i32 q7, q7, q12 - vadd.i32 q4, q4, q6 - vadd.i32 q5, q5, q7 - vext.8 q6, q8, q9, #4 // -stride - vext.8 q7, q9, q10, #4 - vext.8 q8, q11, q12, #4 // +stride - vext.8 q11, q12, q13, #4 - - vld1.8 {d4}, [r1]! - - vmov.i32 q14, #5 - vmov.i32 q15, #6 - - vadd.i32 q6, q6, q8 // -stride, +stride - vadd.i32 q7, q7, q11 - vmul.i32 q4, q4, q14 // * 5 - vmla.i32 q4, q6, q15 // * 6 - vmul.i32 q5, q5, q14 // * 5 - vmla.i32 q5, q7, q15 // * 6 - - vmovl.u8 q2, d4 - vmlal.u16 q4, d0, d4 // b + a * src - vmlal.u16 q5, d1, d5 // b + a * src - vmov q0, q1 - vrshrn.i32 d8, q4, #9 - vrshrn.i32 d9, q5, #9 - vmov q2, q3 - vst1.16 {q4}, [r0]! - - ble 3f - vmov q8, q10 - vmov q11, q13 - vld1.16 {q1}, [r4]! - vld1.16 {q3}, [r8]! - vld1.32 {q9, q10}, [r3]! - vld1.32 {q12, q13}, [r7]! - b 2b - -3: - subs r6, r6, #1 - ble 0f - mov r5, lr - add r0, r0, r10, lsl #1 - add r1, r1, r2 - add r3, r3, r9, lsl #2 - add r7, r7, r9, lsl #2 - add r4, r4, r12, lsl #1 - add r8, r8, r12, lsl #1 - - vld1.32 {q8, q9}, [r3]! - vld1.16 {q0, q1}, [r4]! - vld1.32 {q10}, [r3]! - - vmov.i16 q12, #5 - vmov.i16 q13, #6 - -4: - subs r5, r5, #8 - vext.8 q3, q0, q1, #4 // +1 - vext.8 q2, q0, q1, #2 // 0 - vadd.i16 q0, q0, q3 // -1, +1 - - vext.8 q4, q8, q9, #4 // 0 - vext.8 q5, q9, q10, #4 - vext.8 q6, q8, q9, #8 // +1 - vext.8 q7, q9, q10, #8 - vmul.i16 q2, q2, q13 // * 6 - vmla.i16 q2, q0, q12 // * 5 -> a - vld1.8 {d22}, [r1]! - vadd.i32 q8, q8, q6 // -1, +1 - vadd.i32 q9, q9, q7 - vmovl.u8 q11, d22 - vmul.i32 q4, q4, q15 // * 6 - vmla.i32 q4, q8, q14 // * 5 -> b - vmul.i32 q5, q5, q15 // * 6 - vmla.i32 q5, q9, q14 // * 5 -> b - - vmlal.u16 q4, d4, d22 // b + a * src - vmlal.u16 q5, d5, d23 - vmov q0, q1 - vrshrn.i32 d8, q4, #8 - vrshrn.i32 d9, q5, #8 - vmov q8, q10 - vst1.16 {q4}, [r0]! - - ble 5f - vld1.16 {q1}, [r4]! - vld1.32 {q9, q10}, [r3]! - b 4b - -5: - subs r6, r6, #1 - ble 0f - mov r5, lr - sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started - sub r4, r4, r11, lsl #1 - add r0, r0, r10, lsl #1 - add r1, r1, r2 - sub r3, r3, #16 - sub r4, r4, #16 - b 1b -0: - vpop {q4-q7} - pop {r4-r11,pc} -endfunc - -// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride, -// const pixel *src, const ptrdiff_t src_stride, -// const int16_t *t1, const int w, const int h, -// const int wt); -function sgr_weighted1_8bpc_neon, export=1 - push {r4-r9,lr} - ldrd r4, r5, [sp, #28] - ldrd r6, r7, [sp, #36] - ldr r8, [sp, #44] - vdup.16 d31, r7 - cmp r6, #2 - add r9, r0, r1 - add r12, r2, r3 - add lr, r4, #2*FILTER_OUT_STRIDE - mov r7, #(4*FILTER_OUT_STRIDE) - lsl r1, r1, #1 - lsl r3, r3, #1 - add r8, r5, #7 - bic r8, r8, #7 // Aligned width - sub r1, r1, r8 - sub r3, r3, r8 - sub r7, r7, r8, lsl #1 - mov r8, r5 - blt 2f -1: - vld1.8 {d0}, [r2]! - vld1.8 {d16}, [r12]! - vld1.16 {q1}, [r4]! - vld1.16 {q9}, [lr]! - subs r5, r5, #8 - vshll.u8 q0, d0, #4 // u - vshll.u8 q8, d16, #4 // u - vsub.i16 q1, q1, q0 // t1 - u - vsub.i16 q9, q9, q8 // t1 - u - vshll.u16 q2, d0, #7 // u << 7 - vshll.u16 q3, d1, #7 // u << 7 - vshll.u16 q10, d16, #7 // u << 7 - vshll.u16 q11, d17, #7 // u << 7 - vmlal.s16 q2, d2, d31 // v - vmlal.s16 q3, d3, d31 // v - vmlal.s16 q10, d18, d31 // v - vmlal.s16 q11, d19, d31 // v - vrshrn.i32 d4, q2, #11 - vrshrn.i32 d5, q3, #11 - vrshrn.i32 d20, q10, #11 - vrshrn.i32 d21, q11, #11 - vqmovun.s16 d4, q2 - vqmovun.s16 d20, q10 - vst1.8 {d4}, [r0]! - vst1.8 {d20}, [r9]! - bgt 1b - - sub r6, r6, #2 - cmp r6, #1 - blt 0f - mov r5, r8 - add r0, r0, r1 - add r9, r9, r1 - add r2, r2, r3 - add r12, r12, r3 - add r4, r4, r7 - add lr, lr, r7 - beq 2f - b 1b - -2: - vld1.8 {d0}, [r2]! - vld1.16 {q1}, [r4]! - subs r5, r5, #8 - vshll.u8 q0, d0, #4 // u - vsub.i16 q1, q1, q0 // t1 - u - vshll.u16 q2, d0, #7 // u << 7 - vshll.u16 q3, d1, #7 // u << 7 - vmlal.s16 q2, d2, d31 // v - vmlal.s16 q3, d3, d31 // v - vrshrn.i32 d4, q2, #11 - vrshrn.i32 d5, q3, #11 - vqmovun.s16 d2, q2 - vst1.8 {d2}, [r0]! - bgt 2b -0: - pop {r4-r9,pc} -endfunc - -// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *src, const ptrdiff_t src_stride, -// const int16_t *t1, const int16_t *t2, -// const int w, const int h, -// const int16_t wt[2]); -function sgr_weighted2_8bpc_neon, export=1 - push {r4-r11,lr} - ldrd r4, r5, [sp, #36] - ldrd r6, r7, [sp, #44] - ldr r8, [sp, #52] - cmp r7, #2 - add r10, r0, r1 - add r11, r2, r3 - add r12, r4, #2*FILTER_OUT_STRIDE - add lr, r5, #2*FILTER_OUT_STRIDE - vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1] - mov r8, #4*FILTER_OUT_STRIDE - lsl r1, r1, #1 - lsl r3, r3, #1 - add r9, r6, #7 - bic r9, r9, #7 // Aligned width - sub r1, r1, r9 - sub r3, r3, r9 - sub r8, r8, r9, lsl #1 - mov r9, r6 - blt 2f -1: - vld1.8 {d0}, [r2]! - vld1.8 {d16}, [r11]! - vld1.16 {q1}, [r4]! - vld1.16 {q9}, [r12]! - vld1.16 {q2}, [r5]! - vld1.16 {q10}, [lr]! - subs r6, r6, #8 - vshll.u8 q0, d0, #4 // u - vshll.u8 q8, d16, #4 // u - vsub.i16 q1, q1, q0 // t1 - u - vsub.i16 q2, q2, q0 // t2 - u - vsub.i16 q9, q9, q8 // t1 - u - vsub.i16 q10, q10, q8 // t2 - u - vshll.u16 q3, d0, #7 // u << 7 - vshll.u16 q0, d1, #7 // u << 7 - vshll.u16 q11, d16, #7 // u << 7 - vshll.u16 q8, d17, #7 // u << 7 - vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) - vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) - vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) - vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) - vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u) - vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u) - vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u) - vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u) - vrshrn.i32 d6, q3, #11 - vrshrn.i32 d7, q0, #11 - vrshrn.i32 d22, q11, #11 - vrshrn.i32 d23, q8, #11 - vqmovun.s16 d6, q3 - vqmovun.s16 d22, q11 - vst1.8 {d6}, [r0]! - vst1.8 {d22}, [r10]! - bgt 1b - - subs r7, r7, #2 - cmp r7, #1 - blt 0f - mov r6, r9 - add r0, r0, r1 - add r10, r10, r1 - add r2, r2, r3 - add r11, r11, r3 - add r4, r4, r8 - add r12, r12, r8 - add r5, r5, r8 - add lr, lr, r8 - beq 2f - b 1b - -2: - vld1.8 {d0}, [r2]! - vld1.16 {q1}, [r4]! - vld1.16 {q2}, [r5]! - subs r6, r6, #8 - vshll.u8 q0, d0, #4 // u - vsub.i16 q1, q1, q0 // t1 - u - vsub.i16 q2, q2, q0 // t2 - u - vshll.u16 q3, d0, #7 // u << 7 - vshll.u16 q0, d1, #7 // u << 7 - vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) - vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) - vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) - vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) - vrshrn.i32 d6, q3, #11 - vrshrn.i32 d7, q0, #11 - vqmovun.s16 d6, q3 - vst1.8 {d6}, [r0]! - bgt 1b -0: - pop {r4-r11,pc} -endfunc +sgr_funcs 8 diff -Nru dav1d-0.7.1/src/arm/32/looprestoration_tmpl.S dav1d-0.9.1/src/arm/32/looprestoration_tmpl.S --- dav1d-0.7.1/src/arm/32/looprestoration_tmpl.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/looprestoration_tmpl.S 2021-07-28 21:38:28.861851700 +0000 @@ -0,0 +1,600 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" + +#define FILTER_OUT_STRIDE 384 + +.macro sgr_funcs bpc +// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter1_\bpc\()bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + sub r7, r3, #(4*SUM_STRIDE) + add r8, r3, #(4*SUM_STRIDE) + sub r9, r4, #(2*SUM_STRIDE) + add r10, r4, #(2*SUM_STRIDE) + mov r11, #SUM_STRIDE + mov r12, #FILTER_OUT_STRIDE + add lr, r5, #3 + bic lr, lr, #3 // Aligned width +.if \bpc == 8 + sub r2, r2, lr +.else + sub r2, r2, lr, lsl #1 +.endif + sub r12, r12, lr + sub r11, r11, lr + sub r11, r11, #4 // We read 4 extra elements from both a and b + mov lr, r5 + vmov.i16 q14, #3 + vmov.i32 q15, #3 +1: + vld1.16 {q0}, [r9, :128]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q2}, [r10, :128]! + vld1.32 {q8, q9}, [r7, :128]! + vld1.32 {q10, q11}, [r3, :128]! + vld1.32 {q12, q13}, [r8, :128]! + +2: + subs r5, r5, #4 + vext.8 d6, d0, d1, #2 // -stride + vext.8 d7, d2, d3, #2 // 0 + vext.8 d8, d4, d5, #2 // +stride + vext.8 d9, d0, d1, #4 // +1-stride + vext.8 d10, d2, d3, #4 // +1 + vext.8 d11, d4, d5, #4 // +1+stride + vadd.i16 d2, d2, d6 // -1, -stride + vadd.i16 d7, d7, d8 // 0, +stride + vadd.i16 d0, d0, d9 // -1-stride, +1-stride + vadd.i16 d2, d2, d7 + vadd.i16 d4, d4, d11 // -1+stride, +1+stride + vadd.i16 d2, d2, d10 // +1 + vadd.i16 d0, d0, d4 + + vext.8 q3, q8, q9, #4 // -stride + vshl.i16 d2, d2, #2 + vext.8 q4, q8, q9, #8 // +1-stride + vext.8 q5, q10, q11, #4 // 0 + vext.8 q6, q10, q11, #8 // +1 + vmla.i16 d2, d0, d28 // * 3 -> a + vadd.i32 q3, q3, q10 // -stride, -1 + vadd.i32 q8, q8, q4 // -1-stride, +1-stride + vadd.i32 q5, q5, q6 // 0, +1 + vadd.i32 q8, q8, q12 // -1+stride + vadd.i32 q3, q3, q5 + vext.8 q7, q12, q13, #4 // +stride + vext.8 q10, q12, q13, #8 // +1+stride +.if \bpc == 8 + vld1.32 {d24[0]}, [r1, :32]! // src +.else + vld1.16 {d24}, [r1, :64]! // src +.endif + vadd.i32 q3, q3, q7 // +stride + vadd.i32 q8, q8, q10 // +1+stride + vshl.i32 q3, q3, #2 + vmla.i32 q3, q8, q15 // * 3 -> b +.if \bpc == 8 + vmovl.u8 q12, d24 // src +.endif + vmov d0, d1 + vmlal.u16 q3, d2, d24 // b + a * src + vmov d2, d3 + vrshrn.i32 d6, q3, #9 + vmov d4, d5 + vst1.16 {d6}, [r0]! + + ble 3f + vmov q8, q9 + vmov q10, q11 + vmov q12, q13 + vld1.16 {d1}, [r9, :64]! + vld1.16 {d3}, [r4, :64]! + vld1.16 {d5}, [r10, :64]! + vld1.32 {q9}, [r7, :128]! + vld1.32 {q11}, [r3, :128]! + vld1.32 {q13}, [r8, :128]! + b 2b + +3: + subs r6, r6, #1 + ble 0f + mov r5, lr + add r0, r0, r12, lsl #1 + add r1, r1, r2 + add r3, r3, r11, lsl #2 + add r7, r7, r11, lsl #2 + add r8, r8, r11, lsl #2 + add r4, r4, r11, lsl #1 + add r9, r9, r11, lsl #1 + add r10, r10, r11, lsl #1 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter2_\bpc\()bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + add r7, r3, #(4*(SUM_STRIDE)) + sub r3, r3, #(4*(SUM_STRIDE)) + add r8, r4, #(2*(SUM_STRIDE)) + sub r4, r4, #(2*(SUM_STRIDE)) + mov r9, #(2*SUM_STRIDE) + mov r10, #FILTER_OUT_STRIDE + add r11, r5, #7 + bic r11, r11, #7 // Aligned width +.if \bpc == 8 + sub r2, r2, r11 +.else + sub r2, r2, r11, lsl #1 +.endif + sub r10, r10, r11 + sub r9, r9, r11 + sub r9, r9, #4 // We read 4 extra elements from a + sub r12, r9, #4 // We read 8 extra elements from b + mov lr, r5 + +1: + vld1.16 {q0, q1}, [r4, :128]! + vld1.16 {q2, q3}, [r8, :128]! + vld1.32 {q8, q9}, [r3, :128]! + vld1.32 {q11, q12}, [r7, :128]! + vld1.32 {q10}, [r3, :128]! + vld1.32 {q13}, [r7, :128]! + +2: + vmov.i16 q14, #5 + vmov.i16 q15, #6 + subs r5, r5, #8 + vext.8 q4, q0, q1, #4 // +1-stride + vext.8 q5, q2, q3, #4 // +1+stride + vext.8 q6, q0, q1, #2 // -stride + vext.8 q7, q2, q3, #2 // +stride + vadd.i16 q0, q0, q4 // -1-stride, +1-stride + vadd.i16 q5, q2, q5 // -1+stride, +1+stride + vadd.i16 q2, q6, q7 // -stride, +stride + vadd.i16 q0, q0, q5 + + vext.8 q4, q8, q9, #8 // +1-stride + vext.8 q5, q9, q10, #8 + vext.8 q6, q11, q12, #8 // +1+stride + vext.8 q7, q12, q13, #8 + vmul.i16 q0, q0, q14 // * 5 + vmla.i16 q0, q2, q15 // * 6 + vadd.i32 q4, q4, q8 // -1-stride, +1-stride + vadd.i32 q5, q5, q9 + vadd.i32 q6, q6, q11 // -1+stride, +1+stride + vadd.i32 q7, q7, q12 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q7 + vext.8 q6, q8, q9, #4 // -stride + vext.8 q7, q9, q10, #4 + vext.8 q8, q11, q12, #4 // +stride + vext.8 q11, q12, q13, #4 + +.if \bpc == 8 + vld1.8 {d4}, [r1, :64]! +.else + vld1.8 {q2}, [r1, :128]! +.endif + + vmov.i32 q14, #5 + vmov.i32 q15, #6 + + vadd.i32 q6, q6, q8 // -stride, +stride + vadd.i32 q7, q7, q11 + vmul.i32 q4, q4, q14 // * 5 + vmla.i32 q4, q6, q15 // * 6 + vmul.i32 q5, q5, q14 // * 5 + vmla.i32 q5, q7, q15 // * 6 + +.if \bpc == 8 + vmovl.u8 q2, d4 +.endif + vmlal.u16 q4, d0, d4 // b + a * src + vmlal.u16 q5, d1, d5 // b + a * src + vmov q0, q1 + vrshrn.i32 d8, q4, #9 + vrshrn.i32 d9, q5, #9 + vmov q2, q3 + vst1.16 {q4}, [r0, :128]! + + ble 3f + vmov q8, q10 + vmov q11, q13 + vld1.16 {q1}, [r4, :128]! + vld1.16 {q3}, [r8, :128]! + vld1.32 {q9, q10}, [r3, :128]! + vld1.32 {q12, q13}, [r7, :128]! + b 2b + +3: + subs r6, r6, #1 + ble 0f + mov r5, lr + add r0, r0, r10, lsl #1 + add r1, r1, r2 + add r3, r3, r9, lsl #2 + add r7, r7, r9, lsl #2 + add r4, r4, r12, lsl #1 + add r8, r8, r12, lsl #1 + + vld1.32 {q8, q9}, [r3, :128]! + vld1.16 {q0, q1}, [r4, :128]! + vld1.32 {q10}, [r3, :128]! + + vmov.i16 q12, #5 + vmov.i16 q13, #6 + +4: + subs r5, r5, #8 + vext.8 q3, q0, q1, #4 // +1 + vext.8 q2, q0, q1, #2 // 0 + vadd.i16 q0, q0, q3 // -1, +1 + + vext.8 q4, q8, q9, #4 // 0 + vext.8 q5, q9, q10, #4 + vext.8 q6, q8, q9, #8 // +1 + vext.8 q7, q9, q10, #8 + vmul.i16 q2, q2, q13 // * 6 + vmla.i16 q2, q0, q12 // * 5 -> a +.if \bpc == 8 + vld1.8 {d22}, [r1, :64]! +.else + vld1.16 {q11}, [r1, :128]! +.endif + vadd.i32 q8, q8, q6 // -1, +1 + vadd.i32 q9, q9, q7 +.if \bpc == 8 + vmovl.u8 q11, d22 +.endif + vmul.i32 q4, q4, q15 // * 6 + vmla.i32 q4, q8, q14 // * 5 -> b + vmul.i32 q5, q5, q15 // * 6 + vmla.i32 q5, q9, q14 // * 5 -> b + + vmlal.u16 q4, d4, d22 // b + a * src + vmlal.u16 q5, d5, d23 + vmov q0, q1 + vrshrn.i32 d8, q4, #8 + vrshrn.i32 d9, q5, #8 + vmov q8, q10 + vst1.16 {q4}, [r0, :128]! + + ble 5f + vld1.16 {q1}, [r4, :128]! + vld1.32 {q9, q10}, [r3, :128]! + b 4b + +5: + subs r6, r6, #1 + ble 0f + mov r5, lr + sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started + sub r4, r4, r11, lsl #1 + add r0, r0, r10, lsl #1 + add r1, r1, r2 + sub r3, r3, #16 + sub r4, r4, #16 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int w, const int h, +// const int wt, const int bitdepth_max); +function sgr_weighted1_\bpc\()bpc_neon, export=1 + push {r4-r9,lr} + ldrd r4, r5, [sp, #28] + ldrd r6, r7, [sp, #36] +.if \bpc == 16 + ldr r8, [sp, #44] +.endif + vdup.16 d31, r7 + cmp r6, #2 +.if \bpc == 16 + vdup.16 q14, r8 +.endif + add r9, r0, r1 + add r12, r2, r3 + add lr, r4, #2*FILTER_OUT_STRIDE + mov r7, #(4*FILTER_OUT_STRIDE) + lsl r1, r1, #1 + lsl r3, r3, #1 + add r8, r5, #7 + bic r8, r8, #7 // Aligned width +.if \bpc == 8 + sub r1, r1, r8 + sub r3, r3, r8 +.else + sub r1, r1, r8, lsl #1 + sub r3, r3, r8, lsl #1 +.endif + sub r7, r7, r8, lsl #1 + mov r8, r5 + blt 2f +1: +.if \bpc == 8 + vld1.8 {d0}, [r2, :64]! + vld1.8 {d16}, [r12, :64]! +.else + vld1.16 {q0}, [r2, :128]! + vld1.16 {q8}, [r12, :128]! +.endif + vld1.16 {q1}, [r4, :128]! + vld1.16 {q9}, [lr, :128]! + subs r5, r5, #8 +.if \bpc == 8 + vshll.u8 q0, d0, #4 // u + vshll.u8 q8, d16, #4 // u +.else + vshl.i16 q0, q0, #4 // u + vshl.i16 q8, q8, #4 // u +.endif + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q9, q9, q8 // t1 - u + vshll.u16 q2, d0, #7 // u << 7 + vshll.u16 q3, d1, #7 // u << 7 + vshll.u16 q10, d16, #7 // u << 7 + vshll.u16 q11, d17, #7 // u << 7 + vmlal.s16 q2, d2, d31 // v + vmlal.s16 q3, d3, d31 // v + vmlal.s16 q10, d18, d31 // v + vmlal.s16 q11, d19, d31 // v +.if \bpc == 8 + vrshrn.i32 d4, q2, #11 + vrshrn.i32 d5, q3, #11 + vrshrn.i32 d20, q10, #11 + vrshrn.i32 d21, q11, #11 + vqmovun.s16 d4, q2 + vqmovun.s16 d20, q10 + vst1.8 {d4}, [r0, :64]! + vst1.8 {d20}, [r9, :64]! +.else + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 + vqrshrun.s32 d20, q10, #11 + vqrshrun.s32 d21, q11, #11 + vmin.u16 q2, q2, q14 + vmin.u16 q10, q10, q14 + vst1.16 {q2}, [r0, :128]! + vst1.16 {q10}, [r9, :128]! +.endif + bgt 1b + + sub r6, r6, #2 + cmp r6, #1 + blt 0f + mov r5, r8 + add r0, r0, r1 + add r9, r9, r1 + add r2, r2, r3 + add r12, r12, r3 + add r4, r4, r7 + add lr, lr, r7 + beq 2f + b 1b + +2: +.if \bpc == 8 + vld1.8 {d0}, [r2, :64]! +.else + vld1.16 {q0}, [r2, :128]! +.endif + vld1.16 {q1}, [r4, :128]! + subs r5, r5, #8 +.if \bpc == 8 + vshll.u8 q0, d0, #4 // u +.else + vshl.i16 q0, q0, #4 // u +.endif + vsub.i16 q1, q1, q0 // t1 - u + vshll.u16 q2, d0, #7 // u << 7 + vshll.u16 q3, d1, #7 // u << 7 + vmlal.s16 q2, d2, d31 // v + vmlal.s16 q3, d3, d31 // v +.if \bpc == 8 + vrshrn.i32 d4, q2, #11 + vrshrn.i32 d5, q3, #11 + vqmovun.s16 d2, q2 + vst1.8 {d2}, [r0, :64]! +.else + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 + vmin.u16 q2, q2, q14 + vst1.16 {q2}, [r0, :128]! +.endif + bgt 2b +0: + pop {r4-r9,pc} +endfunc + +// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int16_t *t2, +// const int w, const int h, +// const int16_t wt[2], const int bitdepth_max); +function sgr_weighted2_\bpc\()bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.if \bpc == 8 + ldr r8, [sp, #52] +.else + ldrd r8, r9, [sp, #52] +.endif + cmp r7, #2 + add r10, r0, r1 + add r11, r2, r3 + add r12, r4, #2*FILTER_OUT_STRIDE + add lr, r5, #2*FILTER_OUT_STRIDE + vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1] +.if \bpc == 16 + vdup.16 q14, r9 +.endif + mov r8, #4*FILTER_OUT_STRIDE + lsl r1, r1, #1 + lsl r3, r3, #1 + add r9, r6, #7 + bic r9, r9, #7 // Aligned width +.if \bpc == 8 + sub r1, r1, r9 + sub r3, r3, r9 +.else + sub r1, r1, r9, lsl #1 + sub r3, r3, r9, lsl #1 +.endif + sub r8, r8, r9, lsl #1 + mov r9, r6 + blt 2f +1: +.if \bpc == 8 + vld1.8 {d0}, [r2, :64]! + vld1.8 {d16}, [r11, :64]! +.else + vld1.16 {q0}, [r2, :128]! + vld1.16 {q8}, [r11, :128]! +.endif + vld1.16 {q1}, [r4, :128]! + vld1.16 {q9}, [r12, :128]! + vld1.16 {q2}, [r5, :128]! + vld1.16 {q10}, [lr, :128]! + subs r6, r6, #8 +.if \bpc == 8 + vshll.u8 q0, d0, #4 // u + vshll.u8 q8, d16, #4 // u +.else + vshl.i16 q0, q0, #4 // u + vshl.i16 q8, q8, #4 // u +.endif + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q2, q2, q0 // t2 - u + vsub.i16 q9, q9, q8 // t1 - u + vsub.i16 q10, q10, q8 // t2 - u + vshll.u16 q3, d0, #7 // u << 7 + vshll.u16 q0, d1, #7 // u << 7 + vshll.u16 q11, d16, #7 // u << 7 + vshll.u16 q8, d17, #7 // u << 7 + vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) + vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) + vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) + vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) + vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u) + vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u) + vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u) + vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u) +.if \bpc == 8 + vrshrn.i32 d6, q3, #11 + vrshrn.i32 d7, q0, #11 + vrshrn.i32 d22, q11, #11 + vrshrn.i32 d23, q8, #11 + vqmovun.s16 d6, q3 + vqmovun.s16 d22, q11 + vst1.8 {d6}, [r0, :64]! + vst1.8 {d22}, [r10, :64]! +.else + vqrshrun.s32 d6, q3, #11 + vqrshrun.s32 d7, q0, #11 + vqrshrun.s32 d22, q11, #11 + vqrshrun.s32 d23, q8, #11 + vmin.u16 q3, q3, q14 + vmin.u16 q11, q11, q14 + vst1.16 {q3}, [r0, :128]! + vst1.16 {q11}, [r10, :128]! +.endif + bgt 1b + + subs r7, r7, #2 + cmp r7, #1 + blt 0f + mov r6, r9 + add r0, r0, r1 + add r10, r10, r1 + add r2, r2, r3 + add r11, r11, r3 + add r4, r4, r8 + add r12, r12, r8 + add r5, r5, r8 + add lr, lr, r8 + beq 2f + b 1b + +2: +.if \bpc == 8 + vld1.8 {d0}, [r2, :64]! +.else + vld1.16 {q0}, [r2, :128]! +.endif + vld1.16 {q1}, [r4, :128]! + vld1.16 {q2}, [r5, :128]! + subs r6, r6, #8 +.if \bpc == 8 + vshll.u8 q0, d0, #4 // u +.else + vshl.i16 q0, q0, #4 // u +.endif + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q2, q2, q0 // t2 - u + vshll.u16 q3, d0, #7 // u << 7 + vshll.u16 q0, d1, #7 // u << 7 + vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) + vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) + vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) + vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) +.if \bpc == 8 + vrshrn.i32 d6, q3, #11 + vrshrn.i32 d7, q0, #11 + vqmovun.s16 d6, q3 + vst1.8 {d6}, [r0, :64]! +.else + vqrshrun.s32 d6, q3, #11 + vqrshrun.s32 d7, q0, #11 + vmin.u16 q3, q3, q14 + vst1.16 {q3}, [r0, :128]! +.endif + bgt 1b +0: + pop {r4-r11,pc} +endfunc +.endm diff -Nru dav1d-0.7.1/src/arm/32/mc16.S dav1d-0.9.1/src/arm/32/mc16.S --- dav1d-0.7.1/src/arm/32/mc16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/32/mc16.S 2021-07-28 21:38:28.865851900 +0000 @@ -0,0 +1,3644 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define PREP_BIAS 8192 + +.macro avg d0, d00, d01, d1, d10, d11 + vld1.16 {q0, q1}, [r2, :128]! + vld1.16 {q2, q3}, [r3, :128]! + vqadd.s16 q0, q0, q2 + vqadd.s16 q1, q1, q3 + vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits + vshl.s16 \d0, q0, q13 // -(intermediate_bits+1) + vshl.s16 \d1, q1, q13 // -(intermediate_bits+1) +.endm + +.macro w_avg d0, d00, d01, d1, d10, d11 + vld1.16 {q0, q1}, [r2, :128]! + vld1.16 {q2, q3}, [r3, :128]! + // This difference requires a 17 bit range, and all bits are + // significant for the following multiplication. + vsubl.s16 \d0, d4, d0 + vsubl.s16 q0, d5, d1 + vsubl.s16 \d1, d6, d2 + vsubl.s16 q1, d7, d3 + vmul.s32 \d0, \d0, q4 + vmul.s32 q0, q0, q4 + vmul.s32 \d1, \d1, q4 + vmul.s32 q1, q1, q4 + vshr.s32 \d0, \d0, #4 + vshr.s32 q0, q0, #4 + vshr.s32 \d1, \d1, #4 + vshr.s32 q1, q1, #4 + vaddw.s16 \d0, \d0, d4 + vaddw.s16 q0, q0, d5 + vaddw.s16 \d1, \d1, d6 + vaddw.s16 q1, q1, d7 + vmovn.i32 \d00, \d0 + vmovn.i32 \d01, q0 + vmovn.i32 \d10, \d1 + vmovn.i32 \d11, q1 + vrshl.s16 \d0, \d0, q13 // -intermediate_bits + vrshl.s16 \d1, \d1, q13 // -intermediate_bits + vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits + vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits + vmin.s16 \d0, \d0, q15 // bitdepth_max + vmin.s16 \d1, \d1, q15 // bitdepth_max + vmax.s16 \d0, \d0, q14 // 0 + vmax.s16 \d1, \d1, q14 // 0 +.endm + +.macro mask d0, d00, d01, d1, d10, d11 + vld1.8 {q7}, [r6, :128]! + vld1.16 {q0, q1}, [r2, :128]! + vneg.s8 q7, q7 + vld1.16 {q2, q3}, [r3, :128]! + vmovl.s8 q6, d14 + vmovl.s8 q7, d15 + vmovl.s16 q4, d12 + vmovl.s16 q5, d13 + vmovl.s16 q6, d14 + vmovl.s16 q7, d15 + vsubl.s16 \d0, d4, d0 + vsubl.s16 q0, d5, d1 + vsubl.s16 \d1, d6, d2 + vsubl.s16 q1, d7, d3 + vmul.s32 \d0, \d0, q4 + vmul.s32 q0, q0, q5 + vmul.s32 \d1, \d1, q6 + vmul.s32 q1, q1, q7 + vshr.s32 \d0, \d0, #6 + vshr.s32 q0, q0, #6 + vshr.s32 \d1, \d1, #6 + vshr.s32 q1, q1, #6 + vaddw.s16 \d0, \d0, d4 + vaddw.s16 q0, q0, d5 + vaddw.s16 \d1, \d1, d6 + vaddw.s16 q1, q1, d7 + vmovn.i32 \d00, \d0 + vmovn.i32 \d01, q0 + vmovn.i32 \d10, \d1 + vmovn.i32 \d11, q1 + vrshl.s16 \d0, \d0, q13 // -intermediate_bits + vrshl.s16 \d1, \d1, q13 // -intermediate_bits + vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits + vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits + vmin.s16 \d0, \d0, q15 // bitdepth_max + vmin.s16 \d1, \d1, q15 // bitdepth_max + vmax.s16 \d0, \d0, q14 // 0 + vmax.s16 \d1, \d1, q14 // 0 +.endm + +.macro bidir_fn type, bdmax +function \type\()_16bpc_neon, export=1 + push {r4-r7,lr} + ldrd r4, r5, [sp, #20] + ldr r6, [sp, #28] + clz r4, r4 +.ifnc \type, avg + ldr r7, [sp, #32] + vmov.i16 q14, #0 + vdup.16 q15, r7 // bitdepth_max +.endif +.ifc \type, w_avg + vpush {q4} +.endif +.ifc \type, mask + vpush {q4-q7} +.endif + clz r7, \bdmax + sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18 +.ifc \type, avg + mov lr, #1 + movw r12, #2*PREP_BIAS + lsl lr, lr, r7 // 1 << intermediate_bits + neg r12, r12 // -2*PREP_BIAS + add r7, r7, #1 + sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits + neg r7, r7 // -(intermediate_bits+1) + vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits + vdup.16 q13, r7 // -(intermediate_bits+1) +.else + mov r12, #PREP_BIAS + lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits + neg r7, r7 // -intermediate_bits + vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits + vdup.16 q13, r7 // -intermediate_bits +.endif +.ifc \type, w_avg + vdup.32 q4, r6 + vneg.s32 q4, q4 +.endif + adr r7, L(\type\()_tbl) + sub r4, r4, #24 + \type q8, d16, d17, q9, d18, d19 + ldr r4, [r7, r4, lsl #2] + add r7, r7, r4 + bx r7 + + .align 2 +L(\type\()_tbl): + .word 1280f - L(\type\()_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_tbl) + CONFIG_THUMB + +40: + add r7, r0, r1 + lsl r1, r1, #1 +4: + subs r5, r5, #4 + vst1.16 {d16}, [r0, :64], r1 + vst1.16 {d17}, [r7, :64], r1 + vst1.16 {d18}, [r0, :64], r1 + vst1.16 {d19}, [r7, :64], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 4b +80: + add r7, r0, r1 + lsl r1, r1, #1 +8: + vst1.16 {q8}, [r0, :128], r1 + subs r5, r5, #2 + vst1.16 {q9}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 8b +160: +16: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #2 + vst1.16 {q10, q11}, [r0, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 16b +320: + add r7, r0, #32 +32: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 32b +640: + add r7, r0, #32 + mov r12, #64 + sub r1, r1, #64 +64: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 64b +1280: + add r7, r0, #32 + mov r12, #64 + sub r1, r1, #192 +128: + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r12 + \type q8, d16, d17, q9, d18, d19 + vst1.16 {q10, q11}, [r7, :128], r12 + \type q10, d20, d21, q11, d22, d23 + vst1.16 {q8, q9}, [r0, :128], r1 + subs r5, r5, #1 + vst1.16 {q10, q11}, [r7, :128], r1 + ble 0f + \type q8, d16, d17, q9, d18, d19 + b 128b +0: +.ifc \type, mask + vpop {q4-q7} +.endif +.ifc \type, w_avg + vpop {q4} +.endif + pop {r4-r7,pc} +endfunc +.endm + +bidir_fn avg, r6 +bidir_fn w_avg, r7 +bidir_fn mask, r7 + + +.macro w_mask_fn type +function w_mask_\type\()_16bpc_neon, export=1 + push {r4-r10,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #96] + ldrd r6, r7, [sp, #104] + ldr r8, [sp, #112] + clz r9, r4 + adr lr, L(w_mask_\type\()_tbl) + vdup.16 q15, r8 // bitdepth_max + sub r9, r9, #24 + clz r8, r8 // clz(bitdepth_max) + ldr r9, [lr, r9, lsl #2] + add r9, lr, r9 + sub r8, r8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 + mov r10, #PREP_BIAS*64 + neg r8, r8 // -sh + movw r12, #27615 // (64 + 1 - 38)<> mask_sh + vshr.u16 q7, q7, #10 + vadd.i32 q2, q2, q13 // += PREP_BIAS*64 + vadd.i32 q3, q3, q13 + vadd.i32 q4, q4, q13 + vadd.i32 q5, q5, q13 + vmovl.u16 q12, d12 + vmovl.u16 q13, d13 + vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) + vmovl.u16 q12, d14 + vmla.i32 q3, q9, q13 + vmovl.u16 q13, d15 + vmla.i32 q4, q10, q12 + vmla.i32 q5, q11, q13 + vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + vrshl.s32 q3, q3, q14 + vrshl.s32 q4, q4, q14 + vrshl.s32 q5, q5, q14 + vqmovun.s32 d4, q2 // iclip_pixel + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q4 + vqmovun.s32 d7, q5 + vmin.u16 q2, q2, q15 // iclip_pixel + vmin.u16 q3, q3, q15 // iclip_pixel +.if \type == 444 + vmovn.i16 d12, q6 // 64 - m + vmovn.i16 d13, q7 + vsub.i16 q6, q1, q6 // m + vst1.8 {q6}, [r6, :128]! +.elseif \type == 422 + vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) + vpadd.i16 d13, d14, d15 + vmovn.i16 d12, q6 + vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + vst1.8 {d12}, [r6, :64]! +.elseif \type == 420 + vadd.i16 d12, d12, d13 // (64 - my1) + (64 - my2) (row wise addition) + vadd.i16 d13, d14, d15 + vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) + vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d12[0]}, [r6, :32]! +.endif + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d5}, [r12, :64], r1 + vst1.16 {d6}, [r0, :64], r1 + vst1.16 {d7}, [r12, :64], r1 + bgt 4b + vpop {q4-q7} + pop {r4-r10,pc} +8: + vld1.16 {q2, q3}, [r2, :128]! // tmp1 + vld1.16 {q4, q5}, [r3, :128]! // tmp2 + subs r5, r5, #2 + vdup.32 q13, r10 // PREP_BIAS*64 + vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) + vabd.s16 q7, q3, q5 + vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) + vsubl.s16 q9, d9, d5 + vsubl.s16 q10, d10, d6 + vsubl.s16 q11, d11, d7 + vqsub.u16 q6, q0, q6 // 27615 - abs() + vqsub.u16 q7, q0, q7 + vshll.s16 q5, d7, #6 // tmp1 << 6 + vshll.s16 q4, d6, #6 + vshll.s16 q3, d5, #6 + vshll.s16 q2, d4, #6 + vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh + vshr.u16 q7, q7, #10 + vadd.i32 q2, q2, q13 // += PREP_BIAS*64 + vadd.i32 q3, q3, q13 + vadd.i32 q4, q4, q13 + vadd.i32 q5, q5, q13 + vmovl.u16 q12, d12 + vmovl.u16 q13, d13 + vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) + vmovl.u16 q12, d14 + vmla.i32 q3, q9, q13 + vmovl.u16 q13, d15 + vmla.i32 q4, q10, q12 + vmla.i32 q5, q11, q13 + vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + vrshl.s32 q3, q3, q14 + vrshl.s32 q4, q4, q14 + vrshl.s32 q5, q5, q14 + vqmovun.s32 d4, q2 // iclip_pixel + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q4 + vqmovun.s32 d7, q5 + vmin.u16 q2, q2, q15 // iclip_pixel + vmin.u16 q3, q3, q15 // iclip_pixel +.if \type == 444 + vmovn.i16 d12, q6 // 64 - m + vmovn.i16 d13, q7 + vsub.i16 q6, q1, q6 // m + vst1.8 {q6}, [r6, :128]! +.elseif \type == 422 + vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) + vpadd.i16 d13, d14, d15 + vmovn.i16 d12, q6 + vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + vst1.8 {d12}, [r6, :64]! +.elseif \type == 420 + vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) + vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) + vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d12[0]}, [r6, :32]! +.endif + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 8b + vpop {q4-q7} + pop {r4-r10,pc} +1280: +640: +320: +160: + sub r1, r1, r4, lsl #1 +.if \type == 444 + add lr, r6, r4 +.elseif \type == 422 + add lr, r6, r4, lsr #1 +.endif + add r7, r2, r4, lsl #1 + add r9, r3, r4, lsl #1 +161: + mov r8, r4 +16: + vld1.16 {q2}, [r2, :128]! // tmp1 + vld1.16 {q4}, [r3, :128]! // tmp2 + vld1.16 {q3}, [r7, :128]! + vld1.16 {q5}, [r9, :128]! + subs r8, r8, #8 + vdup.32 q13, r10 // PREP_BIAS*64 + vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) + vabd.s16 q7, q3, q5 + vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) + vsubl.s16 q9, d9, d5 + vsubl.s16 q10, d10, d6 + vsubl.s16 q11, d11, d7 + vqsub.u16 q6, q0, q6 // 27615 - abs() + vqsub.u16 q7, q0, q7 + vshll.s16 q5, d7, #6 // tmp1 << 6 + vshll.s16 q4, d6, #6 + vshll.s16 q3, d5, #6 + vshll.s16 q2, d4, #6 + vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh + vshr.u16 q7, q7, #10 + vadd.i32 q2, q2, q13 // += PREP_BIAS*64 + vadd.i32 q3, q3, q13 + vadd.i32 q4, q4, q13 + vadd.i32 q5, q5, q13 + vmovl.u16 q12, d12 + vmovl.u16 q13, d13 + vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) + vmovl.u16 q12, d14 + vmla.i32 q3, q9, q13 + vmovl.u16 q13, d15 + vmla.i32 q4, q10, q12 + vmla.i32 q5, q11, q13 + vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + vrshl.s32 q3, q3, q14 + vrshl.s32 q4, q4, q14 + vrshl.s32 q5, q5, q14 + vqmovun.s32 d4, q2 // iclip_pixel + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q4 + vqmovun.s32 d7, q5 + vmin.u16 q2, q2, q15 // iclip_pixel + vmin.u16 q3, q3, q15 // iclip_pixel +.if \type == 444 + vmovn.i16 d12, q6 // 64 - m + vmovn.i16 d13, q7 + vsub.i16 q6, q1, q6 // m + vst1.8 {d12}, [r6, :64]! + vst1.8 {d13}, [lr, :64]! +.elseif \type == 422 + vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) + vpadd.i16 d13, d14, d15 + vmovn.i16 d12, q6 + vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + vst1.32 {d12[0]}, [r6, :32]! + vst1.32 {d12[1]}, [lr, :32]! +.elseif \type == 420 + vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) + vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) + vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d12[0]}, [r6, :32]! +.endif + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + bgt 16b + subs r5, r5, #2 + add r2, r2, r4, lsl #1 + add r3, r3, r4, lsl #1 + add r7, r7, r4, lsl #1 + add r9, r9, r4, lsl #1 +.if \type == 444 + add r6, r6, r4 + add lr, lr, r4 +.elseif \type == 422 + add r6, r6, r4, lsr #1 + add lr, lr, r4, lsr #1 +.endif + add r0, r0, r1 + add r12, r12, r1 + bgt 161b + vpop {q4-q7} + pop {r4-r10,pc} +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + +function blend_16bpc_neon, export=1 + push {r4-r5,lr} + ldrd r4, r5, [sp, #12] + clz lr, r3 + adr r3, L(blend_tbl) + sub lr, lr, #26 + ldr lr, [r3, lr, lsl #2] + add r3, r3, lr + bx r3 + + .align 2 +L(blend_tbl): + .word 320f - L(blend_tbl) + CONFIG_THUMB + .word 160f - L(blend_tbl) + CONFIG_THUMB + .word 80f - L(blend_tbl) + CONFIG_THUMB + .word 40f - L(blend_tbl) + CONFIG_THUMB + +40: + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld1.8 {d4}, [r5, :64]! + vld1.16 {q1}, [r2, :128]! + vld1.16 {d0}, [r0, :64] + vneg.s8 d4, d4 // -m + subs r4, r4, #2 + vld1.16 {d1}, [r12, :64] + vmovl.s8 q2, d4 + vshl.i16 q2, q2, #9 // -m << 9 + vsub.i16 q1, q0, q1 // a - b + vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 + vadd.i16 q0, q0, q1 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r12, :64], r1 + bgt 4b + pop {r4-r5,pc} +80: + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld1.8 {q8}, [r5, :128]! + vld1.16 {q2, q3}, [r2, :128]! + vneg.s8 q9, q8 // -m + vld1.16 {q0}, [r0, :128] + vld1.16 {q1}, [r12, :128] + vmovl.s8 q8, d18 + vmovl.s8 q9, d19 + vshl.i16 q8, q8, #9 // -m << 9 + vshl.i16 q9, q9, #9 + vsub.i16 q2, q0, q2 // a - b + vsub.i16 q3, q1, q3 + subs r4, r4, #2 + vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q3, q3, q9 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + bgt 8b + pop {r4-r5,pc} +160: + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld1.8 {q12, q13}, [r5, :128]! + vld1.16 {q8, q9}, [r2, :128]! + subs r4, r4, #2 + vneg.s8 q14, q12 // -m + vld1.16 {q0, q1}, [r0, :128] + vneg.s8 q15, q13 + vld1.16 {q10, q11}, [r2, :128]! + vmovl.s8 q12, d28 + vmovl.s8 q13, d29 + vmovl.s8 q14, d30 + vmovl.s8 q15, d31 + vld1.16 {q2, q3}, [r12, :128] + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vshl.i16 q14, q14, #9 + vshl.i16 q15, q15, #9 + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q13 + vqrdmulh.s16 q10, q10, q14 + vqrdmulh.s16 q11, q11, q15 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128], r1 + vadd.i16 q3, q3, q11 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5,pc} +320: + add r12, r0, #32 +32: + vld1.8 {q12, q13}, [r5, :128]! + vld1.16 {q8, q9}, [r2, :128]! + subs r4, r4, #1 + vneg.s8 q14, q12 // -m + vld1.16 {q0, q1}, [r0, :128] + vneg.s8 q15, q13 + vld1.16 {q10, q11}, [r2, :128]! + vmovl.s8 q12, d28 + vmovl.s8 q13, d29 + vmovl.s8 q14, d30 + vmovl.s8 q15, d31 + vld1.16 {q2, q3}, [r12, :128] + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vshl.i16 q14, q14, #9 + vshl.i16 q15, q15, #9 + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q13 + vqrdmulh.s16 q10, q10, q14 + vqrdmulh.s16 q11, q11, q15 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128], r1 + vadd.i16 q3, q3, q11 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 32b + pop {r4-r5,pc} +endfunc + +function blend_h_16bpc_neon, export=1 + push {r4-r5,lr} + ldr r4, [sp, #12] + movrel r5, X(obmc_masks) + add r5, r5, r4 + sub r4, r4, r4, lsr #2 + clz lr, r3 + adr r12, L(blend_h_tbl) + sub lr, lr, #24 + ldr lr, [r12, lr, lsl #2] + add r12, r12, lr + bx r12 + + .align 2 +L(blend_h_tbl): + .word 1280f - L(blend_h_tbl) + CONFIG_THUMB + .word 640f - L(blend_h_tbl) + CONFIG_THUMB + .word 320f - L(blend_h_tbl) + CONFIG_THUMB + .word 160f - L(blend_h_tbl) + CONFIG_THUMB + .word 80f - L(blend_h_tbl) + CONFIG_THUMB + .word 40f - L(blend_h_tbl) + CONFIG_THUMB + .word 20f - L(blend_h_tbl) + CONFIG_THUMB + +20: + add r12, r0, r1 + lsl r1, r1, #1 +2: + vld2.8 {d4[], d5[]}, [r5, :16]! + vld1.16 {d2}, [r2, :64]! + vext.8 d4, d4, d5, #6 + subs r4, r4, #2 + vneg.s8 d4, d4 // -m + vld1.32 {d0[]}, [r0, :32] + vld1.32 {d0[1]}, [r12, :32] + vmovl.s8 q2, d4 + vshl.i16 d4, d4, #9 // -m << 9 + vsub.i16 d2, d0, d2 // a - b + vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6 + vadd.i16 d0, d0, d2 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[1]}, [r12, :32], r1 + bgt 2b + pop {r4-r5,pc} +40: + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld2.8 {d4[], d5[]}, [r5, :16]! + vld1.16 {q1}, [r2, :128]! + vext.8 d4, d4, d5, #4 + subs r4, r4, #2 + vneg.s8 d4, d4 // -m + vld1.16 {d0}, [r0, :64] + vld1.16 {d1}, [r12, :64] + vmovl.s8 q2, d4 + vshl.i16 q2, q2, #9 // -m << 9 + vsub.i16 q1, q0, q1 // a - b + vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 + vadd.i16 q0, q0, q1 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r12, :64], r1 + bgt 4b + pop {r4-r5,pc} +80: + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld2.8 {d16[], d17[]}, [r5, :16]! + vld1.16 {q2, q3}, [r2, :128]! + vneg.s8 q9, q8 // -m + vld1.16 {q0}, [r0, :128] + subs r4, r4, #2 + vmovl.s8 q8, d18 + vmovl.s8 q9, d19 + vld1.16 {q1}, [r12, :128] + vshl.i16 q8, q8, #9 // -m << 9 + vshl.i16 q9, q9, #9 + vsub.i16 q2, q0, q2 // a - b + vsub.i16 q3, q1, q3 + vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q3, q3, q9 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + bgt 8b + pop {r4-r5,pc} +160: + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld2.8 {d24[], d25[]}, [r5, :16]! + vld1.16 {q8, q9}, [r2, :128]! + subs r4, r4, #2 + vneg.s8 q13, q12 // -m + vld1.16 {q0, q1}, [r0, :128] + vmovl.s8 q12, d26 + vld1.16 {q10, q11}, [r2, :128]! + vmovl.s8 q13, d27 + vld1.16 {q2, q3}, [r12, :128] + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q12 + vqrdmulh.s16 q10, q10, q13 + vqrdmulh.s16 q11, q11, q13 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vadd.i16 q3, q3, q11 + vst1.16 {q0, q1}, [r0, :128], r1 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5,pc} +1280: +640: +320: + sub r1, r1, r3, lsl #1 +321: + vld1.8 {d24[]}, [r5]! + mov r12, r3 + vneg.s8 d24, d24 // -m + vmovl.s8 q12, d24 + vshl.i16 q12, q12, #9 // -m << 9 +32: + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {q0, q1}, [r0, :128]! + subs r12, r12, #32 + vld1.16 {q10, q11}, [r2, :128]! + vld1.16 {q2, q3}, [r0, :128] + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + sub r0, r0, #32 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q12 + vqrdmulh.s16 q10, q10, q12 + vqrdmulh.s16 q11, q11, q12 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q3, q3, q11 + vst1.16 {q2, q3}, [r0, :128]! + bgt 32b + subs r4, r4, #1 + add r0, r0, r1 + bgt 321b + pop {r4-r5,pc} +endfunc + +function blend_v_16bpc_neon, export=1 + push {r4,lr} + ldr r4, [sp, #8] + movrel lr, X(obmc_masks) + add lr, lr, r3 + clz r12, r3 + adr r3, L(blend_v_tbl) + sub r12, r12, #26 + ldr r12, [r3, r12, lsl #2] + add r3, r3, r12 + bx r3 + + .align 2 +L(blend_v_tbl): + .word 320f - L(blend_v_tbl) + CONFIG_THUMB + .word 160f - L(blend_v_tbl) + CONFIG_THUMB + .word 80f - L(blend_v_tbl) + CONFIG_THUMB + .word 40f - L(blend_v_tbl) + CONFIG_THUMB + .word 20f - L(blend_v_tbl) + CONFIG_THUMB + +20: + add r12, r0, r1 + lsl r1, r1, #1 + vld1.8 {d4[]}, [lr] + vneg.s8 d4, d4 // -m + vmovl.s8 q2, d4 + vshl.i16 d4, d4, #9 // -m << 9 +2: + vld1.32 {d2[]}, [r2, :32]! + vld1.16 {d0[]}, [r0, :16] + subs r4, r4, #2 + vld1.16 {d2[1]}, [r2, :16] + vld1.16 {d0[1]}, [r12, :16] + add r2, r2, #4 + vsub.i16 d2, d0, d2 // a - b + vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6 + vadd.i16 d0, d0, d2 + vst1.16 {d0[0]}, [r0, :16], r1 + vst1.16 {d0[1]}, [r12, :16], r1 + bgt 2b + pop {r4,pc} +40: + vld1.32 {d4[]}, [lr, :32] + add r12, r0, r1 + vneg.s8 d4, d4 // -m + lsl r1, r1, #1 + vmovl.s8 q2, d4 + sub r1, r1, #4 + vshl.i16 q2, q2, #9 // -m << 9 +4: + vld1.16 {q1}, [r2, :128]! + vld1.16 {d0}, [r0, :64] + vld1.16 {d1}, [r12, :64] + subs r4, r4, #2 + vsub.i16 q1, q0, q1 // a - b + vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 + vadd.i16 q0, q0, q1 + vst1.32 {d0[0]}, [r0, :32]! + vst1.32 {d1[0]}, [r12, :32]! + vst1.16 {d0[2]}, [r0, :16], r1 + vst1.16 {d1[2]}, [r12, :16], r1 + bgt 4b + pop {r4,pc} +80: + vld1.8 {d16}, [lr, :64] + add r12, r0, r1 + vneg.s8 d16, d16 // -m + lsl r1, r1, #1 + vmovl.s8 q8, d16 + sub r1, r1, #8 + vshl.i16 q8, q8, #9 // -m << 9 +8: + vld1.16 {q2, q3}, [r2, :128]! + vld1.16 {q0}, [r0, :128] + vld1.16 {q1}, [r12, :128] + subs r4, r4, #2 + vsub.i16 q2, q0, q2 // a - b + vsub.i16 q3, q1, q3 + vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q3, q3, q8 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vst1.16 {d0}, [r0, :64]! + vst1.16 {d2}, [r12, :64]! + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d3[0]}, [r12, :32], r1 + bgt 8b + pop {r4,pc} +160: + vld1.8 {q12}, [lr, :128] + add r12, r0, r1 + vneg.s8 q13, q12 // -m + lsl r1, r1, #1 + vmovl.s8 q12, d26 + vmovl.s8 q13, d27 + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 d26, d26, #9 +16: + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {d0, d1, d2}, [r0, :64] + subs r4, r4, #2 + vld1.16 {q10, q11}, [r2, :128]! + vsub.i16 q8, q0, q8 // a - b + vld1.16 {d4, d5, d6}, [r12, :64] + vsub.i16 d18, d2, d18 + vsub.i16 q10, q2, q10 + vsub.i16 d22, d6, d22 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 d18, d18, d26 + vqrdmulh.s16 q10, q10, q12 + vqrdmulh.s16 d22, d22, d26 + vadd.i16 q0, q0, q8 + vadd.i16 d2, d2, d18 + vadd.i16 q2, q2, q10 + vst1.16 {d0, d1, d2}, [r0, :64], r1 + vadd.i16 d6, d6, d22 + vst1.16 {d4, d5, d6}, [r12, :64], r1 + bgt 16b + pop {r4,pc} +320: + vld1.8 {d24, d25, d26}, [lr, :64] + vneg.s8 q14, q12 // -m + vneg.s8 d30, d26 + vmovl.s8 q12, d28 + vmovl.s8 q13, d29 + vmovl.s8 q14, d30 + sub r1, r1, #32 + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vshl.i16 q14, q14, #9 +32: + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {q0, q1}, [r0, :128]! + subs r4, r4, #1 + vld1.16 {q10}, [r2, :128] + vsub.i16 q8, q0, q8 // a - b + vld1.16 {q2}, [r0, :128] + sub r0, r0, #32 + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q13 + vqrdmulh.s16 q10, q10, q14 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128]! + add r2, r2, #32 + vst1.16 {q2}, [r0, :128], r1 + bgt 32b + pop {r4,pc} +endfunc + +// This has got the same signature as the put_8tap functions, +// and assumes that r9 is set to (clz(w)-24). +function put_neon + adr r10, L(put_tbl) + ldr r9, [r10, r9, lsl #2] + add r10, r10, r9 + bx r10 + + .align 2 +L(put_tbl): + .word 1280f - L(put_tbl) + CONFIG_THUMB + .word 640f - L(put_tbl) + CONFIG_THUMB + .word 320f - L(put_tbl) + CONFIG_THUMB + .word 16f - L(put_tbl) + CONFIG_THUMB + .word 80f - L(put_tbl) + CONFIG_THUMB + .word 4f - L(put_tbl) + CONFIG_THUMB + .word 2f - L(put_tbl) + CONFIG_THUMB + +2: + vld1.32 {d0[]}, [r2], r3 + vld1.32 {d1[]}, [r2], r3 + subs r5, r5, #2 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + bgt 2b + pop {r4-r11,pc} +4: + vld1.16 {d0}, [r2], r3 + vld1.16 {d1}, [r2], r3 + subs r5, r5, #2 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r0, :64], r1 + bgt 4b + pop {r4-r11,pc} +80: + add r8, r0, r1 + lsl r1, r1, #1 + add r9, r2, r3 + lsl r3, r3, #1 +8: + vld1.16 {q0}, [r2], r3 + vld1.16 {q1}, [r9], r3 + subs r5, r5, #2 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r8, :128], r1 + bgt 8b + pop {r4-r11,pc} +16: + vld1.16 {q0, q1}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q0, q1}, [r0, :128], r1 + bgt 16b + pop {r4-r11,pc} +320: + sub r1, r1, #32 + sub r3, r3, #32 +32: + vld1.16 {q0, q1}, [r2]! + vst1.16 {q0, q1}, [r0, :128]! + vld1.16 {q2, q3}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 32b + pop {r4-r11,pc} +640: + sub r1, r1, #96 + sub r3, r3, #96 +64: + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q14, q15}, [r0, :128], r1 + bgt 64b + pop {r4-r11,pc} +1280: + sub r1, r1, #224 + sub r3, r3, #224 +128: + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2]! + vst1.16 {q14, q15}, [r0, :128]! + vld1.16 {q8, q9}, [r2]! + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q10, q11}, [r2]! + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q12, q13}, [r2]! + vst1.16 {q12, q13}, [r0, :128]! + vld1.16 {q14, q15}, [r2], r3 + subs r5, r5, #1 + vst1.16 {q14, q15}, [r0, :128], r1 + bgt 128b + pop {r4-r11,pc} +endfunc + +// This has got the same signature as the prep_8tap functions, +// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and +// r8 to w*2. +function prep_neon + adr r10, L(prep_tbl) + ldr r9, [r10, r9, lsl #2] + vdup.16 q15, r7 // intermediate_bits + vmov.i16 q14, #PREP_BIAS + add r10, r10, r9 + bx r10 + + .align 2 +L(prep_tbl): + .word 1280f - L(prep_tbl) + CONFIG_THUMB + .word 640f - L(prep_tbl) + CONFIG_THUMB + .word 320f - L(prep_tbl) + CONFIG_THUMB + .word 16f - L(prep_tbl) + CONFIG_THUMB + .word 80f - L(prep_tbl) + CONFIG_THUMB + .word 40f - L(prep_tbl) + CONFIG_THUMB + +40: + add r9, r1, r2 + lsl r2, r2, #1 +4: + vld1.16 {d0}, [r1], r2 + vld1.16 {d1}, [r9], r2 + subs r4, r4, #2 + vshl.s16 q0, q0, q15 + vsub.i16 q0, q0, q14 + vst1.16 {q0}, [r0, :128]! + bgt 4b + pop {r4-r11,pc} +80: + add r9, r1, r2 + lsl r2, r2, #1 +8: + vld1.16 {q0}, [r1], r2 + vld1.16 {q1}, [r9], r2 + subs r4, r4, #2 + vshl.s16 q0, q0, q15 + vshl.s16 q1, q1, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vst1.16 {q0, q1}, [r0, :128]! + bgt 8b + pop {r4-r11,pc} +16: + vld1.16 {q0, q1}, [r1], r2 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1], r2 + subs r4, r4, #2 + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vshl.s16 q3, q3, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q3, q3, q14 + vst1.16 {q2, q3}, [r0, :128]! + bgt 16b + pop {r4-r11,pc} +320: + sub r2, r2, #32 +32: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1], r2 + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vshl.s16 q3, q3, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q3, q3, q14 + vst1.16 {q2, q3}, [r0, :128]! + bgt 32b + pop {r4-r11,pc} +640: + sub r2, r2, #96 +64: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1]! + vshl.s16 q1, q1, q15 + vld1.16 {q8, q9}, [r1]! + vshl.s16 q2, q2, q15 + vld1.16 {q10, q11}, [r1], r2 + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q9, q9, q14 + vst1.16 {q2, q3}, [r0, :128]! + vsub.i16 q10, q10, q14 + vst1.16 {q8, q9}, [r0, :128]! + vsub.i16 q11, q11, q14 + vst1.16 {q10, q11}, [r0, :128]! + bgt 64b + pop {r4-r11,pc} +1280: + sub r2, r2, #224 +128: + vld1.16 {q0, q1}, [r1]! + subs r4, r4, #1 + vshl.s16 q0, q0, q15 + vld1.16 {q2, q3}, [r1]! + vshl.s16 q1, q1, q15 + vld1.16 {q8, q9}, [r1]! + vshl.s16 q2, q2, q15 + vld1.16 {q10, q11}, [r1]! + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vld1.16 {q0, q1}, [r1]! + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 + vst1.16 {q2, q3}, [r0, :128]! + vld1.16 {q2, q3}, [r1]! + vsub.i16 q11, q11, q14 + vshl.s16 q0, q0, q15 + vst1.16 {q8, q9}, [r0, :128]! + vld1.16 {q8, q9}, [r1]! + vshl.s16 q1, q1, q15 + vshl.s16 q2, q2, q15 + vst1.16 {q10, q11}, [r0, :128]! + vld1.16 {q10, q11}, [r1], r2 + vshl.s16 q3, q3, q15 + vshl.s16 q8, q8, q15 + vshl.s16 q9, q9, q15 + vshl.s16 q10, q10, q15 + vshl.s16 q11, q11, q15 + vsub.i16 q0, q0, q14 + vsub.i16 q1, q1, q14 + vsub.i16 q2, q2, q14 + vsub.i16 q3, q3, q14 + vsub.i16 q8, q8, q14 + vst1.16 {q0, q1}, [r0, :128]! + vsub.i16 q9, q9, q14 + vst1.16 {q2, q3}, [r0, :128]! + vsub.i16 q10, q10, q14 + vst1.16 {q8, q9}, [r0, :128]! + vsub.i16 q11, q11, q14 + vst1.16 {q10, q11}, [r0, :128]! + bgt 128b + pop {r4-r11,pc} +endfunc + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + vld1.\wd {\d0[]}, [\s0], \strd + vld1.\wd {\d1[]}, [\s1], \strd +.ifnb \d2 + vld1.\wd {\d2[]}, [\s0], \strd + vld1.\wd {\d3[]}, [\s1], \strd +.endif +.ifnb \d4 + vld1.\wd {\d4[]}, [\s0], \strd +.endif +.ifnb \d5 + vld1.\wd {\d5[]}, [\s1], \strd +.endif +.ifnb \d6 + vld1.\wd {\d6[]}, [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + vld1.16 {\d0}, [\s0], \strd + vld1.16 {\d1}, [\s1], \strd +.ifnb \d2 + vld1.16 {\d2}, [\s0], \strd + vld1.16 {\d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.16 {\d4}, [\s0], \strd +.endif +.ifnb \d5 + vld1.16 {\d5}, [\s1], \strd +.endif +.ifnb \d6 + vld1.16 {\d6}, [\s0], \strd +.endif +.endm +.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5 + vld1.16 {\d0, \d1}, [\s0], \strd +.ifnb \d2 + vld1.16 {\d2, \d3}, [\s1], \strd +.endif +.ifnb \d4 + vld1.16 {\d4, \d5}, [\s0], \strd +.endif +.endm +.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5 + load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5 +.endm +.macro interleave_1_32 r0, r1, r2, r3, r4 + vext.8 \r0, \r0, \r1, #4 + vext.8 \r1, \r1, \r2, #4 +.ifnb \r3 + vext.8 \r2, \r2, \r3, #4 + vext.8 \r3, \r3, \r4, #4 +.endif +.endm +.macro vmin_u16 c, r0, r1, r2, r3 + vmin.u16 \r0, \r0, \c +.ifnb \r1 + vmin.u16 \r1, \r1, \c +.endif +.ifnb \r2 + vmin.u16 \r2, \r2, \c + vmin.u16 \r3, \r3, \c +.endif +.endm +.macro vsub_i16 c, r0, r1, r2, r3 + vsub.i16 \r0, \r0, \c +.ifnb \r1 + vsub.i16 \r1, \r1, \c +.endif +.ifnb \r2 + vsub.i16 \r2, \r2, \c + vsub.i16 \r3, \r3, \c +.endif +.endm +.macro vmull_vmlal_4 d, s0, s1, s2, s3 + vmull.s16 \d, \s0, d0[0] + vmlal.s16 \d, \s1, d0[1] + vmlal.s16 \d, \s2, d0[2] + vmlal.s16 \d, \s3, d0[3] +.endm +.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 + vmull.s16 \d, \s0, d0[0] + vmlal.s16 \d, \s1, d0[1] + vmlal.s16 \d, \s2, d0[2] + vmlal.s16 \d, \s3, d0[3] + vmlal.s16 \d, \s4, d1[0] + vmlal.s16 \d, \s5, d1[1] + vmlal.s16 \d, \s6, d1[2] + vmlal.s16 \d, \s7, d1[3] +.endm +.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3 + vqrshrun.s32 \d0, \q0, #\shift +.ifnb \q1 + vqrshrun.s32 \d1, \q1, #\shift +.endif +.ifnb \q2 + vqrshrun.s32 \d2, \q2, #\shift + vqrshrun.s32 \d3, \q3, #\shift +.endif +.endm +.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3 + vmovn.i32 \d0, \q0 +.ifnb \q1 + vmovn.i32 \d1, \q1 +.endif +.ifnb \q2 + vmovn.i32 \d2, \q2 + vmovn.i32 \d3, \q3 +.endif +.endm +.macro vrshl_s32 shift, r0, r1, r2, r3 + vrshl.s32 \r0, \r0, \shift + vrshl.s32 \r1, \r1, \shift +.ifnb \r2 + vrshl.s32 \r2, \r2, \shift + vrshl.s32 \r3, \r3, \shift +.endif +.endm +.macro vst1_32 strd, r0, r1 + vst1.32 {\r0[0]}, [r0, :32], \strd + vst1.32 {\r0[1]}, [r9, :32], \strd +.ifnb \r1 + vst1.32 {\r1[0]}, [r0, :32], \strd + vst1.32 {\r1[1]}, [r9, :32], \strd +.endif +.endm +.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 + vst1.16 {\r0}, [r0, \align], \strd + vst1.16 {\r1}, [r9, \align], \strd +.ifnb \r2 + vst1.16 {\r2}, [r0, \align], \strd + vst1.16 {\r3}, [r9, \align], \strd +.endif +.ifnb \r4 + vst1.16 {\r4}, [r0, \align], \strd + vst1.16 {\r5}, [r9, \align], \strd + vst1.16 {\r6}, [r0, \align], \strd + vst1.16 {\r7}, [r9, \align], \strd +.endif +.endm +.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3 +.ifc \type, put + vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + vmin_u16 q15, \q0, \q1 +.else + vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits) + vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 + vsub_i16 q15, \q0, \q1 // PREP_BIAS +.endif +.endm +.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1_reg \strd, :64, \d0, \d1, \d2, \d3 +.endm +.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1_reg \strd, :128, \q0, \q1 +.endm +.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 + finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 + vst1.16 {\q0, \q1}, [r0, :128], \strd +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + movw r9, \type_h + movw r10, \type_v + b \op\()_8tap_neon +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2 +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap_neon + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.ifc \bdmax, r8 + ldr r8, [sp, #52] +.endif + movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, r11 + mul \my, \my, r11 + add \mx, \mx, r9 // mx, 8tap_h, 4tap_h + add \my, \my, r10 // my, 8tap_v, 4tap_v + +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + + vdup.16 q15, \bdmax // bitdepth_max + clz \bdmax, \bdmax + clz r9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + tst \mx, #(0x7f << 14) + sub r9, r9, #24 + add lr, \bdmax, #6 // 6 + intermediate_bits + rsb r12, \bdmax, #6 // 6 - intermediate_bits + movrel r11, X(mc_subpel_filters), -8 + bne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + bne L(\type\()_8tap_v) + b \type\()_neon + +L(\type\()_8tap_h): + cmp \w, #4 + ubfx r10, \mx, #7, #7 + and \mx, \mx, #0x7f + it gt + movgt \mx, r10 + tst \my, #(0x7f << 14) + add \mx, r11, \mx, lsl #3 + bne L(\type\()_8tap_hv) + + adr r10, L(\type\()_8tap_h_tbl) + vdup.32 q14, r12 // 6 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s32 q14, q14 // -(6-intermediate_bits) +.ifc \type, put + vdup.16 q13, \bdmax // intermediate_bits +.else + vmov.i16 q13, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s16 q13, q13 // -intermediate_bits +.endif + bx r10 + + .align 2 +L(\type\()_8tap_h_tbl): + .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +2: + vld1.16 {q2}, [\src], \s_strd + vld1.16 {q3}, [\sr2], \s_strd + vext.8 d5, d4, d5, #2 + vext.8 d7, d6, d7, #2 + subs \h, \h, #2 + vtrn.32 d4, d6 + vtrn.32 d5, d7 + vmull.s16 q1, d4, d0[0] + vmlal.s16 q1, d5, d0[1] + vmlal.s16 q1, d6, d0[2] + vmlal.s16 q1, d7, d0[3] + vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) + vqmovun.s32 d2, q1 + vrshl.s16 d2, d2, d26 // -intermediate_bits + vmin.u16 d2, d2, d30 + vst1.32 {d2[0]}, [\dst, :32], \d_strd + vst1.32 {d2[1]}, [\ds2, :32], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + sub \src, \src, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 +4: + vld1.16 {q8}, [\src], \s_strd + vld1.16 {q11}, [\sr2], \s_strd + vext.8 d18, d16, d17, #2 + vext.8 d19, d16, d17, #4 + vext.8 d20, d16, d17, #6 + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d21, d22, d23, #6 + subs \h, \h, #2 + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d19, d0[2] + vmlal.s16 q2, d20, d0[3] + vmull.s16 q3, d22, d0[0] + vmlal.s16 q3, d24, d0[1] + vmlal.s16 q3, d25, d0[2] + vmlal.s16 q3, d21, d0[3] + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) +.ifc \type, put + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vrshl.s16 q2, q2, q13 // -intermediate_bits + vmin.u16 q2, q2, q15 +.else + vmovn.s32 d4, q2 + vmovn.s32 d5, q3 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + bgt 4b + pop {r4-r11,pc} + +80: +160: +320: +640: +1280: // 8xN, 16xN, 32xN, ... h + vpush {q4-q5} + vld1.8 {d0}, [\mx, :64] + sub \src, \src, #6 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + sub \s_strd, \s_strd, \w, lsl #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, lsl #1 +.endif +81: + vld1.16 {q8, q9}, [\src]! + vld1.16 {q10, q11}, [\sr2]! + mov \mx, \w + +8: + vmull.s16 q1, d16, d0[0] + vmull.s16 q2, d17, d0[0] + vmull.s16 q3, d20, d0[0] + vmull.s16 q4, d21, d0[0] +.irpc i, 1234567 + vext.8 q12, q8, q9, #(2*\i) + vext.8 q5, q10, q11, #(2*\i) +.if \i < 4 + vmlal.s16 q1, d24, d0[\i] + vmlal.s16 q2, d25, d0[\i] + vmlal.s16 q3, d10, d0[\i] + vmlal.s16 q4, d11, d0[\i] +.else + vmlal.s16 q1, d24, d1[\i-4] + vmlal.s16 q2, d25, d1[\i-4] + vmlal.s16 q3, d10, d1[\i-4] + vmlal.s16 q4, d11, d1[\i-4] +.endif +.endr + subs \mx, \mx, #8 + vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vrshl.s32 q4, q4, q14 // -(6-intermediate_bits) +.ifc \type, put + vqmovun.s32 d2, q1 + vqmovun.s32 d3, q2 + vqmovun.s32 d4, q3 + vqmovun.s32 d5, q4 + vrshl.s16 q1, q1, q13 // -intermediate_bits + vrshl.s16 q2, q2, q13 // -intermediate_bits + vmin.u16 q1, q1, q15 + vmin.u16 q2, q2, q15 +.else + vmovn.s32 d2, q1 + vmovn.s32 d3, q2 + vmovn.s32 d4, q3 + vmovn.s32 d5, q4 + vsub.i16 q1, q1, q13 // PREP_BIAS + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + vst1.16 {q1}, [\dst, :128]! + vst1.16 {q2}, [\ds2, :128]! + ble 9f + + vmov q8, q9 + vmov q10, q11 + vld1.16 {q9}, [\src]! + vld1.16 {q11}, [\sr2]! + b 8b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 81b + vpop {q4-q5} + pop {r4-r11,pc} + + +L(\type\()_8tap_v): + cmp \h, #4 + ubfx r10, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r10 + add \my, r11, \my, lsl #3 + +.ifc \type, prep + vdup.32 q14, r12 // 6 - intermediate_bits + vmov.i16 q15, #PREP_BIAS +.endif + adr r10, L(\type\()_8tap_v_tbl) + ldr r9, [r10, r9, lsl #2] +.ifc \type, prep + vneg.s32 q14, q14 // -(6-intermediate_bits) +.endif + add r10, r10, r9 + bx r10 + + .align 2 +L(\type\()_8tap_v_tbl): + .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + bgt 28f + + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + // 2x2 v + load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + interleave_1_32 d1, d2, d3, d4, d5 + bgt 24f + vmull_vmlal_4 q8, d1, d2, d3, d4 + vqrshrun_s32 6, q8, d16 + vmin_u16 d30, d16 + vst1_32 \d_strd, d16 + pop {r4-r11,pc} + +24: // 2x4 v + load_32 \sr2, \src, \s_strd, d6, d7 + interleave_1_32 d5, d6, d7 + vmull_vmlal_4 q8, d1, d2, d3, d4 + vmull_vmlal_4 q9, d3, d4, d5, d6 + vqrshrun_s32 6, q8, d16, q9, d17 + vmin_u16 q15, q8 + vst1_32 \d_strd, d16, d17 + pop {r4-r11,pc} + +28: // 2x8, 2x16 v + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + vmovl.s8 q0, d0 + + load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16 + interleave_1_32 d2, d3, d4, d5, d6 + interleave_1_32 d6, d7, d16 +216: + subs \h, \h, #8 + load_32 \sr2, \src, \s_strd, d17, d18, d19, d20 + load_32 \sr2, \src, \s_strd, d21, d22, d23, d24 + interleave_1_32 d16, d17, d18, d19, d20 + interleave_1_32 d20, d21, d22, d23, d24 + vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 + vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19 + vmull_vmlal_8 q2, d6, d7, d16, d17, d18, d19, d20, d21 + vmull_vmlal_8 q3, d16, d17, d18, d19, d20, d21, d22, d23 + vqrshrun_s32 6, q13, d26, q1, d27, q2, d2, q3, d3 + vmin_u16 q15, q13, q1 + vst1_32 \d_strd, d26, d27 + vst1_32 \d_strd, d2, d3 + ble 0f + vmov q1, q9 + vmov q2, q10 + vmov q3, q11 + vmov d16, d24 + b 216b +0: + pop {r4-r11,pc} +.endif + +40: + bgt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 + vmull_vmlal_4 q8, d1, d2, d3, d4 + vmull_vmlal_4 q9, d2, d3, d4, d5 + shift_store_4 \type, \d_strd, q8, q9, d16, d17 + ble 0f + load_reg \sr2, \src, \s_strd, d6, d7 + vmull_vmlal_4 q8, d3, d4, d5, d6 + vmull_vmlal_4 q9, d4, d5, d6, d7 + shift_store_4 \type, \d_strd, q8, q9, d16, d17 +0: + pop {r4-r11,pc} + +480: // 4x8, 4x16 v + vld1.8 {d0}, [\my, :64] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22 + +48: + subs \h, \h, #4 + load_reg \sr2, \src, \s_strd, d23, d24, d25, d26 + vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 + vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 + vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25 + vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26 + shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5 + ble 0f + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + vmov d22, d26 + b 48b +0: + pop {r4-r11,pc} + +80: + bgt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + + load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9 + vmull_vmlal_4 q10, d2, d4, d6, d16 + vmull_vmlal_4 q11, d3, d5, d7, d17 + vmull_vmlal_4 q12, d4, d6, d16, d18 + vmull_vmlal_4 q13, d5, d7, d17, d19 + shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23 + ble 0f + load_reg \sr2, \src, \s_strd, q10, q11 + vmull_vmlal_4 q1, d6, d16, d18, d20 + vmull_vmlal_4 q2, d7, d17, d19, d21 + vmull_vmlal_4 q12, d16, d18, d20, d22 + vmull_vmlal_4 q13, d17, d19, d21, d23 + shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5 +0: + pop {r4-r11,pc} + +880: // 8x6, 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\my, :64] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11 + +88: + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, q12, q13 + vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24 + vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25 + vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26 + vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27 + shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5 + ble 9f + subs \h, \h, #2 + load_reg \sr2, \src, \s_strd, q1, q2 + vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2 + vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3 + vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4 + vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5 + shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9 + ble 9f + vmov q5, q9 + vmov q6, q10 + vmov q7, q11 + vmov q8, q12 + vmov q9, q13 + vmov q10, q1 + vmov q11, q2 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +160: + bgt 1680b + + // 16x2, 16x4 v + vpush {q6-q7} + add \my, \my, #2 + vld1.32 {d0[]}, [\my] + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + + load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11 +16: + load_16s16 \src, \src, \s_strd, q12, q13 + subs \h, \h, #1 + vmull_vmlal_4 q1, d12, d16, d20, d24 + vmull_vmlal_4 q2, d13, d17, d21, d25 + vmull_vmlal_4 q3, d14, d18, d22, d26 + vmull_vmlal_4 q6, d15, d19, d23, d27 + shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5 + ble 0f + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + vmov q11, q13 + b 16b +0: + vpop {q6-q7} + pop {r4-r11,pc} + + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfx r10, \my, #7, #7 + and \my, \my, #0x7f + it gt + movgt \my, r10 +4: + add \my, r11, \my, lsl #3 + + adr r10, L(\type\()_8tap_hv_tbl) + neg r12, r12 // -(6-intermediate_bits) + ldr r9, [r10, r9, lsl #2] + vdup.32 q14, r12 // -(6-intermediate_bits) +.ifc \type, put + neg r8, lr // -(6+intermeidate_bits) +.else + vmov.i16 q13, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vdup.32 q13, r8 // -(6+intermediate_bits) +.endif + bx r10 + + .align 2 +L(\type\()_8tap_hv_tbl): + .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB + +20: +.ifc \type, put + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 280f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + + // 2x2, 2x4 hv + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vmull.s16 q11, d22, d0 + vmull.s16 q12, d24, d0 + vpadd.s32 d22, d22, d23 + vpadd.s32 d23, d24, d25 + vpadd.s32 d22, d22, d23 + vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) + vmovn.i32 d16, q8 + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vext.8 d16, d16, d24, #4 + vmov d17, d24 + +2: + bl L(\type\()_8tap_filter_2) + + vext.8 d18, d17, d24, #4 + vmull.s16 q2, d16, d2[0] + vmlal.s16 q2, d17, d2[1] + vmlal.s16 q2, d18, d2[2] + vmlal.s16 q2, d24, d2[3] + + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vmin.u16 d4, d4, d30 + subs \h, \h, #2 + vst1.32 {d4[0]}, [\dst, :32], \d_strd + vst1.32 {d4[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d18 + vmov d17, d24 + b 2b + +280: // 2x8, 2x16, 2x32 hv + vld1.8 {d2}, [\my, :64] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vmull.s16 q11, d22, d0 + vmull.s16 q12, d24, d0 + vpadd.s32 d22, d22, d23 + vpadd.s32 d23, d24, d25 + vpadd.s32 d22, d22, d23 + vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) + vmovn.i32 d16, q8 + + bl L(\type\()_8tap_filter_2) + + vext.8 d16, d16, d16, #4 + vext.8 d16, d16, d24, #4 + vmov d17, d24 + bl L(\type\()_8tap_filter_2) + vext.8 d18, d17, d24, #4 + vmov d19, d24 + bl L(\type\()_8tap_filter_2) + vext.8 d20, d19, d24, #4 + vmov d21, d24 + +28: + bl L(\type\()_8tap_filter_2) + vext.8 d22, d21, d24, #4 + vmull.s16 q3, d16, d2[0] + vmlal.s16 q3, d17, d2[1] + vmlal.s16 q3, d18, d2[2] + vmlal.s16 q3, d19, d2[3] + vmlal.s16 q3, d20, d3[0] + vmlal.s16 q3, d21, d3[1] + vmlal.s16 q3, d22, d3[2] + vmlal.s16 q3, d24, d3[3] + + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d6, q3 + vmin.u16 d6, d6, d30 + subs \h, \h, #2 + vst1.32 {d6[0]}, [\dst, :32], \d_strd + vst1.32 {d6[1]}, [\ds2, :32], \d_strd + ble 0f + vmov q8, q9 + vmov q9, q10 + vmov d20, d22 + vmov d21, d24 + b 28b +0: + pop {r4-r11,pc} + +L(\type\()_8tap_filter_2): + vld1.16 {q11}, [\sr2], \s_strd + vld1.16 {q12}, [\src], \s_strd + vext.8 d23, d22, d23, #2 + vext.8 d25, d24, d25, #2 + vtrn.32 q11, q12 + vmull.s16 q3, d22, d0[0] + vmlal.s16 q3, d23, d0[1] + vmlal.s16 q3, d24, d0[2] + vmlal.s16 q3, d25, d0[3] + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d24, q3 + bx lr +.endif + +40: + add \mx, \mx, #2 + vld1.32 {d0[]}, [\mx] + bgt 480f + add \my, \my, #2 + vld1.32 {d2[]}, [\my] + sub \sr2, \src, #2 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + // 4x2, 4x4 hv + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d17, q10 + + bl L(\type\()_8tap_filter_4) + vmov q9, q12 + +4: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d17, d2[0] + vmlal.s16 q2, d18, d2[1] + vmlal.s16 q2, d19, d2[2] + vmlal.s16 q2, d24, d2[3] + vmull.s16 q3, d18, d2[0] + vmlal.s16 q3, d19, d2[1] + vmlal.s16 q3, d24, d2[2] + vmlal.s16 q3, d25, d2[3] +.ifc \type, put + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q15 +.else + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + subs \h, \h, #2 + + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + ble 0f + vmov d17, d19 + vmov q9, q12 + b 4b +0: + pop {r4-r11,pc} + +480: // 4x8, 4x16, 4x32 hv + vpush {d13-d15} + vld1.8 {d2}, [\my, :64] + sub \src, \src, #2 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d13, q10 + + bl L(\type\()_8tap_filter_4) + vmov q7, q12 + bl L(\type\()_8tap_filter_4) + vmov q8, q12 + bl L(\type\()_8tap_filter_4) + vmov q9, q12 + +48: + bl L(\type\()_8tap_filter_4) + vmull.s16 q2, d13, d2[0] + vmlal.s16 q2, d14, d2[1] + vmlal.s16 q2, d15, d2[2] + vmlal.s16 q2, d16, d2[3] + vmlal.s16 q2, d17, d3[0] + vmlal.s16 q2, d18, d3[1] + vmlal.s16 q2, d19, d3[2] + vmlal.s16 q2, d24, d3[3] + vmull.s16 q3, d14, d2[0] + vmlal.s16 q3, d15, d2[1] + vmlal.s16 q3, d16, d2[2] + vmlal.s16 q3, d17, d2[3] + vmlal.s16 q3, d18, d3[0] + vmlal.s16 q3, d19, d3[1] + vmlal.s16 q3, d24, d3[2] + vmlal.s16 q3, d25, d3[3] +.ifc \type, put + vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vmin.u16 q2, q2, q15 +.else + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vsub.i16 q2, q2, q13 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {d4}, [\dst, :64], \d_strd + vst1.16 {d5}, [\ds2, :64], \d_strd + ble 0f + vmov d13, d15 + vmov q7, q8 + vmov q8, q9 + vmov q9, q12 + b 48b +0: + vpop {d13-d15} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_4): + vld1.16 {q10}, [\sr2], \s_strd + vld1.16 {q11}, [\src], \s_strd + vext.8 d24, d20, d21, #2 + vext.8 d25, d20, d21, #4 + vext.8 d21, d20, d21, #6 + vmull.s16 q3, d20, d0[0] + vmlal.s16 q3, d24, d0[1] + vmlal.s16 q3, d25, d0[2] + vmlal.s16 q3, d21, d0[3] + vext.8 d24, d22, d23, #2 + vext.8 d25, d22, d23, #4 + vext.8 d23, d22, d23, #6 + vmull.s16 q10, d22, d0[0] + vmlal.s16 q10, d24, d0[1] + vmlal.s16 q10, d25, d0[2] + vmlal.s16 q10, d23, d0[3] + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) + vmovn.i32 d24, q3 + vmovn.i32 d25, q10 + bx lr + +80: +160: +320: + bgt 880f + add \my, \my, #2 + vld1.8 {d0}, [\mx, :64] + vld1.32 {d2[]}, [\my] + sub \src, \src, #6 + sub \src, \src, \s_strd + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.16 {q11, q12}, [\src], \s_strd + vmull.s16 q2, d22, d0[0] + vmull.s16 q3, d23, d0[0] + vdup.32 q14, r12 // -(6-intermediate_bits) +.irpc i, 1234567 + vext.8 q10, q11, q12, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d20, d0[\i] + vmlal.s16 q3, d21, d0[\i] +.else + vmlal.s16 q2, d20, d1[\i - 4] + vmlal.s16 q3, d21, d1[\i - 4] +.endif +.endr + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d16, q2 + vmovn.i32 d17, q3 + + bl L(\type\()_8tap_filter_8) + vmov q9, q11 + vmov q10, q12 + +8: + bl L(\type\()_8tap_filter_8) + vmull.s16 q2, d16, d2[0] + vmull.s16 q3, d17, d2[0] + vmull.s16 q13, d18, d2[0] + vmull.s16 q14, d19, d2[0] +.ifc \type, put + vdup.32 q8, r8 // -(6+intermediate_bits) +.endif + vmlal.s16 q2, d18, d2[1] + vmlal.s16 q3, d19, d2[1] + vmlal.s16 q13, d20, d2[1] + vmlal.s16 q14, d21, d2[1] + vmlal.s16 q2, d20, d2[2] + vmlal.s16 q3, d21, d2[2] + vmlal.s16 q13, d22, d2[2] + vmlal.s16 q14, d23, d2[2] + vmlal.s16 q2, d22, d2[3] + vmlal.s16 q3, d23, d2[3] + vmlal.s16 q13, d24, d2[3] + vmlal.s16 q14, d25, d2[3] +.ifc \type, put + vdup.16 q9, \bdmax // bitdepth_max + vrshl.s32 q2, q2, q8 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q8 // -(6+intermediate_bits) + vrshl.s32 q13, q13, q8 // -(6+intermediate_bits) + vrshl.s32 q14, q14, q8 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q13 + vqmovun.s32 d7, q14 + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 +.else + vmov.i16 q9, #PREP_BIAS + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q13, #6 + vrshrn.i32 d7, q14, #6 + vsub.i16 q2, q2, q9 // PREP_BIAS + vsub.i16 q3, q3, q9 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + b 8b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 164b +0: + pop {r4-r11,pc} + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + vpush {q4-q7} + vld1.8 {d0}, [\mx, :64] + vld1.8 {d2}, [\my, :64] + sub \src, \src, #6 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + vmovl.s8 q0, d0 + vmovl.s8 q1, d2 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + vld1.16 {q11, q12}, [\src], \s_strd + vmull.s16 q2, d22, d0[0] + vmull.s16 q3, d23, d0[0] + vdup.32 q14, r12 // -(6-intermediate_bits) +.irpc i, 1234567 + vext.8 q10, q11, q12, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d20, d0[\i] + vmlal.s16 q3, d21, d0[\i] +.else + vmlal.s16 q2, d20, d1[\i - 4] + vmlal.s16 q3, d21, d1[\i - 4] +.endif +.endr + vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) + vmovn.i32 d8, q2 + vmovn.i32 d9, q3 + + bl L(\type\()_8tap_filter_8) + vmov q5, q11 + vmov q6, q12 + bl L(\type\()_8tap_filter_8) + vmov q7, q11 + vmov q8, q12 + bl L(\type\()_8tap_filter_8) + vmov q9, q11 + vmov q10, q12 + +88: + bl L(\type\()_8tap_filter_8) + vmull.s16 q2, d8, d2[0] + vmull.s16 q3, d9, d2[0] + vmull.s16 q13, d10, d2[0] + vmull.s16 q14, d11, d2[0] +.ifc \type, put + vdup.32 q4, r8 // -(6+intermediate_bits) +.endif + vmlal.s16 q2, d10, d2[1] + vmlal.s16 q3, d11, d2[1] + vmlal.s16 q13, d12, d2[1] + vmlal.s16 q14, d13, d2[1] + vmlal.s16 q2, d12, d2[2] + vmlal.s16 q3, d13, d2[2] + vmlal.s16 q13, d14, d2[2] + vmlal.s16 q14, d15, d2[2] + vmlal.s16 q2, d14, d2[3] + vmlal.s16 q3, d15, d2[3] + vmlal.s16 q13, d16, d2[3] + vmlal.s16 q14, d17, d2[3] + vmlal.s16 q2, d16, d3[0] + vmlal.s16 q3, d17, d3[0] + vmlal.s16 q13, d18, d3[0] + vmlal.s16 q14, d19, d3[0] + vmlal.s16 q2, d18, d3[1] + vmlal.s16 q3, d19, d3[1] + vmlal.s16 q13, d20, d3[1] + vmlal.s16 q14, d21, d3[1] + vmlal.s16 q2, d20, d3[2] + vmlal.s16 q3, d21, d3[2] + vmlal.s16 q13, d22, d3[2] + vmlal.s16 q14, d23, d3[2] + vmlal.s16 q2, d22, d3[3] + vmlal.s16 q3, d23, d3[3] + vmlal.s16 q13, d24, d3[3] + vmlal.s16 q14, d25, d3[3] +.ifc \type, put + vrshl.s32 q2, q2, q4 // -(6+intermediate_bits) + vrshl.s32 q3, q3, q4 // -(6+intermediate_bits) + vrshl.s32 q13, q13, q4 // -(6+intermediate_bits) + vrshl.s32 q14, q14, q4 // -(6+intermediate_bits) + vqmovun.s32 d4, q2 + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q13 + vqmovun.s32 d7, q14 + vmin.u16 q2, q2, q15 + vmin.u16 q3, q3, q15 +.else + vmov.i16 q5, #PREP_BIAS + vrshrn.i32 d4, q2, #6 + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q13, #6 + vrshrn.i32 d7, q14, #6 + vsub.i16 q2, q2, q5 // PREP_BIAS + vsub.i16 q3, q3, q5 // PREP_BIAS +.endif + subs \h, \h, #2 + vst1.16 {q2}, [\dst, :128], \d_strd + vst1.16 {q3}, [\ds2, :128], \d_strd + ble 9f + vmov q4, q6 + vmov q5, q7 + vmov q6, q8 + vmov q7, q9 + vmov q8, q10 + vmov q9, q11 + vmov q10, q12 + b 88b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 168b +0: + vpop {q4-q7} + pop {r4-r11,pc} + +L(\type\()_8tap_filter_8): + vld1.16 {q13, q14}, [\sr2], \s_strd + vmull.s16 q2, d26, d0[0] + vmull.s16 q3, d27, d0[0] +.irpc i, 1234567 + vext.8 q12, q13, q14, #(2*\i) +.if \i < 4 + vmlal.s16 q2, d24, d0[\i] + vmlal.s16 q3, d25, d0[\i] +.else + vmlal.s16 q2, d24, d1[\i - 4] + vmlal.s16 q3, d25, d1[\i - 4] +.endif +.endr + vdup.32 q12, r12 // -(6-intermediate_bits) + vld1.16 {q13, q14}, [\src], \s_strd + vrshl.s32 q2, q2, q12 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q12 // -(6-intermediate_bits) + vmovn.i32 d4, q2 + vmovn.i32 d5, q3 + + vmull.s16 q3, d26, d0[0] + vmull.s16 q11, d27, d0[0] +.irpc i, 1234567 + vext.8 q12, q13, q14, #(2*\i) +.if \i < 4 + vmlal.s16 q3, d24, d0[\i] + vmlal.s16 q11, d25, d0[\i] +.else + vmlal.s16 q3, d24, d1[\i - 4] + vmlal.s16 q11, d25, d1[\i - 4] +.endif +.endr + vdup.32 q13, r12 // -(6-intermediate_bits) + vrshl.s32 q3, q3, q13 // -(6-intermediate_bits) + vrshl.s32 q11, q11, q13 // -(6-intermediate_bits) + + vmovn.i32 d24, q3 + vmovn.i32 d25, q11 + vmov q11, q2 + bx lr +endfunc + +function \type\()_bilin_16bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] +.ifc \bdmax, r8 + ldr r8, [sp, #52] +.endif + vdup.16 q1, \mx + vdup.16 q3, \my + rsb r9, \mx, #16 + rsb r10, \my, #16 + vdup.16 q0, r9 + vdup.16 q2, r10 +.ifc \type, prep + lsl \d_strd, \w, #1 +.endif + clz \bdmax, \bdmax // bitdepth_max + clz r9, \w + sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 + cmp \mx, #0 + sub r9, r9, #24 + rsb r11, \bdmax, #4 // 4 - intermediate_bits + add r12, \bdmax, #4 // 4 + intermediate_bits + bne L(\type\()_bilin_h) + cmp \my, #0 + bne L(\type\()_bilin_v) + b \type\()_neon + +L(\type\()_bilin_h): + cmp \my, #0 + bne L(\type\()_bilin_hv) + + adr r10, L(\type\()_bilin_h_tbl) + vdup.16 q15, r11 // 4 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s16 q15, q15 // -(4-intermediate_bits) +.ifc \type, put + vdup.16 q14, \bdmax // intermediate_bits +.else + vmov.i16 q14, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s16 q14, q14 // -intermediate_bits +.endif + bx r10 + + .align 2 +L(\type\()_bilin_h_tbl): + .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB + +20: // 2xN h +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + vld1.16 {d16}, [\src], \s_strd + vld1.16 {d18}, [\sr2], \s_strd + vext.8 d17, d16, d16, #2 + vext.8 d19, d18, d18, #2 + vtrn.32 d16, d18 + vtrn.32 d17, d19 + subs \h, \h, #2 + vmul.i16 d16, d16, d0 + vmla.i16 d16, d17, d2 + vrshl.u16 d16, d16, d30 + vrshl.u16 d16, d16, d28 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + bgt 2b + pop {r4-r11,pc} +.endif + +40: // 4xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + vld1.16 {q8}, [\src], \s_strd + vld1.16 {q10}, [\sr2], \s_strd + vext.8 q9, q8, q8, #2 + vext.8 q11, q10, q10, #2 + vmov d17, d20 + vmov d19, d22 + subs \h, \h, #2 + vmul.i16 q8, q8, q0 + vmla.i16 q8, q9, q1 + vrshl.u16 q8, q8, q15 +.ifc \type, put + vrshl.u16 q8, q8, q14 +.else + vsub.i16 q8, q8, q14 +.endif + vst1.16 {d16}, [\dst, :64], \d_strd + vst1.16 {d17}, [\ds2, :64], \d_strd + bgt 4b + pop {r4-r11,pc} + +80: // 8xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + vld1.16 {d16, d17, d18}, [\src], \s_strd + vld1.16 {d20, d21, d22}, [\sr2], \s_strd + vext.8 q9, q8, q9, #2 + vext.8 q11, q10, q11, #2 + subs \h, \h, #2 + vmul.i16 q8, q8, q0 + vmla.i16 q8, q9, q1 + vmul.i16 q10, q10, q0 + vmla.i16 q10, q11, q1 + vrshl.u16 q8, q8, q15 + vrshl.u16 q10, q10, q15 +.ifc \type, put + vrshl.u16 q8, q8, q14 + vrshl.u16 q10, q10, q14 +.else + vsub.i16 q8, q8, q14 + vsub.i16 q10, q10, q14 +.endif + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q10}, [\ds2, :128], \d_strd + bgt 8b + pop {r4-r11,pc} +160: +320: +640: +1280: // 16xN, 32xN, ... h + vpush {q4-q7} + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w, lsl #1 + sub \s_strd, \s_strd, #16 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, lsl #1 +.endif +161: + vld1.16 {q4}, [\src]! + vld1.16 {q9}, [\sr2]! + mov \mx, \w + +16: + vld1.16 {q5, q6}, [\src]! + vld1.16 {q10, q11}, [\sr2]! + vext.8 q7, q4, q5, #2 + vext.8 q8, q5, q6, #2 + vext.8 q12, q9, q10, #2 + vext.8 q13, q10, q11, #2 + vmul.i16 q4, q4, q0 + vmla.i16 q4, q7, q1 + vmul.i16 q5, q5, q0 + vmla.i16 q5, q8, q1 + vmul.i16 q9, q9, q0 + vmla.i16 q9, q12, q1 + vmul.i16 q10, q10, q0 + vmla.i16 q10, q13, q1 + vrshl.u16 q4, q4, q15 + vrshl.u16 q5, q5, q15 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + subs \mx, \mx, #16 +.ifc \type, put + vrshl.u16 q4, q4, q14 + vrshl.u16 q5, q5, q14 + vrshl.u16 q9, q9, q14 + vrshl.u16 q10, q10, q14 +.else + vsub.i16 q4, q4, q14 + vsub.i16 q5, q5, q14 + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 +.endif + vst1.16 {q4, q5}, [\dst, :128]! + vst1.16 {q9, q10}, [\ds2, :128]! + ble 9f + + vmov q4, q6 + vmov q9, q11 + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + bgt 161b + vpop {q4-q7} + pop {r4-r11,pc} + + +L(\type\()_bilin_v): + cmp \h, #4 + adr r10, L(\type\()_bilin_v_tbl) +.ifc \type, prep + vdup.16 q15, r11 // 4 - intermediate_bits +.endif + ldr r9, [r10, r9, lsl #2] +.ifc \type, prep + vmov.i16 q14, #PREP_BIAS + vneg.s16 q15, q15 // -(4-intermediate_bits) +.endif + add r10, r10, r9 + bx r10 + + .align 2 +L(\type\()_bilin_v_tbl): + .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB + +20: // 2xN v +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + vld1.32 {d16[]}, [\src], \s_strd + bgt 24f + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vmul.i16 d16, d16, d4 + vmla.i16 d16, d17, d6 + vrshr.u16 d16, d16, #4 + vst1.32 {d16[0]}, [\dst, :32] + vst1.32 {d16[1]}, [\ds2, :32] + pop {r4-r11,pc} +24: // 2x4, 2x8, ... v + vld1.32 {d17[]}, [\sr2], \s_strd + vld1.32 {d18[]}, [\src], \s_strd + vld1.32 {d19[]}, [\sr2], \s_strd + vld1.32 {d20[]}, [\src], \s_strd + vext.8 d16, d16, d17, #4 + vext.8 d17, d17, d18, #4 + vext.8 d18, d18, d19, #4 + vext.8 d19, d19, d20, #4 + vswp d17, d18 + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + subs \h, \h, #4 + vrshr.u16 q8, q8, #4 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + vst1.32 {d17[0]}, [\dst, :32], \d_strd + vst1.32 {d17[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d20 + b 24b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.16 {d16}, [\src], \s_strd +4: + vld1.16 {d17}, [\sr2], \s_strd + vld1.16 {d19}, [\src], \s_strd + vmov d18, d17 + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 +.else + vrshl.u16 q8, q8, q15 + vsub.i16 q8, q8, q14 +.endif + vst1.16 {d16}, [\dst, :64], \d_strd + vst1.16 {d17}, [\ds2, :64], \d_strd + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + vld1.16 {q8}, [\src], \s_strd +8: + vld1.16 {q9}, [\sr2], \s_strd + vld1.16 {q10}, [\src], \s_strd + vmul.i16 q8, q8, q2 + vmla.i16 q8, q9, q3 + vmul.i16 q9, q9, q2 + vmla.i16 q9, q10, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 +.else + vrshl.u16 q8, q8, q15 + vrshl.u16 q9, q9, q15 + vsub.i16 q8, q8, q14 + vsub.i16 q9, q9, q14 +.endif + vst1.16 {q8}, [\dst, :128], \d_strd + vst1.16 {q9}, [\ds2, :128], \d_strd + ble 0f + vmov q8, q10 + b 8b +0: + pop {r4-r11,pc} + +160: // 16xN, 32xN, ... +320: +640: +1280: + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {q8, q9}, [\src], \s_strd +2: + vld1.16 {q10, q11}, [\sr2], \s_strd + vld1.16 {q12, q13}, [\src], \s_strd + vmul.i16 q8, q8, q2 + vmla.i16 q8, q10, q3 + vmul.i16 q9, q9, q2 + vmla.i16 q9, q11, q3 + vmul.i16 q10, q10, q2 + vmla.i16 q10, q12, q3 + vmul.i16 q11, q11, q2 + vmla.i16 q11, q13, q3 + subs \h, \h, #2 +.ifc \type, put + vrshr.u16 q8, q8, #4 + vrshr.u16 q9, q9, #4 + vrshr.u16 q10, q10, #4 + vrshr.u16 q11, q11, #4 +.else + vrshl.u16 q8, q8, q15 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + vrshl.u16 q11, q11, q15 + vsub.i16 q8, q8, q14 + vsub.i16 q9, q9, q14 + vsub.i16 q10, q10, q14 + vsub.i16 q11, q11, q14 +.endif + vst1.16 {q8, q9}, [\dst, :128], \d_strd + vst1.16 {q10, q11}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q12 + vmov q9, q13 + b 2b +9: + subs \w, \w, #16 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #32 + add \dst, \dst, #32 + b 1b +0: + pop {r4-r11,pc} + +L(\type\()_bilin_hv): + adr r10, L(\type\()_bilin_hv_tbl) + vdup.16 q15, r11 // 4 - intermediate_bits + ldr r9, [r10, r9, lsl #2] + vneg.s16 q15, q15 // -(4-intermediate_bits) +.ifc \type, put + vdup.32 q14, r12 // 4 + intermediate_bits +.else + vmov.i16 q14, #PREP_BIAS +.endif + add r10, r10, r9 +.ifc \type, put + vneg.s32 q14, q14 // -(4+intermediate_bits) +.endif + bx r10 + + .align 2 +L(\type\()_bilin_hv_tbl): + .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB + +20: // 2xN hv +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {d20}, [\src], \s_strd + vext.8 d21, d20, d20, #2 + vmul.i16 d16, d20, d0 + vmla.i16 d16, d21, d2 + vrshl.u16 d16, d16, d30 + vext.8 d16, d16, d16, #4 + +2: + vld1.16 {d20}, [\sr2], \s_strd + vld1.16 {d22}, [\src], \s_strd + vext.8 d21, d20, d20, #2 + vext.8 d23, d22, d22, #2 + vtrn.32 d20, d22 + vtrn.32 d21, d23 + vmul.i16 d18, d20, d0 + vmla.i16 d18, d21, d2 + vrshl.u16 d18, d18, d30 + + vext.8 d16, d16, d18, #4 + + vmull.u16 q8, d16, d4 + vmlal.u16 q8, d18, d6 + vrshl.u32 q8, q8, q14 + vmovn.i32 d16, q8 + subs \h, \h, #2 + vst1.32 {d16[0]}, [\dst, :32], \d_strd + vst1.32 {d16[1]}, [\ds2, :32], \d_strd + ble 0f + vmov d16, d18 + b 2b +0: + pop {r4-r11,pc} +.endif + +40: // 4xN hv + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {q10}, [\src], \s_strd + vext.8 d21, d20, d21, #2 + vmul.i16 d16, d20, d0 + vmla.i16 d16, d21, d2 + vrshl.u16 d16, d16, d30 + +4: + vld1.16 {q10}, [\sr2], \s_strd + vld1.16 {q11}, [\src], \s_strd + vext.8 d21, d20, d21, #2 + vext.8 d23, d22, d23, #2 + vswp d21, d22 + vmul.i16 q9, q10, q0 + vmla.i16 q9, q11, q1 + vrshl.u16 q9, q9, q15 + + vmull.u16 q10, d16, d4 + vmlal.u16 q10, d18, d6 + vmull.u16 q11, d18, d4 + vmlal.u16 q11, d19, d6 +.ifc \type, put + vrshl.u32 q10, q10, q14 + vrshl.u32 q11, q11, q14 + vmovn.i32 d20, q10 + vmovn.i32 d21, q11 +.else + vrshrn.i32 d20, q10, #4 + vrshrn.i32 d21, q11, #4 + vsub.i16 q10, q10, q14 +.endif + subs \h, \h, #2 + vst1.16 {d20}, [\dst, :64], \d_strd + vst1.16 {d21}, [\ds2, :64], \d_strd + ble 0f + vmov d16, d19 + b 4b +0: + pop {r4-r11,pc} + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + vld1.16 {d20, d21, d22}, [\src], \s_strd + vext.8 q11, q10, q11, #2 + vmul.i16 q8, q10, q0 + vmla.i16 q8, q11, q1 + vrshl.u16 q8, q8, q15 + +2: + vld1.16 {d20, d21, d22}, [\sr2], \s_strd + vld1.16 {d24, d25, d26}, [\src], \s_strd + vext.8 q11, q10, q11, #2 + vext.8 q13, q12, q13, #2 + vmul.i16 q9, q10, q0 + vmla.i16 q9, q11, q1 + vmul.i16 q10, q12, q0 + vmla.i16 q10, q13, q1 + vrshl.u16 q9, q9, q15 + vrshl.u16 q10, q10, q15 + + vmull.u16 q11, d16, d4 + vmlal.u16 q11, d18, d6 + vmull.u16 q12, d17, d4 + vmlal.u16 q12, d19, d6 + vmull.u16 q8, d18, d4 + vmlal.u16 q8, d20, d6 + vmull.u16 q9, d19, d4 + vmlal.u16 q9, d21, d6 +.ifc \type, put + vrshl.u32 q11, q11, q14 + vrshl.u32 q12, q12, q14 + vrshl.u32 q8, q8, q14 + vrshl.u32 q9, q9, q14 + vmovn.i32 d22, q11 + vmovn.i32 d23, q12 + vmovn.i32 d16, q8 + vmovn.i32 d17, q9 +.else + vrshrn.i32 d22, q11, #4 + vrshrn.i32 d23, q12, #4 + vrshrn.i32 d16, q8, #4 + vrshrn.i32 d17, q9, #4 + vsub.i16 q11, q11, q14 + vsub.i16 q8, q8, q14 +.endif + subs \h, \h, #2 + vst1.16 {q11}, [\dst, :128], \d_strd + vst1.16 {q8}, [\ds2, :128], \d_strd + ble 9f + vmov q8, q10 + b 2b +9: + subs \w, \w, #8 + ble 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + mls \src, \s_strd, \my, \src + mls \dst, \d_strd, \my, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 + add \dst, \dst, #16 + b 1b +0: + pop {r4-r11,pc} +endfunc +.endm + +filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 +filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10 + +.macro load_filter_ptr src + asr r12, \src, #10 + add r12, r11, r12, lsl #3 +.endm + +.macro load_filter_coef dst, src, inc + add \src, \src, \inc + vld1.8 {\dst}, [r12, :64] +.endm + +.macro load_filter_row dst, src, inc + load_filter_ptr \src + load_filter_coef \dst, \src, \inc +.endm + +function warp_filter_horz_neon + load_filter_ptr r5 // filter 0 + vld1.16 {q6,q7}, [r2], r3 + + load_filter_coef d0, r5, r7 // filter 0 + load_filter_row d2, r5, r7 // filter 1 + vmovl.s8 q0, d0 // filter 0 + vext.8 q3, q6, q7, #2*1 // filter 1 pixels + vmovl.s8 q1, d2 // filter 1 + + vmull.s16 q4, d12, d0 // filter 0 output (0-3) + vmull.s16 q5, d13, d1 // filter 0 output (4-7) + + load_filter_ptr r5 // filter 2 + + vmull.s16 q2, d6, d2 // filter 1 output (0-3) + vmull.s16 q3, d7, d3 // filter 1 output (4-7) + + load_filter_coef d0, r5, r7 // filter 2 + + vpadd.i32 d8, d8, d9 // half pixel 0 (2x32) + vpadd.i32 d9, d10, d11 // half pixel 0 (2x32) + + load_filter_ptr r5 // filter 3 + + vpadd.i32 d4, d4, d5 // half pixel 1 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 1 (2x32) + + vmovl.s8 q0, d0 // filter 2 + vext.8 q3, q6, q7, #2*2 // filter 2 pixels + + vpadd.i32 d8, d8, d9 // pixel 0 (2x32) + vpadd.i32 d9, d4, d5 // pixel 1 (2x32) + + load_filter_coef d2, r5, r7 // filter 3 + + vmull.s16 q2, d6, d0 // filter 2 output (0-3) + vmull.s16 q3, d7, d1 // filter 2 output (4-7) + + load_filter_ptr r5 // filter 4 + + vpadd.i32 d8, d8, d9 // pixel 0,1 + + vpadd.i32 d9, d4, d5 // half pixel 2 (2x32) + vpadd.i32 d10, d6, d7 // half pixel 2 (2x32) + + vmovl.s8 q1, d2 // filter 3 + vext.8 q3, q6, q7, #2*3 // filter 3 pixels + + load_filter_coef d0, r5, r7 // filter 4 + + vpadd.i32 d9, d9, d10 // pixel 2 (2x32) + + vmull.s16 q2, d6, d2 // filter 3 output (0-3) + vmull.s16 q3, d7, d3 // filter 3 output (4-7) + + vmovl.s8 q0, d0 // filter 4 + load_filter_ptr r5 // filter 5 + + vpadd.i32 d10, d4, d5 // half pixel 3 (2x32) + vpadd.i32 d11, d6, d7 // half pixel 3 (2x32) + + vext.8 q3, q6, q7, #2*4 // filter 4 pixels + load_filter_coef d2, r5, r7 // filter 5 + + vpadd.i32 d10, d10, d11 // pixel 3 (2x32) + + vpadd.i32 d9, d9, d10 // pixel 2,3 + + vmull.s16 q2, d6, d0 // filter 4 output (0-3) + vmull.s16 q3, d7, d1 // filter 4 output (4-7) + + vmovl.s8 q1, d2 // filter 5 + load_filter_ptr r5 // filter 6 + + vpadd.i32 d10, d4, d5 // half pixel 4 (2x32) + vpadd.i32 d11, d6, d7 // half pixel 4 (2x32) + + vext.8 q3, q6, q7, #2*5 // filter 5 pixels + load_filter_coef d0, r5, r7 // filter 6 + + vpadd.i32 d10, d10, d11 // pixel 4 (2x32) + + vmull.s16 q2, d6, d2 // filter 5 output (0-3) + vmull.s16 q3, d7, d3 // filter 5 output (4-7) + + vmovl.s8 q0, d0 // filter 6 + load_filter_ptr r5 // filter 7 + + vpadd.i32 d4, d4, d5 // half pixel 5 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 5 (2x32) + + vext.8 q3, q6, q7, #2*6 // filter 6 pixels + load_filter_coef d2, r5, r7 // filter 7 + + vpadd.i32 d11, d4, d5 // pixel 5 (2x32) + + vmull.s16 q2, d6, d0 // filter 6 output (0-3) + vmull.s16 q3, d7, d1 // filter 6 output (4-7) + + vmovl.s8 q1, d2 // filter 7 + + vpadd.i32 d10, d10, d11 // pixel 4,5 + + vpadd.i32 d4, d4, d5 // half pixel 6 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 6 (2x32) + + vext.8 q3, q6, q7, #2*7 // filter 7 pixels + + vpadd.i32 d11, d4, d5 // pixel 6 (2x32) + + vmull.s16 q2, d6, d2 // filter 7 output (0-3) + vmull.s16 q3, d7, d3 // filter 7 output (4-7) + + vld1.32 {d14[],d15[]}, [sp] // -(7 - intermediate_bits) + + vpadd.i32 d4, d4, d5 // half pixel 7 (2x32) + vpadd.i32 d5, d6, d7 // half pixel 7 (2x32) + + sub r5, r5, r7, lsl #3 + + vpadd.i32 d4, d4, d5 // pixel 7 (2x32) + + add r5, r5, r8 + + vpadd.i32 d11, d11, d4 // pixel 6,7 + + vrshl.s32 q4, q4, q7 // -(7 - intermediate_bits) + vrshl.s32 q5, q5, q7 // -(7 - intermediate_bits) + + bx lr +endfunc + +// void dav1d_warp_affine_8x8_16bpc_neon( +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *const abcd, int mx, int my, +// const int bitdepth_max) +.macro warp t +function warp_affine_8x8\t\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + sub sp, sp, #8 + + clz r7, r7 + // intermediate_bits = clz(bitdepth_max) - 18 +.ifb \t + sub r8, r7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 +.endif + sub r7, r7, #25 // -(7 - intermediate_bits) +.ifb \t + neg r8, r8 // -(7 + intermediate_bits) +.endif + str r7, [sp] // spill -(7 - intermediate_bits) on stack +.ifb \t + str r8, [sp, #4] // spill -(7 + intermediate_bits) on stack +.endif + + ldrd r8, r9, [r4] + sxth r7, r8 + asr r8, r8, #16 + asr r4, r9, #16 + sxth r9, r9 + mov r10, #8 + sub r2, r2, r3, lsl #1 + sub r2, r2, r3 + sub r2, r2, #6 + movrel r11, X(mc_warp_filter), 64*8 +.ifnb \t + lsl r1, r1, #1 +.endif + add r5, r5, #512 + add r6, r6, #512 + + bl warp_filter_horz_neon + vmovn.i32 d16, q4 + vmovn.i32 d17, q5 + bl warp_filter_horz_neon + vmovn.i32 d18, q4 + vmovn.i32 d19, q5 + bl warp_filter_horz_neon + vmovn.i32 d20, q4 + vmovn.i32 d21, q5 + bl warp_filter_horz_neon + vmovn.i32 d22, q4 + vmovn.i32 d23, q5 + bl warp_filter_horz_neon + vmovn.i32 d24, q4 + vmovn.i32 d25, q5 + bl warp_filter_horz_neon + vmovn.i32 d26, q4 + vmovn.i32 d27, q5 + bl warp_filter_horz_neon + vmovn.i32 d28, q4 + vmovn.i32 d29, q5 + +1: + bl warp_filter_horz_neon + vmovn.i32 d30, q4 + vmovn.i32 d31, q5 + + load_filter_row d8, r6, r9 + load_filter_row d9, r6, r9 + load_filter_row d10, r6, r9 + load_filter_row d11, r6, r9 + load_filter_row d12, r6, r9 + load_filter_row d13, r6, r9 + load_filter_row d14, r6, r9 + load_filter_row d15, r6, r9 + transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15 + vmovl.s8 q1, d8 + vmovl.s8 q2, d9 + vmovl.s8 q3, d10 + vmovl.s8 q4, d11 + vmovl.s8 q5, d12 + vmovl.s8 q6, d13 + + sub r6, r6, r9, lsl #3 + + // This ordering of vmull/vmlal is highly beneficial for + // Cortex A8/A9/A53 here, but harmful for Cortex A7. + vmull.s16 q0, d16, d2 + vmlal.s16 q0, d18, d4 + vmlal.s16 q0, d20, d6 + vmlal.s16 q0, d22, d8 + vmlal.s16 q0, d24, d10 + vmlal.s16 q0, d26, d12 + vmull.s16 q1, d17, d3 + vmlal.s16 q1, d19, d5 + vmlal.s16 q1, d21, d7 + vmlal.s16 q1, d23, d9 + vmlal.s16 q1, d25, d11 + vmlal.s16 q1, d27, d13 + + vmovl.s8 q2, d14 + vmovl.s8 q3, d15 + + vmlal.s16 q0, d28, d4 + vmlal.s16 q0, d30, d6 + vmlal.s16 q1, d29, d5 + vmlal.s16 q1, d31, d7 + +.ifb \t + ldr lr, [sp, #4] // -(7 + intermediate_bits) + ldr r12, [sp, #120] // bitdepth_max + vdup.32 q2, lr // -(7 + intermediate_bits) + vdup.16 q3, r12 // bitdepth_max +.endif + + vmov q8, q9 + vmov q9, q10 +.ifb \t + vrshl.s32 q0, q0, q2 // -(7 + intermediate_bits) + vrshl.s32 q1, q1, q2 // -(7 + intermediate_bits) +.else + vrshrn.s32 d0, q0, #7 + vrshrn.s32 d1, q1, #7 + vmov.i16 q3, #PREP_BIAS +.endif + vmov q10, q11 +.ifb \t + vqmovun.s32 d0, q0 + vqmovun.s32 d1, q1 +.else + vsub.i16 q0, q0, q3 // PREP_BIAS +.endif + vmov q11, q12 + vmov q12, q13 +.ifb \t + vmin.u16 q0, q0, q3 // bitdepth_max +.endif + vmov q13, q14 + vmov q14, q15 + subs r10, r10, #1 + vst1.16 {q0}, [r0, :128], r1 + + add r6, r6, r4 + bgt 1b + + add sp, sp, #8 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +warp +warp t + +// void dav1d_emu_edge_16bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_16bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + ldrd r8, r9, [sp, #52] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub r12, r3, #1 // ih - 1 + cmp r5, r3 + sub lr, r2, #1 // iw - 1 + it lt + movlt r12, r5 // min(y, ih - 1) + cmp r4, r2 + bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) + it lt + movlt lr, r4 // min(x, iw - 1) + bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) + mla r8, r12, r9, r8 // ref += iclip() * stride + add r8, r8, lr, lsl #1 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add r10, r5, r1 // y + bh + neg r5, r5 // -y + sub r10, r10, r3 // y + bh - ih + sub r12, r1, #1 // bh - 1 + cmp r10, r1 + bic r5, r5, r5, asr #31 // max(-y, 0) + it ge + movge r10, r12 // min(y + bh - ih, bh-1) + cmp r5, r1 + bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) + it ge + movge r5, r12 // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add r11, r4, r0 // x + bw + neg r4, r4 // -x + sub r11, r11, r2 // x + bw - iw + sub lr, r0, #1 // bw - 1 + cmp r11, r0 + bic r4, r4, r4, asr #31 // max(-x, 0) + it ge + movge r11, lr // min(x + bw - iw, bw-1) + cmp r4, r0 + bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) + it ge + movge r4, lr // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub r1, r1, r5 // bh - top_ext + mla r6, r5, r7, r6 + sub r2, r0, r4 // bw - left_ext + sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext + sub r2, r2, r11 // center_w = bw - left_ext - right_ext + + mov r0, r6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + vld1.16 {d0[], d1[]}, [r8] + mov r12, r6 // out = dst + mov r3, r4 + vmov q1, q0 +1: + subs r3, r3, #16 + vst1.16 {q0, q1}, [r12, :128]! + bgt 1b +.endif + mov lr, r8 + add r12, r6, r4, lsl #1 // out = dst + left_ext + mov r3, r2 +1: + vld1.16 {q0, q1}, [lr]! + subs r3, r3, #32 + vld1.16 {q2, q3}, [lr]! +.if \need_left + vst1.16 {q0, q1}, [r12]! + vst1.16 {q2, q3}, [r12]! +.else + vst1.16 {q0, q1}, [r12, :128]! + vst1.16 {q2, q3}, [r12, :128]! +.endif + bgt 1b +.if \need_right + add r3, r8, r2, lsl #1 // in + center_w + sub r3, r3, #2 // in + center_w - 1 + add r12, r6, r4, lsl #1 // dst + left_ext + vld1.16 {d0[], d1[]}, [r3] + add r12, r12, r2, lsl #1 // out = dst + left_ext + center_w + mov r3, r11 + vmov q1, q0 +1: + subs r3, r3, #16 + vst1.16 {q0, q1}, [r12]! + bgt 1b +.endif + + subs r1, r1, #1 // center_h-- + add r6, r6, r7 + add r8, r8, r9 + bgt 0b +.endm + + cmp r4, #0 + beq 2f + // need_left + cmp r11, #0 + beq 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cmp r11, #0 + beq 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + cmp r10, #0 + // Storing the original dst in r0 overwrote bw, recalculate it here + add r2, r2, r4 // center_w + left_ext + add r2, r2, r11 // bw = center_w + left_ext + right_ext + + beq 3f + // need_bottom + sub r8, r6, r7 // ref = dst - stride + mov r4, r2 + sub r12, r7, #32 +1: + vld1.16 {q0, q1}, [r8, :128]! + mov r3, r10 + vld1.16 {q2, q3}, [r8, :128]! +2: + vst1.16 {q0, q1}, [r6, :128]! + subs r3, r3, #1 + vst1.16 {q2, q3}, [r6, :128], r12 + bgt 2b + mls r6, r7, r10, r6 // dst -= bottom_ext * stride + subs r4, r4, #32 // bw -= 32 + add r6, r6, #64 // dst += 32 + bgt 1b + +3: + cmp r5, #0 + beq 3f + // need_top + mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride + sub r12, r7, #32 +1: + vld1.16 {q0, q1}, [r0, :128]! + mov r3, r5 + vld1.16 {q2, q3}, [r0, :128]! +2: + vst1.16 {q0, q1}, [r6, :128]! + subs r3, r3, #1 + vst1.16 {q2, q3}, [r6, :128], r12 + bgt 2b + mls r6, r7, r5, r6 // dst -= top_ext * stride + subs r2, r2, #32 // bw -= 32 + add r6, r6, #64 // dst += 32 + bgt 1b + +3: + pop {r4-r11,pc} +endfunc diff -Nru dav1d-0.7.1/src/arm/32/mc.S dav1d-0.9.1/src/arm/32/mc.S --- dav1d-0.7.1/src/arm/32/mc.S 2020-06-21 11:48:54.960126400 +0000 +++ dav1d-0.9.1/src/arm/32/mc.S 2021-07-28 21:38:28.861851700 +0000 @@ -71,8 +71,7 @@ .macro bidir_fn type function \type\()_8bpc_neon, export=1 push {r4-r6,lr} - ldr r4, [sp, #16] - ldr r5, [sp, #20] + ldrd r4, r5, [sp, #16] clz r4, r4 .ifnc \type, avg ldr lr, [sp, #24] @@ -220,10 +219,8 @@ .macro w_mask_fn type function w_mask_\type\()_8bpc_neon, export=1 push {r4-r9,lr} - ldr r4, [sp, #28] - ldr r5, [sp, #32] - ldr r6, [sp, #36] - ldr r7, [sp, #40] + ldrd r4, r5, [sp, #28] + ldrd r6, r7, [sp, #36] clz r8, r4 adr r9, L(w_mask_\type\()_tbl) sub r8, r8, #24 @@ -455,8 +452,7 @@ function blend_8bpc_neon, export=1 push {r4-r5,lr} - ldr r4, [sp, #12] - ldr r5, [sp, #16] + ldrd r4, r5, [sp, #12] clz lr, r3 adr r3, L(blend_tbl) sub lr, lr, #26 @@ -566,17 +562,17 @@ endfunc function blend_h_8bpc_neon, export=1 - push {r4-r8,lr} - ldr r4, [sp, #24] + push {r4-r5,lr} + ldr r4, [sp, #12] movrel r5, X(obmc_masks) add r5, r5, r4 sub r4, r4, r4, lsr #2 - clz r6, r3 - adr r7, L(blend_h_tbl) - sub r6, r6, #24 - ldr r6, [r7, r6, lsl #2] - add r7, r7, r6 - bx r7 + clz lr, r3 + adr r12, L(blend_h_tbl) + sub lr, lr, #24 + ldr lr, [r12, lr, lsl #2] + add r12, r12, lr + bx r12 .align 2 L(blend_h_tbl): @@ -594,7 +590,7 @@ lsl r1, r1, #1 2: vld1.16 {d2[], d3[]}, [r5, :16]! - vld1.32 {d1[0]}, [r2, :32]! + vld1.32 {d1[]}, [r2, :32]! subs r4, r4, #2 vld1.16 {d0[]}, [r0, :16] vzip.8 d2, d3 @@ -606,7 +602,7 @@ vst1.16 {d20[0]}, [r0, :16], r1 vst1.16 {d20[1]}, [r12, :16], r1 bgt 2b - pop {r4-r8,pc} + pop {r4-r5,pc} 40: vmov.i8 d22, #64 add r12, r0, r1 @@ -625,7 +621,7 @@ vst1.32 {d20[0]}, [r0, :32], r1 vst1.32 {d20[1]}, [r12, :32], r1 bgt 4b - pop {r4-r8,pc} + pop {r4-r5,pc} 80: vmov.i8 q8, #64 add r12, r0, r1 @@ -646,7 +642,7 @@ vst1.u8 {d22}, [r0, :64], r1 vst1.u8 {d23}, [r12, :64], r1 bgt 8b - pop {r4-r8,pc} + pop {r4-r5,pc} 160: vmov.i8 q12, #64 add r12, r0, r1 @@ -673,7 +669,7 @@ vst1.u8 {q9}, [r0, :128], r1 vst1.u8 {q10}, [r12, :128], r1 bgt 16b - pop {r4-r8,pc} + pop {r4-r5,pc} 320: 640: 1280: @@ -682,7 +678,7 @@ 321: vld1.u8 {d6[]}, [r5]! vsub.i8 d7, d20, d6 - mov r8, r3 + mov r12, r3 32: vld1.u8 {q8, q9}, [r2, :128]! vld1.u8 {q0, q1}, [r0, :128] @@ -698,25 +694,25 @@ vmlal.u8 q14, d3, d7 vrshrn.i16 d2, q15, #6 vrshrn.i16 d3, q14, #6 + subs r12, r12, #32 vst1.u8 {q0, q1}, [r0, :128]! - subs r8, r8, #32 bgt 32b add r0, r0, r1 subs r4, r4, #1 bgt 321b - pop {r4-r8,pc} + pop {r4-r5,pc} endfunc function blend_v_8bpc_neon, export=1 - push {r4-r5,lr} - ldr r4, [sp, #12] - movrel r5, X(obmc_masks) - add r5, r5, r3 - clz lr, r3 + push {r4,lr} + ldr r4, [sp, #8] + movrel lr, X(obmc_masks) + add lr, lr, r3 + clz r12, r3 adr r3, L(blend_v_tbl) - sub lr, lr, #26 - ldr lr, [r3, lr, lsl #2] - add r3, r3, lr + sub r12, r12, #26 + ldr r12, [r3, r12, lsl #2] + add r3, r3, r12 bx r3 .align 2 @@ -729,7 +725,7 @@ 20: vmov.i8 d22, #64 - vld1.8 {d2[]}, [r5] + vld1.8 {d2[]}, [lr] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d3, d22, d2 @@ -746,10 +742,10 @@ vst1.8 {d6[0]}, [r0], r1 vst1.8 {d6[1]}, [r12], r1 bgt 2b - pop {r4-r5,pc} + pop {r4,pc} 40: vmov.i8 d22, #64 - vld1.32 {d4[]}, [r5, :32] + vld1.32 {d4[]}, [lr, :32] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d5, d22, d4 @@ -767,10 +763,10 @@ vst1.8 {d20[2]}, [r0], r1 vst1.8 {d20[6]}, [r12], r1 bgt 4b - pop {r4-r5,pc} + pop {r4,pc} 80: vmov.i8 d16, #64 - vld1.u8 {d2}, [r5, :64] + vld1.u8 {d2}, [lr, :64] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d17, d16, d2 @@ -791,10 +787,10 @@ vst1.16 {d22[2]}, [r0, :16], r1 vst1.16 {d23[2]}, [r12, :16], r1 bgt 8b - pop {r4-r5,pc} + pop {r4,pc} 160: vmov.i8 q12, #64 - vld1.u8 {q14}, [r5, :128] + vld1.u8 {q14}, [lr, :128] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 q11, q12, q14 @@ -821,10 +817,10 @@ vst1.32 {d19[0]}, [r0, :32], r1 vst1.32 {d21[0]}, [r12, :32], r1 bgt 16b - pop {r4-r5,pc} + pop {r4,pc} 320: vmov.i8 q10, #64 - vld1.u8 {q2, q3}, [r5, :128] + vld1.u8 {q2, q3}, [lr, :128] vsub.i8 q11, q10, q2 vsub.i8 d24, d20, d6 32: @@ -842,7 +838,7 @@ vrshrn.i16 d2, q15, #6 vst1.u8 {d0, d1, d2}, [r0, :64], r1 bgt 32b - pop {r4-r5,pc} + pop {r4,pc} endfunc @@ -1403,12 +1399,12 @@ vld1.8 {d24}, [\sr2], \s_strd vmovl.u8 q8, d16 vmovl.u8 q12, d24 - vext.8 q9, q8, q8, #2 - vext.8 q10, q8, q8, #4 - vext.8 q11, q8, q8, #6 - vext.8 q13, q12, q12, #2 - vext.8 q14, q12, q12, #4 - vext.8 q15, q12, q12, #6 + vext.8 d18, d16, d17, #2 + vext.8 d20, d16, d17, #4 + vext.8 d22, d16, d17, #6 + vext.8 d26, d24, d25, #2 + vext.8 d28, d24, d25, #4 + vext.8 d30, d24, d25, #6 subs \h, \h, #2 vmul.s16 d4, d16, d0[0] vmla.s16 d4, d18, d0[1] @@ -1431,7 +1427,7 @@ pop {r4-r11,pc} 80: // 8xN h - vld1.8 {d0}, [\mx] + vld1.8 {d0}, [\mx, :64] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1482,7 +1478,7 @@ // one temporary for vext in the loop. That's slower on A7 and A53, // (but surprisingly, marginally faster on A8 and A73). vpush {q4-q6} - vld1.8 {d0}, [\mx] + vld1.8 {d0}, [\mx, :64] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd @@ -1629,7 +1625,7 @@ 28: // 2x8, 2x16 v vpush {q4-q7} - vld1.8 {d0}, [\my] + vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd @@ -1709,7 +1705,7 @@ 480: // 4x8, 4x16 v vpush {q4} - vld1.8 {d0}, [\my] + vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd @@ -1782,7 +1778,7 @@ 640: 1280: vpush {q4} - vld1.8 {d0}, [\my] + vld1.8 {d0}, [\my, :64] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 @@ -1951,11 +1947,10 @@ bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d26, #4 - vmov d19, d26 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] - vmlal.s16 q2, d19, d2[3] + vmlal.s16 q2, d26, d2[3] vqrshrn.s32 d4, q2, #\shift_hv vqmovun.s16 d4, q2 @@ -1964,11 +1959,11 @@ vst1.16 {d4[1]}, [\ds2, :16], \d_strd ble 0f vmov d16, d18 - vmov d17, d19 + vmov d17, d26 b 2b 280: // 2x8, 2x16, 2x32 hv - vld1.8 {d2}, [\my] + vld1.8 {d2}, [\my, :64] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd @@ -2001,7 +1996,6 @@ 28: bl L(\type\()_8tap_filter_2) vext.8 d22, d21, d26, #4 - vmov d23, d26 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] @@ -2009,7 +2003,7 @@ vmlal.s16 q2, d20, d3[0] vmlal.s16 q2, d21, d3[1] vmlal.s16 q2, d22, d3[2] - vmlal.s16 q2, d23, d3[3] + vmlal.s16 q2, d26, d3[3] vqrshrn.s32 d4, q2, #\shift_hv vqmovun.s16 d4, q2 @@ -2022,7 +2016,7 @@ vmov d18, d20 vmov d19, d21 vmov d20, d22 - vmov d21, d23 + vmov d21, d26 b 28b 0: @@ -2108,7 +2102,7 @@ b 4b 480: // 4x8, 4x16, 4x32 hv - vld1.8 {d2}, [\my] + vld1.8 {d2}, [\my, :64] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd @@ -2211,7 +2205,7 @@ bgt 880f vpush {q4-q7} add \my, \my, #2 - vld1.8 {d0}, [\mx] + vld1.8 {d0}, [\mx, :64] vld1.32 {d2[]}, [\my] sub \src, \src, #3 sub \src, \src, \s_strd @@ -2301,8 +2295,8 @@ 640: 1280: vpush {q4-q7} - vld1.8 {d0}, [\mx] - vld1.8 {d2}, [\my] + vld1.8 {d0}, [\mx, :64] + vld1.8 {d2}, [\my, :64] sub \src, \src, #3 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 @@ -2972,8 +2966,8 @@ .endm .macro load_filter_coef dst, src, inc - vld1.8 {\dst}, [r12, :64] add \src, \src, \inc + vld1.8 {\dst}, [r12, :64] .endm .macro load_filter_row dst, src, inc @@ -2984,71 +2978,56 @@ function warp_filter_horz_neon load_filter_ptr r5 // filter 0 vld1.16 {q7}, [r2], r3 + vmov.i8 q6, #128 load_filter_coef d0, r5, r7 // filter 0 - vmovl.u8 q6, d14 // original pixels - load_filter_row d2, r5, r7 // filter 1 - vmovl.u8 q7, d15 // original pixels - load_filter_row d4, r5, r7 // filter 2 - vmovl.s8 q0, d0 // filter 0 - vext.8 q3, q6, q7, #2*1 // filter 1 pixels + load_filter_row d1, r5, r7 // filter 1 + load_filter_row d2, r5, r7 // filter 2 load_filter_ptr r5 // filter 3 - vmovl.s8 q1, d2 // filter 1 - vmul.i16 q5, q6, q0 // filter 0 output - load_filter_coef d0, r5, r7 // filter 3 - vmovl.s8 q2, d4 // filter 2 + veor q7, q7, q6 // subtract by 128 to allow using vmull + load_filter_coef d3, r5, r7 // filter 3 + vext.8 d12, d14, d15, #1 // filter 1 pixels + vext.8 d13, d14, d15, #2 // filter 2 pixels load_filter_ptr r5 // filter 4 - vext.8 q4, q6, q7, #2*2 // filter 2 pixels - vmul.i16 q3, q3, q1 // filter 1 output - load_filter_coef d2, r5, r7 // filter 4 - vmul.i16 q4, q4, q2 // filter 2 output - vext.8 q2, q6, q7, #2*3 // filter 3 pixels - vmovl.s8 q0, d0 // filter 3 - vpaddl.s16 q5, q5 // pixel 0 (4x32) - vpaddl.s16 q3, q3 // pixel 1 (4x32) - vmul.i16 q0, q2, q0 // filter 3 output + vmull.s8 q2, d14, d0 // filter 0 output + vmull.s8 q3, d12, d1 // filter 1 output + load_filter_coef d0, r5, r7 // filter 4 load_filter_ptr r5 // filter 5 - vext.8 q2, q6, q7, #2*4 // filter 4 pixels - vmovl.s8 q1, d2 // filter 4 - vpaddl.s16 q4, q4 // pixel 2 (4x32) - vpadd.s32 d10, d10, d11 // pixel 0 (2x32) - vpadd.s32 d11, d6, d7 // pixel 1 (2x32) - load_filter_coef d6, r5, r7 // filter 5 - vmul.i16 q1, q2, q1 // filter 4 output - vpadd.s32 d8, d8, d9 // pixel 2 (2x32) + vext.8 d12, d14, d15, #3 // filter 3 pixels + vmull.s8 q4, d13, d2 // filter 2 output + vext.8 d13, d14, d15, #4 // filter 4 pixels + vpadd.i16 d4, d4, d5 // pixel 0 (4x16) + vpadd.i16 d5, d6, d7 // pixel 1 (4x16) + load_filter_coef d1, r5, r7 // filter 5 load_filter_ptr r5 // filter 6 - vpaddl.s16 q0, q0 // pixel 3 (4x32) - vpadd.s32 d10, d10, d11 // pixel 0,1 - vext.8 q2, q6, q7, #2*5 // filter 5 pixels - vmovl.s8 q3, d6 // filter 5 - vpaddl.s16 q1, q1 // pixel 4 (4x32) - vpadd.s32 d9, d0, d1 // pixel 3 (2x32) + vmull.s8 q5, d12, d3 // filter 3 output + vext.8 d12, d14, d15, #5 // filter 5 pixels + vmull.s8 q3, d13, d0 // filter 4 output load_filter_coef d0, r5, r7 // filter 6 - vmul.i16 q2, q2, q3 // filter 5 output - vpadd.s32 d11, d8, d9 // pixel 2,3 + vext.8 d13, d14, d15, #6 // filter 6 pixels load_filter_ptr r5 // filter 7 - vpaddl.s16 q2, q2 // pixel 5 (4x32) - vpadd.s32 d8, d2, d3 // pixel 4 (2x32) - vext.8 q3, q6, q7, #2*6 // filter 6 pixels - vmovl.s8 q0, d0 // filter 6 - vpadd.s32 d9, d4, d5 // pixel 5 (2x32) - load_filter_coef d4, r5, r7 // filter 7 - vpadd.s32 d8, d8, d9 // pixel 4,5 - vext.8 q1, q6, q7, #2*7 // filter 7 pixels - vmovl.s8 q2, d4 // filter 7 - vmul.i16 q3, q3, q0 // filter 6 output - vmul.i16 q1, q1, q2 // filter 7 output + vpadd.i16 d8, d8, d9 // pixel 2 (4x16) + vpadd.i16 d9, d10, d11 // pixel 3 (4x16) + vmull.s8 q5, d12, d1 // filter 5 output + load_filter_coef d1, r5, r7 // filter 7 + vext.8 d14, d14, d15, #7 // filter 7 pixels + vpadd.i16 d6, d6, d7 // pixel 4 (4x16) + vpadd.i16 d10, d10, d11 // pixel 5 (4x16) + vmull.s8 q6, d13, d0 // filter 6 output + vmull.s8 q7, d14, d1 // filter 7 output + sub r5, r5, r7, lsl #3 - vpaddl.s16 q3, q3 // pixel 6 (4x32) - vpaddl.s16 q1, q1 // pixel 7 (4x32) - vpadd.s32 d6, d6, d7 // pixel 6 (2x32) - vpadd.s32 d2, d2, d3 // pixel 7 (2x32) - vpadd.s32 d9, d6, d2 // pixel 6,7 - add r5, r5, r8 + vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16) + vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16) + vpadd.i16 d12, d12, d13 // pixel 6 (4x16) + vpadd.i16 d14, d14, d15 // pixel 7 (4x16) + vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16) + vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16) + vpadd.i16 d4, d4, d5 // pixel 0-3 + vpadd.i16 d5, d6, d10 // pixel 4-7 - vrshrn.s32 d10, q5, #3 - vrshrn.s32 d11, q4, #3 + add r5, r5, r8 bx lr endfunc @@ -3080,23 +3059,23 @@ add r6, r6, #512 bl warp_filter_horz_neon - vmov q8, q5 + vrshr.s16 q8, q2, #3 bl warp_filter_horz_neon - vmov q9, q5 + vrshr.s16 q9, q2, #3 bl warp_filter_horz_neon - vmov q10, q5 + vrshr.s16 q10, q2, #3 bl warp_filter_horz_neon - vmov q11, q5 + vrshr.s16 q11, q2, #3 bl warp_filter_horz_neon - vmov q12, q5 + vrshr.s16 q12, q2, #3 bl warp_filter_horz_neon - vmov q13, q5 + vrshr.s16 q13, q2, #3 bl warp_filter_horz_neon - vmov q14, q5 + vrshr.s16 q14, q2, #3 1: bl warp_filter_horz_neon - vmov q15, q5 + vrshr.s16 q15, q2, #3 load_filter_row d8, r6, r9 load_filter_row d9, r6, r9 @@ -3139,12 +3118,19 @@ vmlal.s16 q1, d29, d5 vmlal.s16 q1, d31, d7 +.ifb \t + vmov.i16 q7, #128 +.else + vmov.i16 q7, #0x800 +.endif + vmov q8, q9 vmov q9, q10 vqrshrn.s32 d0, q0, #\shift vmov q10, q11 vqrshrn.s32 d1, q1, #\shift vmov q11, q12 + vadd.i16 q0, q0, q7 vmov q12, q13 .ifb \t vqmovun.s16 d0, q0 @@ -3240,10 +3226,9 @@ .macro v_loop need_left, need_right 0: .if \need_left - vld1.8 {d0[]}, [r8] + vld1.8 {d0[], d1[]}, [r8] mov r12, r6 // out = dst mov r3, r4 - vmov d1, d0 1: subs r3, r3, #16 vst1.8 {q0}, [r12, :128]! @@ -3265,10 +3250,9 @@ add r3, r8, r2 // in + center_w sub r3, r3, #1 // in + center_w - 1 add r12, r6, r4 // dst + left_ext - vld1.8 {d0[]}, [r3] + vld1.8 {d0[], d1[]}, [r3] add r12, r12, r2 // out = dst + left_ext + center_w mov r3, r11 - vmov d1, d0 1: subs r3, r3, #16 vst1.8 {q0}, [r12]! diff -Nru dav1d-0.7.1/src/arm/32/util.S dav1d-0.9.1/src/arm/32/util.S --- dav1d-0.7.1/src/arm/32/util.S 2020-06-21 11:48:54.960126400 +0000 +++ dav1d-0.9.1/src/arm/32/util.S 2021-07-28 21:38:28.865851900 +0000 @@ -69,6 +69,56 @@ #endif .endm +// This macro clobbers r7 (and r12 on windows) and stores data at the +// bottom of the stack; sp is the start of the space allocated that +// the caller can use. +.macro sub_sp_align space +#if CONFIG_THUMB + mov r7, sp + and r7, r7, #15 +#else + and r7, sp, #15 +#endif + sub sp, sp, r7 + // Now the stack is aligned, store the amount of adjustment back + // on the stack, as we don't want to waste a register as frame + // pointer. + str r7, [sp, #-16]! +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub r7, sp, #4096 + ldr r12, [r7] + sub r7, r7, #(\space - 4096) + mov sp, r7 +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +.macro add_sp_align space +.if \space >= 4096 + add sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + add sp, sp, #(\space)%4096 +.endif + ldr r7, [sp], #16 + // Add back the original stack adjustment + add sp, sp, r7 +.endm + .macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 vtrn.32 \q0, \q2 vtrn.32 \q1, \q3 @@ -108,6 +158,14 @@ vtrn.8 \r2, \r3 .endm +.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 + vswp \r1, \r4 // vtrn.64 \q0, \q2 + vswp \r3, \r6 // vtrn.64 \q1, \q3 + + vtrn.32 \q0, \q1 + vtrn.32 \q2, \q3 +.endm + .macro transpose_4x4h q0, q1, r0, r1, r2, r3 vtrn.32 \q0, \q1 diff -Nru dav1d-0.7.1/src/arm/64/cdef.S dav1d-0.9.1/src/arm/64/cdef.S --- dav1d-0.7.1/src/arm/64/cdef.S 2020-06-21 11:48:54.960126400 +0000 +++ dav1d-0.9.1/src/arm/64/cdef.S 2021-07-28 21:38:28.865851900 +0000 @@ -363,10 +363,8 @@ neg v20.16b, v21.16b // -imin() bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() - smlal v1.8h, v18.8b, v19.8b // sum += taps[k] * constrain() - smlal v1.8h, v22.8b, v19.8b // sum += taps[k] * constrain() - smlal2 v2.8h, v18.16b, v19.16b // sum += taps[k] * constrain() - smlal2 v2.8h, v22.16b, v19.16b // sum += taps[k] * constrain() + mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain() + mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain() .endm // void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, @@ -418,8 +416,11 @@ ld1 {v0.s}[3], [x14] // px .endif - movi v1.8h, #0 // sum - movi v2.8h, #0 // sum + // We need 9-bits or two 8-bit accululators to fit the sum. + // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228. + // Start sum at -1 instead of 0 to help handle rounding later. + movi v1.16b, #255 // sum + movi v2.16b, #0 // sum .if \min mov v3.16b, v0.16b // min mov v4.16b, v0.16b // max @@ -468,16 +469,16 @@ .endif b.ne 2b - sshr v5.8h, v1.8h, #15 // -(sum < 0) - sshr v6.8h, v2.8h, #15 // -(sum < 0) - add v1.8h, v1.8h, v5.8h // sum - (sum < 0) - add v2.8h, v2.8h, v6.8h // sum - (sum < 0) - srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 - srshr v2.8h, v2.8h, #4 // (8 + sum - (sum < 0)) >> 4 - uaddw v1.8h, v1.8h, v0.8b // px + (8 + sum ...) >> 4 - uaddw2 v2.8h, v2.8h, v0.16b // px + (8 + sum ...) >> 4 - sqxtun v0.8b, v1.8h - sqxtun2 v0.16b, v2.8h + // Perform halving adds since the value won't fit otherwise. + // To handle the offset for negative values, use both halving w/ and w/o rounding. + srhadd v5.16b, v1.16b, v2.16b // sum >> 1 + shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 + sshr v1.16b, v5.16b, #7 // sum < 0 + bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 + + srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 + + usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4 .if \min umin v0.16b, v0.16b, v4.16b umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) diff -Nru dav1d-0.7.1/src/arm/64/cdef_tmpl.S dav1d-0.9.1/src/arm/64/cdef_tmpl.S --- dav1d-0.7.1/src/arm/64/cdef_tmpl.S 2020-06-21 11:48:54.960126400 +0000 +++ dav1d-0.9.1/src/arm/64/cdef_tmpl.S 2021-07-28 21:38:28.865851900 +0000 @@ -107,7 +107,7 @@ .macro filter_func w, bpc, pri, sec, min, suffix function cdef_filter\w\suffix\()_\bpc\()bpc_neon .if \bpc == 8 - ldr w8, [sp] // bitdepth_max + ldr w8, [sp] // edges cmp w8, #0xf b.eq cdef_filter\w\suffix\()_edged_8bpc_neon .endif @@ -311,6 +311,30 @@ .endif .endm +// Steps for loading and preparing each row +.macro dir_load_step1 s1, bpc +.if \bpc == 8 + ld1 {\s1\().8b}, [x0], x1 +.else + ld1 {\s1\().8h}, [x0], x1 +.endif +.endm + +.macro dir_load_step2 s1, bpc +.if \bpc == 8 + usubl \s1\().8h, \s1\().8b, v31.8b +.else + ushl \s1\().8h, \s1\().8h, v8.8h +.endif +.endm + +.macro dir_load_step3 s1, bpc +// Nothing for \bpc == 8 +.if \bpc != 8 + sub \s1\().8h, \s1\().8h, v31.8h +.endif +.endm + // int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, // unsigned *const var) .macro find_dir bpc @@ -333,21 +357,15 @@ movi v3.8h, #0 // v2-v3 sum_diag[1] movi v5.8h, #0 // v4-v5 sum_hv[0-1] movi v7.8h, #0 // v6-v7 sum_alt[0] + dir_load_step1 v26, \bpc // Setup first row early movi v17.8h, #0 // v16-v17 sum_alt[1] movi v18.8h, #0 // v18-v19 sum_alt[2] + dir_load_step2 v26, \bpc movi v19.8h, #0 + dir_load_step3 v26, \bpc movi v21.8h, #0 // v20-v21 sum_alt[3] .irpc i, 01234567 -.if \bpc == 8 - ld1 {v26.8b}, [x0], x1 - usubl v26.8h, v26.8b, v31.8b -.else - ld1 {v26.8h}, [x0], x1 - ushl v26.8h, v26.8h, v8.8h - sub v26.8h, v26.8h, v31.8h -.endif - addv h25, v26.8h // [y] rev64 v27.8h, v26.8h addp v28.8h, v26.8h, v30.8h // [(x >> 1)] @@ -355,48 +373,59 @@ ext v27.16b, v27.16b, v27.16b, #8 // [-x] rev64 v29.4h, v28.4h // [-(x >> 1)] ins v4.h[\i], v25.h[0] // sum_hv[0] - +.if \i < 6 + ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) + ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) + add v18.8h, v18.8h, v22.8h // sum_alt[2] + add v19.4h, v19.4h, v23.4h // sum_alt[2] +.else + add v18.8h, v18.8h, v26.8h // sum_alt[2] +.endif +.if \i == 0 + mov v20.16b, v26.16b // sum_alt[3] +.elseif \i == 1 + add v20.8h, v20.8h, v26.8h // sum_alt[3] +.else + ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) + ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) + add v20.8h, v20.8h, v24.8h // sum_alt[3] + add v21.4h, v21.4h, v25.4h // sum_alt[3] +.endif .if \i == 0 mov v0.16b, v26.16b // sum_diag[0] + dir_load_step1 v26, \bpc mov v2.16b, v27.16b // sum_diag[1] + dir_load_step2 v26, \bpc mov v6.16b, v28.16b // sum_alt[0] + dir_load_step3 v26, \bpc mov v16.16b, v29.16b // sum_alt[1] .else ext v22.16b, v30.16b, v26.16b, #(16-2*\i) ext v23.16b, v26.16b, v30.16b, #(16-2*\i) ext v24.16b, v30.16b, v27.16b, #(16-2*\i) ext v25.16b, v27.16b, v30.16b, #(16-2*\i) +.if \i != 7 // Nothing to load for the final row + dir_load_step1 v26, \bpc // Start setting up the next row early. +.endif add v0.8h, v0.8h, v22.8h // sum_diag[0] add v1.8h, v1.8h, v23.8h // sum_diag[0] add v2.8h, v2.8h, v24.8h // sum_diag[1] add v3.8h, v3.8h, v25.8h // sum_diag[1] +.if \i != 7 + dir_load_step2 v26, \bpc +.endif ext v22.16b, v30.16b, v28.16b, #(16-2*\i) ext v23.16b, v28.16b, v30.16b, #(16-2*\i) ext v24.16b, v30.16b, v29.16b, #(16-2*\i) ext v25.16b, v29.16b, v30.16b, #(16-2*\i) +.if \i != 7 + dir_load_step3 v26, \bpc +.endif add v6.8h, v6.8h, v22.8h // sum_alt[0] add v7.4h, v7.4h, v23.4h // sum_alt[0] add v16.8h, v16.8h, v24.8h // sum_alt[1] add v17.4h, v17.4h, v25.4h // sum_alt[1] .endif -.if \i < 6 - ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) - ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) - add v18.8h, v18.8h, v22.8h // sum_alt[2] - add v19.4h, v19.4h, v23.4h // sum_alt[2] -.else - add v18.8h, v18.8h, v26.8h // sum_alt[2] -.endif -.if \i == 0 - mov v20.16b, v26.16b // sum_alt[3] -.elseif \i == 1 - add v20.8h, v20.8h, v26.8h // sum_alt[3] -.else - ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) - ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) - add v20.8h, v20.8h, v24.8h // sum_alt[3] - add v21.4h, v21.4h, v25.4h // sum_alt[3] -.endif .endr movi v31.4s, #105 diff -Nru dav1d-0.7.1/src/arm/64/film_grain16.S dav1d-0.9.1/src/arm/64/film_grain16.S --- dav1d-0.7.1/src/arm/64/film_grain16.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/64/film_grain16.S 2021-07-28 21:38:28.865851900 +0000 @@ -0,0 +1,853 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 + +.macro gather_interleaved dst1, dst2, src1, src2, off + umov w14, \src1[0] + umov w15, \src2[1] + umov w16, \src1[2] + add x14, x14, x3 + umov w17, \src2[3] + add x15, x15, x3 + ld1 {\dst1}[0+\off], [x14] + umov w14, \src1[4] + add x16, x16, x3 + ld1 {\dst2}[1+\off], [x15] + umov w15, \src2[5] + add x17, x17, x3 + ld1 {\dst1}[2+\off], [x16] + umov w16, \src1[6] + add x14, x14, x3 + ld1 {\dst2}[3+\off], [x17] + umov w17, \src2[7] + add x15, x15, x3 + ld1 {\dst1}[4+\off], [x14] + add x16, x16, x3 + ld1 {\dst2}[5+\off], [x15] + add x17, x17, x3 + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[7+\off], [x17] +.endm + +.macro gather dst1, dst2, src1, src2, src3, src4 + gather_interleaved \dst1, \dst2, \src1, \src3, 0 + gather_interleaved \dst2, \dst1, \src3, \src1, 0 + gather_interleaved \dst1, \dst2, \src2, \src4, 8 + gather_interleaved \dst2, \dst1, \src4, \src2, 8 +.endm + +function gather32_neon + gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h + ret +endfunc + +function gather16_neon + gather_interleaved v6.b, v7.b, v0.h, v1.h, 0 + gather_interleaved v7.b, v6.b, v1.h, v0.h, 0 + ins v6.d[1], v7.d[0] + ret +endfunc + +const overlap_coeffs_0, align=4 + .short 27, 17, 0, 0 + .short 17, 27, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .short 23, 0, 0, 0 + .short 22, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, uxtw #1 // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type, +// const int bitdepth_max); +function fgy_32x32_16bpc_neon, export=1 + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + str d14, [sp, #64] + eor w4, w4, #15 // 15 - scaling_shift + ldr w11, [x6, #8] // offsets[1][0] + ldr w13, [x6, #4] // offsets[0][1] + ldr w15, [x6, #12] // offsets[1][1] + ldr w10, [sp, #96] // bitdepth_max + ldr w6, [x6] // offsets[0][0] + dup v26.8h, w10 // bitdepth_max + clz w10, w10 + ldr w8, [sp, #80] // clip + sub w10, w10, #24 // -bitdepth_min_8 + mov x9, #GRAIN_WIDTH*2 // grain_lut stride + neg w10, w10 // bitdepth_min_8 + + dup v29.8h, w4 // 15 - scaling_shift + dup v27.8h, w10 // bitdepth_min_8 + + movrel x16, overlap_coeffs_0 + + cbz w8, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #235 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v26.16b // bitdepth_max +2: + + ushr v26.8h, v26.8h, #1 // grain_max + not v25.16b, v26.16b // grain_min + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + + add x5, x5, #18 // grain_lut += 9 + add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x9 // grain_lut += grain_stride + + calc_offset w11, w12, w11, 0, 0 + calc_offset w13, w14, w13, 0, 0 + calc_offset w15, w16, w15, 0, 0 + calc_offset w6, w10, w6, 0, 0 + + add_offset x12, w11, x12, x5, x9 + add_offset x14, w13, x14, x5, x9 + add_offset x16, w15, x16, x5, x9 + add_offset x5, w6, x10, x5, x9 + + ldr w11, [sp, #88] // type + adr x13, L(fgy_loop_tbl) + + add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + tst w11, #1 + ldrh w11, [x13, w11, uxtw #1] + + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx + + sub x11, x13, w11, uxtw + + b.eq 1f + // y overlap + dup v8.8h, v27.h[0] + dup v9.8h, v27.h[1] + mov w10, w7 // backup actual h + mov w7, #2 +1: + br x11 +endfunc + +function fgy_loop_neon +.macro fgy ox, oy +L(loop_\ox\oy): +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src +.if \ox + ld1 {v20.4h}, [x4], x9 // grain_lut old +.endif +.if \oy + ld1 {v21.8h, v22.8h, v23.8h, v24.8h}, [x6], x9 // grain_lut top +.endif +.if \ox && \oy + ld1 {v14.4h}, [x8], x9 // grain_lut top old +.endif + mvni v4.8h, #0xf0, lsl #8 // 0x0fff + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut + + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v4.16b + and v1.16b, v1.16b, v4.16b + and v2.16b, v2.16b, v4.16b + and v3.16b, v3.16b, v4.16b + bl gather32_neon + +.if \ox + smull v20.4s, v20.4h, v27.4h + smlal v20.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v14.4s, v14.4h, v27.4h + smlal v14.4s, v21.4h, v28.4h + sqrshrn v20.4h, v20.4s, #5 + sqrshrn v14.4h, v14.4s, #5 + smin v20.4h, v20.4h, v26.4h + smin v14.4h, v14.4h, v26.4h + smax v20.4h, v20.4h, v25.4h + smax v14.4h, v14.4h, v25.4h +.endif + +.if \ox + smull v10.4s, v20.4h, v9.4h +.else + smull v10.4s, v16.4h, v9.4h +.endif + smull2 v11.4s, v16.8h, v9.8h + smull v12.4s, v17.4h, v9.4h + smull2 v13.4s, v17.8h, v9.8h + smull v16.4s, v18.4h, v9.4h + smull2 v17.4s, v18.8h, v9.8h + smull v18.4s, v19.4h, v9.4h + smull2 v19.4s, v19.8h, v9.8h +.if \ox + smlal v10.4s, v14.4h, v8.4h +.else + smlal v10.4s, v21.4h, v8.4h +.endif + smlal2 v11.4s, v21.8h, v8.8h + smlal v12.4s, v22.4h, v8.4h + smlal2 v13.4s, v22.8h, v8.8h + smlal v16.4s, v23.4h, v8.4h + smlal2 v17.4s, v23.8h, v8.8h + smlal v18.4s, v24.4h, v8.4h + smlal2 v19.4s, v24.8h, v8.8h + sqrshrn v10.4h, v10.4s, #5 + sqrshrn2 v10.8h, v11.4s, #5 + sqrshrn v11.4h, v12.4s, #5 + sqrshrn2 v11.8h, v13.4s, #5 + sqrshrn v12.4h, v16.4s, #5 + sqrshrn2 v12.8h, v17.4s, #5 + sqrshrn v13.4h, v18.4s, #5 + sqrshrn2 v13.8h, v19.4s, #5 + smin v16.8h, v10.8h, v26.8h + smin v17.8h, v11.8h, v26.8h + smin v18.8h, v12.8h, v26.8h + smin v19.8h, v13.8h, v26.8h + smax v16.8h, v16.8h, v25.8h + smax v17.8h, v17.8h, v25.8h + smax v18.8h, v18.8h, v25.8h + smax v19.8h, v19.8h, v25.8h +.endif + + uxtl v4.8h, v6.8b // scaling +.if \ox && !\oy + sqrshrn v20.4h, v20.4s, #5 +.endif + uxtl2 v5.8h, v6.16b +.if \ox && !\oy + smin v20.4h, v20.4h, v26.4h +.endif + uxtl v6.8h, v7.8b +.if \ox && !\oy + smax v20.4h, v20.4h, v25.4h +.endif + uxtl2 v7.8h, v7.16b +.if \ox && !\oy + ins v16.d[0], v20.d[0] +.endif + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v21.8h, v17.8h, v5.8h + sqrdmulh v22.8h, v18.8h, v6.8h + sqrdmulh v23.8h, v19.8h, v7.8h + + usqadd v0.8h, v20.8h // *src + noise + usqadd v1.8h, v21.8h + usqadd v2.8h, v22.8h + usqadd v3.8h, v23.8h + + umax v0.8h, v0.8h, v30.8h + umax v1.8h, v1.8h, v30.8h + umax v2.8h, v2.8h, v30.8h + umax v3.8h, v3.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w7, w7, #1 +.if \oy + dup v8.8h, v28.h[0] + dup v9.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w10, #2 + sub w7, w10, #2 // restore actual remaining h + b.gt L(loop_\ox\()0) +.endif + ldr d14, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + ret +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 + +L(fgy_loop_tbl): + .hword L(fgy_loop_tbl) - L(loop_00) + .hword L(fgy_loop_tbl) - L(loop_01) + .hword L(fgy_loop_tbl) - L(loop_10) + .hword L(fgy_loop_tbl) - L(loop_11) +endfunc + +// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type, +// const int bitdepth_max); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_16bpc_neon, export=1 + str x30, [sp, #-80]! + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + + ldp x8, x9, [sp, #80] // offsets, h + ldp x10, x11, [sp, #96] // uv, is_id + ldr w16, [sp, #120] // bitdepth_max + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + dup v23.8h, w16 // bitdepth_max + clz w16, w16 + eor w13, w13, #15 // 15 - scaling_shift + sub w16, w16, #24 // -bitdepth_min_8 + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + neg w16, w16 // bitdepth_min_8 + ld1r {v8.8h}, [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1r {v9.8h}, [x15] // uv_mult + + dup v29.8h, w13 // 15 - scaling_shift + dup v27.8h, w16 // bitdepth_min_8 + + cbz w12, 1f + // clip + movi v30.8h, #16 + movi v31.8h, #240 + sshl v30.8h, v30.8h, v27.8h + sshl v31.8h, v31.8h, v27.8h + cbz w11, 2f + // is_id + movi v31.8h, #235 + sshl v31.8h, v31.8h, v27.8h + b 2f +1: + // no clip + movi v30.8h, #0 + mov v31.16b, v23.16b // bitdepth_max +2: + + ushr v15.8h, v23.8h, #1 // grain_max + sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8 + not v14.16b, v15.16b // grain_min + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH*2 // grain_lut stride + + add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #112] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx\()_tbl) + + ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.8h, #23 + movi v26.8h, #22 +.else + movi v25.8h, #27 + movi v26.8h, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): +1: +.if \ox + ld1 {v4.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v5.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut + +.if \ox + smull v4.4s, v4.4h, v27.4h + smlal v4.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v5.4s, v5.4h, v27.4h + smlal v5.4s, v0.4h, v28.4h + sqrshrn v4.4h, v4.4s, #5 + sqrshrn v5.4h, v5.4s, #5 + smin v4.4h, v4.4h, v15.4h + smin v5.4h, v5.4h, v15.4h + smax v4.4h, v4.4h, v14.4h + smax v5.4h, v5.4h, v14.4h + ins v16.d[0], v4.d[0] + ins v0.d[0], v5.d[0] +.endif + + smull v6.4s, v16.4h, v26.4h + smull2 v7.4s, v16.8h, v26.8h + smull v10.4s, v17.4h, v26.4h + smull2 v11.4s, v17.8h, v26.8h + smull v16.4s, v18.4h, v26.4h + smull2 v17.4s, v18.8h, v26.8h + smull v18.4s, v19.4h, v26.4h + smull2 v19.4s, v19.8h, v26.8h + smlal v6.4s, v0.4h, v25.4h + smlal2 v7.4s, v0.8h, v25.8h + smlal v10.4s, v1.4h, v25.4h + smlal2 v11.4s, v1.8h, v25.8h + smlal v16.4s, v2.4h, v25.4h + smlal2 v17.4s, v2.8h, v25.8h + smlal v18.4s, v3.4h, v25.4h + smlal2 v19.4s, v3.8h, v25.8h + sqrshrn v6.4h, v6.4s, #5 + sqrshrn2 v6.8h, v7.4s, #5 + sqrshrn v7.4h, v10.4s, #5 + sqrshrn2 v7.8h, v11.4s, #5 + sqrshrn v10.4h, v16.4s, #5 + sqrshrn2 v10.8h, v17.4s, #5 + sqrshrn v11.4h, v18.4s, #5 + sqrshrn2 v11.8h, v19.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v4.4h, v4.4s, #5 + smin v4.4h, v4.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v6.8h, v15.8h + smin v17.8h, v7.8h, v15.8h + smin v18.8h, v10.8h, v15.8h + smin v19.8h, v11.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h + smax v18.8h, v18.8h, v14.8h + smax v19.8h, v19.8h, v14.8h +.endif + +.if \ox && !\oy + smax v4.4h, v4.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v4.d[0] +.endif + +.if !\csfl + smull v4.4s, v0.4h, v8.4h + smull2 v5.4s, v0.8h, v8.8h + smull v6.4s, v1.4h, v8.4h + smull2 v7.4s, v1.8h, v8.8h + smull v0.4s, v2.4h, v8.4h + smull2 v1.4s, v2.8h, v8.8h + smull v2.4s, v3.4h, v8.4h + smull2 v3.4s, v3.8h, v8.8h + smlal v4.4s, v10.4h, v9.4h + smlal2 v5.4s, v10.8h, v9.8h + smlal v6.4s, v11.4h, v9.4h + smlal2 v7.4s, v11.8h, v9.8h + smlal v0.4s, v12.4h, v9.4h + smlal2 v1.4s, v12.8h, v9.8h + smlal v2.4s, v13.4h, v9.4h + smlal2 v3.4s, v13.8h, v9.8h + shrn v4.4h, v4.4s, #6 + shrn2 v4.8h, v5.4s, #6 + shrn v5.4h, v6.4s, #6 + shrn2 v5.8h, v7.4s, #6 + shrn v6.4h, v0.4s, #6 + shrn2 v6.8h, v1.4s, #6 + shrn v7.4h, v2.4s, #6 + shrn2 v7.8h, v3.4s, #6 + add v0.8h, v4.8h, v24.8h + add v1.8h, v5.8h, v24.8h + add v2.8h, v6.8h, v24.8h + add v3.8h, v7.8h, v24.8h + movi v20.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smin v2.8h, v2.8h, v23.8h + smin v3.8h, v3.8h, v23.8h + smax v0.8h, v0.8h, v20.8h + smax v1.8h, v1.8h, v20.8h + smax v2.8h, v2.8h, v20.8h + smax v3.8h, v3.8h, v20.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b + and v2.16b, v2.16b, v23.16b + and v3.16b, v3.16b, v23.16b +.endif + + bl gather32_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + uxtl v6.8h, v7.8b + uxtl2 v7.8h, v7.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + sqrdmulh v18.8h, v18.8h, v6.8h + sqrdmulh v19.8h, v19.8h, v7.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + usqadd v12.8h, v18.8h + usqadd v13.8h, v19.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umax v2.8h, v12.8h, v30.8h + umax v3.8h, v13.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + + subs w9, w9, #1 +.if \oy + dup v25.8h, v28.h[0] + dup v26.8h, v28.h[1] +.endif + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +1: +.if \ox + ld1 {v18.4h}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v19.4h}, [x11], x10 // grain_lut top old +.endif + ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut + +.if \ox + smull v18.4s, v18.4h, v27.4h + smlal v18.4s, v16.4h, v28.4h +.endif + +.if \oy +.if \ox + smull v19.4s, v19.4h, v27.4h + smlal v19.4s, v20.4h, v28.4h + sqrshrn v18.4h, v18.4s, #5 + sqrshrn v19.4h, v19.4s, #5 + smin v18.4h, v18.4h, v15.4h + smin v19.4h, v19.4h, v15.4h + smax v18.4h, v18.4h, v14.4h + smax v19.4h, v19.4h, v14.4h + ins v16.d[0], v18.d[0] + ins v20.d[0], v19.d[0] +.endif + + smull v0.4s, v16.4h, v26.4h + smull2 v1.4s, v16.8h, v26.8h + smull v2.4s, v17.4h, v26.4h + smull2 v3.4s, v17.8h, v26.8h + smlal v0.4s, v20.4h, v25.4h + smlal2 v1.4s, v20.8h, v25.8h + smlal v2.4s, v21.4h, v25.4h + smlal2 v3.4s, v21.8h, v25.8h + sqrshrn v16.4h, v0.4s, #5 + sqrshrn2 v16.8h, v1.4s, #5 + sqrshrn v17.4h, v2.4s, #5 + sqrshrn2 v17.8h, v3.4s, #5 +.endif + +.if \ox && !\oy + sqrshrn v18.4h, v18.4s, #5 + smin v18.4h, v18.4h, v15.4h +.endif + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma +.if \oy + smin v16.8h, v16.8h, v15.8h + smin v17.8h, v17.8h, v15.8h + smax v16.8h, v16.8h, v14.8h + smax v17.8h, v17.8h, v14.8h +.endif + +.if \ox && !\oy + smax v18.4h, v18.4h, v14.4h +.endif + ld1 {v10.8h, v11.8h}, [x1], x2 // src +.if \ox && !\oy + ins v16.d[0], v18.d[0] +.endif + addp v0.8h, v0.8h, v1.8h + addp v1.8h, v2.8h, v3.8h + urshr v0.8h, v0.8h, #1 + urshr v1.8h, v1.8h, #1 +.if !\csfl + smull v2.4s, v0.4h, v8.4h + smull2 v3.4s, v0.8h, v8.8h + smull v0.4s, v1.4h, v8.4h + smull2 v1.4s, v1.8h, v8.8h + smlal v2.4s, v10.4h, v9.4h + smlal2 v3.4s, v10.8h, v9.8h + smlal v0.4s, v11.4h, v9.4h + smlal2 v1.4s, v11.8h, v9.8h + shrn v2.4h, v2.4s, #6 + shrn2 v2.8h, v3.4s, #6 + shrn v3.4h, v0.4s, #6 + shrn2 v3.8h, v1.4s, #6 + add v0.8h, v2.8h, v24.8h + add v1.8h, v3.8h, v24.8h + movi v2.8h, #0 + smin v0.8h, v0.8h, v23.8h + smin v1.8h, v1.8h, v23.8h + smax v0.8h, v0.8h, v2.8h + smax v1.8h, v1.8h, v2.8h +.else + // Make sure that uninitialized pixels out of range past the right + // edge are in range; their actual values shouldn't matter. + and v0.16b, v0.16b, v23.16b + and v1.16b, v1.16b, v23.16b +.endif + + bl gather16_neon + + uxtl v4.8h, v6.8b // scaling + uxtl2 v5.8h, v6.16b + + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + + sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v17.8h, v17.8h, v5.8h + + usqadd v10.8h, v16.8h // *src + noise + usqadd v11.8h, v17.8h + + umax v0.8h, v10.8h, v30.8h + umax v1.8h, v11.8h, v30.8h + umin v0.8h, v0.8h, v31.8h + umin v1.8h, v1.8h, v31.8h + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.8h, v1.8h}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldr x30, [sp], #80 + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc diff -Nru dav1d-0.7.1/src/arm/64/film_grain.S dav1d-0.9.1/src/arm/64/film_grain.S --- dav1d-0.7.1/src/arm/64/film_grain.S 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/64/film_grain.S 2021-07-28 21:38:28.865851900 +0000 @@ -0,0 +1,2227 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" +#include "src/arm/asm-offsets.h" + +#define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr w11, w2, #3 + lsr w12, w2, #12 + lsr w13, w2, #1 + eor w11, w2, w11 // (r >> 0) ^ (r >> 3) + eor w12, w12, w13 // (r >> 12) ^ (r >> 1) + eor w11, w11, w12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr w2, w2, #\steps +.endif + and w11, w11, #((1 << \steps) - 1) // bit +.if \shift + orr w2, w2, w11, lsl #(16 - \steps) // *state +.else + orr w2, w2, w11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, x2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, x2, #17 - \bits, #\bits + lsr w2, w2, #1 +.endm + +// special calling convention: +// w2 holds seed +// x3 holds dav1d_gaussian_sequence +// clobbers x11-x15 +// returns in v0.8h +function get_gaussian_neon + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 0 + increment_seed 4 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + read_rand x14, 11, 3 + ld1 {v0.h}[3], [x15] + add x14, x3, x14, lsl #1 + read_rand x15, 11, 2 + ld1 {v0.h}[4], [x14] + add x15, x3, x15, lsl #1 + read_rand x14, 11, 1 + ld1 {v0.h}[5], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[6], [x14] + ld1 {v0.h}[7], [x15] + ret +endfunc + +.macro get_grain_row r0, r1, r2, r3, r4, r5 + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r0\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r0\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r1\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r1\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r2\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r2\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r3\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r3\().16b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn \r4\().8b, \r5\().8h + bl get_gaussian_neon + srshl \r5\().8h, v0.8h, v31.8h + xtn2 \r4\().16b, \r5\().8h + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {\r5\().h}[0], [x14] + ld1 {\r5\().h}[1], [x15] + srshl v0.4h, \r5\().4h, v31.4h + xtn \r5\().8b, v0.8h +.endm + +.macro store_grain_row r0, r1, r2, r3, r4, r5 + st1 {\r0\().16b,\r1\().16b}, [x0], #32 + st1 {\r2\().16b,\r3\().16b}, [x0], #32 + st1 {\r4\().16b}, [x0], #16 + st1 {\r5\().h}[0], [x0], #2 +.endm + +.macro get_grain_row_44 r0, r1, r2 + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn \r0\().8b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn2 \r0\().16b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn \r1\().8b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn2 \r1\().16b, \r2\().8h + bl get_gaussian_neon + srshl \r2\().8h, v0.8h, v31.8h + xtn \r2\().8b, \r2\().8h + + increment_seed 4 + read_rand x14, 11, 3 + read_rand x15, 11, 2 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[0], [x14] + read_rand x14, 11, 1 + ld1 {v0.h}[1], [x15] + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {v0.h}[2], [x14] + ld1 {v0.h}[3], [x15] + srshl v0.4h, v0.4h, v31.4h + xtn2 \r2\().16b, v0.8h +.endm + +.macro store_grain_row_44 r0, r1, r2 + st1 {\r0\().16b,\r1\().16b}, [x0], #32 + st1 {\r2\().16b}, [x0] + add x0, x0, #GRAIN_WIDTH-32 +.endm + +.macro get_grain_2 dst + increment_seed 2 + read_rand x14, 11, 1 + read_rand x15, 11, 0 + add x14, x3, x14, lsl #1 + add x15, x3, x15, lsl #1 + ld1 {\dst\().h}[0], [x14] + ld1 {\dst\().h}[1], [x15] + srshl v0.4h, \dst\().4h, v31.4h + xtn \dst\().8b, v0.8h +.endm + +// w15 holds the number of entries to produce +// w14 holds the previous output entry +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +function output_lag1_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #1 + madd w14, w14, w4, w11 // sum (above) + *coeff * prev output + add w14, w14, w8 // 1 << (ar_coeff_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + add w12, w12, w10 + asr w12, w12, w9 // >> (4 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.b[15], w14 + b.gt 1b + ret +endfunc + +function sum_lag1_above_neon + smull v2.8h, v3.8b, v28.8b + smull2 v3.8h, v3.16b, v28.16b + smull v4.8h, v0.8b, v27.8b + smull2 v5.8h, v0.16b, v27.16b + smull v6.8h, v1.8b, v29.8b + smull2 v7.8h, v1.16b, v29.16b + saddl v0.4s, v2.4h, v4.4h + saddl2 v1.4s, v2.8h, v4.8h + saddl v2.4s, v3.4h, v5.4h + saddl2 v3.4s, v3.8h, v5.8h + saddw v4.4s, v0.4s, v6.4h + saddw2 v5.4s, v1.4s, v6.8h + saddw v6.4s, v2.4s, v7.4h + saddw2 v7.4s, v3.4s, v7.8h + ret +endfunc + +.macro sum_lag1_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag1_\edge\()_neon + str x30, [sp, #-16]! + bl sum_lag1_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH + ld1 {v22.16b, v23.16b}, [x19], #32 + ld1 {v24.16b, v25.16b}, [x12] + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + saddlp v24.8h, v24.16b + saddlp v25.8h, v25.16b + add v22.8h, v22.8h, v24.8h + add v23.8h, v23.8h, v25.8h + rshrn v0.8b, v22.8h, #2 + rshrn2 v0.16b, v23.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.16b, v23.16b}, [x19], #32 + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + rshrn v0.8b, v22.8h, #1 + rshrn2 v0.16b, v23.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.16b}, [x19], #16 +.endif +.if \uv_layout + smull v2.8h, v0.8b, v30.8b + smull2 v3.8h, v0.16b, v30.16b + saddw v4.4s, v4.4s, v2.4h + saddw2 v5.4s, v5.4s, v2.8h + saddw v6.4s, v6.4s, v3.4h + saddw2 v7.4s, v7.4s, v3.8h +.endif +.if \uv_layout && \elems == 16 + b sum_lag1_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 15 + b sum_lag1_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 9 + b sum_lag1_uv_420_\edge\()_start +.else +sum_lag1_\type\()_\edge\()_start: +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + xtn2 v0.16b, v0.8h + ext v4.16b, v4.16b, v4.16b, #12 + smov w14, v0.b[15] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_lag1_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_lag1_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b + mov w15, #4 + bl output_lag1_neon + + increment_seed 4, shift=0 + mov v1.16b, v6.16b +.if \elems == 9 + mov w15, #1 + bl output_lag1_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + xtn v1.8b, v1.8h + ext v0.16b, v0.16b, v1.16b, #7 +.else + mov w15, #4 + bl output_lag1_neon + + increment_seed 4, shift=0 + mov v1.16b, v7.16b + +.ifc \edge, right + mov w15, #3 + bl output_lag1_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #1 +.else + mov w15, #4 + bl output_lag1_neon +.endif +.endif + ldr x30, [sp], #16 + ret +.endif +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 15 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 15 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 9 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 9 + +.macro sum_lag1 type, dst, left, mid, right, edge=mid + mov v3.16b, \mid\().16b + ext v0.16b, \left\().16b, \mid\().16b, #15 + ext v1.16b, \mid\().16b, \right\().16b, #1 + bl sum_\type\()_lag1_\edge\()_neon + mov \dst\().16b, v0.16b +.endm + +.macro sum_y_lag1 dst, left, mid, right, edge=mid + sum_lag1 y, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_444, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_422, \dst, \left, \mid, \right, \edge +.endm + +.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid + sum_lag1 uv_420, \dst, \left, \mid, \right, \edge +.endm + +// w15 holds the number of entries to produce +// w14 and w16 hold the previous output entries +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +function output_lag2_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #1 + madd w11, w16, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w14, w17, w11 // += *coeff * prev output 2 + mov w16, w14 + add w14, w11, w8 // 1 << (ar_coeff_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + add w12, w12, w10 + asr w12, w12, w9 // >> (4 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.b[15], w14 + b.gt 1b + ret +endfunc + +function sum_lag2_above_neon + sub x12, x0, #2*GRAIN_WIDTH - 16 + sub x13, x0, #1*GRAIN_WIDTH - 16 + ld1 {v18.16b}, [x12] // load top right + ld1 {v21.16b}, [x13] + + ext v22.16b, v16.16b, v17.16b, #14 // top left, top mid + dup v26.16b, v30.b[0] + ext v23.16b, v16.16b, v17.16b, #15 + dup v27.16b, v30.b[1] + ext v0.16b, v17.16b, v18.16b, #1 // top mid, top right + dup v28.16b, v30.b[3] + ext v1.16b, v17.16b, v18.16b, #2 + dup v29.16b, v30.b[4] + + smull v2.8h, v22.8b, v26.8b + smull2 v3.8h, v22.16b, v26.16b + smull v4.8h, v23.8b, v27.8b + smull2 v5.8h, v23.16b, v27.16b + smull v6.8h, v0.8b, v28.8b + smull2 v7.8h, v0.16b, v28.16b + smull v0.8h, v1.8b, v29.8b + smull2 v1.8h, v1.16b, v29.16b + saddl v22.4s, v2.4h, v4.4h + saddl2 v23.4s, v2.8h, v4.8h + saddl v26.4s, v3.4h, v5.4h + saddl2 v27.4s, v3.8h, v5.8h + saddl v2.4s, v0.4h, v6.4h + saddl2 v3.4s, v0.8h, v6.8h + saddl v6.4s, v1.4h, v7.4h + saddl2 v7.4s, v1.8h, v7.8h + add v4.4s, v22.4s, v2.4s + add v5.4s, v23.4s, v3.4s + add v6.4s, v26.4s, v6.4s + add v7.4s, v27.4s, v7.4s + + ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid + dup v26.16b, v30.b[5] + ext v23.16b, v19.16b, v20.16b, #15 + dup v27.16b, v30.b[6] + ext v0.16b, v20.16b, v21.16b, #1 // top mid, top right + dup v28.16b, v30.b[8] + ext v1.16b, v20.16b, v21.16b, #2 + dup v29.16b, v30.b[9] + + smull v2.8h, v22.8b, v26.8b + smull2 v3.8h, v22.16b, v26.16b + smull v22.8h, v23.8b, v27.8b + smull2 v23.8h, v23.16b, v27.16b + smull v26.8h, v0.8b, v28.8b + smull2 v27.8h, v0.16b, v28.16b + smull v28.8h, v1.8b, v29.8b + smull2 v29.8h, v1.16b, v29.16b + saddl v0.4s, v2.4h, v22.4h + saddl2 v1.4s, v2.8h, v22.8h + saddl v2.4s, v3.4h, v23.4h + saddl2 v3.4s, v3.8h, v23.8h + saddl v22.4s, v26.4h, v28.4h + saddl2 v23.4s, v26.8h, v28.8h + saddl v26.4s, v27.4h, v29.4h + saddl2 v27.4s, v27.8h, v29.8h + add v0.4s, v0.4s, v22.4s + add v1.4s, v1.4s, v23.4s + add v2.4s, v2.4s, v26.4s + add v3.4s, v3.4s, v27.4s + dup v26.16b, v30.b[2] + dup v27.16b, v30.b[7] + smull v22.8h, v17.8b, v26.8b + smull2 v23.8h, v17.16b, v26.16b + smull v24.8h, v20.8b, v27.8b + smull2 v25.8h, v20.16b, v27.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + saddl v0.4s, v22.4h, v24.4h + saddl2 v1.4s, v22.8h, v24.8h + saddl v2.4s, v23.4h, v25.4h + saddl2 v3.4s, v23.8h, v25.8h + mov v19.16b, v20.16b + mov v20.16b, v21.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + ret +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag2_\edge\()_neon + str x30, [sp, #-16]! +.ifc \edge, left + sub x12, x0, #2*GRAIN_WIDTH + sub x13, x0, #1*GRAIN_WIDTH + ld1 {v17.16b}, [x12] // load the previous block right above + ld1 {v20.16b}, [x13] +.endif + bl sum_lag2_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH + ld1 {v22.16b, v23.16b}, [x19], #32 + ld1 {v24.16b, v25.16b}, [x12] + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + saddlp v24.8h, v24.16b + saddlp v25.8h, v25.16b + add v22.8h, v22.8h, v24.8h + add v23.8h, v23.8h, v25.8h + rshrn v0.8b, v22.8h, #2 + rshrn2 v0.16b, v23.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.16b, v23.16b}, [x19], #32 + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + rshrn v0.8b, v22.8h, #1 + rshrn2 v0.16b, v23.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.16b}, [x19], #16 +.endif +.if \uv_layout + dup v1.16b, v30.b[12] + smull v2.8h, v0.8b, v1.8b + smull2 v3.8h, v0.16b, v1.16b + saddw v4.4s, v4.4s, v2.4h + saddw2 v5.4s, v5.4s, v2.8h + saddw v6.4s, v6.4s, v3.4h + saddw2 v7.4s, v7.4s, v3.8h +.endif +.if \uv_layout && \elems == 16 + b sum_lag2_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 15 + b sum_lag2_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 9 + b sum_lag2_uv_420_\edge\()_start +.else +sum_lag2_\type\()_\edge\()_start: +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + xtn2 v0.16b, v0.8h + ext v4.16b, v4.16b, v4.16b, #12 + smov w16, v0.b[14] + smov w14, v0.b[15] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_lag2_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_lag2_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b + mov w15, #4 + bl output_lag2_neon + + increment_seed 4, shift=0 + mov v1.16b, v6.16b +.if \elems == 9 + mov w15, #1 + bl output_lag2_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + xtn v1.8b, v1.8h + ext v0.16b, v0.16b, v1.16b, #7 +.else + mov w15, #4 + bl output_lag2_neon + + increment_seed 4, shift=0 + mov v1.16b, v7.16b + +.ifc \edge, right + mov w15, #3 + bl output_lag2_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #1 +.else + mov w15, #4 + bl output_lag2_neon +.endif +.endif + st1 {v0.16b}, [x0], #16 + ldr x30, [sp], #16 + ret +.endif +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 15 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 15 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 9 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 9 + + +// w15 holds the number of entries to produce +// w14, w16 and w17 hold the previous output entries +// v0 holds the vector of produced entries +// v1 holds the input vector of sums from above +function output_lag3_neon +1: + read_shift_rand x13, 11 + mov w11, v1.s[0] + ldrsh w12, [x3, x13, lsl #1] + ext v0.16b, v0.16b, v0.16b, #1 + madd w11, w17, w4, w11 // sum (above) + *coeff * prev output 1 + madd w11, w16, w20, w11 // sum (above) + *coeff * prev output 2 + madd w11, w14, w21, w11 // += *coeff * prev output 3 + mov w17, w16 + mov w16, w14 + add w14, w11, w8 // 1 << (ar_coeff_shift - 1) + asr w14, w14, w7 // >> ar_coeff_shift + add w12, w12, w10 + asr w12, w12, w9 // >> (4 + grain_scale_shift) + add w14, w14, w12 + cmp w14, w5 + csel w14, w14, w5, le + cmp w14, w6 + csel w14, w14, w6, ge + subs w15, w15, #1 + ext v1.16b, v1.16b, v1.16b, #4 + ins v0.b[15], w14 + b.gt 1b + ret +endfunc + +function sum_lag3_above_neon + sub x11, x0, #3*GRAIN_WIDTH - 16 + sub x12, x0, #2*GRAIN_WIDTH - 16 + sub x13, x0, #1*GRAIN_WIDTH - 16 + ld1 {v15.16b}, [x11] // load top right + ld1 {v18.16b}, [x12] + ld1 {v21.16b}, [x13] + + ext v8.16b, v13.16b, v14.16b, #13 // top left, top mid + dup v22.16b, v29.b[0] + ext v9.16b, v13.16b, v14.16b, #14 + dup v23.16b, v29.b[1] + ext v10.16b, v13.16b, v14.16b, #15 + dup v24.16b, v29.b[2] + dup v25.16b, v29.b[3] + ext v11.16b, v14.16b, v15.16b, #1 // top mid, top right + dup v26.16b, v29.b[4] + ext v12.16b, v14.16b, v15.16b, #2 + dup v27.16b, v29.b[5] + ext v13.16b, v14.16b, v15.16b, #3 + dup v28.16b, v29.b[6] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v14.8b, v25.8b + smull2 v13.8h, v14.16b, v25.16b + add v4.4s, v22.4s, v0.4s + add v5.4s, v23.4s, v1.4s + add v6.4s, v24.4s, v2.4s + add v7.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h + + ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid + dup v22.16b, v29.b[7] + ext v9.16b, v16.16b, v17.16b, #14 + dup v23.16b, v29.b[8] + ext v10.16b, v16.16b, v17.16b, #15 + dup v24.16b, v29.b[9] + dup v25.16b, v29.b[10] + ext v11.16b, v17.16b, v18.16b, #1 // top mid, top right + dup v26.16b, v29.b[11] + ext v12.16b, v17.16b, v18.16b, #2 + dup v27.16b, v29.b[12] + ext v13.16b, v17.16b, v18.16b, #3 + dup v28.16b, v29.b[13] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v17.8b, v25.8b + smull2 v13.8h, v17.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v13.4h + saddw2 v7.4s, v7.4s, v13.8h + + ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid + dup v22.16b, v29.b[14] + ext v9.16b, v19.16b, v20.16b, #14 + dup v23.16b, v29.b[15] + ext v10.16b, v19.16b, v20.16b, #15 + dup v24.16b, v30.b[0] + dup v25.16b, v30.b[1] + ext v11.16b, v20.16b, v21.16b, #1 // top mid, top right + dup v26.16b, v30.b[2] + ext v12.16b, v20.16b, v21.16b, #2 + dup v27.16b, v30.b[3] + ext v13.16b, v20.16b, v21.16b, #3 + dup v28.16b, v30.b[4] + + smull v0.8h, v8.8b, v22.8b + smull2 v1.8h, v8.16b, v22.16b + smull v2.8h, v9.8b, v23.8b + smull2 v3.8h, v9.16b, v23.16b + smull v8.8h, v10.8b, v24.8b + smull2 v9.8h, v10.16b, v24.16b + smull v10.8h, v11.8b, v26.8b + smull2 v11.8h, v11.16b, v26.16b + saddl v22.4s, v0.4h, v2.4h + saddl2 v23.4s, v0.8h, v2.8h + saddl v24.4s, v1.4h, v3.4h + saddl2 v26.4s, v1.8h, v3.8h + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + smull v8.8h, v12.8b, v27.8b + smull2 v9.8h, v12.16b, v27.16b + smull v10.8h, v13.8b, v28.8b + smull2 v11.8h, v13.16b, v28.16b + smull v12.8h, v20.8b, v25.8b + smull2 v19.8h, v20.16b, v25.16b + add v22.4s, v22.4s, v0.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v2.4s + add v26.4s, v26.4s, v3.4s + saddl v0.4s, v8.4h, v10.4h + saddl2 v1.4s, v8.8h, v10.8h + saddl v2.4s, v9.4h, v11.4h + saddl2 v3.4s, v9.8h, v11.8h + add v4.4s, v4.4s, v22.4s + add v5.4s, v5.4s, v23.4s + add v6.4s, v6.4s, v24.4s + add v7.4s, v7.4s, v26.4s + mov v13.16b, v14.16b + mov v14.16b, v15.16b + add v4.4s, v4.4s, v0.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v2.4s + add v7.4s, v7.4s, v3.4s + mov v16.16b, v17.16b + mov v17.16b, v18.16b + saddw v4.4s, v4.4s, v12.4h + saddw2 v5.4s, v5.4s, v12.8h + saddw v6.4s, v6.4s, v19.4h + saddw2 v7.4s, v7.4s, v19.8h + + mov v19.16b, v20.16b + mov v20.16b, v21.16b + ret +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=16 +function sum_\type\()_lag3_\edge\()_neon + str x30, [sp, #-16]! +.ifc \edge, left + sub x11, x0, #3*GRAIN_WIDTH + sub x12, x0, #2*GRAIN_WIDTH + sub x13, x0, #1*GRAIN_WIDTH + ld1 {v14.16b}, [x11] // load the previous block right above + ld1 {v17.16b}, [x12] + ld1 {v20.16b}, [x13] +.endif + bl sum_lag3_above_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH + ld1 {v22.16b, v23.16b}, [x19], #32 + ld1 {v24.16b, v25.16b}, [x12] + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + saddlp v24.8h, v24.16b + saddlp v25.8h, v25.16b + add v22.8h, v22.8h, v24.8h + add v23.8h, v23.8h, v25.8h + rshrn v0.8b, v22.8h, #2 + rshrn2 v0.16b, v23.8h, #2 +.endif +.ifc \type, uv_422 + ld1 {v22.16b, v23.16b}, [x19], #32 + saddlp v22.8h, v22.16b + saddlp v23.8h, v23.16b + rshrn v0.8b, v22.8h, #1 + rshrn2 v0.16b, v23.8h, #1 +.endif +.ifc \type, uv_444 + ld1 {v0.16b}, [x19], #16 +.endif +.if \uv_layout + dup v1.16b, v30.b[8] + smull v2.8h, v0.8b, v1.8b + smull2 v3.8h, v0.16b, v1.16b + saddw v4.4s, v4.4s, v2.4h + saddw2 v5.4s, v5.4s, v2.8h + saddw v6.4s, v6.4s, v3.4h + saddw2 v7.4s, v7.4s, v3.8h +.endif +.if \uv_layout && \elems == 16 + b sum_lag3_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 15 + b sum_lag3_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 9 + b sum_lag3_uv_420_\edge\()_start +.else +sum_lag3_\type\()_\edge\()_start: +.ifc \edge, left + increment_seed 4 + read_rand x12, 11, 3 + read_rand x13, 11, 2 + read_rand x14, 11, 1 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v0.h}[5], [x12] + ld1 {v0.h}[6], [x13] + ld1 {v0.h}[7], [x14] + lsl x2, x2, #1 // shift back the state as if we'd done increment_seed with shift=0 + srshl v0.8h, v0.8h, v31.8h + xtn2 v0.16b, v0.8h + ext v4.16b, v4.16b, v4.16b, #12 + smov w17, v0.b[13] + smov w16, v0.b[14] + smov w14, v0.b[15] + + mov v1.16b, v4.16b + mov w15, #1 + bl output_lag3_neon +.else + increment_seed 4, shift=0 + mov v1.16b, v4.16b + mov w15, #4 + bl output_lag3_neon +.endif + + increment_seed 4, shift=0 + mov v1.16b, v5.16b + mov w15, #4 + bl output_lag3_neon + + increment_seed 4, shift=0 + mov v1.16b, v6.16b +.if \elems == 9 + mov w15, #1 + bl output_lag3_neon + lsr w2, w2, #3 + + read_rand x12, 11, 2 + read_rand x13, 11, 1 + read_rand x14, 11, 0 + add x12, x3, x12, lsl #1 + add x13, x3, x13, lsl #1 + add x14, x3, x14, lsl #1 + ld1 {v1.h}[0], [x12] + ld1 {v1.h}[1], [x13] + ld1 {v1.h}[2], [x14] + srshl v1.4h, v1.4h, v31.4h + xtn v1.8b, v1.8h + ext v0.16b, v0.16b, v1.16b, #7 +.else + mov w15, #4 + bl output_lag3_neon + + increment_seed 4, shift=0 + mov v1.16b, v7.16b + +.ifc \edge, right + mov w15, #3 + bl output_lag3_neon + read_shift_rand x15, 11 + add x15, x3, x15, lsl #1 + ld1 {v1.h}[0], [x15] + srshl v1.4h, v1.4h, v31.4h + ext v0.16b, v0.16b, v1.16b, #1 +.else + mov w15, #4 + bl output_lag3_neon +.endif +.endif + st1 {v0.16b}, [x0], #16 + ldr x30, [sp], #16 + ret +.endif +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 15 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 15 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 9 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 9 + +function generate_grain_rows_neon + str x30, [sp, #-16]! +1: + get_grain_row v16, v17, v18, v19, v20, v21 + subs w1, w1, #1 + store_grain_row v16, v17, v18, v19, v20, v21 + b.gt 1b + ldr x30, [sp], #16 + ret +endfunc + +function generate_grain_rows_44_neon + str x30, [sp, #-16]! +1: + get_grain_row_44 v16, v17, v18 + subs w1, w1, #1 + store_grain_row_44 v16, v17, v18 + b.gt 1b + ldr x30, [sp], #16 + ret +endfunc + +function get_grain_row_neon + str x30, [sp, #-16]! + get_grain_row v16, v17, v18, v19, v20, v21 + ldr x30, [sp], #16 + ret +endfunc + +function get_grain_row_44_neon + str x30, [sp, #-16]! + get_grain_row_44 v16, v17, v18 + ldr x30, [sp], #16 + ret +endfunc + +function add_uv_444_coeff_lag0_neon + str x30, [sp, #-16]! +add_coeff_lag0_start: + smull v2.8h, v0.8b, v27.8b + smull2 v3.8h, v0.16b, v27.16b + srshl v2.8h, v2.8h, v28.8h + srshl v3.8h, v3.8h, v28.8h + saddw v2.8h, v2.8h, v1.8b + saddw2 v3.8h, v3.8h, v1.16b + sqxtn v2.8b, v2.8h + sqxtn2 v2.16b, v3.8h + ldr x30, [sp], #16 + ret +endfunc + +function add_uv_420_coeff_lag0_neon + str x30, [sp, #-16]! + ld1 {v4.16b, v5.16b}, [x19], #32 + ld1 {v6.16b, v7.16b}, [x12], #32 + saddlp v4.8h, v4.16b + saddlp v5.8h, v5.16b + saddlp v6.8h, v6.16b + saddlp v7.8h, v7.16b + add v4.8h, v4.8h, v6.8h + add v5.8h, v5.8h, v7.8h + rshrn v4.8b, v4.8h, #2 + rshrn2 v4.16b, v5.8h, #2 + and v0.16b, v4.16b, v0.16b + b add_coeff_lag0_start +endfunc + +function add_uv_422_coeff_lag0_neon + str x30, [sp, #-16]! + ld1 {v4.16b, v5.16b}, [x19], #32 + saddlp v4.8h, v4.16b + saddlp v5.8h, v5.16b + rshrn v4.8b, v4.8h, #1 + rshrn2 v4.16b, v5.8h, #1 + and v0.16b, v4.16b, v0.16b + b add_coeff_lag0_start +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_8bpc_neon, export=1 + stp x30, x19, [sp, #-96]! + +.ifc \type, uv_444 + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH + mov x1, x2 + mul w13, w13, w14 +.endif + movrel x3, X(gaussian_sequence) + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add x4, x1, #FGD_AR_COEFFS_Y +.else + add x4, x1, #FGD_AR_COEFFS_UV +.endif + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + +.ifc \type, uv_444 + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne +.endif + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #127 + mov w6, #-128 + +.ifc \type, uv_444 + eor w2, w2, w11 +.endif + + br x16 + + ret + +L(generate_grain_\type\()_lag0): +.ifc \type, y + mov w1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + dup v28.8h, w7 + ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + ext v29.16b, v0.16b, v1.16b, #13 + ext v30.16b, v1.16b, v0.16b, #1 + neg v28.8h, v28.8h + + mov w1, #3 + bl generate_grain_rows_neon + mov w1, #GRAIN_HEIGHT-3 +1: + ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64 + bl get_grain_row_neon + and v0.16b, v22.16b, v29.16b + mov v1.16b, v16.16b + bl add_uv_444_coeff_lag0_neon + mov v0.16b, v23.16b + mov v1.16b, v17.16b + mov v16.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + ld1 {v26.16b}, [x19], #16 + mov v0.16b, v24.16b + mov v1.16b, v18.16b + mov v17.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + add x19, x19, #2 + mov v0.16b, v25.16b + mov v1.16b, v19.16b + mov v18.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + and v0.16b, v26.16b, v30.16b + mov v1.16b, v20.16b + mov v19.16b, v2.16b + bl add_uv_444_coeff_lag0_neon + mov v20.16b, v2.16b + subs w1, w1, #1 + store_grain_row v16, v17, v18, v19, v20, v21 + b.gt 1b +.endif + ldp x30, x19, [sp], #96 + ret + +L(generate_grain_\type\()_lag1): + ld1r {v27.16b}, [x4], #1 // ar_coeffs_y[0] + ld1r {v28.16b}, [x4], #1 // ar_coeffs_y[1] + ld1r {v29.16b}, [x4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb w4, [x4, #1] // ar_coeffs_y[4] +.else + add x4, x4, #2 +.endif + + mov w1, #3 +.ifc \type, uv_444 + ld1r {v30.16b}, [x4] // ar_coeffs_uv[5] + ldursb w4, [x4, #-1] // ar_coeffs_uv[4] +.endif + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + sum_\type\()_lag1 v22, v16, v16, v17, left + sum_\type\()_lag1 v23, v16, v17, v18 + sum_\type\()_lag1 v24, v17, v18, v19 + sum_\type\()_lag1 v25, v18, v19, v20 + sum_\type\()_lag1 v20, v19, v20, v21, right + get_grain_2 v21 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + store_grain_row v22, v23, v24, v25, v20, v21 + mov v16.16b, v22.16b + mov v17.16b, v23.16b + mov v18.16b, v24.16b + mov v19.16b, v25.16b + b.gt 1b + + ldp x30, x19, [sp], #96 + ret + +L(generate_grain_\type\()_lag2): + ld1 {v30.16b}, [x4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + st1 {v16.h}[0], [x0], #2 + b.gt 1b + + ldp x30, x19, [sp], #96 + ret + +L(generate_grain_\type\()_lag3): + ldr q29, [x4] // ar_coeffs_y[0-15] + ldr q30, [x4, #16] // ar_coeffs_y[16-23], ar_coeffs_uv[16-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_neon + + mov w1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + get_grain_2 v16 + subs w1, w1, #1 +.ifc \type, uv_444 + add x19, x19, #2 +.endif + st1 {v16.h}[0], [x0], #2 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH-(3*32) +.else + sub \reg, \reg, #3*32-GRAIN_WIDTH +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_8bpc_neon, export=1 + stp x30, x19, [sp, #-96]! + + mov w13, w3 + mov w14, #28 + add x19, x1, #3*GRAIN_WIDTH-3 + mov x1, x2 + mul w13, w13, w14 + + movrel x3, X(gaussian_sequence) + ldr w2, [x1, #FGD_SEED] + ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] + add x4, x1, #FGD_AR_COEFFS_UV + adr x16, L(gen_grain_\type\()_tbl) + ldr w17, [x1, #FGD_AR_COEFF_LAG] + add w9, w9, #4 + ldrh w17, [x16, w17, uxtw #1] + dup v31.8h, w9 // 4 + data->grain_scale_shift + sub x16, x16, w17, uxtw + neg v31.8h, v31.8h + + cmp w13, #0 + mov w11, #0x49d8 + mov w14, #0xb524 + add x4, x4, w13, uxtw // Add offset to ar_coeffs_uv[1] + csel w11, w11, w14, ne + + ldr w7, [x1, #FGD_AR_COEFF_SHIFT] + mov w8, #1 + mov w10, #1 + lsl w8, w8, w7 // 1 << ar_coeff_shift + lsl w10, w10, w9 // 1 << (4 + data->grain_scale_shift) + lsr w8, w8, #1 // 1 << (ar_coeff_shift - 1) + lsr w10, w10, #1 // 1 << (4 + data->grain_scale_shift - 1) + mov w5, #127 + mov w6, #-128 + + eor w2, w2, w11 + + br x16 + + ret + +L(generate_grain_\type\()_lag0): + dup v28.8h, w7 + ld1r {v27.16b}, [x4] // ar_coeffs_uv[0] + movi v0.16b, #0 + movi v1.16b, #255 + ext v29.16b, v0.16b, v1.16b, #13 + ext v30.16b, v1.16b, v0.16b, #7 + neg v28.8h, v28.8h + + mov w1, #3 + bl generate_grain_rows_44_neon + set_height w1, \type +1: + bl get_grain_row_44_neon +.ifc \type, uv_420 + add x12, x19, #GRAIN_WIDTH +.endif + mov v0.16b, v29.16b + mov v1.16b, v16.16b + bl add_\type\()_coeff_lag0_neon + movi v0.16b, #255 + mov v1.16b, v17.16b + mov v16.16b, v2.16b + bl add_\type\()_coeff_lag0_neon + mov v0.16b, v30.16b + mov v1.16b, v18.16b + mov v17.16b, v2.16b + bl add_\type\()_coeff_lag0_neon + mov v18.16b, v2.16b + subs w1, w1, #1 + increment_y_ptr x19, \type + store_grain_row_44 v16, v17, v18 + b.gt 1b + + ldp x30, x19, [sp], #96 + ret + +L(generate_grain_\type\()_lag1): + ld1r {v27.16b}, [x4], #1 // ar_coeffs_uv[0] + ld1r {v28.16b}, [x4], #1 // ar_coeffs_uv[1] + ld1r {v29.16b}, [x4] // ar_coeffs_uv[2] + add x4, x4, #2 + + mov w1, #3 + ld1r {v30.16b}, [x4] // ar_coeffs_uv[5] + ldursb w4, [x4, #-1] // ar_coeffs_uv[4] + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + sum_\type\()_lag1 v20, v16, v16, v17, left + sum_\type\()_lag1 v21, v16, v17, v18 + sum_\type\()_lag1 v18, v17, v18, v18, right + subs w1, w1, #1 + increment_y_ptr x19, \type + store_grain_row_44 v20, v21, v18 + mov v16.16b, v20.16b + mov v17.16b, v21.16b + b.gt 1b + + ldp x30, x19, [sp], #96 + ret + +L(generate_grain_\type\()_lag2): + ld1 {v30.16b}, [x4] // ar_coeffs_uv[0-12] + + smov w4, v30.b[10] + smov w17, v30.b[11] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag2_left_neon + bl sum_\type\()_lag2_mid_neon + bl sum_\type\()_lag2_right_neon + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH-48 + b.gt 1b + + ldp x30, x19, [sp], #96 + ret + +L(generate_grain_\type\()_lag3): + ldr q29, [x4] // ar_coeffs_uv[0-15] + ldr q30, [x4, #16] // ar_coeffs_uv[16-24] + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x20, x21, [sp, #80] + + smov w4, v30.b[5] + smov w20, v30.b[6] + smov w21, v30.b[7] + + mov w1, #3 + bl generate_grain_rows_44_neon + + set_height w1, \type +1: + bl sum_\type\()_lag3_left_neon + bl sum_\type\()_lag3_mid_neon + bl sum_\type\()_lag3_right_neon + subs w1, w1, #1 + increment_y_ptr x19, \type + add x0, x0, #GRAIN_WIDTH-48 + b.gt 1b + + ldp x20, x21, [sp, #80] + ldp d14, d15, [sp, #64] + ldp d12, d13, [sp, #48] + ldp d10, d11, [sp, #32] + ldp d8, d9, [sp, #16] + ldp x30, x19, [sp], #96 + ret + +L(gen_grain_\type\()_tbl): + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) + .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 + +.macro gather_interleaved dst1, dst2, src1, src2, off + umov w14, \src1[0+\off] + umov w15, \src2[8+\off] + umov w16, \src1[2+\off] + add x14, x14, x3 + umov w17, \src2[10+\off] + add x15, x15, x3 + ld1 {\dst1}[0+\off], [x14] + umov w14, \src1[4+\off] + add x16, x16, x3 + ld1 {\dst2}[8+\off], [x15] + umov w15, \src2[12+\off] + add x17, x17, x3 + ld1 {\dst1}[2+\off], [x16] + umov w16, \src1[6+\off] + add x14, x14, x3 + ld1 {\dst2}[10+\off], [x17] + umov w17, \src2[14+\off] + add x15, x15, x3 + ld1 {\dst1}[4+\off], [x14] + add x16, x16, x3 + ld1 {\dst2}[12+\off], [x15] + add x17, x17, x3 + ld1 {\dst1}[6+\off], [x16] + ld1 {\dst2}[14+\off], [x17] +.endm + +.macro gather dst1, dst2, src1, src2 + gather_interleaved \dst1, \dst2, \src1, \src2, 0 + gather_interleaved \dst2, \dst1, \src2, \src1, 0 + gather_interleaved \dst1, \dst2, \src1, \src2, 1 + gather_interleaved \dst2, \dst1, \src2, \src1, 1 +.endm + +function gather32_neon + gather v4.b, v5.b, v0.b, v1.b + ret +endfunc + +function gather16_neon + gather_interleaved v4.b, v5.b, v0.b, v0.b, 0 + gather_interleaved v4.b, v5.b, v0.b, v0.b, 1 + ins v4.d[1], v5.d[1] + ret +endfunc + +const overlap_coeffs_0, align=4 + .byte 27, 17, 0, 0, 0, 0, 0, 0 + .byte 17, 27, 32, 32, 32, 32, 32, 32 +endconst + +const overlap_coeffs_1, align=4 + .byte 23, 0, 0, 0, 0, 0, 0, 0 + .byte 22, 32, 32, 32, 32, 32, 32, 32 +endconst + +.macro calc_offset offx, offy, src, sx, sy + and \offy, \src, #0xF // randval & 0xF + lsr \offx, \src, #4 // randval >> 4 +.if \sy == 0 + add \offy, \offy, \offy // 2 * (randval & 0xF) +.endif +.if \sx == 0 + add \offx, \offx, \offx // 2 * (randval >> 4) +.endif +.endm + +.macro add_offset dst, offx, offy, src, stride + madd \dst, \stride, \offy, \src // grain_lut += grain_stride * offy + add \dst, \dst, \offx, uxtw // grain_lut += offx +.endm + +// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const int scaling_shift, +// const entry grain_lut[][GRAIN_WIDTH], +// const int offsets[][2], +// const int h, const ptrdiff_t clip, +// const ptrdiff_t type); +function fgy_32x32_8bpc_neon, export=1 + str x30, [sp, #-16]! + ldr w11, [x6, #8] // offsets[1][0] + ldr w13, [x6, #4] // offsets[0][1] + ldr w15, [x6, #12] // offsets[1][1] + ldr w6, [x6] // offsets[0][0] + ldr w8, [sp, #16] // clip + mov x9, #GRAIN_WIDTH // grain_lut stride + + neg w4, w4 + dup v29.8h, w4 // -scaling_shift + + movrel x16, overlap_coeffs_0 + + cbz w8, 1f + // clip + movi v30.16b, #16 + movi v31.16b, #235 + b 2f +1: + // no clip + movi v30.16b, #0 + movi v31.16b, #255 +2: + + ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs + + add x5, x5, #9 // grain_lut += 9 + add x5, x5, x9, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x9 // grain_lut += grain_stride + + calc_offset w11, w12, w11, 0, 0 + calc_offset w13, w14, w13, 0, 0 + calc_offset w15, w16, w15, 0, 0 + calc_offset w6, w10, w6, 0, 0 + + add_offset x12, w11, x12, x5, x9 + add_offset x14, w13, x14, x5, x9 + add_offset x16, w15, x16, x5, x9 + add_offset x5, w6, x10, x5, x9 + + ldr w11, [sp, #24] // type + adr x13, L(fgy_loop_tbl) + + add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + + tst w11, #1 + ldrh w11, [x13, w11, uxtw #1] + + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx + + sub x11, x13, w11, uxtw + + b.eq 1f + // y overlap + dup v6.16b, v27.b[0] + dup v7.16b, v27.b[1] + mov w10, w7 // backup actual h + mov w7, #2 +1: + br x11 +endfunc + +function fgy_loop_neon +.macro fgy ox, oy +L(loop_\ox\oy): +1: + ld1 {v0.16b, v1.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x9 // grain_lut old +.endif +.if \oy + ld1 {v22.16b, v23.16b}, [x6], x9 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x8], x9 // grain_lut top old +.endif + ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut + + bl gather32_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v7.8b +.else + smull v16.8h, v18.8b, v7.8b +.endif + smull2 v17.8h, v18.16b, v7.16b + smull v18.8h, v19.8b, v7.8b + smull2 v19.8h, v19.16b, v7.16b +.if \ox + smlal v16.8h, v21.8b, v6.8b +.else + smlal v16.8h, v22.8b, v6.8b +.endif + smlal2 v17.8h, v22.16b, v6.16b + smlal v18.8h, v23.8b, v6.8b + smlal2 v19.8h, v23.16b, v6.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 + sqrshrn v23.8b, v18.8h, #5 + sqrshrn2 v23.16b, v19.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b + sxtl v18.8h, v23.8b + sxtl2 v19.8h, v23.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + uxtl v4.8h, v5.8b + uxtl2 v5.8h, v5.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + mul v18.8h, v18.8h, v4.8h + mul v19.8h, v19.8h, v5.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + srshl v18.8h, v18.8h, v29.8h + srshl v19.8h, v19.8h, v29.8h + + uaddw v16.8h, v16.8h, v0.8b // *src + noise + uaddw2 v17.8h, v17.8h, v0.16b + uaddw v18.8h, v18.8h, v1.8b + uaddw2 v19.8h, v19.8h, v1.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + sqxtun v1.8b, v18.8h + sqxtun2 v1.16b, v19.8h + + umax v0.16b, v0.16b, v30.16b + umax v1.16b, v1.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + umin v1.16b, v1.16b, v31.16b + + subs w7, w7, #1 +.if \oy + dup v6.16b, v28.b[0] + dup v7.16b, v28.b[1] +.endif + st1 {v0.16b, v1.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w10, #2 + sub w7, w10, #2 // restore actual remaining h + b.gt L(loop_\ox\()0) +.endif + ldr x30, [sp], #16 + ret +.endm + + fgy 0, 0 + fgy 0, 1 + fgy 1, 0 + fgy 1, 1 + +L(fgy_loop_tbl): + .hword L(fgy_loop_tbl) - L(loop_00) + .hword L(fgy_loop_tbl) - L(loop_01) + .hword L(fgy_loop_tbl) - L(loop_10) + .hword L(fgy_loop_tbl) - L(loop_11) +endfunc + +// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst, +// const pixel *const src, +// const ptrdiff_t stride, +// const uint8_t scaling[SCALING_SIZE], +// const Dav1dFilmGrainData *const data, +// const entry grain_lut[][GRAIN_WIDTH], +// const pixel *const luma_row, +// const ptrdiff_t luma_stride, +// const int offsets[][2], +// const ptrdiff_t h, const ptrdiff_t uv, +// const ptrdiff_t is_id, +// const ptrdiff_t type); +.macro fguv layout, sx, sy +function fguv_32x32_\layout\()_8bpc_neon, export=1 + str x30, [sp, #-32]! + str d8, [sp, #16] + ldp x8, x9, [sp, #32] // offsets, h + ldp x10, x11, [sp, #48] // uv, is_id + + ldr w13, [x4, #FGD_SCALING_SHIFT] + ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE] + neg w13, w13 // -scaling_shift + + // !csfl + add x10, x4, x10, lsl #2 // + 4*uv + add x14, x10, #FGD_UV_LUMA_MULT + add x15, x10, #FGD_UV_MULT + add x10, x10, #FGD_UV_OFFSET + ld1 {v8.h}[0], [x14] // uv_luma_mult + ld1r {v24.8h}, [x10] // uv_offset + ld1 {v8.h}[1], [x15] // uv_mult + + dup v29.8h, w13 // -scaling_shift + + cbz w12, 1f + // clip + movi v30.16b, #16 + movi v31.16b, #240 + cbz w11, 2f + // is_id + movi v31.16b, #235 + b 2f +1: + // no clip + movi v30.16b, #0 + movi v31.16b, #255 +2: + + ldr w12, [x8, #8] // offsets[1][0] + ldr w14, [x8, #4] // offsets[0][1] + ldr w16, [x8, #12] // offsets[1][1] + ldr w8, [x8] // offsets[0][0] + + mov x10, #GRAIN_WIDTH // grain_lut stride + + add x5, x5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6 +.if \sy + add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride + add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride +.else + add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride + add x5, x5, x10 // grain_lut += grain_stride +.endif + + calc_offset w12, w13, w12, \sx, \sy + calc_offset w14, w15, w14, \sx, \sy + calc_offset w16, w17, w16, \sx, \sy + calc_offset w8, w11, w8, \sx, \sy + + add_offset x13, w12, x13, x5, x10 + add_offset x15, w14, x15, x5, x10 + add_offset x17, w16, x17, x5, x10 + add_offset x5, w8, x11, x5, x10 + + add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by + add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + + ldr w13, [sp, #64] // type + + movrel x16, overlap_coeffs_\sx + adr x14, L(fguv_loop_sx\sx\()_tbl) + + ld1 {v27.8b, v28.8b}, [x16] // overlap_coeffs + tst w13, #1 + ldrh w13, [x14, w13, uxtw #1] + + b.eq 1f + // y overlap + sub w12, w9, #(2 >> \sy) // backup remaining h + mov w9, #(2 >> \sy) + +1: + sub x13, x14, w13, uxtw + +.if \sy + movi v25.16b, #23 + movi v26.16b, #22 +.else + movi v25.16b, #27 + movi v26.16b, #17 +.endif + +.if \sy + add x7, x7, x7 // luma_stride *= 2 +.endif + + br x13 +endfunc +.endm + +fguv 420, 1, 1 +fguv 422, 1, 0 +fguv 444, 0, 0 + +function fguv_loop_sx0_neon +.macro fguv_loop_sx0 csfl, ox, oy +L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): +1: + ld1 {v0.16b, v1.16b}, [x6], x7 // luma + ld1 {v6.16b, v7.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v22.16b, v23.16b}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x11], x10 // grain_lut top old +.endif + ld1 {v18.16b, v19.16b}, [x5], x10 // grain_lut + +.if !\csfl + uxtl v2.8h, v0.8b + uxtl2 v3.8h, v0.16b + uxtl v4.8h, v1.8b + uxtl2 v5.8h, v1.16b + uxtl v0.8h, v6.8b + uxtl2 v1.8h, v6.16b + uxtl v16.8h, v7.8b + uxtl2 v17.8h, v7.16b + mul v2.8h, v2.8h, v8.h[0] + mul v3.8h, v3.8h, v8.h[0] + mul v4.8h, v4.8h, v8.h[0] + mul v5.8h, v5.8h, v8.h[0] + mul v0.8h, v0.8h, v8.h[1] + mul v1.8h, v1.8h, v8.h[1] + mul v16.8h, v16.8h, v8.h[1] + mul v17.8h, v17.8h, v8.h[1] + sqadd v2.8h, v2.8h, v0.8h + sqadd v3.8h, v3.8h, v1.8h + sqadd v4.8h, v4.8h, v16.8h + sqadd v5.8h, v5.8h, v17.8h + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + sshr v4.8h, v4.8h, #6 + sshr v5.8h, v5.8h, #6 + add v2.8h, v2.8h, v24.8h + add v3.8h, v3.8h, v24.8h + add v4.8h, v4.8h, v24.8h + add v5.8h, v5.8h, v24.8h + sqxtun v0.8b, v2.8h + sqxtun2 v0.16b, v3.8h + sqxtun v1.8b, v4.8h + sqxtun2 v1.16b, v5.8h +.endif + + bl gather32_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v26.8b +.else + smull v16.8h, v18.8b, v26.8b +.endif + smull2 v17.8h, v18.16b, v26.16b + smull v18.8h, v19.8b, v26.8b + smull2 v19.8h, v19.16b, v26.16b +.if \ox + smlal v16.8h, v21.8b, v25.8b +.else + smlal v16.8h, v22.8b, v25.8b +.endif + smlal2 v17.8h, v22.16b, v25.16b + smlal v18.8h, v23.8b, v25.8b + smlal2 v19.8h, v23.16b, v25.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 + sqrshrn v23.8b, v18.8h, #5 + sqrshrn2 v23.16b, v19.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b + sxtl v18.8h, v23.8b + sxtl2 v19.8h, v23.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b + sxtl v18.8h, v19.8b + sxtl2 v19.8h, v19.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + uxtl v4.8h, v5.8b + uxtl2 v5.8h, v5.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + mul v18.8h, v18.8h, v4.8h + mul v19.8h, v19.8h, v5.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + srshl v18.8h, v18.8h, v29.8h + srshl v19.8h, v19.8h, v29.8h + + uaddw v16.8h, v16.8h, v6.8b // *src + noise + uaddw2 v17.8h, v17.8h, v6.16b + uaddw v18.8h, v18.8h, v7.8b + uaddw2 v19.8h, v19.8h, v7.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + sqxtun v1.8b, v18.8h + sqxtun2 v1.16b, v19.8h + + umax v0.16b, v0.16b, v30.16b + umax v1.16b, v1.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + umin v1.16b, v1.16b, v31.16b + + subs w9, w9, #1 +.if \oy + dup v25.16b, v28.b[0] + dup v26.16b, v28.b[1] +.endif + st1 {v0.16b, v1.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0) +.endif + b 9f +.endm + fguv_loop_sx0 0, 0, 0 + fguv_loop_sx0 0, 0, 1 + fguv_loop_sx0 0, 1, 0 + fguv_loop_sx0 0, 1, 1 + fguv_loop_sx0 1, 0, 0 + fguv_loop_sx0 1, 0, 1 + fguv_loop_sx0 1, 1, 0 + fguv_loop_sx0 1, 1, 1 + +9: + ldr d8, [sp, #16] + ldr x30, [sp], #32 + ret + +L(fguv_loop_sx0_tbl): + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) + .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) +endfunc + +function fguv_loop_sx1_neon +.macro fguv_loop_sx1 csfl, ox, oy +L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): +1: + ld1 {v0.16b, v1.16b}, [x6], x7 // luma + ld1 {v6.16b}, [x1], x2 // src +.if \ox + ld1 {v20.8b}, [x4], x10 // grain_lut old +.endif +.if \oy + ld1 {v22.16b}, [x8], x10 // grain_lut top +.endif +.if \ox && \oy + ld1 {v21.8b}, [x11], x10 // grain_lut top old +.endif + ld1 {v18.16b}, [x5], x10 // grain_lut + + uaddlp v2.8h, v0.16b + uaddlp v3.8h, v1.16b +.if \csfl + rshrn v0.8b, v2.8h, #1 + rshrn2 v0.16b, v3.8h, #1 +.else + urshr v2.8h, v2.8h, #1 + urshr v3.8h, v3.8h, #1 + uxtl v0.8h, v6.8b + uxtl2 v1.8h, v6.16b + mul v2.8h, v2.8h, v8.h[0] + mul v3.8h, v3.8h, v8.h[0] + mul v0.8h, v0.8h, v8.h[1] + mul v1.8h, v1.8h, v8.h[1] + sqadd v2.8h, v2.8h, v0.8h + sqadd v3.8h, v3.8h, v1.8h + sshr v2.8h, v2.8h, #6 + sshr v3.8h, v3.8h, #6 + add v2.8h, v2.8h, v24.8h + add v3.8h, v3.8h, v24.8h + sqxtun v0.8b, v2.8h + sqxtun2 v0.16b, v3.8h +.endif + + bl gather16_neon + +.if \ox + smull v20.8h, v20.8b, v27.8b + smlal v20.8h, v18.8b, v28.8b +.endif + +.if \oy +.if \ox + smull v21.8h, v21.8b, v27.8b + smlal v21.8h, v22.8b, v28.8b + sqrshrn v20.8b, v20.8h, #5 + sqrshrn v21.8b, v21.8h, #5 +.endif + +.if \ox + smull v16.8h, v20.8b, v26.8b +.else + smull v16.8h, v18.8b, v26.8b +.endif + smull2 v17.8h, v18.16b, v26.16b +.if \ox + smlal v16.8h, v21.8b, v25.8b +.else + smlal v16.8h, v22.8b, v25.8b +.endif + smlal2 v17.8h, v22.16b, v25.16b + sqrshrn v22.8b, v16.8h, #5 + sqrshrn2 v22.16b, v17.8h, #5 +.endif + + // sxtl of grain +.if \oy + sxtl v16.8h, v22.8b + sxtl2 v17.8h, v22.16b +.elseif \ox + sqrshrn v20.8b, v20.8h, #5 + sxtl2 v17.8h, v18.16b + sxtl v16.8h, v20.8b +.else + sxtl v16.8h, v18.8b + sxtl2 v17.8h, v18.16b +.endif + + uxtl v2.8h, v4.8b // scaling + uxtl2 v3.8h, v4.16b + + mul v16.8h, v16.8h, v2.8h // scaling * grain + mul v17.8h, v17.8h, v3.8h + + srshl v16.8h, v16.8h, v29.8h // round2(scaling * grain, scaling_shift) + srshl v17.8h, v17.8h, v29.8h + + uaddw v16.8h, v16.8h, v6.8b // *src + noise + uaddw2 v17.8h, v17.8h, v6.16b + + sqxtun v0.8b, v16.8h + sqxtun2 v0.16b, v17.8h + + umax v0.16b, v0.16b, v30.16b + umin v0.16b, v0.16b, v31.16b + +.if \oy + mov v16.16b, v25.16b +.endif + subs w9, w9, #1 +.if \oy + mov v25.16b, v26.16b + mov v26.16b, v16.16b +.endif + st1 {v0.16b}, [x0], x2 // dst + b.gt 1b + +.if \oy + cmp w12, #0 + mov w9, w12 // restore actual remaining h + b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0) +.endif + + b 9f +.endm + fguv_loop_sx1 0, 0, 0 + fguv_loop_sx1 0, 0, 1 + fguv_loop_sx1 0, 1, 0 + fguv_loop_sx1 0, 1, 1 + fguv_loop_sx1 1, 0, 0 + fguv_loop_sx1 1, 0, 1 + fguv_loop_sx1 1, 1, 0 + fguv_loop_sx1 1, 1, 1 + +9: + ldr d8, [sp, #16] + ldr x30, [sp], #32 + ret + +L(fguv_loop_sx1_tbl): + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) + .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) +endfunc diff -Nru dav1d-0.7.1/src/arm/64/ipred16.S dav1d-0.9.1/src/arm/64/ipred16.S --- dav1d-0.7.1/src/arm/64/ipred16.S 2020-06-21 11:48:54.964126300 +0000 +++ dav1d-0.9.1/src/arm/64/ipred16.S 2021-07-28 21:38:28.869851800 +0000 @@ -562,9 +562,9 @@ L(ipred_dc_h4): ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h + add x2, x2, #2 br x3 L(ipred_dc_w4): - add x2, x2, #2 ld1 {v1.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.4h @@ -594,9 +594,9 @@ L(ipred_dc_h8): ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h + add x2, x2, #2 br x3 L(ipred_dc_w8): - add x2, x2, #2 ld1 {v1.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.8h @@ -626,10 +626,10 @@ L(ipred_dc_h16): ld1 {v0.8h, v1.8h}, [x2], #32 addp v0.8h, v0.8h, v1.8h + add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w16): - add x2, x2, #2 ld1 {v1.8h, v2.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h @@ -663,10 +663,10 @@ addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h + add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w32): - add x2, x2, #2 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h @@ -709,10 +709,10 @@ addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h + add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w64): - add x2, x2, #2 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h @@ -920,10 +920,10 @@ lsl x1, x1, #1 br x5 40: - sub x2, x2, #8 - mov x7, #-8 ld1r {v6.2d}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor + sub x2, x2, #8 + mov x7, #-8 dup v5.8h, v6.h[3] // right sub v6.8h, v6.8h, v4.8h // top-bottom uxtl v7.8h, v7.8b // weights_hor @@ -963,10 +963,10 @@ b.gt 4b ret 80: - sub x2, x2, #8 - mov x7, #-8 ld1 {v6.8h}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor + sub x2, x2, #8 + mov x7, #-8 dup v5.8h, v6.h[7] // right sub v6.8h, v6.8h, v4.8h // top-bottom uxtl v7.8h, v7.8b // weights_hor @@ -1382,7 +1382,9 @@ sxtl v21.8h, v21.8b sxtl v22.8h, v22.8b dup v31.8h, w8 +.if \bpc == 10 movi v30.8h, #0 +.endif br x5 40: ldur d0, [x2, #2] // top (0-3) @@ -1421,7 +1423,6 @@ smin v2.8h, v2.8h, v31.8h subs w4, w4, #2 st1 {v2.d}[0], [x0], x1 - uxtl v0.8h, v2.8b ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] st1 {v2.d}[1], [x6], x1 b.gt 4b @@ -2125,7 +2126,7 @@ dup v16.4s, w8 // width + height adr x7, L(ipred_cfl_tbl) rbit w8, w8 // rbit(width + height) - sub w9, w9, #22 // 22 leading bits, minus table offset 4 + sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) ldrh w9, [x7, w9, uxtw #1] @@ -2143,9 +2144,9 @@ L(ipred_cfl_h4): ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h + add x2, x2, #2 br x9 L(ipred_cfl_w4): - add x2, x2, #2 ld1 {v2.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.4h @@ -2168,9 +2169,9 @@ L(ipred_cfl_h8): ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h + add x2, x2, #2 br x9 L(ipred_cfl_w8): - add x2, x2, #2 ld1 {v2.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.8h @@ -2193,10 +2194,10 @@ L(ipred_cfl_h16): ld1 {v2.8h, v3.8h}, [x2], #32 addp v0.8h, v2.8h, v3.8h + add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w16): - add x2, x2, #2 ld1 {v2.8h, v3.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v2.8h, v2.8h, v3.8h @@ -2222,10 +2223,10 @@ addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h + add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w32): - add x2, x2, #2 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] add v0.4s, v0.4s, v16.4s addp v2.8h, v2.8h, v3.8h @@ -2398,7 +2399,6 @@ // Double the height and reuse the w4 summing/subtracting lsl w6, w6, #1 - lsl w9, w9, #1 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_w16): @@ -2547,7 +2547,6 @@ b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b - b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_hpad): cbz w4, 3f @@ -2576,7 +2575,6 @@ // Quadruple the height and reuse the w4 summing/subtracting lsl w6, w6, #2 - lsl w9, w9, #2 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_tbl): @@ -2832,3 +2830,248 @@ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) endfunc + +// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_444_tbl) + sub w8, w8, #26 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + ld1 {v0.4h}, [x1], x2 + ld1 {v0.d}[1], [x10], x2 + ld1 {v1.4h}, [x1], x2 + ld1 {v1.d}[1], [x10], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + ld1 {v3.8h}, [x10], x2 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cbnz w3, L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + shl v0.8h, v0.8h, #3 + shl v2.8h, v2.8h, #3 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr x7, L(ipred_cfl_ac_444_w32_tbl) + ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 + lsr x2, x2, #1 // Restore the stride to one line increments + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_444_w32_wpad0): +1: // Copy and expand input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + shl v2.8h, v2.8h, #3 + shl v3.8h, v3.8h, #3 + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): +1: // Copy and expand input, padding 8 + ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 + shl v2.8h, v2.8h, #3 + shl v0.8h, v0.8h, #3 + shl v1.8h, v1.8h, #3 + dup v3.8h, v2.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + ld1 {v0.8h, v1.8h}, [x1], x2 + shl v1.8h, v1.8h, #3 + shl v0.8h, v0.8h, #3 + dup v2.8h, v1.h[7] + dup v3.8h, v1.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + ld1 {v0.8h}, [x1], x2 + shl v0.8h, v0.8h, #3 + dup v1.8h, v0.h[7] + dup v2.8h, v0.h[7] + dup v3.8h, v0.h[7] + subs w8, w8, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + +L(ipred_cfl_ac_444_w32_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl w6, w6, #3 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_444_tbl): + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) + +L(ipred_cfl_ac_444_w32_tbl): + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) +endfunc diff -Nru dav1d-0.7.1/src/arm/64/ipred.S dav1d-0.9.1/src/arm/64/ipred.S --- dav1d-0.7.1/src/arm/64/ipred.S 2020-06-21 11:48:54.964126300 +0000 +++ dav1d-0.9.1/src/arm/64/ipred.S 2021-07-28 21:38:28.865851900 +0000 @@ -502,9 +502,9 @@ ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr uaddlv h0, v0.8b + add x2, x2, #1 br x3 L(ipred_dc_w4): - add x2, x2, #1 ld1 {v1.s}[0], [x2] ins v1.s[1], wzr add v0.4h, v0.4h, v16.4h @@ -534,9 +534,9 @@ L(ipred_dc_h8): ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b + add x2, x2, #1 br x3 L(ipred_dc_w8): - add x2, x2, #1 ld1 {v1.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b @@ -565,9 +565,9 @@ L(ipred_dc_h16): ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b + add x2, x2, #1 br x3 L(ipred_dc_w16): - add x2, x2, #1 ld1 {v1.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b @@ -597,10 +597,10 @@ ld1 {v0.16b, v1.16b}, [x2], #32 uaddlv h0, v0.16b uaddlv h1, v1.16b + add x2, x2, #1 add v0.4h, v0.4h, v1.4h br x3 L(ipred_dc_w32): - add x2, x2, #1 ld1 {v1.16b, v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b @@ -637,10 +637,10 @@ uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h + add x2, x2, #1 add v0.4h, v0.4h, v2.4h br x3 L(ipred_dc_w64): - add x2, x2, #1 ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b @@ -884,10 +884,10 @@ lsl x1, x1, #1 br x5 40: - sub x2, x2, #4 - mov x7, #-4 ld1r {v6.2s}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor + sub x2, x2, #4 + mov x7, #-4 dup v5.16b, v6.b[3] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor @@ -922,10 +922,10 @@ b.gt 4b ret 80: - sub x2, x2, #4 - mov x7, #-4 ld1 {v6.8b}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor + sub x2, x2, #4 + mov x7, #-4 dup v5.16b, v6.b[7] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor @@ -1460,12 +1460,14 @@ subs w3, w3, #16 sqrshrun v6.8b, v6.8h, #4 - ins v0.h[2], v2.h[7] st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 - ins v0.b[0], v6.b[7] st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 + b.le 8f + ins v0.h[2], v2.h[7] + ins v0.b[0], v6.b[7] ins v0.b[2], v6.b[3] - b.gt 2b + b 2b +8: subs w4, w4, #2 b.le 9f sub x8, x6, w9, uxtw @@ -1815,7 +1817,7 @@ dup v16.8h, w8 // width + height adr x7, L(ipred_cfl_tbl) rbit w8, w8 // rbit(width + height) - sub w9, w9, #22 // 22 leading bits, minus table offset 4 + sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) ldrh w9, [x7, w9, uxtw #1] @@ -1832,10 +1834,10 @@ L(ipred_cfl_h4): ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr + add x2, x2, #1 uaddlv h0, v0.8b br x9 L(ipred_cfl_w4): - add x2, x2, #1 ld1 {v2.s}[0], [x2] ins v2.s[1], wzr add v0.4h, v0.4h, v16.4h @@ -1858,9 +1860,9 @@ L(ipred_cfl_h8): ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b + add x2, x2, #1 br x9 L(ipred_cfl_w8): - add x2, x2, #1 ld1 {v2.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b @@ -1882,9 +1884,9 @@ L(ipred_cfl_h16): ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b + add x2, x2, #1 br x9 L(ipred_cfl_w16): - add x2, x2, #1 ld1 {v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b @@ -1907,10 +1909,10 @@ ld1 {v2.16b, v3.16b}, [x2], #32 uaddlv h2, v2.16b uaddlv h3, v3.16b + add x2, x2, #1 add v0.4h, v2.4h, v3.4h br x9 L(ipred_cfl_w32): - add x2, x2, #1 ld1 {v2.16b, v3.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b @@ -2078,6 +2080,7 @@ sub x0, x0, w6, uxtw #4 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] +L(ipred_cfl_ac_420_w8_subtract_dc): 6: // Subtract dc from ac ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w6, w6, #4 @@ -2223,7 +2226,6 @@ b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b - b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_hpad): cbz w4, 3f @@ -2244,7 +2246,6 @@ // Double the height and reuse the w8 summing/subtracting lsl w6, w6, #1 - lsl w9, w9, #1 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) L(ipred_cfl_ac_420_tbl): @@ -2474,3 +2475,290 @@ .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) endfunc + +// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_8bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_444_tbl) + sub w8, w8, #26 + ldrh w8, [x7, w8, uxtw #1] + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + ld1 {v0.s}[0], [x1], x2 + ld1 {v0.s}[1], [x10], x2 + ld1 {v1.s}[0], [x1], x2 + ld1 {v1.s}[1], [x10], x2 + ushll v0.8h, v0.8b, #3 + ushll v1.8h, v1.8b, #3 + subs w8, w8, #4 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x10], x2 + ld1 {v2.8b}, [x1], x2 + ushll v0.8h, v0.8b, #3 + ld1 {v3.8b}, [x10], x2 + ushll v1.8h, v1.8b, #3 + ushll v2.8h, v2.8b, #3 + ushll v3.8h, v3.8b, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cbnz w3, L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + ld1 {v0.16b}, [x1], x2 + ld1 {v2.16b}, [x10], x2 + ld1 {v4.16b}, [x1], x2 + ushll2 v1.8h, v0.16b, #3 + ushll v0.8h, v0.8b, #3 + ld1 {v6.16b}, [x10], x2 + ushll2 v3.8h, v2.16b, #3 + ushll v2.8h, v2.8b, #3 + ushll2 v5.8h, v4.16b, #3 + ushll v4.8h, v4.8b, #3 + ushll2 v7.8h, v6.16b, #3 + ushll v6.8h, v6.8b, #3 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + mov v0.16b, v6.16b + mov v1.16b, v7.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + ld1 {v0.8b}, [x1], x2 + ld1 {v2.8b}, [x10], x2 + ld1 {v4.8b}, [x1], x2 + ld1 {v6.8b}, [x10], x2 + ushll v0.8h, v0.8b, #3 + ushll v2.8h, v2.8b, #3 + ushll v4.8h, v4.8b, #3 + ushll v6.8h, v6.8b, #3 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + dup v5.8h, v4.h[7] + dup v7.8h, v6.h[7] + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + mov v0.16b, v6.16b + mov v1.16b, v7.16b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr x7, L(ipred_cfl_ac_444_w32_tbl) + ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_444_w32_wpad0): +1: // Copy and expand input, without padding + ld1 {v2.16b, v3.16b}, [x1], x2 + ld1 {v6.16b, v7.16b}, [x10], x2 + ushll v0.8h, v2.8b, #3 + ushll2 v1.8h, v2.16b, #3 + ushll v2.8h, v3.8b, #3 + ushll2 v3.8h, v3.16b, #3 + ushll v4.8h, v6.8b, #3 + ushll2 v5.8h, v6.16b, #3 + ushll v6.8h, v7.8b, #3 + ushll2 v7.8h, v7.16b, #3 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): +1: // Copy and expand input, padding 8 + ldr d2, [x1, #16] + ld1 {v1.16b}, [x1], x2 + ldr d6, [x10, #16] + ld1 {v5.16b}, [x10], x2 + ushll v2.8h, v2.8b, #3 + ushll v0.8h, v1.8b, #3 + ushll2 v1.8h, v1.16b, #3 + ushll v6.8h, v6.8b, #3 + ushll v4.8h, v5.8b, #3 + ushll2 v5.8h, v5.16b, #3 + dup v3.8h, v2.h[7] + dup v7.8h, v6.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + ld1 {v1.16b}, [x1], x2 + ld1 {v5.16b}, [x10], x2 + ushll v0.8h, v1.8b, #3 + ushll2 v1.8h, v1.16b, #3 + ushll v4.8h, v5.8b, #3 + ushll2 v5.8h, v5.16b, #3 + dup v2.8h, v1.h[7] + dup v3.8h, v1.h[7] + dup v6.8h, v5.h[7] + dup v7.8h, v5.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + ld1 {v0.8b}, [x1], x2 + ld1 {v4.8b}, [x10], x2 + ushll v0.8h, v0.8b, #3 + ushll v4.8h, v4.8b, #3 + dup v1.8h, v0.h[7] + dup v2.8h, v0.h[7] + dup v3.8h, v0.h[7] + dup v5.8h, v4.h[7] + dup v6.8h, v4.h[7] + dup v7.8h, v4.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 1b + +L(ipred_cfl_ac_444_w32_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + add v16.8h, v16.8h, v4.8h + add v17.8h, v17.8h, v5.8h + add v18.8h, v18.8h, v6.8h + add v19.8h, v19.8h, v7.8h + b.gt 2b +3: + + // Quadruple the height and reuse the w8 subtracting + lsl w6, w6, #2 + // Aggregate the sums, with wider intermediates earlier than in + // ipred_cfl_ac_420_w8_calc_subtract_dc. + uaddlp v0.4s, v16.8h + uaddlp v1.4s, v17.8h + uaddlp v2.4s, v18.8h + uaddlp v3.4s, v19.8h + add v0.4s, v0.4s, v1.4s + add v2.4s, v2.4s, v3.4s + add v0.4s, v0.4s, v2.4s + addv s0, v0.4s // sum + sub x0, x0, w6, uxtw #4 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] + b L(ipred_cfl_ac_420_w8_subtract_dc) + +L(ipred_cfl_ac_444_tbl): + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) + .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) + +L(ipred_cfl_ac_444_w32_tbl): + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) + .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) +endfunc diff -Nru dav1d-0.7.1/src/arm/64/itx16.S dav1d-0.9.1/src/arm/64/itx16.S --- dav1d-0.7.1/src/arm/64/itx16.S 2020-06-21 11:48:54.964126300 +0000 +++ dav1d-0.9.1/src/arm/64/itx16.S 2021-07-28 21:38:28.869851800 +0000 @@ -124,7 +124,7 @@ .endif .endm -.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 +.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 .endif @@ -132,10 +132,7 @@ srshr \shift, \shift, #\shiftbits .endif .ifnb \addsrc - sqadd \adddst, \adddst, \addsrc -.endif -.ifnb \max - smax \max, \max, v6.8h + usqadd \adddst, \addsrc .endif .ifnb \min smin \min, \min, v7.8h @@ -146,63 +143,57 @@ .endm .macro load_add_store_8x16 dst, src mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store v2.8h, v16.8h, , , , , , \dst, \src - load_add_store v3.8h, v17.8h, , , , , , \dst, \src - load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src - load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src - load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src - load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src - load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src - load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src - load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src - load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src - load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src - load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src - load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src - load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src - load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src - load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src - load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src - load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src - load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src - load_add_store , , , , , v31.8h, v30.8h, \dst, \src - load_add_store , , , , , , v31.8h, \dst, \src + load_add_store v2.8h, v16.8h, , , , , \dst, \src + load_add_store v3.8h, v17.8h, , , , , \dst, \src + load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src + load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src + load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src + load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src + load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src + load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src + load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src + load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src + load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src + load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src + load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src + load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src + load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src + load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src + load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src + load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src + load_add_store , , , , v27.8h, v26.8h, \dst, \src + load_add_store , , , , , v27.8h, \dst, \src .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits - load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits - load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits - load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits - load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits - load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits - load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits - load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits - load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits - load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits - load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits - load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits - load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits + load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits + load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits + load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits + load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits + load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits + load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , , , , v19.8h, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src, shiftbits=4 mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits - load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits - load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits - load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits - load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits - load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits - load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits - load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits - load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits + load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits + load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits + load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits + load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits + load_add_store , , , , , v5.8h, \dst, \src, \shiftbits .endm -.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src +.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src .ifnb \load ld1 {\load}[0], [\src], x1 .endif @@ -216,14 +207,11 @@ ld1 {\load}[1], [\src], x1 .endif .ifnb \addsrc - sqadd \adddst, \adddst, \addsrc + usqadd \adddst, \addsrc .endif .ifnb \store st1 {\store}[0], [\dst], x1 .endif -.ifnb \max - smax \max, \max, v6.8h -.endif .ifnb \min smin \min, \min, v7.8h .endif @@ -233,37 +221,33 @@ .endm .macro load_add_store_4x16 dst, src mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src - load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src - load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src - load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src - load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src - load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src - load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src - load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src - load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src - load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src - load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src - load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src - load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src - load_add_store4 , , , , , , , , v30.d, \dst, \src + load_add_store4 v0.d, v17, v16, , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src + load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src + load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src + load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src + load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src + load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src + load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src + load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src + load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src + load_add_store4 , , , , , , , v23.d, \dst, \src .endm .macro load_add_store_4x8 dst, src mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src - load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src - load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src - load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src - load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src - load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src - load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src - load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src - load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src - load_add_store4 , , , , , , , , v22.d, \dst, \src + load_add_store4 v0.d, v17, v16, , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src + load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src + load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src + load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src + load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src + load_add_store4 , , , , , , , v3.d, \dst, \src .endm .macro idct_dc w, h, shift @@ -291,7 +275,6 @@ .endm function idct_dc_w4_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.d}[0], [x0], x1 @@ -299,11 +282,9 @@ ld1 {v1.d}[0], [x0], x1 subs w4, w4, #4 ld1 {v1.d}[1], [x0], x1 - sqadd v0.8h, v0.8h, v16.8h + usqadd v0.8h, v16.8h sub x0, x0, x1, lsl #2 - sqadd v1.8h, v1.8h, v16.8h - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h + usqadd v1.8h, v16.8h smin v0.8h, v0.8h, v31.8h st1 {v0.d}[0], [x0], x1 smin v1.8h, v1.8h, v31.8h @@ -315,23 +296,18 @@ endfunc function idct_dc_w8_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h}, [x0], x1 subs w4, w4, #4 ld1 {v1.8h}, [x0], x1 - sqadd v0.8h, v0.8h, v16.8h + usqadd v0.8h, v16.8h ld1 {v2.8h}, [x0], x1 - sqadd v1.8h, v1.8h, v16.8h + usqadd v1.8h, v16.8h ld1 {v3.8h}, [x0], x1 - sqadd v2.8h, v2.8h, v16.8h - sqadd v3.8h, v3.8h, v16.8h + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h sub x0, x0, x1, lsl #2 - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h - smax v2.8h, v2.8h, v30.8h - smax v3.8h, v3.8h, v30.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h st1 {v0.8h}, [x0], x1 @@ -345,21 +321,16 @@ endfunc function idct_dc_w16_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h}, [x0], x1 subs w4, w4, #2 ld1 {v2.8h, v3.8h}, [x0], x1 - sqadd v0.8h, v0.8h, v16.8h - sqadd v1.8h, v1.8h, v16.8h + usqadd v0.8h, v16.8h + usqadd v1.8h, v16.8h sub x0, x0, x1, lsl #1 - sqadd v2.8h, v2.8h, v16.8h - sqadd v3.8h, v3.8h, v16.8h - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h - smax v2.8h, v2.8h, v30.8h - smax v3.8h, v3.8h, v30.8h + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h @@ -371,19 +342,14 @@ endfunc function idct_dc_w32_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w4, w4, #1 - sqadd v0.8h, v0.8h, v16.8h - sqadd v1.8h, v1.8h, v16.8h - sqadd v2.8h, v2.8h, v16.8h - sqadd v3.8h, v3.8h, v16.8h - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h - smax v2.8h, v2.8h, v30.8h - smax v3.8h, v3.8h, v30.8h + usqadd v0.8h, v16.8h + usqadd v1.8h, v16.8h + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h @@ -394,30 +360,21 @@ endfunc function idct_dc_w64_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff sub x1, x1, #64 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 subs w4, w4, #1 - sqadd v0.8h, v0.8h, v16.8h + usqadd v0.8h, v16.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] - sqadd v1.8h, v1.8h, v16.8h + usqadd v1.8h, v16.8h sub x0, x0, #64 - sqadd v2.8h, v2.8h, v16.8h - sqadd v3.8h, v3.8h, v16.8h - sqadd v4.8h, v4.8h, v16.8h - sqadd v5.8h, v5.8h, v16.8h - sqadd v6.8h, v6.8h, v16.8h - sqadd v7.8h, v7.8h, v16.8h - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h - smax v2.8h, v2.8h, v30.8h - smax v3.8h, v3.8h, v30.8h - smax v4.8h, v4.8h, v30.8h - smax v5.8h, v5.8h, v30.8h - smax v6.8h, v6.8h, v30.8h - smax v7.8h, v7.8h, v30.8h + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h + usqadd v4.8h, v16.8h + usqadd v5.8h, v16.8h + usqadd v6.8h, v16.8h + usqadd v7.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h @@ -445,12 +402,12 @@ .macro idct_4 r0, r1, r2, r3 mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] - mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] + mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] srshr v6.4s, v6.4s, #12 - srshr v7.4s, v4.4s, #12 srshr v2.4s, v2.4s, #12 + srshr v7.4s, v4.4s, #12 srshr v3.4s, v3.4s, #12 sqadd \r0\().4s, v2.4s, v6.4s sqsub \r3\().4s, v2.4s, v6.4s @@ -575,16 +532,14 @@ L(itx_4x4_end): mvni v31.8h, #0xfc, lsl #8 // 0x3ff sub x0, x0, x1, lsl #2 - sqadd v16.8h, v16.8h, v0.8h - sqadd v18.8h, v18.8h, v1.8h - smax v16.8h, v16.8h, v30.8h - smax v18.8h, v18.8h, v30.8h - smin v16.8h, v16.8h, v31.8h - st1 {v16.d}[0], [x0], x1 - smin v18.8h, v18.8h, v31.8h - st1 {v16.d}[1], [x0], x1 - st1 {v18.d}[0], [x0], x1 - st1 {v18.d}[1], [x0], x1 + usqadd v0.8h, v16.8h + usqadd v1.8h, v18.8h + smin v0.8h, v0.8h, v31.8h + st1 {v0.d}[0], [x0], x1 + smin v1.8h, v1.8h, v31.8h + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 br x15 endfunc @@ -647,7 +602,7 @@ srshr \r1\().4s, v2.4s, #12 // t4a srshr \r7\().4s, v4.4s, #12 // t7a srshr \r3\().4s, v6.4s, #12 // t5a - srshr \r5\().4s, v7.4s, #12 // taa + srshr \r5\().4s, v7.4s, #12 // t6a sqadd v2.4s, \r1\().4s, \r3\().4s // t4 sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a @@ -1052,7 +1007,7 @@ srshr v4.4s, v4.4s, #12 // t11 srshr v5.4s, v6.4s, #12 // t12 - mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a + mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a srshr v2.4s, v2.4s, #12 // t10a srshr v3.4s, v6.4s, #12 // t13a @@ -1488,10 +1443,10 @@ st1 {v2.4s}, [x6], x11 .endr blr x4 - rshrn v28.4h, v16.4s, #1 - rshrn v29.4h, v17.4s, #1 - rshrn v30.4h, v18.4s, #1 - rshrn v31.4h, v19.4s, #1 + sqrshrn v28.4h, v16.4s, #1 + sqrshrn v29.4h, v17.4s, #1 + sqrshrn v30.4h, v18.4s, #1 + sqrshrn v31.4h, v19.4s, #1 transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 b 2f @@ -1511,10 +1466,10 @@ st1 {v2.4s}, [x6], x11 .endr blr x4 - rshrn v24.4h, v16.4s, #1 - rshrn v25.4h, v17.4s, #1 - rshrn v26.4h, v18.4s, #1 - rshrn v27.4h, v19.4s, #1 + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 b 2f @@ -1533,10 +1488,10 @@ st1 {v2.4s}, [x6], x11 .endr blr x4 - rshrn v20.4h, v16.4s, #1 - rshrn v21.4h, v17.4s, #1 - rshrn v22.4h, v18.4s, #1 - rshrn v23.4h, v19.4s, #1 + sqrshrn v20.4h, v16.4s, #1 + sqrshrn v21.4h, v17.4s, #1 + sqrshrn v22.4h, v18.4s, #1 + sqrshrn v23.4h, v19.4s, #1 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 b 2f @@ -1552,10 +1507,10 @@ st1 {v2.4s}, [x2], x11 .endr blr x4 - rshrn v16.4h, v16.4s, #1 - rshrn v17.4h, v17.4s, #1 - rshrn v18.4h, v18.4s, #1 - rshrn v19.4h, v19.4s, #1 + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 blr x5 @@ -2219,7 +2174,6 @@ neg x9, x8 mov x10, x6 - movi v0.8h, #0 mvni v1.8h, #0xfc, lsl #8 // 0x3ff .macro combine r0, r1, r2, r3, op, stride ld1 {v5.8h}, [x7], \stride @@ -2231,27 +2185,23 @@ ld1 {v4.8h}, [x10], x1 srshr v5.8h, v5.8h, #4 \op v6.8h, v6.8h, \r1 - sqadd v5.8h, v5.8h, v2.8h + usqadd v2.8h, v5.8h srshr v6.8h, v6.8h, #4 \op v7.8h, v7.8h, \r2 - smax v2.8h, v5.8h, v0.8h ld1 {v5.8h}, [x7], \stride - sqadd v6.8h, v6.8h, v3.8h + usqadd v3.8h, v6.8h smin v2.8h, v2.8h, v1.8h srshr v7.8h, v7.8h, #4 \op v5.8h, v5.8h, \r3 st1 {v2.8h}, [x6], x1 ld1 {v2.8h}, [x10], x1 - smax v3.8h, v6.8h, v0.8h - sqadd v7.8h, v7.8h, v4.8h + usqadd v4.8h, v7.8h smin v3.8h, v3.8h, v1.8h srshr v5.8h, v5.8h, #4 st1 {v3.8h}, [x6], x1 - smax v4.8h, v7.8h, v0.8h - sqadd v5.8h, v5.8h, v2.8h + usqadd v2.8h, v5.8h smin v4.8h, v4.8h, v1.8h st1 {v4.8h}, [x6], x1 - smax v2.8h, v5.8h, v0.8h smin v2.8h, v2.8h, v1.8h st1 {v2.8h}, [x6], x1 .endm @@ -2652,8 +2602,10 @@ mov w8, #(16 - \i) cmp w3, w12 b.lt 1f +.if \i < 12 ldrh w12, [x13], #2 .endif +.endif mov x8, #4*16 bl inv_txfm_horz_scale_dct_32x4_neon .endr @@ -3195,7 +3147,6 @@ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff .macro add_dest_addsub src0, src1, src2, src3 ld1 {v0.8h}, [x6], x1 @@ -3211,18 +3162,14 @@ srshr v4.8h, v4.8h, #4 srshr v5.8h, v5.8h, #4 srshr \src0, \src0, #4 - sqadd v0.8h, v0.8h, v4.8h + usqadd v0.8h, v4.8h srshr \src2, \src2, #4 - sqadd v1.8h, v1.8h, \src0 - sqadd v2.8h, v2.8h, v5.8h - smax v0.8h, v0.8h, v6.8h - sqadd v3.8h, v3.8h, \src2 - smax v1.8h, v1.8h, v6.8h + usqadd v1.8h, \src0 + usqadd v2.8h, v5.8h smin v0.8h, v0.8h, v7.8h - smax v2.8h, v2.8h, v6.8h + usqadd v3.8h, \src2 smin v1.8h, v1.8h, v7.8h st1 {v0.8h}, [x6], x1 - smax v3.8h, v3.8h, v6.8h smin v2.8h, v2.8h, v7.8h st1 {v1.8h}, [x9], x10 smin v3.8h, v3.8h, v7.8h @@ -3240,29 +3187,6 @@ br x14 endfunc -.macro sub_sp space -#ifdef _WIN32 -.if \space > 8192 - // Here, we'd need to touch two (or more) pages while decrementing - // the stack pointer. - .error "sub_sp_align doesn't support values over 8K at the moment" -.elseif \space > 4096 - sub x16, sp, #4096 - ldr xzr, [x16] - sub sp, x16, #(\space - 4096) -.else - sub sp, sp, #\space -.endif -#else -.if \space >= 4096 - sub sp, sp, #(\space)/4096*4096 -.endif -.if (\space % 4096) != 0 - sub sp, sp, #(\space)%4096 -.endif -#endif -.endm - function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 idct_dc 64, 64, 2 @@ -3492,8 +3416,10 @@ mov w8, #(32 - \i) cmp w3, w12 b.lt 1f +.if \i < 28 ldrh w12, [x13], #2 .endif +.endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_16x4_neon diff -Nru dav1d-0.7.1/src/arm/64/itx.S dav1d-0.9.1/src/arm/64/itx.S --- dav1d-0.7.1/src/arm/64/itx.S 2020-06-21 11:48:54.964126300 +0000 +++ dav1d-0.9.1/src/arm/64/itx.S 2021-07-28 21:38:28.869851800 +0000 @@ -718,7 +718,7 @@ rshrn_sz \r7, v4, v5, #12, \sz // t7a smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a rshrn_sz \r3, v6, v7, #12, \sz // t5a - rshrn_sz \r5, v2, v3, #12, \sz // taa + rshrn_sz \r5, v2, v3, #12, \sz // t6a sqadd v2\sz, \r1\sz, \r3\sz // t4 sqsub \r1\sz, \r1\sz, \r3\sz // t5a @@ -1085,7 +1085,7 @@ rshrn_sz v4, v4, v5, #12, \sz // t11 rshrn_sz v5, v6, v7, #12, \sz // t12 - smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a + smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a rshrn_sz v2, v2, v3, #12, \sz // t10a rshrn_sz v3, v6, v7, #12, \sz // t13a @@ -3002,29 +3002,6 @@ br x14 endfunc -.macro sub_sp space -#ifdef _WIN32 -.if \space > 8192 - // Here, we'd need to touch two (or more) pages while decrementing - // the stack pointer. - .error "sub_sp_align doesn't support values over 8K at the moment" -.elseif \space > 4096 - sub x16, sp, #4096 - ldr xzr, [x16] - sub sp, x16, #(\space - 4096) -.else - sub sp, sp, #\space -.endif -#else -.if \space >= 4096 - sub sp, sp, #(\space)/4096*4096 -.endif -.if (\space % 4096) != 0 - sub sp, sp, #(\space)%4096 -.endif -#endif -.endm - function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 idct_dc 64, 64, 2 @@ -3149,8 +3126,10 @@ mov w8, #(32 - \i) cmp w3, w12 b.lt 1f +.if \i < 24 ldrh w12, [x13], #2 .endif +.endif add x7, x2, #(\i*2) mov x8, #32*2 bl inv_txfm_horz_scale_dct_32x8_neon @@ -3254,8 +3233,10 @@ mov w8, #(32 - \i) cmp w3, w12 b.lt 1f +.if \i < 24 ldrh w12, [x13], #2 .endif +.endif add x7, x2, #(\i*2) mov x8, #32*2 bl inv_txfm_horz_16x8_neon diff -Nru dav1d-0.7.1/src/arm/64/loopfilter16.S dav1d-0.9.1/src/arm/64/loopfilter16.S --- dav1d-0.7.1/src/arm/64/loopfilter16.S 2020-06-21 11:48:54.968126300 +0000 +++ dav1d-0.9.1/src/arm/64/loopfilter16.S 2021-07-28 21:38:28.869851800 +0000 @@ -150,10 +150,9 @@ movi v6.8h, #4 add v2.8h, v2.8h, v4.8h smin v2.8h, v2.8h, v3.8h // f = iclip_diff() - movi v7.8h, #3 smax v2.8h, v2.8h, v9.8h // f = iclip_diff() sqadd v4.8h, v6.8h, v2.8h // f + 4 - sqadd v5.8h, v7.8h, v2.8h // f + 3 + sqadd v5.8h, v5.8h, v2.8h // f + 3 smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1) smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1) sshr v4.8h, v4.8h, #3 // f1 @@ -785,7 +784,7 @@ orr w6, w6, w7 // vmask[0] |= vmask[1] 1: - tst w6, #0x0f + tst w6, #0x03 .ifc \dir, v ld1 {v0.8b}, [x4], #8 ld1 {v1.8b}, [x3], #8 @@ -808,11 +807,11 @@ ld1r {v6.8b}, [x5] // sharp[1] sub x5, x5, #8 bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0] + cmtst v2.2s, v1.2s, v2.2s // L != 0 mul v1.2s, v1.2s, v4.2s // L .ifc \type, y dup v15.2s, w2 // vmask[2] .endif - cmtst v2.2s, v1.2s, v2.2s // L != 0 dup v14.2s, w7 // vmask[1] mov x16, v2.d[0] cmp x16, #0 @@ -847,14 +846,14 @@ ushl v10.8h, v10.8h, v31.8h .ifc \type, y - tst w2, #0x0f + tst w2, #0x03 b.eq 2f // wd16 bl lpf_\dir\()_16_8_neon b 8f 2: .endif - tst w7, #0x0f + tst w7, #0x03 b.eq 3f .ifc \type, y // wd8 diff -Nru dav1d-0.7.1/src/arm/64/loopfilter.S dav1d-0.9.1/src/arm/64/loopfilter.S --- dav1d-0.7.1/src/arm/64/loopfilter.S 2020-06-21 11:48:54.964126300 +0000 +++ dav1d-0.9.1/src/arm/64/loopfilter.S 2021-07-28 21:38:28.869851800 +0000 @@ -132,12 +132,11 @@ .endif b.eq 1f // skip wd == 4 case .endif - - usubl v2.8h, v22.8b, v25.8b // p1 - q1 - usubl2 v3.8h, v22.16b, v25.16b + movi v3.16b, #128 + eor v2.16b, v22.16b, v3.16b // p1 - 128 + eor v3.16b, v25.16b, v3.16b // q1 - 128 cmhi v0.16b, v0.16b, v12.16b // hev - sqxtn v2.8b, v2.8h // iclip_diff(p1 - q1) - sqxtn2 v2.16b, v3.8h + sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1) and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) usubl v2.8h, v24.8b, v23.8b @@ -155,35 +154,23 @@ sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127) sshr v4.16b, v4.16b, #3 // f1 sshr v5.16b, v5.16b, #3 // f2 - uxtl v2.8h, v23.8b // p0 - uxtl2 v3.8h, v23.16b - uxtl v6.8h, v24.8b // q0 - uxtl2 v7.8h, v24.16b - saddw v2.8h, v2.8h, v5.8b - saddw2 v3.8h, v3.8h, v5.16b - ssubw v6.8h, v6.8h, v4.8b - ssubw2 v7.8h, v7.8h, v4.16b + mov v2.16b, v23.16b // p0 + mov v3.16b, v24.16b // q0 + neg v6.16b, v4.16b // -f1 srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1 - sqxtun v2.8b, v2.8h // out p0 - sqxtun2 v2.16b, v3.8h - sqxtun v6.8b, v6.8h // out q0 - sqxtun2 v6.16b, v7.8h + // p0 + f2, q0 - f1 + usqadd v2.16b, v5.16b // out p0 + usqadd v3.16b, v6.16b // out q0 + neg v6.16b, v4.16b // -((f1 + 1) >> 1) bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) - uxtl v2.8h, v22.8b // p1 - uxtl2 v3.8h, v22.16b - bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4) - uxtl v6.8h, v25.8b // q1 - uxtl2 v7.8h, v25.16b - saddw v2.8h, v2.8h, v4.8b - saddw2 v3.8h, v3.8h, v4.16b - ssubw v6.8h, v6.8h, v4.8b - ssubw2 v7.8h, v7.8h, v4.16b - sqxtun v2.8b, v2.8h // out p1 - sqxtun2 v2.16b, v3.8h - sqxtun v6.8b, v6.8h // out q1 - sqxtun2 v6.16b, v7.8h + bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4) + mov v2.16b, v22.16b // p1 + mov v3.16b, v25.16b // q1 + // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1) + usqadd v2.16b, v4.16b // out p1 + usqadd v3.16b, v6.16b // out q1 bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) - bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev) + bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 @@ -1034,11 +1021,11 @@ ld1r {v6.16b}, [x5] // sharp[1] sub x5, x5, #8 bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0] + cmtst v2.4s, v1.4s, v2.4s // L != 0 mul v1.4s, v1.4s, v4.4s // L .ifc \type, y dup v15.4s, w2 // vmask[2] .endif - cmtst v2.4s, v1.4s, v2.4s // L != 0 dup v14.4s, w7 // vmask[1] mov x16, v2.d[0] mov x17, v2.d[1] diff -Nru dav1d-0.7.1/src/arm/64/looprestoration16.S dav1d-0.9.1/src/arm/64/looprestoration16.S --- dav1d-0.7.1/src/arm/64/looprestoration16.S 2020-06-21 11:48:54.968126300 +0000 +++ dav1d-0.9.1/src/arm/64/looprestoration16.S 2021-07-28 21:38:28.869851800 +0000 @@ -28,655 +28,1058 @@ #include "src/arm/asm.S" #include "util.S" -// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], -// const pixel *src, ptrdiff_t stride, -// const int16_t fh[7], const intptr_t w, -// int h, enum LrEdgeFlags edges, -// const int bitdepth_max); -function wiener_filter_h_16bpc_neon, export=1 - ldr w8, [sp] // bitdepth_max - ld1 {v0.8h}, [x4] - clz w8, w8 +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], +// const pixel *lpf, const ptrdiff_t lpf_stride, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter7_16bpc_neon, export=1 + ldr w8, [sp] +#ifdef __APPLE__ + ldr w9, [sp, #4] +#else + ldr w9, [sp, #8] +#endif + stp x29, x30, [sp, #-32]! + stp d8, d9, [sp, #16] + mov x29, sp + ld1 {v0.8h, v1.8h}, [x7] + tst w8, #4 // LR_HAVE_TOP + sub_sp 384*2*6 + + dup v28.8h, w9 // bitdepth_max + clz w9, w9 movi v30.4s, #1 - sub w9, w8, #38 // -(bitdepth + 6) - sub w8, w8, #25 // -round_bits_h - neg w9, w9 // bitdepth + 6 - dup v1.4s, w9 - dup v29.4s, w8 // -round_bits_h + sub w10, w9, #38 // -(bitdepth + 6) + sub w11, w9, #11 // round_bits_v + sub w9, w9, #25 // -round_bits_h + neg w10, w10 // bitdepth + 6 + neg w11, w11 // -round_bits_v + dup v2.4s, w10 + dup v29.4s, w9 // -round_bits_h + dup v27.4s, w11 // -round_bits_v movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 - ushl v30.4s, v30.4s, v1.4s // 1 << (bitdepth + 6) - mov w8, w5 - // Calculate mid_stride - add w10, w5, #7 - bic w10, w10, #7 - lsl w10, w10, #1 - - // Clear the last unused element of v0, to allow filtering a single - // pixel with one plain mul+addv. - ins v0.h[7], wzr + ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) + + zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 + + // x9 - t6 + // x10 - t5 + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_7) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter7_h_16bpc_neon + add x3, x3, x4 // lpf += lpf_stride + mov x9, x14 // t6 + mov x10, x14 // t5 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + add x3, x3, x4, lsl #2 + add x3, x3, x4 // lpf += lpf_stride*5 + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + mov x13, x14 // t2 + subs w6, w6, #1 // h-- + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + +L(main_7): + add x15, x14, #384*2 // t0 = t1 + 384*2 +L(main_loop_7): + bl wiener_filter7_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_loop_7) + tst w8, #8 // LR_HAVE_BOTTOM + b.eq L(v3_7) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + sub x4, x4, x1 // lpf_stride - p_stride + bl wiener_filter7_hv_16bpc_neon + add x3, x3, x4 // src += lpf_stride - p_stride + bl wiener_filter7_hv_16bpc_neon +L(v1_7): + bl wiener_filter7_v_16bpc_neon + + mov sp, x29 + ldp d8, d9, [sp, #16] + ldp x29, x30, [sp], #32 + ret + +L(no_top_7): + add x3, x3, x4, lsl #2 + add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x9, x14 // t6 + mov x10, x14 // t5 + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x13, x14 // t2 + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter7_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x15, x15, #384*2*4 // t0 += 384*2*4 + bl wiener_filter7_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_7) +L(v3_7): + bl wiener_filter7_v_16bpc_neon +L(v2_7): + bl wiener_filter7_v_16bpc_neon + b L(v1_7) +endfunc - // Set up pointers for reading/writing alternate rows - add x12, x0, x10 - lsl w10, w10, #1 - add x13, x2, x3 - lsl x3, x3, #1 - - // Subtract the width from mid_stride - sub x10, x10, w5, uxtw #1 - - // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. - cmp w5, #8 - add w11, w5, #13 - bic w11, w11, #7 - b.ge 1f - mov w11, #16 -1: - sub x3, x3, w11, uxtw #1 + +function wiener_filter7_h_16bpc_neon + stp x3, x5, [sp, #-32]! + str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f // LR_HAVE_LEFT - cbnz x1, 0f + cbnz x2, 0f // left == NULL - sub x2, x2, #6 - sub x13, x13, #6 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 3 pixels from the src pointer, - // but shift it as if we had done that. - add x3, x3, #6 - - -1: // Loop vertically - ld1 {v2.8h, v3.8h}, [x2], #32 - ld1 {v4.8h, v5.8h}, [x13], #32 + sub x3, x3, #6 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x1, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v1.d}[1], [x1], #8 - // Move x2/x13 back to account for the last 3 pixels we loaded earlier, + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, // which we'll shift out. - sub x2, x2, #6 - sub x13, x13, #6 - ld1 {v6.d}[1], [x1], #8 + sub x3, x3, #6 ext v3.16b, v2.16b, v3.16b, #10 - ext v2.16b, v1.16b, v2.16b, #10 - ext v5.16b, v4.16b, v5.16b, #10 - ext v4.16b, v6.16b, v4.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 b 2f -0: - // !LR_HAVE_LEFT, fill v1 with the leftmost pixel - // and shift v2/v3 to have 3x the first pixel at the front. - dup v1.8h, v2.h[0] - dup v6.8h, v4.h[0] - // Move x2 back to account for the last 3 pixels we loaded before, + +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v4 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, // which we shifted out. - sub x2, x2, #6 - sub x13, x13, #6 + sub x3, x3, #6 ext v3.16b, v2.16b, v3.16b, #10 - ext v2.16b, v1.16b, v2.16b, #10 - ext v5.16b, v4.16b, v5.16b, #10 - ext v4.16b, v6.16b, v4.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 2: + ld1 {v4.8h}, [x3], #16 - tst w7, #2 // LR_HAVE_RIGHT + tst w8, #2 // LR_HAVE_RIGHT b.ne 4f - // If we'll need to pad the right edge, load that byte to pad with - // here since we can find it pretty easily from here. - sub w9, w5, #14 - ldr h27, [x2, w9, sxtw #1] - ldr h28, [x13, w9, sxtw #1] - // Fill v27/v28 with the right padding pixel - dup v27.8h, v27.h[0] - dup v28.8h, v28.h[0] + 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. - cmp w5, #11 - b.ge 4f // If w >= 11, all used input pixels are valid - cmp w5, #7 - b.ge 5f // If w >= 7, we can filter 4 pixels - b 6f -4: // Loop horizontally -.macro ushll_sz d0, d1, src, shift, wd - ushll \d0\().4s, \src\().4h, \shift -.ifc \wd, .8h - ushll2 \d1\().4s, \src\().8h, \shift -.endif -.endm -.macro add_sz d0, d1, s0, s1, c, wd - add \d0\().4s, \s0\().4s, \c\().4s -.ifc \wd, .8h - add \d1\().4s, \s1\().4s, \c\().4s -.endif -.endm -.macro srshl_sz d0, d1, s0, s1, c, wd - srshl \d0\().4s, \s0\().4s, \c\().4s -.ifc \wd, .8h - srshl \d1\().4s, \s1\().4s, \c\().4s -.endif -.endm -.macro sqxtun_sz dst, s0, s1, wd - sqxtun \dst\().4h, \s0\().4s -.ifc \wd, .8h - sqxtun2 \dst\().8h, \s1\().4s -.endif -.endm + // Check whether we need to pad the right edge + cmp w5, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w5, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -6 + ldr h26, [x3, w17, sxtw #1] + sub x7, x7, w5, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x7] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b -.macro filter wd +4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - ext v16.16b, v2.16b, v3.16b, #2 ext v17.16b, v2.16b, v3.16b, #4 - ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 ext v20.16b, v2.16b, v3.16b, #10 ext v21.16b, v2.16b, v3.16b, #12 - ushll_sz v6, v7, v18, #7, \wd - smlal v6.4s, v2.4h, v0.h[0] - smlal v6.4s, v16.4h, v0.h[1] - smlal v6.4s, v17.4h, v0.h[2] - smlal v6.4s, v18.4h, v0.h[3] - smlal v6.4s, v19.4h, v0.h[4] - smlal v6.4s, v20.4h, v0.h[5] - smlal v6.4s, v21.4h, v0.h[6] -.ifc \wd, .8h - smlal2 v7.4s, v2.8h, v0.h[0] - smlal2 v7.4s, v16.8h, v0.h[1] - smlal2 v7.4s, v17.8h, v0.h[2] - smlal2 v7.4s, v18.8h, v0.h[3] - smlal2 v7.4s, v19.8h, v0.h[4] - smlal2 v7.4s, v20.8h, v0.h[5] - smlal2 v7.4s, v21.8h, v0.h[6] -.endif - ext v19.16b, v4.16b, v5.16b, #2 - ext v20.16b, v4.16b, v5.16b, #4 - ext v21.16b, v4.16b, v5.16b, #6 - ext v22.16b, v4.16b, v5.16b, #8 - ext v23.16b, v4.16b, v5.16b, #10 - ext v24.16b, v4.16b, v5.16b, #12 - ushll_sz v16, v17, v21, #7, \wd - smlal v16.4s, v4.4h, v0.h[0] - smlal v16.4s, v19.4h, v0.h[1] - smlal v16.4s, v20.4h, v0.h[2] - smlal v16.4s, v21.4h, v0.h[3] - smlal v16.4s, v22.4h, v0.h[4] - smlal v16.4s, v23.4h, v0.h[5] - smlal v16.4s, v24.4h, v0.h[6] -.ifc \wd, .8h - smlal2 v17.4s, v4.8h, v0.h[0] - smlal2 v17.4s, v19.8h, v0.h[1] - smlal2 v17.4s, v20.8h, v0.h[2] - smlal2 v17.4s, v21.8h, v0.h[3] - smlal2 v17.4s, v22.8h, v0.h[4] - smlal2 v17.4s, v23.8h, v0.h[5] - smlal2 v17.4s, v24.8h, v0.h[6] -.endif - mvni v24\wd, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 - add_sz v6, v7, v6, v7, v30, \wd - add_sz v16, v17, v16, v17, v30, \wd - srshl_sz v6, v7, v6, v7, v29, \wd - srshl_sz v16, v17, v16, v17, v29, \wd - sqxtun_sz v6, v6, v7, \wd - sqxtun_sz v7, v16, v17, \wd - umin v6\wd, v6\wd, v24\wd - umin v7\wd, v7\wd, v24\wd - sub v6\wd, v6\wd, v31\wd - sub v7\wd, v7\wd, v31\wd -.endm - filter .8h - st1 {v6.8h}, [x0], #16 - st1 {v7.8h}, [x12], #16 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + smull v6.4s, v18.4h, v0.h[3] + smlal v6.4s, v19.4h, v0.h[2] + smlal v6.4s, v20.4h, v0.h[1] + smlal v6.4s, v21.4h, v0.h[0] + smull2 v7.4s, v18.8h, v0.h[3] + smlal2 v7.4s, v19.8h, v0.h[2] + smlal2 v7.4s, v20.8h, v0.h[1] + smlal2 v7.4s, v21.8h, v0.h[0] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + smull v16.4s, v18.4h, v0.h[3] + smlal v16.4s, v19.4h, v0.h[2] + smlal v16.4s, v20.4h, v0.h[1] + smlal v16.4s, v21.4h, v0.h[0] + smull2 v17.4s, v18.8h, v0.h[3] + smlal2 v17.4s, v19.8h, v0.h[2] + smlal2 v17.4s, v20.8h, v0.h[1] + smlal2 v17.4s, v21.8h, v0.h[0] - subs w5, w5, #8 - b.le 9f - tst w7, #2 // LR_HAVE_RIGHT - mov v2.16b, v3.16b - mov v4.16b, v5.16b - ld1 {v3.8h}, [x2], #16 - ld1 {v5.8h}, [x13], #16 + mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v16.4s, v16.4s, v29.4s + srshl v17.4s, v17.4s, v29.4s + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v16.4s + sqxtun2 v7.8h, v17.4s + umin v6.8h, v6.8h, v24.8h + umin v7.8h, v7.8h, v24.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + subs w5, w5, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w8, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. -5: // Filter 4 pixels, 7 <= w < 11 - filter .4h - st1 {v6.4h}, [x0], #8 - st1 {v7.4h}, [x12], #8 - - subs w5, w5, #4 // 3 <= w < 7 - ext v2.16b, v2.16b, v3.16b, #8 - ext v3.16b, v3.16b, v3.16b, #8 - ext v4.16b, v4.16b, v5.16b, #8 - ext v5.16b, v5.16b, v5.16b, #8 - -6: // Pad the right edge and filter the last few pixels. - // w < 7, w+3 pixels valid in v2-v3 - cmp w5, #5 - b.lt 7f - b.gt 8f - // w == 5, 8 pixels valid in v2, v3 invalid - mov v3.16b, v27.16b - mov v5.16b, v28.16b - b 88f - -7: // 1 <= w < 5, 4-7 pixels valid in v2 - sub w9, w5, #1 - // w9 = (pixels valid - 4) - adr x11, L(variable_shift_tbl) - ldrh w9, [x11, w9, uxtw #1] - sub x11, x11, w9, uxth - mov v3.16b, v27.16b - mov v5.16b, v28.16b - br x11 -44: // 4 pixels valid in v2/v4, fill the high half with padding. - ins v2.d[1], v3.d[0] - ins v4.d[1], v5.d[0] - b 88f - // Shift v2 right, shifting out invalid pixels, - // shift v2 left to the original offset, shifting in padding pixels. -55: // 5 pixels valid - ext v2.16b, v2.16b, v2.16b, #10 - ext v2.16b, v2.16b, v3.16b, #6 - ext v4.16b, v4.16b, v4.16b, #10 - ext v4.16b, v4.16b, v5.16b, #6 - b 88f -66: // 6 pixels valid, fill the upper 2 pixels with padding. - ins v2.s[3], v3.s[0] - ins v4.s[3], v5.s[0] - b 88f -77: // 7 pixels valid, fill the last pixel with padding. - ins v2.h[7], v3.h[0] - ins v4.h[7], v5.h[0] - b 88f - -L(variable_shift_tbl): - .hword L(variable_shift_tbl) - 44b - .hword L(variable_shift_tbl) - 55b - .hword L(variable_shift_tbl) - 66b - .hword L(variable_shift_tbl) - 77b - -8: // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3 - ins v27.h[0], v3.h[0] - ins v28.h[0], v5.h[0] - mov v3.16b, v27.16b - mov v5.16b, v28.16b - -88: - // w < 7, v2-v3 padded properly - cmp w5, #4 - b.lt 888f - - // w >= 4, filter 4 pixels - filter .4h - st1 {v6.4h}, [x0], #8 - st1 {v7.4h}, [x12], #8 - subs w5, w5, #4 // 0 <= w < 4 - ext v2.16b, v2.16b, v3.16b, #8 - ext v4.16b, v4.16b, v5.16b, #8 - b.eq 9f -888: // 1 <= w < 4, filter 1 pixel at a time - smull v6.4s, v2.4h, v0.4h - smull2 v7.4s, v2.8h, v0.8h - smull v16.4s, v4.4h, v0.4h - smull2 v17.4s, v4.8h, v0.8h - add v6.4s, v6.4s, v7.4s - add v16.4s, v16.4s, v17.4s - addv s6, v6.4s - addv s7, v16.4s - dup v16.4h, v2.h[3] - ins v16.h[1], v4.h[3] - ins v6.s[1], v7.s[0] - mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 - ushll v16.4s, v16.4h, #7 +0: + ldr x14, [sp, #16] + ldp x3, x5, [sp], #32 + ret +endfunc + +function wiener_filter7_v_16bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, afterwards. + stp x10, x11, [sp, #-64]! + stp x12, x13, [sp, #16] + stp x14, x14, [sp, #32] + stp x0, x5, [sp, #48] +1: + ld1 {v16.8h, v17.8h}, [x9], #32 + ld1 {v18.8h, v19.8h}, [x10], #32 + ld1 {v20.8h, v21.8h}, [x11], #32 + ld1 {v22.8h, v23.8h}, [x12], #32 + ld1 {v24.8h, v25.8h}, [x13], #32 + ld1 {v6.8h, v7.8h}, [x14], #32 + + smull v2.4s, v16.4h, v0.h[4] + smlal v2.4s, v18.4h, v0.h[5] + smlal v2.4s, v20.4h, v0.h[6] + smlal v2.4s, v22.4h, v0.h[7] + smlal v2.4s, v24.4h, v0.h[6] + smlal v2.4s, v6.4h, v0.h[5] + smlal v2.4s, v6.4h, v0.h[4] + smull2 v3.4s, v16.8h, v0.h[4] + smlal2 v3.4s, v18.8h, v0.h[5] + smlal2 v3.4s, v20.8h, v0.h[6] + smlal2 v3.4s, v22.8h, v0.h[7] + smlal2 v3.4s, v24.8h, v0.h[6] + smlal2 v3.4s, v6.8h, v0.h[5] + smlal2 v3.4s, v6.8h, v0.h[4] + smull v4.4s, v17.4h, v0.h[4] + smlal v4.4s, v19.4h, v0.h[5] + smlal v4.4s, v21.4h, v0.h[6] + smlal v4.4s, v23.4h, v0.h[7] + smlal v4.4s, v25.4h, v0.h[6] + smlal v4.4s, v7.4h, v0.h[5] + smlal v4.4s, v7.4h, v0.h[4] + smull2 v5.4s, v17.8h, v0.h[4] + smlal2 v5.4s, v19.8h, v0.h[5] + smlal2 v5.4s, v21.8h, v0.h[6] + smlal2 v5.4s, v23.8h, v0.h[7] + smlal2 v5.4s, v25.8h, v0.h[6] + smlal2 v5.4s, v7.8h, v0.h[5] + smlal2 v5.4s, v7.8h, v0.h[4] + srshl v2.4s, v2.4s, v27.4s // -round_bits_v + srshl v3.4s, v3.4s, v27.4s + srshl v4.4s, v4.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v28.8h // bitdepth_max + umin v3.8h, v3.8h, v28.8h + subs w5, w5, #16 + st1 {v2.8h, v3.8h}, [x0], #32 + b.gt 1b + + ldp x0, x5, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #64 + + add x0, x0, x1 + ret +endfunc + +function wiener_filter7_hv_16bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, and x15==x9, afterwards. + stp x10, x11, [sp, #-80]! + stp x12, x13, [sp, #16] + stp x14, x15, [sp, #32] + stp x10, x0, [sp, #48] + stp x3, x5, [sp, #64] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + b 2f +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v4 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w5, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -6 + ldr h26, [x3, w17, sxtw #1] + sub x7, x7, w5, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x7] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + smull v6.4s, v18.4h, v0.h[3] + smlal v6.4s, v19.4h, v0.h[2] + smlal v6.4s, v20.4h, v0.h[1] + smlal v6.4s, v21.4h, v0.h[0] + smull2 v7.4s, v18.8h, v0.h[3] + smlal2 v7.4s, v19.8h, v0.h[2] + smlal2 v7.4s, v20.8h, v0.h[1] + smlal2 v7.4s, v21.8h, v0.h[0] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + smull v24.4s, v18.4h, v0.h[3] + smlal v24.4s, v19.4h, v0.h[2] + smlal v24.4s, v20.4h, v0.h[1] + smlal v24.4s, v21.4h, v0.h[0] + smull2 v25.4s, v18.8h, v0.h[3] + smlal2 v25.4s, v19.8h, v0.h[2] + smlal2 v25.4s, v20.8h, v0.h[1] + smlal2 v25.4s, v21.8h, v0.h[0] + + ld1 {v16.8h, v17.8h}, [x9], #32 + + mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 add v6.4s, v6.4s, v30.4s - add v6.4s, v6.4s, v16.4s + add v7.4s, v7.4s, v30.4s + add v24.4s, v24.4s, v30.4s + add v25.4s, v25.4s, v30.4s + ld1 {v18.8h, v19.8h}, [x10], #32 srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v24.4s, v24.4s, v29.4s + srshl v25.4s, v25.4s, v29.4s + ld1 {v20.8h, v21.8h}, [x11], #32 sqxtun v6.4h, v6.4s - umin v6.4h, v6.4h, v24.4h - sub v6.4h, v6.4h, v31.4h - st1 {v6.h}[0], [x0], #2 - st1 {v6.h}[1], [x12], #2 - subs w5, w5, #1 - ext v2.16b, v2.16b, v3.16b, #2 - ext v4.16b, v4.16b, v5.16b, #2 - b.gt 888b + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v24.4s + sqxtun2 v7.8h, v25.4s + ld1 {v22.8h, v23.8h}, [x12], #32 + umin v6.8h, v6.8h, v26.8h + umin v7.8h, v7.8h, v26.8h + ld1 {v24.8h, v25.8h}, [x13], #32 + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + ld1 {v8.8h, v9.8h}, [x14], #32 + + smull v1.4s, v16.4h, v0.h[4] + smlal v1.4s, v18.4h, v0.h[5] + smlal v1.4s, v20.4h, v0.h[6] + smlal v1.4s, v22.4h, v0.h[7] + smlal v1.4s, v24.4h, v0.h[6] + smlal v1.4s, v8.4h, v0.h[5] + smlal v1.4s, v6.4h, v0.h[4] + smull2 v5.4s, v16.8h, v0.h[4] + smlal2 v5.4s, v18.8h, v0.h[5] + smlal2 v5.4s, v20.8h, v0.h[6] + smlal2 v5.4s, v22.8h, v0.h[7] + smlal2 v5.4s, v24.8h, v0.h[6] + smlal2 v5.4s, v8.8h, v0.h[5] + smlal2 v5.4s, v6.8h, v0.h[4] + smull v26.4s, v17.4h, v0.h[4] + smlal v26.4s, v19.4h, v0.h[5] + smlal v26.4s, v21.4h, v0.h[6] + smlal v26.4s, v23.4h, v0.h[7] + smlal v26.4s, v25.4h, v0.h[6] + smlal v26.4s, v9.4h, v0.h[5] + smlal v26.4s, v7.4h, v0.h[4] + smull2 v16.4s, v17.8h, v0.h[4] + smlal2 v16.4s, v19.8h, v0.h[5] + smlal2 v16.4s, v21.8h, v0.h[6] + smlal2 v16.4s, v23.8h, v0.h[7] + smlal2 v16.4s, v25.8h, v0.h[6] + smlal2 v16.4s, v9.8h, v0.h[5] + smlal2 v16.4s, v7.8h, v0.h[4] + srshl v1.4s, v1.4s, v27.4s // -round_bits_v + srshl v5.4s, v5.4s, v27.4s + srshl v26.4s, v26.4s, v27.4s + srshl v16.4s, v16.4s, v27.4s + sqxtun v18.4h, v1.4s + sqxtun2 v18.8h, v5.4s + sqxtun v19.4h, v26.4s + sqxtun2 v19.8h, v16.4s + st1 {v6.8h, v7.8h}, [x15], #32 + umin v18.8h, v18.8h, v28.8h // bitdepth_max + umin v19.8h, v19.8h, v28.8h + subs w5, w5, #16 + + st1 {v18.8h, v19.8h}, [x0], #32 -9: - subs w6, w6, #2 b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x10 - add x12, x12, x10 - add x2, x2, x3 - add x13, x13, x3 - mov w5, w8 - b 1b + mov v2.16b, v4.16b + tst w8, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + 0: + ldp x3, x5, [sp, #64] + ldp x15, x0, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #80 + + add x3, x3, x1 + add x0, x0, x1 + ret -.purgem filter endfunc -// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, -// const int16_t *mid, int w, int h, -// const int16_t fv[7], enum LrEdgeFlags edges, -// ptrdiff_t mid_stride, const int bitdepth_max); -function wiener_filter_v_16bpc_neon, export=1 - ldr w8, [sp] // bitdepth_max - ld1 {v0.8h}, [x5] - dup v31.8h, w8 - clz w8, w8 - movi v1.8h, #128 - sub w8, w8, #11 // round_bits_v - add v1.8h, v1.8h, v0.8h - dup v30.4s, w8 - mov w8, w4 - neg v30.4s, v30.4s // -round_bits_v - - // Calculate the number of rows to move back when looping vertically - mov w11, w4 - tst w6, #4 // LR_HAVE_TOP - b.eq 0f - sub x2, x2, x7, lsl #1 - add w11, w11, #2 -0: - tst w6, #8 // LR_HAVE_BOTTOM +// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], +// const pixel *lpf, const ptrdiff_t lpf_stride, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter5_16bpc_neon, export=1 + ldr w8, [sp] +#ifdef __APPLE__ + ldr w9, [sp, #4] +#else + ldr w9, [sp, #8] +#endif + stp x29, x30, [sp, #-32]! + stp d8, d9, [sp, #16] + mov x29, sp + ld1 {v0.8h, v1.8h}, [x7] + tst w8, #4 // LR_HAVE_TOP + sub_sp 384*2*4 + + dup v28.8h, w9 // bitdepth_max + clz w9, w9 + movi v30.4s, #1 + sub w10, w9, #38 // -(bitdepth + 6) + sub w11, w9, #11 // round_bits_v + sub w9, w9, #25 // -round_bits_h + neg w10, w10 // bitdepth + 6 + neg w11, w11 // -round_bits_v + dup v2.4s, w10 + dup v29.4s, w9 // -round_bits_h + dup v27.4s, w11 // -round_bits_v + movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 + ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) + + zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 + + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_5) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter5_h_16bpc_neon + add x3, x3, x4 // lpf += lpf_stride + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + add x3, x3, x4, lsl #2 + add x3, x3, x4 // lpf += lpf_stride*5 + mov x12, x14 // t3 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter5_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += p_stride + +L(main_5): + mov x15, x11 // t0 = t4 +L(main_loop_5): + bl wiener_filter5_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_loop_5) + tst w8, #8 // LR_HAVE_BOTTOM + b.eq L(v2_5) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + sub x4, x4, x1 // lpf_stride - p_stride + bl wiener_filter5_hv_16bpc_neon + add x3, x3, x4 // src += lpf_stride - p_stride + bl wiener_filter5_hv_16bpc_neon +L(end_5): + + mov sp, x29 + ldp d8, d9, [sp, #16] + ldp x29, x30, [sp], #32 + ret + +L(no_top_5): + add x3, x3, x4, lsl #2 + add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter5_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter5_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x15, x15, #384*2*3 // t0 += 384*2*3 + bl wiener_filter5_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_5) +L(v2_5): + bl wiener_filter5_v_16bpc_neon + add x0, x0, x1 + mov x11, x12 + mov x12, x13 + mov x13, x14 +L(v1_5): + bl wiener_filter5_v_16bpc_neon + b L(end_5) +endfunc + + +function wiener_filter5_h_16bpc_neon + stp x3, x5, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT b.eq 1f - add w11, w11, #2 + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f -1: // Start of horizontal loop; start one vertical filter slice. - // Load rows into v16-v19 and pad properly. - tst w6, #4 // LR_HAVE_TOP - ld1 {v16.8h}, [x2], x7 - b.eq 2f - // LR_HAVE_TOP - ld1 {v18.8h}, [x2], x7 - mov v17.16b, v16.16b - ld1 {v19.8h}, [x2], x7 - b 3f -2: // !LR_HAVE_TOP - mov v17.16b, v16.16b - mov v18.16b, v16.16b - mov v19.16b, v16.16b - -3: - cmp w4, #4 - b.lt 5f - // Start filtering normally; fill in v20-v22 with unique rows. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - ld1 {v22.8h}, [x2], x7 - -4: -.macro filter compare - subs w4, w4, #1 +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + b 2f + +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w5, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -4 + ldr h26, [x3, w17, sxtw #1] + sub x7, x7, w5, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x7] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - smull v2.4s, v16.4h, v0.h[0] - smlal v2.4s, v17.4h, v0.h[1] - smlal v2.4s, v18.4h, v0.h[2] - smlal v2.4s, v19.4h, v1.h[3] - smlal v2.4s, v20.4h, v0.h[4] - smlal v2.4s, v21.4h, v0.h[5] - smlal v2.4s, v22.4h, v0.h[6] - smull2 v3.4s, v16.8h, v0.h[0] - smlal2 v3.4s, v17.8h, v0.h[1] - smlal2 v3.4s, v18.8h, v0.h[2] - smlal2 v3.4s, v19.8h, v1.h[3] - smlal2 v3.4s, v20.8h, v0.h[4] - smlal2 v3.4s, v21.8h, v0.h[5] - smlal2 v3.4s, v22.8h, v0.h[6] - srshl v2.4s, v2.4s, v30.4s // round_bits_v - srshl v3.4s, v3.4s, v30.4s - sqxtun v2.4h, v2.4s - sqxtun2 v2.8h, v3.4s - umin v2.8h, v2.8h, v31.8h // bitdepth_max - st1 {v2.8h}, [x0], x1 -.if \compare - cmp w4, #4 -.else - b.le 9f -.endif - mov v16.16b, v17.16b - mov v17.16b, v18.16b - mov v18.16b, v19.16b - mov v19.16b, v20.16b - mov v20.16b, v21.16b - mov v21.16b, v22.16b -.endm - filter 1 - b.lt 7f - ld1 {v22.8h}, [x2], x7 - b 4b - -5: // Less than 4 rows in total; not all of v20-v21 are filled yet. - tst w6, #8 // LR_HAVE_BOTTOM - b.eq 6f - // LR_HAVE_BOTTOM - cmp w4, #2 - // We load at least 2 rows in all cases. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - b.gt 53f // 3 rows in total - b.eq 52f // 2 rows in total -51: // 1 row in total, v19 already loaded, load edge into v20-v22. - mov v22.16b, v21.16b - b 8f -52: // 2 rows in total, v19 already loaded, load v20 with content data - // and 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - mov v23.16b, v22.16b - b 8f -53: - // 3 rows in total, v19 already loaded, load v20 and v21 with content - // and 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - ld1 {v23.8h}, [x2], x7 - mov v24.16b, v23.16b - b 8f - -6: - // !LR_HAVE_BOTTOM - cmp w4, #2 - b.gt 63f // 3 rows in total - b.eq 62f // 2 rows in total -61: // 1 row in total, v19 already loaded, pad that into v20-v22. - mov v20.16b, v19.16b - mov v21.16b, v19.16b - mov v22.16b, v19.16b - b 8f -62: // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23. - ld1 {v20.8h}, [x2], x7 - mov v21.16b, v20.16b - mov v22.16b, v20.16b - mov v23.16b, v20.16b - b 8f -63: - // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - mov v22.16b, v21.16b - mov v23.16b, v21.16b - mov v24.16b, v21.16b - b 8f - -7: - // All registers up to v21 are filled already, 3 valid rows left. - // < 4 valid rows left; fill in padding and filter the last - // few rows. - tst w6, #8 // LR_HAVE_BOTTOM - b.eq 71f - // LR_HAVE_BOTTOM; load 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - ld1 {v23.8h}, [x2], x7 - mov v24.16b, v23.16b - b 8f -71: - // !LR_HAVE_BOTTOM, pad 3 rows - mov v22.16b, v21.16b - mov v23.16b, v21.16b - mov v24.16b, v21.16b - -8: // At this point, all registers up to v22-v24 are loaded with - // edge/padding (depending on how many rows are left). - filter 0 // This branches to 9f when done - mov v22.16b, v23.16b - mov v23.16b, v24.16b - b 8b + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + smull v6.4s, v17.4h, v0.h[3] + smlal v6.4s, v18.4h, v0.h[2] + smlal v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v17.8h, v0.h[3] + smlal2 v7.4s, v18.8h, v0.h[2] + smlal2 v7.4s, v19.8h, v0.h[1] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + smull v16.4s, v17.4h, v0.h[3] + smlal v16.4s, v18.4h, v0.h[2] + smlal v16.4s, v19.4h, v0.h[1] + smull2 v17.4s, v17.8h, v0.h[3] + smlal2 v17.4s, v18.8h, v0.h[2] + smlal2 v17.4s, v19.8h, v0.h[1] + + mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v16.4s, v16.4s, v29.4s + srshl v17.4s, v17.4s, v29.4s + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v16.4s + sqxtun2 v7.8h, v17.4s + umin v6.8h, v6.8h, v24.8h + umin v7.8h, v7.8h, v24.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + subs w5, w5, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 -9: // End of one vertical slice. - subs w3, w3, #8 b.le 0f - // Move pointers back up to the top and loop horizontally. - msub x0, x1, x8, x0 - msub x2, x7, x11, x2 - add x0, x0, #16 - add x2, x2, #16 - mov w4, w8 - b 1b + mov v2.16b, v4.16b + tst w8, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. 0: + ldr x14, [sp, #16] + ldp x3, x5, [sp], #32 ret -.purgem filter endfunc -// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride, -// const pixel *src, int w, int h); -function copy_narrow_16bpc_neon, export=1 - adr x5, L(copy_narrow_tbl) - ldrh w6, [x5, w3, uxtw #1] - sub x5, x5, w6, uxth - br x5 -10: - add x7, x0, x1 - lsl x1, x1, #1 -18: - subs w4, w4, #8 - b.lt 110f - ld1 {v0.8h}, [x2], #16 - st1 {v0.h}[0], [x0], x1 - st1 {v0.h}[1], [x7], x1 - st1 {v0.h}[2], [x0], x1 - st1 {v0.h}[3], [x7], x1 - st1 {v0.h}[4], [x0], x1 - st1 {v0.h}[5], [x7], x1 - st1 {v0.h}[6], [x0], x1 - st1 {v0.h}[7], [x7], x1 - b.le 0f - b 18b -110: - add w4, w4, #8 - asr x1, x1, #1 -11: - subs w4, w4, #1 - ld1 {v0.h}[0], [x2], #2 - st1 {v0.h}[0], [x0], x1 - b.gt 11b -0: +function wiener_filter5_v_16bpc_neon + stp x11, x12, [sp, #-48]! + stp x13, x14, [sp, #16] + stp x0, x5, [sp, #32] +1: + ld1 {v16.8h, v17.8h}, [x11], #32 + ld1 {v18.8h, v19.8h}, [x12], #32 + ld1 {v20.8h, v21.8h}, [x13], #32 + ld1 {v22.8h, v23.8h}, [x14], #32 + + smull v2.4s, v16.4h, v0.h[5] + smlal v2.4s, v18.4h, v0.h[6] + smlal v2.4s, v20.4h, v0.h[7] + smlal v2.4s, v22.4h, v0.h[6] + smlal v2.4s, v22.4h, v0.h[5] + smull2 v3.4s, v16.8h, v0.h[5] + smlal2 v3.4s, v18.8h, v0.h[6] + smlal2 v3.4s, v20.8h, v0.h[7] + smlal2 v3.4s, v22.8h, v0.h[6] + smlal2 v3.4s, v22.8h, v0.h[5] + smull v4.4s, v17.4h, v0.h[5] + smlal v4.4s, v19.4h, v0.h[6] + smlal v4.4s, v21.4h, v0.h[7] + smlal v4.4s, v23.4h, v0.h[6] + smlal v4.4s, v23.4h, v0.h[5] + smull2 v5.4s, v17.8h, v0.h[5] + smlal2 v5.4s, v19.8h, v0.h[6] + smlal2 v5.4s, v21.8h, v0.h[7] + smlal2 v5.4s, v23.8h, v0.h[6] + smlal2 v5.4s, v23.8h, v0.h[5] + srshl v2.4s, v2.4s, v27.4s // -round_bits_v + srshl v3.4s, v3.4s, v27.4s + srshl v4.4s, v4.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v28.8h // bitdepth_max + umin v3.8h, v3.8h, v28.8h + + subs w5, w5, #16 + st1 {v2.8h, v3.8h}, [x0], #32 + b.gt 1b + + ldp x0, x5, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #48 + ret +endfunc + +function wiener_filter5_hv_16bpc_neon + // Backing up/restoring registers shifted, so that x11 gets the value + // of x12, etc, and x15==x11, afterwards. + stp x12, x13, [sp, #-64]! + stp x14, x15, [sp, #16] + stp x12, x0, [sp, #32] + stp x3, x5, [sp, #48] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f -20: - add x7, x0, x1 - lsl x1, x1, #1 -24: - subs w4, w4, #4 - b.lt 210f - ld1 {v0.4s}, [x2], #16 - st1 {v0.s}[0], [x0], x1 - st1 {v0.s}[1], [x7], x1 - st1 {v0.s}[2], [x0], x1 - st1 {v0.s}[3], [x7], x1 - b.le 0f - b 24b -210: - add w4, w4, #4 - asr x1, x1, #1 -22: - subs w4, w4, #1 - ld1 {v0.s}[0], [x2], #4 - st1 {v0.s}[0], [x0], x1 - b.gt 22b 0: - ret + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + b 2f +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v3 to have 2x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 -30: - ldr w5, [x2] - ldrh w6, [x2, #4] - add x2, x2, #6 - subs w4, w4, #1 - str w5, [x0] - strh w6, [x0, #4] - add x0, x0, x1 - b.gt 30b - ret +2: + ld1 {v4.8h}, [x3], #16 + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w5, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -4 + ldr h26, [x3, w17, sxtw #1] + sub x7, x7, w5, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x7] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + smull v6.4s, v17.4h, v0.h[3] + smlal v6.4s, v18.4h, v0.h[2] + smlal v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v17.8h, v0.h[3] + smlal2 v7.4s, v18.8h, v0.h[2] + smlal2 v7.4s, v19.8h, v0.h[1] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + smull v24.4s, v17.4h, v0.h[3] + smlal v24.4s, v18.4h, v0.h[2] + smlal v24.4s, v19.4h, v0.h[1] + smull2 v25.4s, v17.8h, v0.h[3] + smlal2 v25.4s, v18.8h, v0.h[2] + smlal2 v25.4s, v19.8h, v0.h[1] + + ld1 {v16.8h, v17.8h}, [x11], #32 + mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v24.4s, v24.4s, v30.4s + add v25.4s, v25.4s, v30.4s + ld1 {v18.8h, v19.8h}, [x12], #32 + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v24.4s, v24.4s, v29.4s + srshl v25.4s, v25.4s, v29.4s + ld1 {v20.8h, v21.8h}, [x13], #32 + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v24.4s + sqxtun2 v7.8h, v25.4s + ld1 {v22.8h, v23.8h}, [x14], #32 + umin v6.8h, v6.8h, v26.8h + umin v7.8h, v7.8h, v26.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + smull v8.4s, v16.4h, v0.h[5] + smlal v8.4s, v18.4h, v0.h[6] + smlal v8.4s, v20.4h, v0.h[7] + smlal v8.4s, v22.4h, v0.h[6] + smlal v8.4s, v6.4h, v0.h[5] + smull2 v9.4s, v16.8h, v0.h[5] + smlal2 v9.4s, v18.8h, v0.h[6] + smlal2 v9.4s, v20.8h, v0.h[7] + smlal2 v9.4s, v22.8h, v0.h[6] + smlal2 v9.4s, v6.8h, v0.h[5] + smull v1.4s, v17.4h, v0.h[5] + smlal v1.4s, v19.4h, v0.h[6] + smlal v1.4s, v21.4h, v0.h[7] + smlal v1.4s, v23.4h, v0.h[6] + smlal v1.4s, v7.4h, v0.h[5] + smull2 v5.4s, v17.8h, v0.h[5] + smlal2 v5.4s, v19.8h, v0.h[6] + smlal2 v5.4s, v21.8h, v0.h[7] + smlal2 v5.4s, v23.8h, v0.h[6] + smlal2 v5.4s, v7.8h, v0.h[5] + srshl v8.4s, v8.4s, v27.4s // -round_bits_v + srshl v9.4s, v9.4s, v27.4s + srshl v1.4s, v1.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v8.4h, v8.4s + sqxtun2 v8.8h, v9.4s + sqxtun v9.4h, v1.4s + sqxtun2 v9.8h, v5.4s + st1 {v6.8h, v7.8h}, [x15], #32 + umin v8.8h, v8.8h, v28.8h // bitdepth_max + umin v9.8h, v9.8h, v28.8h + + subs w5, w5, #16 + + st1 {v8.8h, v9.8h}, [x0], #32 -40: - add x7, x0, x1 - lsl x1, x1, #1 -42: - subs w4, w4, #2 - b.lt 41f - ld1 {v0.2d}, [x2], #16 - st1 {v0.d}[0], [x0], x1 - st1 {v0.d}[1], [x7], x1 b.le 0f - b 42b -41: - ld1 {v0.4h}, [x2] - st1 {v0.4h}, [x0] -0: - ret + mov v2.16b, v4.16b + tst w8, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. -50: - ldr x5, [x2] - ldrh w6, [x2, #8] - add x2, x2, #10 - subs w4, w4, #1 - str x5, [x0] - strh w6, [x0, #8] - add x0, x0, x1 - b.gt 50b - ret +0: + ldp x3, x5, [sp, #48] + ldp x15, x0, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #64 -60: - ldr x5, [x2] - ldr w6, [x2, #8] - add x2, x2, #12 - subs w4, w4, #1 - str x5, [x0] - str w6, [x0, #8] + add x3, x3, x1 add x0, x0, x1 - b.gt 60b - ret -70: - ldr x5, [x2] - ldr w6, [x2, #8] - ldrh w7, [x2, #12] - add x2, x2, #14 - subs w4, w4, #1 - str x5, [x0] - str w6, [x0, #8] - strh w7, [x0, #12] - add x0, x0, x1 - b.gt 70b ret - -L(copy_narrow_tbl): - .hword 0 - .hword L(copy_narrow_tbl) - 10b - .hword L(copy_narrow_tbl) - 20b - .hword L(copy_narrow_tbl) - 30b - .hword L(copy_narrow_tbl) - 40b - .hword L(copy_narrow_tbl) - 50b - .hword L(copy_narrow_tbl) - 60b - .hword L(copy_narrow_tbl) - 70b endfunc #define SUM_STRIDE (384+16) @@ -699,25 +1102,15 @@ mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - tst w7, #2 // LR_HAVE_RIGHT - b.ne 0f - // !LR_HAVE_RIGHT - add w13, w5, #3 - bic w13, w13, #3 - b 1f -0: add w13, w5, #7 bic w13, w13, #7 -1: sub x9, x9, w13, uxtw #1 // Store the width for the vertical loop mov w8, w5 // Subtract the number of pixels read from the input from the stride - add w13, w5, #14 - bic w13, w13, #7 + add w13, w13, #8 sub x4, x4, w13, uxtw #1 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -770,16 +1163,9 @@ ext v16.16b, v18.16b, v16.16b, #12 2: - umull v2.4s, v0.4h, v0.4h - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - umull v18.4s, v16.4h, v16.4h - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h - tst w7, #2 // LR_HAVE_RIGHT b.ne 4f - // If we'll need to pad the right edge, load that byte to pad with + // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub w13, w5, #(2 + 16 - 2 + 1) ldr h30, [x3, w13, sxtw #1] @@ -788,151 +1174,65 @@ dup v30.8h, v30.h[0] dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #10 b.ge 4f // If w >= 10, all used input pixels are valid - cmp w5, #6 - b.ge 5f // If w >= 6, we can filter 4 pixels - b 6f + + // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in v0/1.h[w] onwards + movrel x13, right_ext_mask + sub x13, x13, w5, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + bit v16.16b, v31.16b, v28.16b + bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally -.macro ext_n dst1, dst2, src1, src2, src3, n, w - ext \dst1, \src1, \src2, \n -.if \w > 4 - ext \dst2, \src2, \src3, \n -.endif -.endm -.macro add_n dst1, dst2, src1, src2, src3, src4, w - add \dst1, \src1, \src3 -.if \w > 4 - add \dst2, \src2, \src4 -.endif -.endm - -.macro add3 w, wd - ext v24.16b, v0.16b, v1.16b, #2 - ext v25.16b, v0.16b, v1.16b, #4 - ext v26.16b, v16.16b, v17.16b, #2 - ext v27.16b, v16.16b, v17.16b, #4 - add v6\wd, v0\wd, v24\wd - add v7\wd, v16\wd, v26\wd - add v6\wd, v6\wd, v25\wd - add v7\wd, v7\wd, v27\wd - - ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w - ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w - - add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w - add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w - - ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w - ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w - - add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w - add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w -.endm - add3 8, .8h + ext v26.16b, v0.16b, v1.16b, #2 + ext v28.16b, v16.16b, v17.16b, #2 + ext v27.16b, v0.16b, v1.16b, #4 + ext v29.16b, v16.16b, v17.16b, #4 + + add v6.8h, v0.8h, v26.8h + umull v22.4s, v0.4h, v0.4h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7.8h, v16.8h, v28.8h + umull v24.4s, v16.4h, v16.4h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6.8h, v6.8h, v27.8h + umull2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h + add v7.8h, v7.8h, v29.8h + umull2 v25.4s, v16.8h, v16.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h + + subs w5, w5, #8 + st1 {v6.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 st1 {v24.4s,v25.4s}, [x10], #32 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b mov v16.16b, v17.16b ld1 {v1.8h}, [x3], #16 ld1 {v17.8h}, [x12], #16 - mov v2.16b, v4.16b - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - mov v18.16b, v20.16b - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 6 <= w < 10 - add3 4, .4h - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - - subs w5, w5, #4 // 2 <= w < 6 - ext v0.16b, v0.16b, v1.16b, #8 - ext v16.16b, v16.16b, v17.16b, #8 - -6: // Pad the right edge and produce the last few pixels. - // 2 <= w < 6, 2-5 pixels valid in v0 - sub w13, w5, #2 - // w13 = (pixels valid - 2) - adr x14, L(box3_variable_shift_tbl) - ldrh w13, [x14, w13, uxtw #1] - sub x13, x14, w13, uxth - br x13 - // Shift v0 right, shifting out invalid pixels, - // shift v0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - ext v0.16b, v0.16b, v0.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v0.16b, v0.16b, v30.16b, #12 - ext v16.16b, v16.16b, v31.16b, #12 - b 88f -33: // 3 pixels valid - ext v0.16b, v0.16b, v0.16b, #6 - ext v16.16b, v16.16b, v16.16b, #6 - ext v0.16b, v0.16b, v30.16b, #10 - ext v16.16b, v16.16b, v31.16b, #10 - b 88f -44: // 4 pixels valid - ext v0.16b, v0.16b, v0.16b, #8 - ext v16.16b, v16.16b, v16.16b, #8 - ext v0.16b, v0.16b, v30.16b, #8 - ext v16.16b, v16.16b, v31.16b, #8 - b 88f -55: // 5 pixels valid - ext v0.16b, v0.16b, v0.16b, #10 - ext v16.16b, v16.16b, v16.16b, #10 - ext v0.16b, v0.16b, v30.16b, #6 - ext v16.16b, v16.16b, v31.16b, #6 - b 88f - -L(box3_variable_shift_tbl): - .hword L(box3_variable_shift_tbl) - 22b - .hword L(box3_variable_shift_tbl) - 33b - .hword L(box3_variable_shift_tbl) - 44b - .hword L(box3_variable_shift_tbl) - 55b - -88: - umull v2.4s, v0.4h, v0.4h - umull2 v3.4s, v0.8h, v0.8h - umull v18.4s, v16.4h, v16.4h - umull2 v19.4s, v16.8h, v16.8h - - add3 4, .4h - subs w5, w5, #4 - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - b.le 9f - ext v0.16b, v0.16b, v0.16b, #8 - ext v16.16b, v16.16b, v16.16b, #8 - mov v2.16b, v3.16b - mov v3.16b, v4.16b - mov v18.16b, v19.16b - mov v19.16b, v20.16b - // Only one needed pixel left, but do a normal 4 pixel - // addition anyway - add3 4, .4h - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - 9: subs w6, w6, #2 b.le 0f @@ -947,7 +1247,6 @@ b 1b 0: ret -.purgem add3 endfunc // void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, @@ -966,23 +1265,11 @@ mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - // Subtract the number of pixels read from the input from the stride. - tst w7, #2 // LR_HAVE_RIGHT - b.ne 0f - // !LR_HAVE_RIGHT - add w13, w5, #3 - bic w13, w13, #3 - add w14, w5, #13 - b 1f -0: add w13, w5, #7 bic w13, w13, #7 - add w14, w5, #15 -1: sub x9, x9, w13, uxtw #1 - bic w14, w14, #7 - sub x4, x4, w14, uxtw #1 + add w13, w13, #8 + sub x4, x4, w13, uxtw #1 // Store the width for the vertical loop mov w8, w5 @@ -1026,7 +1313,7 @@ // and shift v0/v1 to have 3x the first pixel at the front. dup v2.8h, v0.h[0] dup v18.8h, v16.h[0] - // Move x3 back to account for the last 6 bytes we loaded before, + // Move x3 back to account for the last 3 pixels we loaded before, // which we shifted out. sub x3, x3, #6 sub x12, x12, #6 @@ -1036,16 +1323,9 @@ ext v16.16b, v18.16b, v16.16b, #10 2: - umull v2.4s, v0.4h, v0.4h - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - umull v18.4s, v16.4h, v16.4h - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h - tst w7, #2 // LR_HAVE_RIGHT b.ne 4f - // If we'll need to pad the right edge, load that byte to pad with + // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub w13, w5, #(2 + 16 - 3 + 1) ldr h30, [x3, w13, sxtw #1] @@ -1054,171 +1334,84 @@ dup v30.8h, v30.h[0] dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #11 b.ge 4f // If w >= 11, all used input pixels are valid - cmp w5, #7 - b.ge 5f // If w >= 7, we can produce 4 pixels - b 6f + + // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -2 + sub x13, x13, w5, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + bit v16.16b, v31.16b, v28.16b + bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally -.macro add5 w, wd - ext v24.16b, v0.16b, v1.16b, #2 - ext v25.16b, v0.16b, v1.16b, #4 + ext v26.16b, v0.16b, v1.16b, #2 + ext v28.16b, v16.16b, v17.16b, #2 + ext v27.16b, v0.16b, v1.16b, #4 + ext v29.16b, v16.16b, v17.16b, #4 + + add v6.8h, v0.8h, v26.8h + umull v22.4s, v0.4h, v0.4h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7.8h, v16.8h, v28.8h + umull v24.4s, v16.4h, v16.4h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6.8h, v6.8h, v27.8h + umull2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h + add v7.8h, v7.8h, v29.8h + umull2 v25.4s, v16.8h, v16.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h + ext v26.16b, v0.16b, v1.16b, #6 + ext v28.16b, v16.16b, v17.16b, #6 ext v27.16b, v0.16b, v1.16b, #8 + ext v29.16b, v16.16b, v17.16b, #8 - add v6\wd, v0\wd, v24\wd - add v25\wd, v25\wd, v26\wd - add v6\wd, v6\wd, v27\wd + add v6.8h, v6.8h, v26.8h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7.8h, v7.8h, v28.8h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6.8h, v6.8h, v27.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h + add v7.8h, v7.8h, v29.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h - ext v26.16b, v16.16b, v17.16b, #2 - ext v27.16b, v16.16b, v17.16b, #4 - ext v28.16b, v16.16b, v17.16b, #6 - ext v29.16b, v16.16b, v17.16b, #8 + subs w5, w5, #8 - add v7\wd, v16\wd, v26\wd - add v27\wd, v27\wd, v28\wd - add v7\wd, v7\wd, v29\wd - add v6\wd, v6\wd, v25\wd - add v7\wd, v7\wd, v27\wd - - ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w - ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w - ext_n v28.16b, v29.16b, v2.16b, v3.16b, v4.16b, #12, \w - - add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w - add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w - add_n v22.4s, v23.4s, v22.4s, v23.4s, v3.4s, v4.4s, \w - add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w - - ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w - ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w - ext_n v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w - - add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w - add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w - add_n v24.4s, v25.4s, v24.4s, v25.4s, v19.4s, v20.4s, \w - add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w -.endm - add5 8, .8h st1 {v6.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 st1 {v24.4s,v25.4s}, [x10], #32 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b mov v16.16b, v17.16b ld1 {v1.8h}, [x3], #16 ld1 {v17.8h}, [x12], #16 - mov v2.16b, v4.16b - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - mov v18.16b, v20.16b - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 7 <= w < 11 - add5 4, .4h - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - - subs w5, w5, #4 // 3 <= w < 7 - ext v0.16b, v0.16b, v1.16b, #8 - ext v16.16b, v16.16b, v17.16b, #8 - -6: // Pad the right edge and produce the last few pixels. - // w < 7, w+1 pixels valid in v0/v4 - sub w13, w5, #1 - // w13 = pixels valid - 2 - adr x14, L(box5_variable_shift_tbl) - ldrh w13, [x14, w13, uxtw #1] - mov v1.16b, v30.16b - mov v17.16b, v31.16b - sub x13, x14, w13, uxth - br x13 - // Shift v0 right, shifting out invalid pixels, - // shift v0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - ext v0.16b, v0.16b, v0.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v0.16b, v0.16b, v30.16b, #12 - ext v16.16b, v16.16b, v31.16b, #12 - b 88f -33: // 3 pixels valid - ext v0.16b, v0.16b, v0.16b, #6 - ext v16.16b, v16.16b, v16.16b, #6 - ext v0.16b, v0.16b, v30.16b, #10 - ext v16.16b, v16.16b, v31.16b, #10 - b 88f -44: // 4 pixels valid - ext v0.16b, v0.16b, v0.16b, #8 - ext v16.16b, v16.16b, v16.16b, #8 - ext v0.16b, v0.16b, v30.16b, #8 - ext v16.16b, v16.16b, v31.16b, #8 - b 88f -55: // 5 pixels valid - ext v0.16b, v0.16b, v0.16b, #10 - ext v16.16b, v16.16b, v16.16b, #10 - ext v0.16b, v0.16b, v30.16b, #6 - ext v16.16b, v16.16b, v31.16b, #6 - b 88f -66: // 6 pixels valid - ext v0.16b, v0.16b, v0.16b, #12 - ext v16.16b, v16.16b, v16.16b, #12 - ext v0.16b, v0.16b, v30.16b, #4 - ext v16.16b, v16.16b, v31.16b, #4 - b 88f -77: // 7 pixels valid - ext v0.16b, v0.16b, v0.16b, #14 - ext v16.16b, v16.16b, v16.16b, #14 - ext v0.16b, v0.16b, v30.16b, #2 - ext v16.16b, v16.16b, v31.16b, #2 - b 88f - -L(box5_variable_shift_tbl): - .hword L(box5_variable_shift_tbl) - 22b - .hword L(box5_variable_shift_tbl) - 33b - .hword L(box5_variable_shift_tbl) - 44b - .hword L(box5_variable_shift_tbl) - 55b - .hword L(box5_variable_shift_tbl) - 66b - .hword L(box5_variable_shift_tbl) - 77b - -88: - umull v2.4s, v0.4h, v0.4h - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - umull v18.4s, v16.4h, v16.4h - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h - - add5 4, .4h - subs w5, w5, #4 - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - b.le 9f - ext v0.16b, v0.16b, v1.16b, #8 - ext v16.16b, v16.16b, v17.16b, #8 - mov v2.16b, v3.16b - mov v3.16b, v4.16b - mov v18.16b, v19.16b - mov v19.16b, v20.16b - add5 4, .4h - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - 9: subs w6, w6, #2 b.le 0f @@ -1233,7 +1426,6 @@ b 1b 0: ret -.purgem add5 endfunc sgr_funcs 16 diff -Nru dav1d-0.7.1/src/arm/64/looprestoration.S dav1d-0.9.1/src/arm/64/looprestoration.S --- dav1d-0.7.1/src/arm/64/looprestoration.S 2020-06-21 11:48:54.968126300 +0000 +++ dav1d-0.9.1/src/arm/64/looprestoration.S 2021-07-28 21:38:28.869851800 +0000 @@ -28,591 +28,945 @@ #include "src/arm/asm.S" #include "util.S" -// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], -// const pixel *src, ptrdiff_t stride, -// const int16_t fh[7], const intptr_t w, -// int h, enum LrEdgeFlags edges); -function wiener_filter_h_8bpc_neon, export=1 - mov w8, w5 - ld1 {v0.8h}, [x4] - mov w9, #(1 << 14) - (1 << 2) - dup v30.8h, w9 +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], +// const pixel *lpf, const ptrdiff_t lpf_stride, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges); +function wiener_filter7_8bpc_neon, export=1 + ldr w8, [sp] + stp x29, x30, [sp, #-16]! + mov x29, sp + ld1 {v0.8h, v1.8h}, [x7] + tst w8, #4 // LR_HAVE_TOP + sub_sp 384*2*6 + + mov w17, #(1 << 14) - (1 << 2) + dup v30.8h, w17 movi v31.8h, #8, lsl #8 - // Calculate mid_stride - add w10, w5, #7 - bic w10, w10, #7 - lsl w10, w10, #1 - - // Clear the last unused element of v0, to allow filtering a single - // pixel with one plain mul+addv. - ins v0.h[7], wzr - // Set up pointers for reading/writing alternate rows - add x12, x0, x10 - lsl w10, w10, #1 - add x13, x2, x3 - lsl x3, x3, #1 - - // Subtract the width from mid_stride - sub x10, x10, w5, uxtw #1 - - // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. - cmp w5, #8 - add w11, w5, #13 - bic w11, w11, #7 - b.ge 1f - mov w11, #16 -1: - sub x3, x3, w11, uxtw + // x9 - t6 + // x10 - t5 + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_7) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter7_h_8bpc_neon + add x3, x3, x4 // lpf += lpf_stride + mov x9, x14 // t6 + mov x10, x14 // t5 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + add x3, x3, x4, lsl #2 + add x3, x3, x4 // lpf += lpf_stride*5 + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + mov x13, x14 // t2 + subs w6, w6, #1 // h-- + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + +L(main_7): + add x15, x14, #384*2 // t0 = t1 + 384*2 +L(main_loop_7): + bl wiener_filter7_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_loop_7) + tst w8, #8 // LR_HAVE_BOTTOM + b.eq L(v3_7) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + sub x4, x4, x1 // lpf_stride - p_stride + bl wiener_filter7_hv_8bpc_neon + add x3, x3, x4 // src += lpf_stride - p_stride + bl wiener_filter7_hv_8bpc_neon +L(v1_7): + bl wiener_filter7_v_8bpc_neon + + mov sp, x29 + ldp x29, x30, [sp], #16 + ret + +L(no_top_7): + add x3, x3, x4, lsl #2 + add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x9, x14 // t6 + mov x10, x14 // t5 + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x13, x14 // t2 + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter7_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x15, x15, #384*2*4 // t0 += 384*2*4 + bl wiener_filter7_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_7) +L(v3_7): + bl wiener_filter7_v_8bpc_neon +L(v2_7): + bl wiener_filter7_v_8bpc_neon + b L(v1_7) +endfunc + + +function wiener_filter7_h_8bpc_neon + stp x3, x5, [sp, #-32]! + str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f // LR_HAVE_LEFT - cbnz x1, 0f + cbnz x2, 0f // left == NULL - sub x2, x2, #3 - sub x13, x13, #3 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 3 pixels from the src pointer, - // but shift it as if we had done that. - add x3, x3, #3 - - -1: // Loop vertically - ld1 {v3.16b}, [x2], #16 - ld1 {v5.16b}, [x13], #16 + sub x3, x3, #3 + ld1 {v3.16b}, [x3], #16 + b 2f - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x1, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v2.s}[3], [x1], #4 - // Move x2/x13 back to account for the last 3 bytes we loaded earlier, + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, // which we'll shift out. - sub x2, x2, #3 - sub x13, x13, #3 - ld1 {v4.s}[3], [x1], #4 - ext v3.16b, v2.16b, v3.16b, #13 - ext v5.16b, v4.16b, v5.16b, #13 + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 b 2f -0: + +1: + ld1 {v3.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v2 with the leftmost byte // and shift v3 to have 3x the first byte at the front. - dup v2.16b, v3.b[0] - dup v4.16b, v5.b[0] - // Move x2 back to account for the last 3 bytes we loaded before, + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, // which we shifted out. - sub x2, x2, #3 - sub x13, x13, #3 - ext v3.16b, v2.16b, v3.16b, #13 - ext v5.16b, v4.16b, v5.16b, #13 + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 2: - uxtl v2.8h, v3.8b - uxtl2 v3.8h, v3.16b - uxtl v4.8h, v5.8b - uxtl2 v5.8h, v5.16b + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b - tst w7, #2 // LR_HAVE_RIGHT + tst w8, #2 // LR_HAVE_RIGHT b.ne 4f - // If we'll need to pad the right edge, load that byte to pad with - // here since we can find it pretty easily from here. - sub w9, w5, #14 - ldr b28, [x2, w9, sxtw] - ldr b29, [x13, w9, sxtw] - // Fill v28/v29 with the right padding pixel - dup v28.8b, v28.b[0] - dup v29.8b, v29.b[0] - uxtl v28.8h, v28.8b - uxtl v29.8h, v29.8b + 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. - cmp w5, #11 - b.ge 4f // If w >= 11, all used input pixels are valid - cmp w5, #7 - b.ge 5f // If w >= 7, we can filter 4 pixels - b 6f + + // Check whether we need to pad the right edge + cmp w5, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w5, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -6 + ldr b28, [x3, w17, sxtw] + sub x7, x7, w5, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x7] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b 4: // Loop horizontally -.macro filter wd // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - ext v16.16b, v2.16b, v3.16b, #2 ext v17.16b, v2.16b, v3.16b, #4 - ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 ext v20.16b, v2.16b, v3.16b, #10 ext v21.16b, v2.16b, v3.16b, #12 - mul v6\wd, v2\wd, v0.h[0] - mla v6\wd, v16\wd, v0.h[1] - mla v6\wd, v17\wd, v0.h[2] - mla v6\wd, v18\wd, v0.h[3] - mla v6\wd, v19\wd, v0.h[4] - mla v6\wd, v20\wd, v0.h[5] - mla v6\wd, v21\wd, v0.h[6] - ext v22.16b, v4.16b, v5.16b, #2 - ext v23.16b, v4.16b, v5.16b, #4 - ext v24.16b, v4.16b, v5.16b, #6 - ext v25.16b, v4.16b, v5.16b, #8 - ext v26.16b, v4.16b, v5.16b, #10 - ext v27.16b, v4.16b, v5.16b, #12 - mul v7\wd, v4\wd, v0.h[0] - mla v7\wd, v22\wd, v0.h[1] - mla v7\wd, v23\wd, v0.h[2] - mla v7\wd, v24\wd, v0.h[3] - mla v7\wd, v25\wd, v0.h[4] - mla v7\wd, v26\wd, v0.h[5] - mla v7\wd, v27\wd, v0.h[6] - - shl v18\wd, v18\wd, #7 - shl v24\wd, v24\wd, #7 - sub v18\wd, v18\wd, v30\wd - sub v24\wd, v24\wd, v30\wd - sqadd v6\wd, v6\wd, v18\wd - sqadd v7\wd, v7\wd, v24\wd - sshr v6\wd, v6\wd, #3 - sshr v7\wd, v7\wd, #3 - add v6\wd, v6\wd, v31\wd - add v7\wd, v7\wd, v31\wd -.endm - filter .8h - st1 {v6.8h}, [x0], #16 - st1 {v7.8h}, [x12], #16 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + shl v22.8h, v18.8h, #7 + mul v6.8h, v18.8h, v0.h[3] + mla v6.8h, v19.8h, v0.h[4] + mla v6.8h, v20.8h, v0.h[5] + mla v6.8h, v21.8h, v0.h[6] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + shl v23.8h, v18.8h, #7 + mul v7.8h, v18.8h, v0.h[3] + mla v7.8h, v19.8h, v0.h[4] + mla v7.8h, v20.8h, v0.h[5] + mla v7.8h, v21.8h, v0.h[6] + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h - subs w5, w5, #8 - b.le 9f - tst w7, #2 // LR_HAVE_RIGHT - mov v2.16b, v3.16b - mov v4.16b, v5.16b - ld1 {v3.8b}, [x2], #8 - ld1 {v5.8b}, [x13], #8 - uxtl v3.8h, v3.8b - uxtl v5.8h, v5.8b + subs w5, w5, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + + b.le 0f + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w8, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. -5: // Filter 4 pixels, 7 <= w < 11 - filter .4h - st1 {v6.4h}, [x0], #8 - st1 {v7.4h}, [x12], #8 - - subs w5, w5, #4 // 3 <= w < 7 - ext v2.16b, v2.16b, v3.16b, #8 - ext v3.16b, v3.16b, v3.16b, #8 - ext v4.16b, v4.16b, v5.16b, #8 - ext v5.16b, v5.16b, v5.16b, #8 - -6: // Pad the right edge and filter the last few pixels. - // w < 7, w+3 pixels valid in v2-v3 - cmp w5, #5 - b.lt 7f - b.gt 8f - // w == 5, 8 pixels valid in v2, v3 invalid - mov v3.16b, v28.16b - mov v5.16b, v29.16b - b 88f - -7: // 1 <= w < 5, 4-7 pixels valid in v2 - sub w9, w5, #1 - // w9 = (pixels valid - 4) - adr x11, L(variable_shift_tbl) - ldrh w9, [x11, w9, uxtw #1] - sub x11, x11, w9, uxth - mov v3.16b, v28.16b - mov v5.16b, v29.16b - br x11 -44: // 4 pixels valid in v2/v4, fill the high half with padding. - ins v2.d[1], v3.d[0] - ins v4.d[1], v5.d[0] - b 88f - // Shift v2 right, shifting out invalid pixels, - // shift v2 left to the original offset, shifting in padding pixels. -55: // 5 pixels valid - ext v2.16b, v2.16b, v2.16b, #10 - ext v2.16b, v2.16b, v3.16b, #6 - ext v4.16b, v4.16b, v4.16b, #10 - ext v4.16b, v4.16b, v5.16b, #6 - b 88f -66: // 6 pixels valid, fill the upper 2 pixels with padding. - ins v2.s[3], v3.s[0] - ins v4.s[3], v5.s[0] - b 88f -77: // 7 pixels valid, fill the last pixel with padding. - ins v2.h[7], v3.h[0] - ins v4.h[7], v5.h[0] - b 88f - -L(variable_shift_tbl): - .hword L(variable_shift_tbl) - 44b - .hword L(variable_shift_tbl) - 55b - .hword L(variable_shift_tbl) - 66b - .hword L(variable_shift_tbl) - 77b - -8: // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3 - ins v28.h[0], v3.h[0] - ins v29.h[0], v5.h[0] - mov v3.16b, v28.16b - mov v5.16b, v29.16b - -88: - // w < 7, v2-v3 padded properly - cmp w5, #4 - b.lt 888f - - // w >= 4, filter 4 pixels - filter .4h - st1 {v6.4h}, [x0], #8 - st1 {v7.4h}, [x12], #8 - subs w5, w5, #4 // 0 <= w < 4 - ext v2.16b, v2.16b, v3.16b, #8 - ext v4.16b, v4.16b, v5.16b, #8 - b.eq 9f -888: // 1 <= w < 4, filter 1 pixel at a time - mul v6.8h, v2.8h, v0.8h - mul v7.8h, v4.8h, v0.8h - addv h6, v6.8h - addv h7, v7.8h - dup v16.4h, v2.h[3] - ins v16.h[1], v4.h[3] - ins v6.h[1], v7.h[0] - shl v16.4h, v16.4h, #7 - sub v16.4h, v16.4h, v30.4h - sqadd v6.4h, v6.4h, v16.4h - sshr v6.4h, v6.4h, #3 - add v6.4h, v6.4h, v31.4h - st1 {v6.h}[0], [x0], #2 - st1 {v6.h}[1], [x12], #2 - subs w5, w5, #1 - ext v2.16b, v2.16b, v3.16b, #2 - ext v4.16b, v4.16b, v5.16b, #2 - b.gt 888b +0: + ldr x14, [sp, #16] + ldp x3, x5, [sp], #32 + ret +endfunc + +function wiener_filter7_v_8bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, afterwards. + stp x10, x11, [sp, #-64]! + stp x12, x13, [sp, #16] + stp x14, x14, [sp, #32] + stp x0, x5, [sp, #48] +1: + ld1 {v20.8h, v21.8h}, [x11], #32 + ld1 {v24.8h, v25.8h}, [x13], #32 + + ld1 {v18.8h, v19.8h}, [x10], #32 + add v24.8h, v24.8h, v20.8h + ld1 {v26.8h, v27.8h}, [x14], #32 + + ld1 {v16.8h, v17.8h}, [x9], #32 + add v28.8h, v26.8h, v18.8h + ld1 {v22.8h, v23.8h}, [x12], #32 + + add v16.8h, v26.8h, v16.8h + add v25.8h, v25.8h, v21.8h + + smull v2.4s, v22.4h, v1.h[3] + smlal v2.4s, v24.4h, v1.h[4] + smlal v2.4s, v28.4h, v1.h[5] + smlal v2.4s, v16.4h, v1.h[6] + add v29.8h, v27.8h, v19.8h + smull2 v3.4s, v22.8h, v1.h[3] + smlal2 v3.4s, v24.8h, v1.h[4] + smlal2 v3.4s, v28.8h, v1.h[5] + smlal2 v3.4s, v16.8h, v1.h[6] + add v17.8h, v27.8h, v17.8h + smull v4.4s, v23.4h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[4] + smlal v4.4s, v29.4h, v1.h[5] + smlal v4.4s, v17.4h, v1.h[6] + smull2 v5.4s, v23.8h, v1.h[3] + smlal2 v5.4s, v25.8h, v1.h[4] + smlal2 v5.4s, v29.8h, v1.h[5] + smlal2 v5.4s, v17.8h, v1.h[6] + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + sqrshrun v3.4h, v4.4s, #11 + sqrshrun2 v3.8h, v5.4s, #11 + sqxtun v2.8b, v2.8h + sqxtun2 v2.16b, v3.8h + subs w5, w5, #16 + st1 {v2.16b}, [x0], #16 + b.gt 1b + + ldp x0, x5, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #64 + + add x0, x0, x1 + ret +endfunc + +function wiener_filter7_hv_8bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, and x15==x9, afterwards. + stp x10, x11, [sp, #-80]! + stp x12, x13, [sp, #16] + stp x14, x15, [sp, #32] + stp x10, x0, [sp, #48] + stp x3, x5, [sp, #64] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #3 + ld1 {v3.16b}, [x3], #16 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 + b 2f +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 3x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 + +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w5, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -6 + ldr b28, [x3, w17, sxtw] + sub x7, x7, w5, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x7] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + shl v22.8h, v18.8h, #7 + mul v6.8h, v18.8h, v0.h[3] + mla v6.8h, v19.8h, v0.h[4] + mla v6.8h, v20.8h, v0.h[5] + mla v6.8h, v21.8h, v0.h[6] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + shl v23.8h, v18.8h, #7 + mul v7.8h, v18.8h, v0.h[3] + mla v7.8h, v19.8h, v0.h[4] + mla v7.8h, v20.8h, v0.h[5] + mla v7.8h, v21.8h, v0.h[6] + + ld1 {v20.8h, v21.8h}, [x11], #32 + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + ld1 {v26.8h, v27.8h}, [x13], #32 + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + ld1 {v18.8h, v19.8h}, [x10], #32 + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + ld1 {v28.8h, v29.8h}, [x14], #32 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + ld1 {v16.8h, v17.8h}, [x9], #32 + add v26.8h, v20.8h, v26.8h + + ld1 {v24.8h, v25.8h}, [x12], #32 + add v28.8h, v18.8h, v28.8h + + add v16.8h, v16.8h, v6.8h + add v27.8h, v21.8h, v27.8h + + smull v18.4s, v24.4h, v1.h[3] + smlal v18.4s, v26.4h, v1.h[4] + smlal v18.4s, v28.4h, v1.h[5] + smlal v18.4s, v16.4h, v1.h[6] + add v29.8h, v19.8h, v29.8h + smull2 v19.4s, v24.8h, v1.h[3] + smlal2 v19.4s, v26.8h, v1.h[4] + smlal2 v19.4s, v28.8h, v1.h[5] + smlal2 v19.4s, v16.8h, v1.h[6] + add v17.8h, v17.8h, v7.8h + smull v20.4s, v25.4h, v1.h[3] + smlal v20.4s, v27.4h, v1.h[4] + smlal v20.4s, v29.4h, v1.h[5] + smlal v20.4s, v17.4h, v1.h[6] + smull2 v21.4s, v25.8h, v1.h[3] + smlal2 v21.4s, v27.8h, v1.h[4] + smlal2 v21.4s, v29.8h, v1.h[5] + smlal2 v21.4s, v17.8h, v1.h[6] + sqrshrun v18.4h, v18.4s, #11 + sqrshrun2 v18.8h, v19.4s, #11 + sqrshrun v19.4h, v20.4s, #11 + sqrshrun2 v19.8h, v21.4s, #11 + st1 {v6.8h, v7.8h}, [x15], #32 + sqxtun v18.8b, v18.8h + sqxtun2 v18.16b, v19.8h + subs w5, w5, #16 + + st1 {v18.16b}, [x0], #16 -9: - subs w6, w6, #2 b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x10 - add x12, x12, x10 - add x2, x2, x3 - add x13, x13, x3 - mov w5, w8 - b 1b + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w8, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + 0: + ldp x3, x5, [sp, #64] + ldp x15, x0, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #80 + + add x3, x3, x1 + add x0, x0, x1 + ret -.purgem filter endfunc -// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, -// const int16_t *mid, int w, int h, -// const int16_t fv[7], enum LrEdgeFlags edges, -// ptrdiff_t mid_stride); -function wiener_filter_v_8bpc_neon, export=1 - mov w8, w4 - ld1 {v0.8h}, [x5] - movi v1.8h, #128 - add v1.8h, v1.8h, v0.8h - - // Calculate the number of rows to move back when looping vertically - mov w11, w4 - tst w6, #4 // LR_HAVE_TOP - b.eq 0f - sub x2, x2, x7, lsl #1 - add w11, w11, #2 -0: - tst w6, #8 // LR_HAVE_BOTTOM +// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], +// const pixel *lpf, const ptrdiff_t lpf_stride, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges); +function wiener_filter5_8bpc_neon, export=1 + ldr w8, [sp] + stp x29, x30, [sp, #-16]! + mov x29, sp + ld1 {v0.8h, v1.8h}, [x7] + tst w8, #4 // LR_HAVE_TOP + sub_sp 384*2*4 + + mov w17, #(1 << 14) - (1 << 2) + dup v30.8h, w17 + movi v31.8h, #8, lsl #8 + + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_5) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter5_h_8bpc_neon + add x3, x3, x4 // lpf += lpf_stride + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_8bpc_neon + add x3, x3, x4, lsl #2 + add x3, x3, x4 // lpf += lpf_stride*5 + mov x12, x14 // t3 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter5_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += p_stride + +L(main_5): + mov x15, x11 // t0 = t4 +L(main_loop_5): + bl wiener_filter5_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_loop_5) + tst w8, #8 // LR_HAVE_BOTTOM + b.eq L(v2_5) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + sub x4, x4, x1 // lpf_stride - p_stride + bl wiener_filter5_hv_8bpc_neon + add x3, x3, x4 // src += lpf_stride - p_stride + bl wiener_filter5_hv_8bpc_neon +L(end_5): + + mov sp, x29 + ldp x29, x30, [sp], #16 + ret + +L(no_top_5): + add x3, x3, x4, lsl #2 + add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter5_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter5_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x15, x15, #384*2*3 // t0 += 384*2*3 + bl wiener_filter5_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_5) +L(v2_5): + bl wiener_filter5_v_8bpc_neon + add x0, x0, x1 + mov x11, x12 + mov x12, x13 + mov x13, x14 +L(v1_5): + bl wiener_filter5_v_8bpc_neon + b L(end_5) +endfunc + + +function wiener_filter5_h_8bpc_neon + stp x3, x5, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT b.eq 1f - add w11, w11, #2 + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #2 + ld1 {v3.16b}, [x3], #16 + b 2f -1: // Start of horizontal loop; start one vertical filter slice. - // Load rows into v16-v19 and pad properly. - tst w6, #4 // LR_HAVE_TOP - ld1 {v16.8h}, [x2], x7 - b.eq 2f - // LR_HAVE_TOP - ld1 {v18.8h}, [x2], x7 - mov v17.16b, v16.16b - ld1 {v19.8h}, [x2], x7 - b 3f -2: // !LR_HAVE_TOP - mov v17.16b, v16.16b - mov v18.16b, v16.16b - mov v19.16b, v16.16b - -3: - cmp w4, #4 - b.lt 5f - // Start filtering normally; fill in v20-v22 with unique rows. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - ld1 {v22.8h}, [x2], x7 - -4: -.macro filter compare - subs w4, w4, #1 +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + b 2f + +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 3x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w5, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -4 + ldr b28, [x3, w17, sxtw] + sub x7, x7, w5, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x7] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - smull v2.4s, v16.4h, v0.h[0] - smlal v2.4s, v17.4h, v0.h[1] - smlal v2.4s, v18.4h, v0.h[2] - smlal v2.4s, v19.4h, v1.h[3] - smlal v2.4s, v20.4h, v0.h[4] - smlal v2.4s, v21.4h, v0.h[5] - smlal v2.4s, v22.4h, v0.h[6] - smull2 v3.4s, v16.8h, v0.h[0] - smlal2 v3.4s, v17.8h, v0.h[1] - smlal2 v3.4s, v18.8h, v0.h[2] - smlal2 v3.4s, v19.8h, v1.h[3] - smlal2 v3.4s, v20.8h, v0.h[4] - smlal2 v3.4s, v21.8h, v0.h[5] - smlal2 v3.4s, v22.8h, v0.h[6] - sqrshrun v2.4h, v2.4s, #11 - sqrshrun2 v2.8h, v3.4s, #11 - sqxtun v2.8b, v2.8h - st1 {v2.8b}, [x0], x1 -.if \compare - cmp w4, #4 -.else - b.le 9f -.endif - mov v16.16b, v17.16b - mov v17.16b, v18.16b - mov v18.16b, v19.16b - mov v19.16b, v20.16b - mov v20.16b, v21.16b - mov v21.16b, v22.16b -.endm - filter 1 - b.lt 7f - ld1 {v22.8h}, [x2], x7 - b 4b - -5: // Less than 4 rows in total; not all of v20-v21 are filled yet. - tst w6, #8 // LR_HAVE_BOTTOM - b.eq 6f - // LR_HAVE_BOTTOM - cmp w4, #2 - // We load at least 2 rows in all cases. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - b.gt 53f // 3 rows in total - b.eq 52f // 2 rows in total -51: // 1 row in total, v19 already loaded, load edge into v20-v22. - mov v22.16b, v21.16b - b 8f -52: // 2 rows in total, v19 already loaded, load v20 with content data - // and 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - mov v23.16b, v22.16b - b 8f -53: - // 3 rows in total, v19 already loaded, load v20 and v21 with content - // and 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - ld1 {v23.8h}, [x2], x7 - mov v24.16b, v23.16b - b 8f - -6: - // !LR_HAVE_BOTTOM - cmp w4, #2 - b.gt 63f // 3 rows in total - b.eq 62f // 2 rows in total -61: // 1 row in total, v19 already loaded, pad that into v20-v22. - mov v20.16b, v19.16b - mov v21.16b, v19.16b - mov v22.16b, v19.16b - b 8f -62: // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23. - ld1 {v20.8h}, [x2], x7 - mov v21.16b, v20.16b - mov v22.16b, v20.16b - mov v23.16b, v20.16b - b 8f -63: - // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - mov v22.16b, v21.16b - mov v23.16b, v21.16b - mov v24.16b, v21.16b - b 8f - -7: - // All registers up to v21 are filled already, 3 valid rows left. - // < 4 valid rows left; fill in padding and filter the last - // few rows. - tst w6, #8 // LR_HAVE_BOTTOM - b.eq 71f - // LR_HAVE_BOTTOM; load 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - ld1 {v23.8h}, [x2], x7 - mov v24.16b, v23.16b - b 8f -71: - // !LR_HAVE_BOTTOM, pad 3 rows - mov v22.16b, v21.16b - mov v23.16b, v21.16b - mov v24.16b, v21.16b - -8: // At this point, all registers up to v22-v24 are loaded with - // edge/padding (depending on how many rows are left). - filter 0 // This branches to 9f when done - mov v22.16b, v23.16b - mov v23.16b, v24.16b - b 8b + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + shl v22.8h, v17.8h, #7 + mul v6.8h, v17.8h, v0.h[3] + mla v6.8h, v18.8h, v0.h[4] + mla v6.8h, v19.8h, v0.h[5] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + shl v23.8h, v17.8h, #7 + mul v7.8h, v17.8h, v0.h[3] + mla v7.8h, v18.8h, v0.h[4] + mla v7.8h, v19.8h, v0.h[5] + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + subs w5, w5, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 -9: // End of one vertical slice. - subs w3, w3, #8 b.le 0f - // Move pointers back up to the top and loop horizontally. - msub x0, x1, x8, x0 - msub x2, x7, x11, x2 - add x0, x0, #8 - add x2, x2, #16 - mov w4, w8 - b 1b + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w8, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. 0: + ldr x14, [sp, #16] + ldp x3, x5, [sp], #32 ret -.purgem filter endfunc -// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride, -// const pixel *src, int w, int h); -function copy_narrow_8bpc_neon, export=1 - adr x5, L(copy_narrow_tbl) - ldrh w6, [x5, w3, uxtw #1] - sub x5, x5, w6, uxth - br x5 -10: - add x7, x0, x1 - lsl x1, x1, #1 -18: - subs w4, w4, #8 - b.lt 110f - ld1 {v0.8b}, [x2], #8 - st1 {v0.b}[0], [x0], x1 - st1 {v0.b}[1], [x7], x1 - st1 {v0.b}[2], [x0], x1 - st1 {v0.b}[3], [x7], x1 - st1 {v0.b}[4], [x0], x1 - st1 {v0.b}[5], [x7], x1 - st1 {v0.b}[6], [x0], x1 - st1 {v0.b}[7], [x7], x1 - b.le 0f - b 18b -110: - add w4, w4, #8 - asr x1, x1, #1 -11: - subs w4, w4, #1 - ld1 {v0.b}[0], [x2], #1 - st1 {v0.b}[0], [x0], x1 - b.gt 11b -0: +function wiener_filter5_v_8bpc_neon + stp x11, x12, [sp, #-48]! + stp x13, x14, [sp, #16] + stp x0, x5, [sp, #32] +1: + ld1 {v18.8h, v19.8h}, [x12], #32 + ld1 {v22.8h, v23.8h}, [x14], #32 + ld1 {v16.8h, v17.8h}, [x11], #32 + + add v24.8h, v22.8h, v18.8h + ld1 {v20.8h, v21.8h}, [x13], #32 + add v16.8h, v22.8h, v16.8h + add v25.8h, v23.8h, v19.8h + + smull v2.4s, v20.4h, v1.h[3] + smlal v2.4s, v24.4h, v1.h[4] + smlal v2.4s, v16.4h, v1.h[5] + add v17.8h, v23.8h, v17.8h + smull2 v3.4s, v20.8h, v1.h[3] + smlal2 v3.4s, v24.8h, v1.h[4] + smlal2 v3.4s, v16.8h, v1.h[5] + smull v4.4s, v21.4h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[4] + smlal v4.4s, v17.4h, v1.h[5] + smull2 v5.4s, v21.8h, v1.h[3] + smlal2 v5.4s, v25.8h, v1.h[4] + smlal2 v5.4s, v17.8h, v1.h[5] + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + sqrshrun v3.4h, v4.4s, #11 + sqrshrun2 v3.8h, v5.4s, #11 + sqxtun v2.8b, v2.8h + sqxtun2 v2.16b, v3.8h + subs w5, w5, #16 + st1 {v2.16b}, [x0], #16 + b.gt 1b + + ldp x0, x5, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #48 + ret +endfunc + +function wiener_filter5_hv_8bpc_neon + // Backing up/restoring registers shifted, so that x11 gets the value + // of x12, etc, and x15==x11, afterwards. + stp x12, x13, [sp, #-64]! + stp x14, x15, [sp, #16] + stp x12, x0, [sp, #32] + stp x3, x5, [sp, #48] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #2 + ld1 {v3.16b}, [x3], #16 + b 2f -20: - add x7, x0, x1 - lsl x1, x1, #1 -24: - subs w4, w4, #4 - b.lt 210f - ld1 {v0.4h}, [x2], #8 - st1 {v0.h}[0], [x0], x1 - st1 {v0.h}[1], [x7], x1 - st1 {v0.h}[2], [x0], x1 - st1 {v0.h}[3], [x7], x1 - b.le 0f - b 24b -210: - add w4, w4, #4 - asr x1, x1, #1 -22: - subs w4, w4, #1 - ld1 {v0.h}[0], [x2], #2 - st1 {v0.h}[0], [x0], x1 - b.gt 22b 0: - ret + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + b 2f +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 2x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 -30: - ldrh w5, [x2] - ldrb w6, [x2, #2] - add x2, x2, #3 - subs w4, w4, #1 - strh w5, [x0] - strb w6, [x0, #2] - add x0, x0, x1 - b.gt 30b - ret +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w5, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -4 + ldr b28, [x3, w17, sxtw] + sub x7, x7, w5, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x7] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally + + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + shl v22.8h, v17.8h, #7 + mul v6.8h, v17.8h, v0.h[3] + mla v6.8h, v18.8h, v0.h[4] + mla v6.8h, v19.8h, v0.h[5] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + shl v23.8h, v17.8h, #7 + mul v7.8h, v17.8h, v0.h[3] + mla v7.8h, v18.8h, v0.h[4] + mla v7.8h, v19.8h, v0.h[5] + + ld1 {v18.8h, v19.8h}, [x12], #32 + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + ld1 {v24.8h, v25.8h}, [x14], #32 + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + ld1 {v16.8h, v17.8h}, [x11], #32 + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + ld1 {v20.8h, v21.8h}, [x13], #32 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + add v24.8h, v24.8h, v18.8h + add v16.8h, v16.8h, v6.8h + + smull v18.4s, v20.4h, v1.h[3] + smlal v18.4s, v24.4h, v1.h[4] + smlal v18.4s, v16.4h, v1.h[5] + add v25.8h, v25.8h, v19.8h + smull2 v19.4s, v20.8h, v1.h[3] + smlal2 v19.4s, v24.8h, v1.h[4] + smlal2 v19.4s, v16.8h, v1.h[5] + add v17.8h, v17.8h, v7.8h + smull v20.4s, v21.4h, v1.h[3] + smlal v20.4s, v25.4h, v1.h[4] + smlal v20.4s, v17.4h, v1.h[5] + smull2 v21.4s, v21.8h, v1.h[3] + smlal2 v21.4s, v25.8h, v1.h[4] + smlal2 v21.4s, v17.8h, v1.h[5] + sqrshrun v18.4h, v18.4s, #11 + sqrshrun2 v18.8h, v19.4s, #11 + sqrshrun v19.4h, v20.4s, #11 + sqrshrun2 v19.8h, v21.4s, #11 + st1 {v6.8h, v7.8h}, [x15], #32 + sqxtun v18.8b, v18.8h + sqxtun2 v18.16b, v19.8h + subs w5, w5, #16 + + st1 {v18.16b}, [x0], #16 -40: - add x7, x0, x1 - lsl x1, x1, #1 -42: - subs w4, w4, #2 - b.lt 41f - ld1 {v0.2s}, [x2], #8 - st1 {v0.s}[0], [x0], x1 - st1 {v0.s}[1], [x7], x1 b.le 0f - b 42b -41: - ld1 {v0.s}[0], [x2] - st1 {v0.s}[0], [x0] -0: - ret + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w8, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. -50: - ldr w5, [x2] - ldrb w6, [x2, #4] - add x2, x2, #5 - subs w4, w4, #1 - str w5, [x0] - strb w6, [x0, #4] - add x0, x0, x1 - b.gt 50b - ret +0: + ldp x3, x5, [sp, #48] + ldp x15, x0, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #64 -60: - ldr w5, [x2] - ldrh w6, [x2, #4] - add x2, x2, #6 - subs w4, w4, #1 - str w5, [x0] - strh w6, [x0, #4] + add x3, x3, x1 add x0, x0, x1 - b.gt 60b - ret -70: - ldr w5, [x2] - ldrh w6, [x2, #4] - ldrb w7, [x2, #6] - add x2, x2, #7 - subs w4, w4, #1 - str w5, [x0] - strh w6, [x0, #4] - strb w7, [x0, #6] - add x0, x0, x1 - b.gt 70b ret - -L(copy_narrow_tbl): - .hword 0 - .hword L(copy_narrow_tbl) - 10b - .hword L(copy_narrow_tbl) - 20b - .hword L(copy_narrow_tbl) - 30b - .hword L(copy_narrow_tbl) - 40b - .hword L(copy_narrow_tbl) - 50b - .hword L(copy_narrow_tbl) - 60b - .hword L(copy_narrow_tbl) - 70b endfunc #define SUM_STRIDE (384+16) @@ -635,25 +989,15 @@ mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - tst w7, #2 // LR_HAVE_RIGHT - b.ne 0f - // !LR_HAVE_RIGHT - add w13, w5, #3 - bic w13, w13, #3 - b 1f -0: add w13, w5, #7 bic w13, w13, #7 -1: sub x9, x9, w13, uxtw #1 // Store the width for the vertical loop mov w8, w5 // Subtract the number of pixels read from the input from the stride - add w13, w5, #14 - bic w13, w13, #7 + add w13, w13, #8 sub x4, x4, w13, uxtw // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -715,37 +1059,33 @@ ldr b30, [x3, w13, sxtw] ldr b31, [x12, w13, sxtw] // Fill v30/v31 with the right padding pixel - dup v30.8b, v30.b[0] - dup v31.8b, v31.b[0] + dup v30.16b, v30.b[0] + dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #10 b.ge 4f // If w >= 10, all used input pixels are valid - cmp w5, #6 - b.ge 5f // If w >= 6, we can filter 4 pixels - b 6f -4: // Loop horizontally -.macro uaddl_nh dst1, dst2, src1, src2, w - uaddl \dst1, \src1\().4h, \src2\().4h -.if \w > 4 - uaddl2 \dst2, \src1\().8h, \src2\().8h -.endif -.endm -.macro uaddw_nh dst1, dst2, src, w - uaddw \dst1, \dst1, \src\().4h -.if \w > 4 - uaddw2 \dst2, \dst2, \src\().8h -.endif -.endm -.macro add_nh dst1, dst2, src1, src2, w - add \dst1, \dst1, \src1 -.if \w > 4 - add \dst2, \dst2, \src2 -.endif -.endm + // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in v0/4.b[w] onwards + movrel x13, right_ext_mask + sub x13, x13, w5, uxtw + ld1 {v29.16b}, [x13] + + bit v0.16b, v30.16b, v29.16b + bit v4.16b, v31.16b, v29.16b + + // Update the precalculated squares + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b -.macro add3 w +4: // Loop horizontally ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 ext v18.16b, v4.16b, v4.16b, #1 @@ -760,19 +1100,23 @@ ext v22.16b, v5.16b, v6.16b, #2 ext v23.16b, v5.16b, v6.16b, #4 - uaddl_nh v26.4s, v27.4s, v1, v20, \w - uaddw_nh v26.4s, v27.4s, v21, \w + uaddl v26.4s, v1.4h, v20.4h + uaddl2 v27.4s, v1.8h, v20.8h + uaddw v26.4s, v26.4s, v21.4h + uaddw2 v27.4s, v27.4s, v21.8h + + uaddl v28.4s, v5.4h, v22.4h + uaddl2 v29.4s, v5.8h, v22.8h + uaddw v28.4s, v28.4s, v23.4h + uaddw2 v29.4s, v29.4s, v23.8h + + subs w5, w5, #8 - uaddl_nh v28.4s, v29.4s, v5, v22, \w - uaddw_nh v28.4s, v29.4s, v23, \w -.endm - add3 8 st1 {v3.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 st1 {v28.4s,v29.4s}, [x10], #32 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 @@ -787,83 +1131,6 @@ b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 6 <= w < 10 - add3 4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - - subs w5, w5, #4 // 2 <= w < 6 - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - -6: // Pad the right edge and produce the last few pixels. - // 2 <= w < 6, 2-5 pixels valid in v0 - sub w13, w5, #2 - // w13 = (pixels valid - 2) - adr x14, L(box3_variable_shift_tbl) - ldrh w13, [x14, w13, uxtw #1] - sub x13, x14, w13, uxth - br x13 - // Shift v0 right, shifting out invalid pixels, - // shift v0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - ext v0.16b, v0.16b, v0.16b, #2 - ext v4.16b, v4.16b, v4.16b, #2 - ext v0.16b, v0.16b, v30.16b, #14 - ext v4.16b, v4.16b, v31.16b, #14 - b 88f -33: // 3 pixels valid - ext v0.16b, v0.16b, v0.16b, #3 - ext v4.16b, v4.16b, v4.16b, #3 - ext v0.16b, v0.16b, v30.16b, #13 - ext v4.16b, v4.16b, v31.16b, #13 - b 88f -44: // 4 pixels valid - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - ext v0.16b, v0.16b, v30.16b, #12 - ext v4.16b, v4.16b, v31.16b, #12 - b 88f -55: // 5 pixels valid - ext v0.16b, v0.16b, v0.16b, #5 - ext v4.16b, v4.16b, v4.16b, #5 - ext v0.16b, v0.16b, v30.16b, #11 - ext v4.16b, v4.16b, v31.16b, #11 - b 88f - -L(box3_variable_shift_tbl): - .hword L(box3_variable_shift_tbl) - 22b - .hword L(box3_variable_shift_tbl) - 33b - .hword L(box3_variable_shift_tbl) - 44b - .hword L(box3_variable_shift_tbl) - 55b - -88: - umull v1.8h, v0.8b, v0.8b - umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b - - add3 4 - subs w5, w5, #4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - b.le 9f - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - ext v1.16b, v1.16b, v2.16b, #8 - ext v5.16b, v5.16b, v6.16b, #8 - // Only one needed pixel left, but do a normal 4 pixel - // addition anyway - add3 4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - 9: subs w6, w6, #2 b.le 0f @@ -878,7 +1145,6 @@ b 1b 0: ret -.purgem add3 endfunc // void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, @@ -897,23 +1163,11 @@ mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - // Subtract the number of pixels read from the input from the stride. - tst w7, #2 // LR_HAVE_RIGHT - b.ne 0f - // !LR_HAVE_RIGHT - add w13, w5, #3 - bic w13, w13, #3 - add w14, w5, #13 - b 1f -0: add w13, w5, #7 bic w13, w13, #7 - add w14, w5, #15 -1: sub x9, x9, w13, uxtw #1 - bic w14, w14, #7 - sub x4, x4, w14, uxtw + add w13, w13, #8 + sub x4, x4, w13, uxtw // Store the width for the vertical loop mov w8, w5 @@ -976,18 +1230,34 @@ ldr b30, [x3, w13, sxtw] ldr b31, [x12, w13, sxtw] // Fill v30/v31 with the right padding pixel - dup v30.8b, v30.b[0] - dup v31.8b, v31.b[0] + dup v30.16b, v30.b[0] + dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #11 b.ge 4f // If w >= 11, all used input pixels are valid - cmp w5, #7 - b.ge 5f // If w >= 7, we can produce 4 pixels - b 6f + + // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -1 + sub x13, x13, w5, uxtw + ld1 {v29.16b}, [x13] + + bit v0.16b, v30.16b, v29.16b + bit v4.16b, v31.16b, v29.16b + + // Update the precalculated squares + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b 4: // Loop horizontally -.macro add5 w ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 ext v18.16b, v0.16b, v0.16b, #3 @@ -1014,22 +1284,30 @@ ext v22.16b, v5.16b, v6.16b, #6 ext v23.16b, v5.16b, v6.16b, #8 - uaddl_nh v26.4s, v27.4s, v1, v16, \w - uaddl_nh v16.4s, v17.4s, v17, v18, \w - uaddl_nh v28.4s, v29.4s, v5, v20, \w - uaddw_nh v26.4s, v27.4s, v19, \w - uaddl_nh v20.4s, v21.4s, v21, v22, \w - uaddw_nh v28.4s, v29.4s, v23, \w - add_nh v26.4s, v27.4s, v16.4s, v17.4s, \w - add_nh v28.4s, v29.4s, v20.4s, v21.4s, \w -.endm - add5 8 + uaddl v26.4s, v1.4h, v16.4h + uaddl2 v27.4s, v1.8h, v16.8h + uaddl v16.4s, v17.4h, v18.4h + uaddl2 v17.4s, v17.8h, v18.8h + uaddl v28.4s, v5.4h, v20.4h + uaddl2 v29.4s, v5.8h, v20.8h + uaddw v26.4s, v26.4s, v19.4h + uaddw2 v27.4s, v27.4s, v19.8h + uaddl v20.4s, v21.4h, v22.4h + uaddl2 v21.4s, v21.8h, v22.8h + uaddw v28.4s, v28.4s, v23.4h + uaddw2 v29.4s, v29.4s, v23.8h + add v26.4s, v26.4s, v16.4s + add v27.4s, v27.4s, v17.4s + add v28.4s, v28.4s, v20.4s + add v29.4s, v29.4s, v21.4s + + subs w5, w5, #8 + st1 {v3.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 st1 {v28.4s,v29.4s}, [x10], #32 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 @@ -1043,95 +1321,6 @@ b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 7 <= w < 11 - add5 4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - - subs w5, w5, #4 // 3 <= w < 7 - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - -6: // Pad the right edge and produce the last few pixels. - // w < 7, w+1 pixels valid in v0/v4 - sub w13, w5, #1 - // w13 = pixels valid - 2 - adr x14, L(box5_variable_shift_tbl) - ldrh w13, [x14, w13, uxtw #1] - sub x13, x14, w13, uxth - br x13 - // Shift v0 right, shifting out invalid pixels, - // shift v0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - ext v0.16b, v0.16b, v0.16b, #2 - ext v4.16b, v4.16b, v4.16b, #2 - ext v0.16b, v0.16b, v30.16b, #14 - ext v4.16b, v4.16b, v31.16b, #14 - b 88f -33: // 3 pixels valid - ext v0.16b, v0.16b, v0.16b, #3 - ext v4.16b, v4.16b, v4.16b, #3 - ext v0.16b, v0.16b, v30.16b, #13 - ext v4.16b, v4.16b, v31.16b, #13 - b 88f -44: // 4 pixels valid - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - ext v0.16b, v0.16b, v30.16b, #12 - ext v4.16b, v4.16b, v31.16b, #12 - b 88f -55: // 5 pixels valid - ext v0.16b, v0.16b, v0.16b, #5 - ext v4.16b, v4.16b, v4.16b, #5 - ext v0.16b, v0.16b, v30.16b, #11 - ext v4.16b, v4.16b, v31.16b, #11 - b 88f -66: // 6 pixels valid - ext v0.16b, v0.16b, v0.16b, #6 - ext v4.16b, v4.16b, v4.16b, #6 - ext v0.16b, v0.16b, v30.16b, #10 - ext v4.16b, v4.16b, v31.16b, #10 - b 88f -77: // 7 pixels valid - ext v0.16b, v0.16b, v0.16b, #7 - ext v4.16b, v4.16b, v4.16b, #7 - ext v0.16b, v0.16b, v30.16b, #9 - ext v4.16b, v4.16b, v31.16b, #9 - b 88f - -L(box5_variable_shift_tbl): - .hword L(box5_variable_shift_tbl) - 22b - .hword L(box5_variable_shift_tbl) - 33b - .hword L(box5_variable_shift_tbl) - 44b - .hword L(box5_variable_shift_tbl) - 55b - .hword L(box5_variable_shift_tbl) - 66b - .hword L(box5_variable_shift_tbl) - 77b - -88: - umull v1.8h, v0.8b, v0.8b - umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b - - add5 4 - subs w5, w5, #4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - b.le 9f - ext v0.16b, v0.16b, v0.16b, #4 - ext v1.16b, v1.16b, v2.16b, #8 - ext v4.16b, v4.16b, v4.16b, #4 - ext v5.16b, v5.16b, v6.16b, #8 - add5 4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - 9: subs w6, w6, #2 b.le 0f @@ -1146,7 +1335,6 @@ b 1b 0: ret -.purgem add5 endfunc sgr_funcs 8 diff -Nru dav1d-0.7.1/src/arm/64/looprestoration_tmpl.S dav1d-0.9.1/src/arm/64/looprestoration_tmpl.S --- dav1d-0.7.1/src/arm/64/looprestoration_tmpl.S 2020-06-21 11:48:54.968126300 +0000 +++ dav1d-0.9.1/src/arm/64/looprestoration_tmpl.S 2021-07-28 21:38:28.869851800 +0000 @@ -454,7 +454,7 @@ // const pixel *src, const ptrdiff_t src_stride, // const int16_t *t1, const int16_t *t2, // const int w, const int h, -// const int16_t wt[2]); +// const int16_t wt[2], const int bitdepth_max); function sgr_weighted2_\bpc\()bpc_neon, export=1 .if \bpc == 8 ldr x8, [sp] diff -Nru dav1d-0.7.1/src/arm/64/mc16.S dav1d-0.9.1/src/arm/64/mc16.S --- dav1d-0.7.1/src/arm/64/mc16.S 2020-06-21 11:48:54.972126500 +0000 +++ dav1d-0.9.1/src/arm/64/mc16.S 2021-07-28 21:38:28.873851800 +0000 @@ -909,12 +909,11 @@ b.gt 8b ret 160: - ld1 {v16.8b, v17.8b}, [x5] + ld1 {v16.16b}, [x5] sub x1, x1, #16 - neg v16.8b, v16.8b // -m - neg v17.8b, v17.8b - sxtl v16.8h, v16.8b - sxtl v17.8h, v17.8b + neg v17.16b, v16.16b // -m + sxtl v16.8h, v17.8b + sxtl2 v17.8h, v17.16b shl v16.8h, v16.8h, #9 // -m << 9 shl v17.4h, v17.4h, #9 16: @@ -1004,11 +1003,11 @@ b.gt 2b ret 4: - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x2], x3 + ld1 {v0.4h}, [x2], x3 + ld1 {v1.4h}, [x2], x3 subs w5, w5, #2 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 + st1 {v0.4h}, [x0], x1 + st1 {v1.4h}, [x0], x1 b.gt 4b ret 80: @@ -1017,11 +1016,11 @@ add x9, x2, x3 lsl x3, x3, #1 8: - ld1 {v0.16b}, [x2], x3 - ld1 {v1.16b}, [x9], x3 + ld1 {v0.8h}, [x2], x3 + ld1 {v1.8h}, [x9], x3 subs w5, w5, #2 - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x8], x1 + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x8], x1 b.gt 8b ret 16: @@ -2039,7 +2038,6 @@ sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 - sxtl v1.4s, v1.4h ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 @@ -2049,19 +2047,23 @@ addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) bl L(\type\()_8tap_filter_2) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). + xtn v16.4h, v16.4s - trn1 v16.2d, v16.2d, v24.2d - mov v17.16b, v24.16b + trn1 v16.2s, v16.2s, v24.2s + mov v17.8b, v24.8b 2: bl L(\type\()_8tap_filter_2) - ext v18.16b, v17.16b, v24.16b, #8 - mov v19.16b, v24.16b - mul v2.4s, v16.4s, v1.s[0] - mla v2.4s, v17.4s, v1.s[1] - mla v2.4s, v18.4s, v1.s[2] - mla v2.4s, v19.4s, v1.s[3] + ext v18.8b, v17.8b, v24.8b, #4 + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v24.4h, v1.h[3] srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s @@ -2070,8 +2072,8 @@ st1 {v2.s}[0], [\dst], \d_strd st1 {v2.s}[1], [\ds2], \d_strd b.le 0f - mov v16.16b, v18.16b - mov v17.16b, v19.16b + mov v16.8b, v18.8b + mov v17.8b, v24.8b b 2b 280: // 2x8, 2x16, 2x32 hv @@ -2085,8 +2087,6 @@ sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 - sxtl2 v2.4s, v1.8h - sxtl v1.4s, v1.4h ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 @@ -2095,29 +2095,33 @@ addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) + // The intermediates from the horizontal pass fit in 16 bit without + // any bias; we could just as well keep them as .4s, but narrowing + // them to .4h gives a significant speedup on out of order cores + // (at the cost of a smaller slowdown on in-order cores such as A53). bl L(\type\()_8tap_filter_2) - trn1 v16.2d, v16.2d, v24.2d - mov v17.16b, v24.16b + xtn v16.4h, v16.4s + trn1 v16.2s, v16.2s, v24.2s + mov v17.8b, v24.8b bl L(\type\()_8tap_filter_2) - ext v18.16b, v17.16b, v24.16b, #8 - mov v19.16b, v24.16b + ext v18.8b, v17.8b, v24.8b, #4 + mov v19.8b, v24.8b bl L(\type\()_8tap_filter_2) - ext v20.16b, v19.16b, v24.16b, #8 - mov v21.16b, v24.16b + ext v20.8b, v19.8b, v24.8b, #4 + mov v21.8b, v24.8b 28: bl L(\type\()_8tap_filter_2) - ext v22.16b, v21.16b, v24.16b, #8 - mov v23.16b, v24.16b - mul v3.4s, v16.4s, v1.s[0] - mla v3.4s, v17.4s, v1.s[1] - mla v3.4s, v18.4s, v1.s[2] - mla v3.4s, v19.4s, v1.s[3] - mla v3.4s, v20.4s, v2.s[0] - mla v3.4s, v21.4s, v2.s[1] - mla v3.4s, v22.4s, v2.s[2] - mla v3.4s, v23.4s, v2.s[3] + ext v22.8b, v21.8b, v24.8b, #4 + smull v3.4s, v16.4h, v1.h[0] + smlal v3.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[6] + smlal v3.4s, v24.4h, v1.h[7] srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s @@ -2126,12 +2130,12 @@ st1 {v3.s}[0], [\dst], \d_strd st1 {v3.s}[1], [\ds2], \d_strd b.le 0f - mov v16.16b, v18.16b - mov v17.16b, v19.16b - mov v18.16b, v20.16b - mov v19.16b, v21.16b - mov v20.16b, v22.16b - mov v21.16b, v23.16b + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v24.8b b 28b 0: @@ -2151,6 +2155,7 @@ smlal v24.4s, v27.4h, v0.h[2] smlal v24.4s, v28.4h, v0.h[3] srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) + xtn v24.4h, v24.4s ret .endif @@ -3183,8 +3188,8 @@ .macro load_filter_row dst, src, inc asr w13, \src, #10 - ldr \dst, [x11, w13, sxtw #3] add \src, \src, \inc + ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon @@ -3338,15 +3343,7 @@ load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 - transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 - sxtl v0.8h, v0.8b - sxtl v1.8h, v1.8b - sxtl v2.8h, v2.8b - sxtl v3.8h, v3.8b - sxtl v4.8h, v4.8b - sxtl v5.8h, v5.8b - sxtl v6.8h, v6.8b - sxtl v7.8h, v7.8b + transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. diff -Nru dav1d-0.7.1/src/arm/64/mc.S dav1d-0.9.1/src/arm/64/mc.S --- dav1d-0.7.1/src/arm/64/mc.S 2020-06-21 11:48:54.968126300 +0000 +++ dav1d-0.9.1/src/arm/64/mc.S 2021-07-28 21:38:28.873851800 +0000 @@ -1906,11 +1906,10 @@ bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v28.8b, #4 - mov v19.8b, v28.8b smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] - smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v28.4h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h @@ -1919,7 +1918,7 @@ st1 {v2.h}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b - mov v17.8b, v19.8b + mov v17.8b, v28.8b b 2b 280: // 2x8, 2x16, 2x32 hv @@ -1956,7 +1955,6 @@ 28: bl L(\type\()_8tap_filter_2) ext v22.8b, v21.8b, v28.8b, #4 - mov v23.8b, v28.8b smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] @@ -1964,7 +1962,7 @@ smlal v2.4s, v20.4h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] - smlal v2.4s, v23.4h, v1.h[7] + smlal v2.4s, v28.4h, v1.h[7] sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h @@ -1977,7 +1975,7 @@ mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b - mov v21.8b, v23.8b + mov v21.8b, v28.8b b 28b 0: @@ -2182,16 +2180,7 @@ lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - ld1 {v28.8b, v29.8b}, [\src], \s_strd - uxtl v28.8h, v28.8b - uxtl v29.8h, v29.8b - mul v24.8h, v28.8h, v0.h[0] -.irpc i, 1234567 - ext v26.16b, v28.16b, v29.16b, #(2*\i) - mla v24.8h, v26.8h, v0.h[\i] -.endr - srshr v16.8h, v24.8h, #2 - + bl L(\type\()_8tap_filter_8_first) bl L(\type\()_8tap_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b @@ -2269,16 +2258,7 @@ lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - ld1 {v28.8b, v29.8b}, [\src], \s_strd - uxtl v28.8h, v28.8b - uxtl v29.8h, v29.8b - mul v24.8h, v28.8h, v0.h[0] -.irpc i, 1234567 - ext v26.16b, v28.16b, v29.16b, #(2*\i) - mla v24.8h, v26.8h, v0.h[\i] -.endr - srshr v16.8h, v24.8h, #2 - + bl L(\type\()_8tap_filter_8_first) bl L(\type\()_8tap_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b @@ -2365,6 +2345,28 @@ 0: br x15 +L(\type\()_8tap_filter_8_first): + ld1 {v28.8b, v29.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + mul v16.8h, v28.8h, v0.h[0] + ext v24.16b, v28.16b, v29.16b, #(2*1) + ext v25.16b, v28.16b, v29.16b, #(2*2) + ext v26.16b, v28.16b, v29.16b, #(2*3) + ext v27.16b, v28.16b, v29.16b, #(2*4) + mla v16.8h, v24.8h, v0.h[1] + mla v16.8h, v25.8h, v0.h[2] + mla v16.8h, v26.8h, v0.h[3] + mla v16.8h, v27.8h, v0.h[4] + ext v24.16b, v28.16b, v29.16b, #(2*5) + ext v25.16b, v28.16b, v29.16b, #(2*6) + ext v26.16b, v28.16b, v29.16b, #(2*7) + mla v16.8h, v24.8h, v0.h[5] + mla v16.8h, v25.8h, v0.h[6] + mla v16.8h, v26.8h, v0.h[7] + srshr v16.8h, v16.8h, #2 + ret + L(\type\()_8tap_filter_8): ld1 {v28.8b, v29.8b}, [\sr2], \s_strd ld1 {v30.8b, v31.8b}, [\src], \s_strd @@ -2918,8 +2920,8 @@ .macro load_filter_row dst, src, inc asr w13, \src, #10 - ldr \dst, [x11, w13, sxtw #3] add \src, \src, \inc + ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon @@ -2928,57 +2930,44 @@ ld1 {v16.8b, v17.8b}, [x2], x3 load_filter_row d0, w12, w7 - uxtl v16.8h, v16.8b load_filter_row d1, w12, w7 - uxtl v17.8h, v17.8b load_filter_row d2, w12, w7 - sxtl v0.8h, v0.8b load_filter_row d3, w12, w7 - sxtl v1.8h, v1.8b load_filter_row d4, w12, w7 - sxtl v2.8h, v2.8b load_filter_row d5, w12, w7 - sxtl v3.8h, v3.8b load_filter_row d6, w12, w7 - sxtl v4.8h, v4.8b + // subtract by 128 to allow using smull + eor v16.8b, v16.8b, v22.8b + eor v17.8b, v17.8b, v22.8b load_filter_row d7, w12, w7 - sxtl v5.8h, v5.8b - ext v18.16b, v16.16b, v17.16b, #2*1 - mul v23.8h, v16.8h, v0.8h - sxtl v6.8h, v6.8b - ext v19.16b, v16.16b, v17.16b, #2*2 - mul v18.8h, v18.8h, v1.8h - sxtl v7.8h, v7.8b - ext v20.16b, v16.16b, v17.16b, #2*3 - mul v19.8h, v19.8h, v2.8h - ext v21.16b, v16.16b, v17.16b, #2*4 - saddlp v23.4s, v23.8h - mul v20.8h, v20.8h, v3.8h - ext v22.16b, v16.16b, v17.16b, #2*5 - saddlp v18.4s, v18.8h - mul v21.8h, v21.8h, v4.8h - saddlp v19.4s, v19.8h - mul v22.8h, v22.8h, v5.8h - saddlp v20.4s, v20.8h - saddlp v21.4s, v21.8h - saddlp v22.4s, v22.8h - addp v18.4s, v23.4s, v18.4s - ext v23.16b, v16.16b, v17.16b, #2*6 - addp v19.4s, v19.4s, v20.4s - mul v23.8h, v23.8h, v6.8h - ext v20.16b, v16.16b, v17.16b, #2*7 - mul v20.8h, v20.8h, v7.8h - saddlp v23.4s, v23.8h - addp v21.4s, v21.4s, v22.4s - saddlp v20.4s, v20.8h - addp v20.4s, v23.4s, v20.4s - addp v18.4s, v18.4s, v19.4s - addp v20.4s, v21.4s, v20.4s - add w5, w5, w8 + ext v18.8b, v16.8b, v17.8b, #1 + ext v19.8b, v16.8b, v17.8b, #2 + smull v0.8h, v0.8b, v16.8b + smull v1.8h, v1.8b, v18.8b + ext v18.8b, v16.8b, v17.8b, #3 + ext v20.8b, v16.8b, v17.8b, #4 + smull v2.8h, v2.8b, v19.8b + smull v3.8h, v3.8b, v18.8b + ext v18.8b, v16.8b, v17.8b, #5 + ext v19.8b, v16.8b, v17.8b, #6 + smull v4.8h, v4.8b, v20.8b + smull v5.8h, v5.8b, v18.8b + ext v18.8b, v16.8b, v17.8b, #7 + smull v6.8h, v6.8b, v19.8b + smull v7.8h, v7.8b, v18.8b + + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + + addp v0.8h, v0.8h, v4.8h - rshrn v16.4h, v18.4s, #3 - rshrn2 v16.8h, v20.4s, #3 + add w5, w5, w8 ret endfunc @@ -3004,25 +2993,32 @@ lsl x1, x1, #1 .endif + movi v22.8b, #128 +.ifb \t + movi v23.8h, #128 +.else + movi v23.8h, #8, lsl #8 +.endif + bl warp_filter_horz_neon - mov v24.16b, v16.16b + srshr v24.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v25.16b, v16.16b + srshr v25.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v26.16b, v16.16b + srshr v26.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v27.16b, v16.16b + srshr v27.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v28.16b, v16.16b + srshr v28.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v29.16b, v16.16b + srshr v29.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v30.16b, v16.16b + srshr v30.8h, v0.8h, #3 1: add w14, w6, #512 bl warp_filter_horz_neon - mov v31.16b, v16.16b + srshr v31.8h, v0.8h, #3 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 @@ -3032,15 +3028,7 @@ load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 - transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 - sxtl v0.8h, v0.8b - sxtl v1.8h, v1.8b - sxtl v2.8h, v2.8b - sxtl v3.8h, v3.8b - sxtl v4.8h, v4.8b - sxtl v5.8h, v5.8b - sxtl v6.8h, v6.8b - sxtl v7.8h, v7.8b + transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. @@ -3068,6 +3056,7 @@ sqrshrn2 v16.8h, v17.4s, #\shift mov v27.16b, v28.16b mov v28.16b, v29.16b + add v16.8h, v16.8h, v23.8h .ifb \t sqxtun v16.8b, v16.8h .endif diff -Nru dav1d-0.7.1/src/arm/64/util.S dav1d-0.9.1/src/arm/64/util.S --- dav1d-0.7.1/src/arm/64/util.S 2020-06-21 11:48:54.972126500 +0000 +++ dav1d-0.9.1/src/arm/64/util.S 2021-07-28 21:38:28.873851800 +0000 @@ -59,33 +59,65 @@ #endif .endm -.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 - trn1 \t8\().8b, \r0\().8b, \r1\().8b - trn2 \t9\().8b, \r0\().8b, \r1\().8b - trn1 \r1\().8b, \r2\().8b, \r3\().8b - trn2 \r3\().8b, \r2\().8b, \r3\().8b - trn1 \r0\().8b, \r4\().8b, \r5\().8b - trn2 \r5\().8b, \r4\().8b, \r5\().8b - trn1 \r2\().8b, \r6\().8b, \r7\().8b - trn2 \r7\().8b, \r6\().8b, \r7\().8b - - trn1 \r4\().4h, \r0\().4h, \r2\().4h - trn2 \r2\().4h, \r0\().4h, \r2\().4h - trn1 \r6\().4h, \r5\().4h, \r7\().4h - trn2 \r7\().4h, \r5\().4h, \r7\().4h - trn1 \r5\().4h, \t9\().4h, \r3\().4h - trn2 \t9\().4h, \t9\().4h, \r3\().4h - trn1 \r3\().4h, \t8\().4h, \r1\().4h - trn2 \t8\().4h, \t8\().4h, \r1\().4h - - trn1 \r0\().2s, \r3\().2s, \r4\().2s - trn2 \r4\().2s, \r3\().2s, \r4\().2s - trn1 \r1\().2s, \r5\().2s, \r6\().2s - trn2 \r5\().2s, \r5\().2s, \r6\().2s - trn2 \r6\().2s, \t8\().2s, \r2\().2s - trn1 \r2\().2s, \t8\().2s, \r2\().2s - trn1 \r3\().2s, \t9\().2s, \r7\().2s - trn2 \r7\().2s, \t9\().2s, \r7\().2s +.macro sub_sp space +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub x16, sp, #4096 + ldr xzr, [x16] + sub sp, x16, #(\space - 4096) +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl + // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 + zip1 \r0\().16b, \r0\().16b, \r1\().16b + // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7 + zip1 \r2\().16b, \r2\().16b, \r3\().16b + // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7 + zip1 \r4\().16b, \r4\().16b, \r5\().16b + // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7 + zip1 \r6\().16b, \r6\().16b, \r7\().16b + + // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6 + trn1 \r1\().8h, \r0\().8h, \r2\().8h + // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7 + trn2 \r3\().8h, \r0\().8h, \r2\().8h + // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6 + trn1 \r5\().8h, \r4\().8h, \r6\().8h + // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7 + trn2 \r7\().8h, \r4\().8h, \r6\().8h + + // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4 + trn1 \r0\().4s, \r1\().4s, \r5\().4s + // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6 + trn2 \r2\().4s, \r1\().4s, \r5\().4s + // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5 + trn1 \r1\().4s, \r3\().4s, \r7\().4s + // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7 + trn2 \r3\().4s, \r3\().4s, \r7\().4s + + \xtl\()2 \r4\().8h, \r0\().16b + \xtl \r0\().8h, \r0\().8b + \xtl\()2 \r6\().8h, \r2\().16b + \xtl \r2\().8h, \r2\().8b + \xtl\()2 \r5\().8h, \r1\().16b + \xtl \r1\().8h, \r1\().8b + \xtl\()2 \r7\().8h, \r3\().16b + \xtl \r3\().8h, \r3\().8b .endm .macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 diff -Nru dav1d-0.7.1/src/arm/asm-offsets.h dav1d-0.9.1/src/arm/asm-offsets.h --- dav1d-0.7.1/src/arm/asm-offsets.h 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/asm-offsets.h 2021-07-28 21:38:28.873851800 +0000 @@ -0,0 +1,43 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ARM_ASM_OFFSETS_H +#define ARM_ASM_OFFSETS_H + +#define FGD_SEED 0 +#define FGD_AR_COEFF_LAG 92 +#define FGD_AR_COEFFS_Y 96 +#define FGD_AR_COEFFS_UV 120 +#define FGD_AR_COEFF_SHIFT 176 +#define FGD_GRAIN_SCALE_SHIFT 184 + +#define FGD_SCALING_SHIFT 88 +#define FGD_UV_MULT 188 +#define FGD_UV_LUMA_MULT 196 +#define FGD_UV_OFFSET 204 +#define FGD_CLIP_TO_RESTRICTED_RANGE 216 + +#endif /* ARM_ASM_OFFSETS_H */ diff -Nru dav1d-0.7.1/src/arm/asm.S dav1d-0.9.1/src/arm/asm.S --- dav1d-0.7.1/src/arm/asm.S 2020-06-21 11:48:54.972126500 +0000 +++ dav1d-0.9.1/src/arm/asm.S 2021-07-28 21:38:28.873851800 +0000 @@ -94,6 +94,8 @@ #ifdef __ELF__ .type EXTERN\name, %function .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name #endif #if HAVE_AS_FUNC .func EXTERN\name @@ -129,6 +131,8 @@ .global EXTERN\name #ifdef __ELF__ .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name #endif EXTERN\name: .endif diff -Nru dav1d-0.7.1/src/arm/cdef_init_tmpl.c dav1d-0.9.1/src/arm/cdef_init_tmpl.c --- dav1d-0.7.1/src/arm/cdef_init_tmpl.c 2020-06-21 11:48:54.972126500 +0000 +++ dav1d-0.9.1/src/arm/cdef_init_tmpl.c 2021-07-28 21:38:28.873851800 +0000 @@ -27,7 +27,6 @@ #include "src/cpu.h" #include "src/cdef.h" -#if BITDEPTH == 8 || ARCH_AARCH64 decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon)); void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src, @@ -72,7 +71,6 @@ DEFINE_FILTER(8, 8, 16) DEFINE_FILTER(4, 8, 8) DEFINE_FILTER(4, 4, 8) -#endif COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) { @@ -80,10 +78,8 @@ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 || ARCH_AARCH64 c->dir = BF(dav1d_cdef_find_dir, neon); c->fb[0] = cdef_filter_8x8_neon; c->fb[1] = cdef_filter_4x8_neon; c->fb[2] = cdef_filter_4x4_neon; -#endif } diff -Nru dav1d-0.7.1/src/arm/film_grain_init_tmpl.c dav1d-0.9.1/src/arm/film_grain_init_tmpl.c --- dav1d-0.7.1/src/arm/film_grain_init_tmpl.c 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/arm/film_grain_init_tmpl.c 2021-07-28 21:38:28.873851800 +0000 @@ -0,0 +1,223 @@ +/* + * Copyright © 2018, Niklas Haas + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * Copyright © 2021, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/film_grain.h" +#include "asm-offsets.h" + +#if ARCH_AARCH64 + +CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV); +CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT); + +CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT); +CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET); +CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE); + +void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH], + const Dav1dFilmGrainData *const data + HIGHBD_DECL_SUFFIX); + +#define GEN_GRAIN_UV(suff) \ +void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \ + const entry buf_y[][GRAIN_WIDTH], \ + const Dav1dFilmGrainData *const data, \ + const intptr_t uv \ + HIGHBD_DECL_SUFFIX) + +GEN_GRAIN_UV(420); +GEN_GRAIN_UV(422); +GEN_GRAIN_UV(444); +#endif + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// same layout of parameters on the stack across platforms. +void BF(dav1d_fgy_32x32, neon)(pixel *const dst, + const pixel *const src, + const ptrdiff_t stride, + const uint8_t scaling[SCALING_SIZE], + const int scaling_shift, + const entry grain_lut[][GRAIN_WIDTH], + const int offsets[][2], + const int h, const ptrdiff_t clip, + const ptrdiff_t type + HIGHBD_DECL_SUFFIX); + +// Use ptrdiff_t instead of int for the last few parameters, to get the +// parameters on the stack with the same layout across platforms. +#define FGUV(suff) \ +void BF(dav1d_fguv_32x32_ ## suff, neon)(pixel *const dst, \ + const pixel *const src, \ + const ptrdiff_t stride, \ + const uint8_t scaling[SCALING_SIZE], \ + const Dav1dFilmGrainData *const data, \ + const entry grain_lut[][GRAIN_WIDTH], \ + const pixel *const luma_row, \ + const ptrdiff_t luma_stride, \ + const int offsets[][2], \ + const ptrdiff_t h, const ptrdiff_t uv, \ + const ptrdiff_t is_id, \ + const ptrdiff_t type \ + HIGHBD_DECL_SUFFIX) + +FGUV(420); +FGUV(422); +FGUV(444); + +static inline int get_random_number(const int bits, unsigned *const state) { + const int r = *state; + unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; + *state = (r >> 1) | (bit << 15); + + return (*state >> (16 - bits)) & ((1 << bits) - 1); +} + +static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, + const ptrdiff_t stride, + const Dav1dFilmGrainData *const data, const size_t pw, + const uint8_t scaling[SCALING_SIZE], + const entry grain_lut[][GRAIN_WIDTH], + const int bh, const int row_num HIGHBD_DECL_SUFFIX) +{ + const int rows = 1 + (data->overlap_flag && row_num > 0); + + // seed[0] contains the current row, seed[1] contains the previous + unsigned seed[2]; + for (int i = 0; i < rows; i++) { + seed[i] = data->seed; + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); + } + + int offsets[2 /* col offset */][2 /* row offset */]; + + // process this row in BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { + + if (data->overlap_flag && bx) { + // shift previous offsets left + for (int i = 0; i < rows; i++) + offsets[1][i] = offsets[0][i]; + } + + // update current offsets + for (int i = 0; i < rows; i++) + offsets[0][i] = get_random_number(8, &seed[i]); + + int type = 0; + if (data->overlap_flag && row_num) + type |= 1; /* overlap y */ + if (data->overlap_flag && bx) + type |= 2; /* overlap x */ + + BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride, + scaling, data->scaling_shift, + grain_lut, offsets, bh, + data->clip_to_restricted_range, type + HIGHBD_TAIL_SUFFIX); + } +} + +#define fguv_ss_fn(nm, sx, sy) \ +static void \ +fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ + const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \ + const int pw, const uint8_t scaling[SCALING_SIZE], \ + const entry grain_lut[][GRAIN_WIDTH], const int bh, \ + const int row_num, const pixel *const luma_row, \ + const ptrdiff_t luma_stride, const int uv, const int is_id \ + HIGHBD_DECL_SUFFIX) \ +{ \ + const int rows = 1 + (data->overlap_flag && row_num > 0); \ + \ + /* seed[0] contains the current row, seed[1] contains the previous */ \ + unsigned seed[2]; \ + for (int i = 0; i < rows; i++) { \ + seed[i] = data->seed; \ + seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; \ + seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \ + } \ + \ + int offsets[2 /* col offset */][2 /* row offset */]; \ + \ + /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \ + for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \ + if (data->overlap_flag && bx) { \ + /* shift previous offsets left */ \ + for (int i = 0; i < rows; i++) \ + offsets[1][i] = offsets[0][i]; \ + } \ + \ + /* update current offsets */ \ + for (int i = 0; i < rows; i++) \ + offsets[0][i] = get_random_number(8, &seed[i]); \ + \ + int type = 0; \ + if (data->overlap_flag && row_num) \ + type |= 1; /* overlap y */ \ + if (data->overlap_flag && bx) \ + type |= 2; /* overlap x */ \ + if (data->chroma_scaling_from_luma) \ + type |= 4; \ + \ + BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \ + scaling, data, grain_lut, \ + luma_row + (bx << sx), luma_stride, \ + offsets, bh, uv, is_id, type \ + HIGHBD_TAIL_SUFFIX); \ + } \ +} + +fguv_ss_fn(420, 1, 1); +fguv_ss_fn(422, 1, 0); +fguv_ss_fn(444, 0, 0); + +COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + +#if ARCH_AARCH64 && BITDEPTH == 8 + c->generate_grain_y = BF(dav1d_generate_grain_y, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon); +#endif + + c->fgy_32x32xn = fgy_32x32xn_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon; +} diff -Nru dav1d-0.7.1/src/arm/ipred_init_tmpl.c dav1d-0.9.1/src/arm/ipred_init_tmpl.c --- dav1d-0.7.1/src/arm/ipred_init_tmpl.c 2020-06-21 11:48:54.972126500 +0000 +++ dav1d-0.9.1/src/arm/ipred_init_tmpl.c 2021-07-28 21:38:28.873851800 +0000 @@ -46,6 +46,7 @@ decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon)); decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon)); decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); @@ -54,14 +55,12 @@ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 || ARCH_AARCH64 c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon); c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon); c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon); c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon); c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon); c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon); -#if ARCH_AARCH64 c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon); c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon); c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon); @@ -75,8 +74,7 @@ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon); c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon); c->pal_pred = BF(dav1d_pal_pred, neon); -#endif -#endif } diff -Nru dav1d-0.7.1/src/arm/itx_init_tmpl.c dav1d-0.9.1/src/arm/itx_init_tmpl.c --- dav1d-0.7.1/src/arm/itx_init_tmpl.c 2020-06-21 11:48:54.972126500 +0000 +++ dav1d-0.9.1/src/arm/itx_init_tmpl.c 2021-07-28 21:38:28.873851800 +0000 @@ -119,7 +119,6 @@ if (bpc > 10) return; -#if ARCH_AARCH64 || BITDEPTH == 8 assign_itx17_fn( , 4, 4, neon); assign_itx16_fn(R, 4, 8, neon); assign_itx16_fn(R, 4, 16, neon); @@ -139,5 +138,4 @@ assign_itx1_fn (R, 64, 16, neon); assign_itx1_fn (R, 64, 32, neon); assign_itx1_fn ( , 64, 64, neon); -#endif } diff -Nru dav1d-0.7.1/src/arm/loopfilter_init_tmpl.c dav1d-0.9.1/src/arm/loopfilter_init_tmpl.c --- dav1d-0.7.1/src/arm/loopfilter_init_tmpl.c 2020-06-21 11:48:54.972126500 +0000 +++ dav1d-0.9.1/src/arm/loopfilter_init_tmpl.c 2021-07-28 21:38:28.873851800 +0000 @@ -38,10 +38,8 @@ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 || ARCH_AARCH64 c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon); c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon); c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon); c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, neon); -#endif } diff -Nru dav1d-0.7.1/src/arm/looprestoration_init_tmpl.c dav1d-0.9.1/src/arm/looprestoration_init_tmpl.c --- dav1d-0.7.1/src/arm/looprestoration_init_tmpl.c 2020-06-21 11:48:54.972126500 +0000 +++ dav1d-0.9.1/src/arm/looprestoration_init_tmpl.c 2021-07-28 21:38:28.873851800 +0000 @@ -27,9 +27,24 @@ #include "src/cpu.h" #include "src/looprestoration.h" -#include "src/tables.h" -#if BITDEPTH == 8 || ARCH_AARCH64 +#if ARCH_AARCH64 +void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t p_stride, + const pixel (*left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t p_stride, + const pixel (*left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges + HIGHBD_DECL_SUFFIX); +#else + // The 8bpc version calculates things slightly differently than the reference // C version. That version calculates roughly this: // int16_t sum = 0; @@ -46,12 +61,11 @@ // 1 << (bitdepth + 6 - round_bits_h). void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4], const pixel *src, ptrdiff_t stride, - const int16_t fh[7], const intptr_t w, + const int16_t fh[8], intptr_t w, int h, enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); // This calculates things slightly differently than the reference C version. // This version calculates roughly this: -// fv[3] += 128; // int32_t sum = 0; // for (int i = 0; i < 7; i++) // sum += mid[idx] * fv[i]; @@ -59,51 +73,40 @@ // This function assumes that the width is a multiple of 8. void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride, const int16_t *mid, int w, int h, - const int16_t fv[7], enum LrEdgeFlags edges, + const int16_t fv[8], enum LrEdgeFlags edges, ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX); -void BF(dav1d_copy_narrow, neon)(pixel *dst, ptrdiff_t stride, - const pixel *src, int w, int h); static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, const pixel (*const left)[4], const pixel *lpf, const ptrdiff_t lpf_stride, - const int w, const int h, const int16_t fh[7], - const int16_t fv[7], const enum LrEdgeFlags edges - HIGHBD_DECL_SUFFIX) + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { + const int16_t (*const filter)[8] = params->filter; ALIGN_STK_16(int16_t, mid, 68 * 384,); int mid_stride = (w + 7) & ~7; // Horizontal filter BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride, - fh, w, h, edges HIGHBD_TAIL_SUFFIX); + filter[0], w, h, edges HIGHBD_TAIL_SUFFIX); if (edges & LR_HAVE_TOP) BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride, - fh, w, 2, edges HIGHBD_TAIL_SUFFIX); + filter[0], w, 2, edges + HIGHBD_TAIL_SUFFIX); if (edges & LR_HAVE_BOTTOM) BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL, lpf + 6 * PXSTRIDE(lpf_stride), - lpf_stride, fh, w, 2, edges + lpf_stride, filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); // Vertical filter - if (w >= 8) - BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride], - w & ~7, h, fv, edges, - mid_stride * sizeof(*mid) - HIGHBD_TAIL_SUFFIX); - if (w & 7) { - // For uneven widths, do a full 8 pixel wide filtering into a temp - // buffer and copy out the narrow slice of pixels separately into dest. - ALIGN_STK_16(pixel, tmp, 64 * 8,); - BF(dav1d_wiener_filter_v, neon)(tmp, (w & 7) * sizeof(pixel), - &mid[2*mid_stride + (w & ~7)], - w & 7, h, fv, edges, - mid_stride * sizeof(*mid) - HIGHBD_TAIL_SUFFIX); - BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h); - } + BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride], + w, h, filter[1], edges, + mid_stride * sizeof(*mid) + HIGHBD_TAIL_SUFFIX); } +#endif void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum, const pixel (*left)[4], @@ -205,94 +208,66 @@ const int w, const int h, const int16_t wt[2] HIGHBD_DECL_SUFFIX); -static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, - const int w, const int h, const int sgr_idx, - const int16_t sgr_wt[7], const enum LrEdgeFlags edges - HIGHBD_DECL_SUFFIX) +static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t dst_stride, + const pixel (*const left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { - if (!dav1d_sgr_params[sgr_idx][0]) { - ALIGN_STK_16(int16_t, tmp, 64 * 384,); - dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, - w, h, dav1d_sgr_params[sgr_idx][3], edges - HIGHBD_TAIL_SUFFIX); - if (w >= 8) - BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, - tmp, w & ~7, h, (1 << 7) - sgr_wt[1] - HIGHBD_TAIL_SUFFIX); - if (w & 7) { - // For uneven widths, do a full 8 pixel wide filtering into a temp - // buffer and copy out the narrow slice of pixels separately into - // dest. - ALIGN_STK_16(pixel, stripe, 64 * 8,); - BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel), - dst + (w & ~7), dst_stride, - tmp + (w & ~7), w & 7, h, - (1 << 7) - sgr_wt[1] - HIGHBD_TAIL_SUFFIX); - BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe, - w & 7, h); - } - } else if (!dav1d_sgr_params[sgr_idx][1]) { - ALIGN_STK_16(int16_t, tmp, 64 * 384,); - dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, - w, h, dav1d_sgr_params[sgr_idx][2], edges - HIGHBD_TAIL_SUFFIX); - if (w >= 8) - BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, - tmp, w & ~7, h, sgr_wt[0] - HIGHBD_TAIL_SUFFIX); - if (w & 7) { - // For uneven widths, do a full 8 pixel wide filtering into a temp - // buffer and copy out the narrow slice of pixels separately into - // dest. - ALIGN_STK_16(pixel, stripe, 64 * 8,); - BF(dav1d_sgr_weighted1, neon)(stripe, (w & 7) * sizeof(pixel), - dst + (w & ~7), dst_stride, - tmp + (w & ~7), w & 7, h, sgr_wt[0] - HIGHBD_TAIL_SUFFIX); - BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe, - w & 7, h); - } - } else { - ALIGN_STK_16(int16_t, tmp1, 64 * 384,); - ALIGN_STK_16(int16_t, tmp2, 64 * 384,); - dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride, - w, h, dav1d_sgr_params[sgr_idx][2], edges - HIGHBD_TAIL_SUFFIX); - dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride, - w, h, dav1d_sgr_params[sgr_idx][3], edges - HIGHBD_TAIL_SUFFIX); - const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] }; - if (w >= 8) - BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride, - tmp1, tmp2, w & ~7, h, wt - HIGHBD_TAIL_SUFFIX); - if (w & 7) { - // For uneven widths, do a full 8 pixel wide filtering into a temp - // buffer and copy out the narrow slice of pixels separately into - // dest. - ALIGN_STK_16(pixel, stripe, 64 * 8,); - BF(dav1d_sgr_weighted2, neon)(stripe, (w & 7) * sizeof(pixel), - dst + (w & ~7), dst_stride, - tmp1 + (w & ~7), tmp2 + (w & ~7), - w & 7, h, wt HIGHBD_TAIL_SUFFIX); - BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, stripe, - w & 7, h); - } - } + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, + w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); + BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, + tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX); +} + +static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t dst_stride, + const pixel (*const left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp, 64 * 384,); + dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, + w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); + BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, + tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX); +} + +static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t dst_stride, + const pixel (*const left)[4], + const pixel *lpf, const ptrdiff_t lpf_stride, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int16_t, tmp1, 64 * 384,); + ALIGN_STK_16(int16_t, tmp2, 64 * 384,); + dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride, + w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); + dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride, + w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); + const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 }; + BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride, + tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX); } -#endif // BITDEPTH == 8 COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 || ARCH_AARCH64 - c->wiener = wiener_filter_neon; - if (bpc <= 10) - c->selfguided = sgr_filter_neon; +#if ARCH_AARCH64 + c->wiener[0] = BF(dav1d_wiener_filter7, neon); + c->wiener[1] = BF(dav1d_wiener_filter5, neon); +#else + c->wiener[0] = c->wiener[1] = wiener_filter_neon; #endif + if (bpc <= 10) { + c->sgr[0] = sgr_filter_5x5_neon; + c->sgr[1] = sgr_filter_3x3_neon; + c->sgr[2] = sgr_filter_mix_neon; + } } diff -Nru dav1d-0.7.1/src/arm/mc_init_tmpl.c dav1d-0.9.1/src/arm/mc_init_tmpl.c --- dav1d-0.7.1/src/arm/mc_init_tmpl.c 2020-06-21 11:48:54.972126500 +0000 +++ dav1d-0.9.1/src/arm/mc_init_tmpl.c 2021-07-28 21:38:28.873851800 +0000 @@ -77,7 +77,6 @@ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 || ARCH_AARCH64 init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); @@ -112,5 +111,4 @@ c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); c->emu_edge = BF(dav1d_emu_edge, neon); -#endif } diff -Nru dav1d-0.7.1/src/cdef_apply_tmpl.c dav1d-0.9.1/src/cdef_apply_tmpl.c --- dav1d-0.7.1/src/cdef_apply_tmpl.c 2020-06-21 11:48:54.976126400 +0000 +++ dav1d-0.9.1/src/cdef_apply_tmpl.c 2021-07-28 21:38:28.873851800 +0000 @@ -117,7 +117,7 @@ for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) { const int tf = f->lf.top_pre_cdef_toggle; - const int by_idx = by & 30; + const int by_idx = (by & 30) >> 1; if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM; if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration @@ -140,6 +140,11 @@ goto next_sb; } + // Create a complete 32-bit mask for the sb row ahead of time. + const uint16_t (*noskip_row)[2] = &lflvl[sb128x].noskip_mask[by_idx]; + const unsigned noskip_mask = (unsigned) noskip_row[0][1] << 16 | + noskip_row[0][0]; + const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx]; const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx]; const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1); @@ -162,11 +167,8 @@ // check if this 8x8 block had any coded coefficients; if not, // go to the next block - const unsigned bx_mask = 3U << (bx & 14); - const int bx_idx = (bx & 16) >> 4; - if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] | - lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask)) - { + const uint32_t bx_mask = 3U << (bx & 30); + if (!(noskip_mask & bx_mask)) { last_skip = 1; goto next_b; } diff -Nru dav1d-0.7.1/src/cdf.c dav1d-0.9.1/src/cdf.c --- dav1d-0.7.1/src/cdf.c 2020-06-21 11:48:54.976126400 +0000 +++ dav1d-0.9.1/src/cdf.c 2021-07-28 21:38:28.877852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -29,10 +29,9 @@ #include -#include "src/thread.h" -#include "common/intops.h" +#include "common/frame.h" -#include "src/cdf.h" +#include "src/internal.h" #include "src/tables.h" #define CDF1(x) (32768-(x)) @@ -4015,7 +4014,7 @@ update_cdf_1d(11, m.txtp_inter2); update_bit_1d(4, m.txtp_inter3); - if (!(hdr->frame_type & 1)) { + if (IS_KEY_OR_INTRA(hdr)) { update_bit_0d(m.intrabc); update_cdf_1d(N_MV_JOINTS - 1, dmv.joint); @@ -4096,11 +4095,11 @@ } } -int dav1d_cdf_thread_alloc(CdfThreadContext *const cdf, +int dav1d_cdf_thread_alloc(Dav1dContext *const c, CdfThreadContext *const cdf, struct thread_data *const t) { - cdf->ref = dav1d_ref_create(sizeof(CdfContext) + - (t != NULL) * sizeof(atomic_uint)); + cdf->ref = dav1d_ref_create_using_pool(c->cdf_pool, + sizeof(CdfContext) + sizeof(atomic_uint)); if (!cdf->ref) return DAV1D_ERR(ENOMEM); cdf->data.cdf = cdf->ref->data; if (t) { diff -Nru dav1d-0.7.1/src/cdf.h dav1d-0.9.1/src/cdf.h --- dav1d-0.7.1/src/cdf.h 2020-06-21 11:48:54.976126400 +0000 +++ dav1d-0.9.1/src/cdf.h 2021-07-28 21:38:28.877852000 +0000 @@ -140,7 +140,8 @@ } CdfThreadContext; void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx); -int dav1d_cdf_thread_alloc(CdfThreadContext *cdf, struct thread_data *t); +int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf, + struct thread_data *t); void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src); void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src); void dav1d_cdf_thread_unref(CdfThreadContext *cdf); diff -Nru dav1d-0.7.1/src/data.c dav1d-0.9.1/src/data.c --- dav1d-0.7.1/src/data.c 2020-06-21 11:48:54.980126400 +0000 +++ dav1d-0.9.1/src/data.c 2021-07-28 21:38:28.877852000 +0000 @@ -43,6 +43,7 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) { validate_input_or_ret(buf != NULL, NULL); + if (sz > SIZE_MAX / 2) return NULL; buf->ref = dav1d_ref_create(sz); if (!buf->ref) return NULL; buf->data = buf->ref->const_data; @@ -101,18 +102,6 @@ *dst = *src; } -void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) { - validate_input(dst != NULL); - validate_input(dst->data == NULL); - validate_input(src != NULL); - - if (src->ref) - validate_input(src->data != NULL); - - *dst = *src; - memset(src, 0, sizeof(*src)); -} - void dav1d_data_props_copy(Dav1dDataProps *const dst, const Dav1dDataProps *const src) { diff -Nru dav1d-0.7.1/src/data.h dav1d-0.9.1/src/data.h --- dav1d-0.7.1/src/data.h 2020-06-21 11:48:54.980126400 +0000 +++ dav1d-0.9.1/src/data.h 2021-07-28 21:38:28.877852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -33,11 +33,6 @@ void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src); /** - * Move a data reference. - */ -void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src); - -/** * Copy the source properties to the destitionatin and increase the * user_data's reference count (if it's not NULL). */ diff -Nru dav1d-0.7.1/src/decode.c dav1d-0.9.1/src/decode.c --- dav1d-0.7.1/src/decode.c 2020-06-21 11:48:54.980126400 +0000 +++ dav1d-0.9.1/src/decode.c 2021-07-28 21:38:28.877852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -35,8 +35,8 @@ #include "dav1d/data.h" +#include "common/frame.h" #include "common/intops.h" -#include "common/mem.h" #include "src/ctx.h" #include "src/decode.h" @@ -728,7 +728,7 @@ case_set(bh4, l., 1, by4); case_set(bw4, a->, 0, bx4); #undef set_ctx - if (f->frame_hdr->frame_type & 1) { + if (IS_INTER_OR_SWITCH(f->frame_hdr)) { refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx]; for (int x = 0; x < bw4; x++) { r[x].ref.ref[0] = 0; @@ -749,7 +749,7 @@ #undef set_ctx } } else { - if (f->frame_hdr->frame_type & 1 /* not intrabc */ && + if (IS_INTER_OR_SWITCH(f->frame_hdr) /* not intrabc */ && b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP) { if (b->matrix[0] == SHRT_MIN) { @@ -773,10 +773,10 @@ signabs(t->warpmv.matrix[3]), signabs(t->warpmv.matrix[4]), signabs(t->warpmv.matrix[5]), - signabs(t->warpmv.alpha), - signabs(t->warpmv.beta), - signabs(t->warpmv.gamma), - signabs(t->warpmv.delta), + signabs(t->warpmv.u.p.alpha), + signabs(t->warpmv.u.p.beta), + signabs(t->warpmv.u.p.gamma), + signabs(t->warpmv.u.p.delta), b->mv2d.y, b->mv2d.x); #undef signabs } @@ -792,7 +792,7 @@ case_set(bw4, a->, 0, bx4); #undef set_ctx - if (f->frame_hdr->frame_type & 1) { + if (IS_INTER_OR_SWITCH(f->frame_hdr)) { refmvs_block *const r = &t->rt.r[(t->by & 31) + 5 + bh4 - 1][t->bx]; for (int x = 0; x < bw4; x++) { r[x].ref.ref[0] = b->ref[0] + 1; @@ -1044,7 +1044,7 @@ if (b->skip_mode) { b->intra = 0; - } else if (f->frame_hdr->frame_type & 1) { + } else if (IS_INTER_OR_SWITCH(f->frame_hdr)) { if (seg && (seg->ref >= 0 || seg->globalmv)) { b->intra = !seg->ref; } else { @@ -1065,7 +1065,7 @@ // intra/inter-specific stuff if (b->intra) { - uint16_t *const ymode_cdf = f->frame_hdr->frame_type & 1 ? + uint16_t *const ymode_cdf = IS_INTER_OR_SWITCH(f->frame_hdr) ? ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] : ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]] [dav1d_intra_mode_context[t->l.mode[by4]]]; @@ -1253,7 +1253,7 @@ rep_macro(type, t->dir skip, off, mul * b->skip); \ /* see aomedia bug 2183 for why we use luma coordinates here */ \ rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \ - if (f->frame_hdr->frame_type & 1) { \ + if (IS_INTER_OR_SWITCH(f->frame_hdr)) { \ rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \ rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \ rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \ @@ -1294,10 +1294,10 @@ } } } - if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) { + if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { splat_intraref(&t->rt, t->by, t->bx, bs); } - } else if (!(f->frame_hdr->frame_type & 1)) { + } else if (IS_KEY_OR_INTRA(f->frame_hdr)) { // intra block copy refmvs_candidate mvstack[8]; int n_mvs, ctx; @@ -1843,10 +1843,10 @@ signabs(t->warpmv.matrix[3]), signabs(t->warpmv.matrix[4]), signabs(t->warpmv.matrix[5]), - signabs(t->warpmv.alpha), - signabs(t->warpmv.beta), - signabs(t->warpmv.gamma), - signabs(t->warpmv.delta), + signabs(t->warpmv.u.p.alpha), + signabs(t->warpmv.u.p.beta), + signabs(t->warpmv.u.p.gamma), + signabs(t->warpmv.u.p.delta), b->mv[0].y, b->mv[0].x); #undef signabs if (f->frame_thread.pass) { @@ -1985,10 +1985,10 @@ #undef set_ctx } if (!b->skip) { - uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4]; + uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4 >> 1]; const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15); const int bx_idx = (bx4 & 16) >> 4; - for (int y = 0; y < bh4; y++, noskip_mask++) { + for (int y = 0; y < bh4; y += 2, noskip_mask++) { (*noskip_mask)[bx_idx] |= mask; if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway (*noskip_mask)[1] |= mask; @@ -2485,15 +2485,12 @@ lr->filter_h[1], lr->filter_h[2], ts->msac.rng); } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) { const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4); + const uint16_t *const sgr_params = dav1d_sgr_params[idx]; lr->sgr_idx = idx; - lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ? - dav1d_msac_decode_subexp(&ts->msac, - ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : - 0; - lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ? - dav1d_msac_decode_subexp(&ts->msac, - ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : - 95; + lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0; + lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac, + ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : 95; memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v)); memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h)); ts->lr_ref[p] = lr; @@ -2514,20 +2511,20 @@ const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col]; const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128; - if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) { + if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start, ts->tiling.col_end, ts->tiling.row_start, ts->tiling.row_end, t->by >> f->sb_shift, ts->tiling.row); } - reset_context(&t->l, !(f->frame_hdr->frame_type & 1), f->frame_thread.pass); + reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass); if (f->frame_thread.pass == 2) { for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w; t->bx < ts->tiling.col_end; t->bx += sb_step) { - if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire)) + if (atomic_load_explicit(c->flush, memory_order_acquire)) return 1; if (decode_sb(t, root_bl, c->intra_edge.root[root_bl])) return 1; @@ -2558,7 +2555,7 @@ t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start; t->bx < ts->tiling.col_end; t->bx += sb_step) { - if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire)) + if (atomic_load_explicit(c->flush, memory_order_acquire)) return 1; if (root_bl == BL_128X128) { t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx; @@ -2632,7 +2629,7 @@ } } - if (f->n_tc > 1 && f->frame_hdr->frame_type & 1) { + if (f->seq_hdr->ref_frame_mvs && f->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { dav1d_refmvs_save_tmvs(&t->rt, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, t->by >> 1, (t->by + sb_step) >> 1); @@ -2678,17 +2675,17 @@ sizeof(*f->tile_thread.titsati_index_rows) * (f->frame_hdr->tiling.rows + 1))) { - for (int tile_row = 0, tile_idx = 0; + for (int tile_row = 0, task_idx = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++) { for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; - tile_col++, tile_idx++) + tile_col++, task_idx++) { - f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][0] = sby; - f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][1] = + f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0] = sby; + f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1] = tile_row * f->frame_hdr->tiling.cols + tile_col; } } @@ -2860,7 +2857,9 @@ const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd; if (lr_line_sz != f->lf.lr_line_sz) { dav1d_freep_aligned(&f->lf.lr_lpf_line[0]); - uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32); + const int num_lines = c->n_pfc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12; + // lr simd may overread the input, so slightly over-allocate the lpf buffer + uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * num_lines * 3 + 64, 32); if (!lr_ptr) { f->lf.lr_line_sz = 0; goto error; @@ -2868,7 +2867,7 @@ for (int pl = 0; pl <= 2; pl++) { f->lf.lr_lpf_line[pl] = lr_ptr; - lr_ptr += lr_line_sz * 12; + lr_ptr += lr_line_sz * num_lines; } f->lf.lr_line_sz = lr_line_sz; @@ -2950,26 +2949,30 @@ } // init ref mvs - if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) { + if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { const int ret = dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr, f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc); if (ret < 0) goto error; } + + // create post-filtering tasks + if (c->n_pfc > 1) + if (dav1d_task_create_filter_sbrow(f)) + goto error; + retval = DAV1D_ERR(EINVAL); // setup dequant tables init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq); if (f->frame_hdr->quant.qm) - for (int j = 0; j < N_RECT_TX_SIZES; j++) { - f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j]; - f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j]; - f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j]; - } - for (int i = f->frame_hdr->quant.qm; i < 2; i++) - for (int tx = 0; tx < N_RECT_TX_SIZES; tx++) - for (int pl = 0; pl < 3; pl++) - f->qm[i][tx][pl] = dav1d_qm_tbl[15][!!pl][tx]; + for (int i = 0; i < N_RECT_TX_SIZES; i++) { + f->qm[i][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][i]; + f->qm[i][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][i]; + f->qm[i][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][i]; + } + else + memset(f->qm, 0, sizeof(f->qm)); // setup jnt_comp weights if (f->frame_hdr->switchable_comp_refs) { @@ -3080,9 +3083,9 @@ f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y; for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++) - reset_context(&f->a[n], !(f->frame_hdr->frame_type & 1), f->frame_thread.pass); + reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass); - if (f->n_tc == 1) { + if (f->n_tc == 1 || (c->n_pfc > 1 && f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows == 1)) { Dav1dTileContext *const t = f->tc; // no tile threading - we explicitly interleave tile/sbrow decoding @@ -3102,25 +3105,38 @@ 4 * (t->by + f->sb_step), PLANE_TYPE_BLOCK)) { - return 1; + goto error; } dav1d_refmvs_load_tmvs(&f->rf, tile_row, 0, f->bw >> 1, t->by >> 1, by_end); } for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; - if (dav1d_decode_tile_sbrow(t)) goto error; } - if (f->frame_thread.pass <= 1 && f->frame_hdr->frame_type & 1) { + if (f->seq_hdr->ref_frame_mvs && f->frame_thread.pass <= 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end); } // loopfilter + cdef + restoration - if (f->frame_thread.pass != 1) - f->bd_fn.filter_sbrow(f, sby); - dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4, - progress_plane_type); + if (f->frame_thread.pass != 1) { + if (c->n_pfc == 1) + f->bd_fn.filter_sbrow(f, sby); + else { + pthread_mutex_lock(&f->lf.thread.pftd->lock); + if (f->lf.thread.npf != 0 && !f->lf.thread.done) { + Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf]; + t->start = 1; + if (t->status == DAV1D_TASK_READY) + dav1d_task_schedule(f->lf.thread.pftd, t); + } + pthread_mutex_unlock(&f->lf.thread.pftd->lock); + } + } + if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0) + dav1d_thread_picture_signal(&f->sr_cur, + (sby + 1) * f->sb_step * 4, + progress_plane_type); } } } else { @@ -3143,7 +3159,6 @@ pthread_cond_broadcast(&f->tile_thread.cond); pthread_mutex_unlock(&f->tile_thread.lock); - // loopfilter + cdef + restoration for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++) @@ -3175,10 +3190,24 @@ } // loopfilter + cdef + restoration - if (f->frame_thread.pass != 1) - f->bd_fn.filter_sbrow(f, sby); - dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4, - progress_plane_type); + if (f->frame_thread.pass != 1) { + if (c->n_pfc == 1) + f->bd_fn.filter_sbrow(f, sby); + else { + pthread_mutex_lock(&f->lf.thread.pftd->lock); + if (f->lf.thread.npf != 0 && !f->lf.thread.done) { + Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf]; + t->start = 1; + if (t->status == DAV1D_TASK_READY) + dav1d_task_schedule(f->lf.thread.pftd, t); + } + pthread_mutex_unlock(&f->lf.thread.pftd->lock); + } + } + if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0) + dav1d_thread_picture_signal(&f->sr_cur, + (sby + 1) * f->sb_step * 4, + progress_plane_type); } } @@ -3223,6 +3252,17 @@ retval = 0; error: + if (c->n_pfc > 1) { + pthread_mutex_lock(&f->lf.thread.pftd->lock); + if (!f->lf.thread.done) { + if (retval != 0) { + f->lf.thread.done = -1; + pthread_cond_signal(&f->lf.thread.pftd->cond); + } + pthread_cond_wait(&f->lf.thread.cond, &f->lf.thread.pftd->lock); + } + pthread_mutex_unlock(&f->lf.thread.pftd->lock); + } dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR, PLANE_TYPE_ALL); for (int i = 0; i < 7; i++) { @@ -3276,8 +3316,10 @@ if (out_delayed->p.data[0]) { const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], memory_order_relaxed); - if (out_delayed->visible && progress != FRAME_ERROR) + if (out_delayed->visible && progress != FRAME_ERROR) { dav1d_picture_ref(&c->out, &out_delayed->p); + c->event_flags |= dav1d_picture_get_event_flags(out_delayed); + } dav1d_thread_picture_unref(out_delayed); } } else { @@ -3330,6 +3372,10 @@ f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \ f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \ f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \ + f->bd_fn.filter_sbrow_deblock = dav1d_filter_sbrow_deblock_##bd##bpc; \ + f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \ + f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \ + f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \ f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \ f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc if (!f->seq_hdr->hbd) { @@ -3344,7 +3390,7 @@ #undef assign_bitdepth_case int ref_coded_width[7]; - if (f->frame_hdr->frame_type & 1) { + if (IS_INTER_OR_SWITCH(f->frame_hdr)) { if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) { const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame]; if (!c->refs[pri_ref].p.p.data[0]) { @@ -3398,7 +3444,7 @@ dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]); } if (f->frame_hdr->refresh_context) { - res = dav1d_cdf_thread_alloc(&f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL); + res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL); if (res < 0) goto error; } @@ -3443,8 +3489,10 @@ // move f->cur into output queue if (c->n_fc == 1) { - if (f->frame_hdr->show_frame) + if (f->frame_hdr->show_frame) { dav1d_picture_ref(&c->out, &f->sr_cur.p); + c->event_flags |= dav1d_picture_get_event_flags(&f->sr_cur); + } } else { dav1d_thread_picture_ref(out_delayed, &f->sr_cur); } @@ -3462,9 +3510,9 @@ f->bitdepth_max = (1 << f->cur.p.bpc) - 1; // ref_mvs - if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) { - f->mvs_ref = dav1d_ref_create(f->sb128h * 16 * (f->b4_stride >> 1) * - sizeof(*f->mvs)); + if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { + f->mvs_ref = dav1d_ref_create_using_pool(c->refmvs_pool, + sizeof(*f->mvs) * f->sb128h * 16 * (f->b4_stride >> 1)); if (!f->mvs_ref) { res = DAV1D_ERR(ENOMEM); goto error; @@ -3527,7 +3575,8 @@ // We're updating an existing map, but need somewhere to // put the new values. Allocate them here (the data // actually gets set elsewhere) - f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h); + f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool, + sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h); if (!f->cur_segmap_ref) { dav1d_ref_dec(&f->prev_segmap_ref); res = DAV1D_ERR(ENOMEM); @@ -3542,13 +3591,14 @@ f->cur_segmap = f->prev_segmap_ref->data; } else { // We need to make a new map. Allocate one here and zero it out. - f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h); + const size_t segmap_size = sizeof(*f->cur_segmap) * f->b4_stride * 32 * f->sb128h; + f->cur_segmap_ref = dav1d_ref_create_using_pool(c->segmap_pool, segmap_size); if (!f->cur_segmap_ref) { res = DAV1D_ERR(ENOMEM); goto error; } f->cur_segmap = f->cur_segmap_ref->data; - memset(f->cur_segmap_ref->data, 0, f->b4_stride * 32 * f->sb128h); + memset(f->cur_segmap, 0, segmap_size); } } else { f->cur_segmap = NULL; diff -Nru dav1d-0.7.1/src/dequant_tables.c dav1d-0.9.1/src/dequant_tables.c --- dav1d-0.7.1/src/dequant_tables.c 2020-06-21 11:48:54.980126400 +0000 +++ dav1d-0.9.1/src/dequant_tables.c 2021-07-28 21:38:28.877852000 +0000 @@ -29,7 +29,7 @@ #include "src/dequant_tables.h" -const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2] = { +const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2] = { { { 4, 4, }, { 8, 8, }, { 8, 9, }, { 9, 10, }, { 10, 11, }, { 11, 12, }, { 12, 13, }, { 12, 14, }, diff -Nru dav1d-0.7.1/src/dequant_tables.h dav1d-0.9.1/src/dequant_tables.h --- dav1d-0.7.1/src/dequant_tables.h 2020-06-21 11:48:54.984126300 +0000 +++ dav1d-0.9.1/src/dequant_tables.h 2021-07-28 21:38:28.877852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -32,6 +32,6 @@ #include "src/levels.h" -extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2]; +extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2]; #endif /* DAV1D_SRC_DEQUANT_TABLES_H */ diff -Nru dav1d-0.7.1/src/ext/x86/x86inc.asm dav1d-0.9.1/src/ext/x86/x86inc.asm --- dav1d-0.7.1/src/ext/x86/x86inc.asm 2020-06-21 11:48:54.984126300 +0000 +++ dav1d-0.9.1/src/ext/x86/x86inc.asm 2021-07-28 21:38:28.877852000 +0000 @@ -1,7 +1,7 @@ ;***************************************************************************** -;* x86inc.asm: x264asm abstraction layer +;* x86inc.asm: x86 abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2019 x264 project +;* Copyright (C) 2005-2021 x264 project ;* ;* Authors: Loren Merritt ;* Henrik Gramner @@ -21,23 +21,14 @@ ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;***************************************************************************** -; This is a header file for the x264ASM assembly language, which uses +; This is a header file for the x86inc.asm assembly language, which uses ; NASM/YASM syntax combined with a large number of macros to provide easy ; abstraction between different calling conventions (x86_32, win64, linux64). ; It also has various other useful features to simplify writing the kind of -; DSP functions that are most often used in x264. - -; Unlike the rest of x264, this file is available under an ISC license, as it -; has significant usefulness outside of x264 and we want it to be available -; to the largest audience possible. Of course, if you modify it for your own -; purposes to add a new feature, we strongly encourage contributing a patch -; as this feature might be useful for others as well. Send patches or ideas -; to x264-devel@videolan.org . - -%include "config.asm" +; DSP functions that are most often used. %ifndef private_prefix - %define private_prefix dav1d + %error private_prefix not defined %endif %ifndef public_prefix @@ -118,7 +109,7 @@ ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that -; covers most of x264's asm. +; covers most use cases. ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. @@ -358,6 +349,28 @@ %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) %define high_mm_regs (16*cpuflag(avx512)) +; Large stack allocations on Windows need to use stack probing in order +; to guarantee that all stack memory is committed before accessing it. +; This is done by ensuring that the guard page(s) at the end of the +; currently committed pages are touched prior to any pages beyond that. +%if WIN64 + %assign STACK_PROBE_SIZE 8192 +%elifidn __OUTPUT_FORMAT__, win32 + %assign STACK_PROBE_SIZE 4096 +%else + %assign STACK_PROBE_SIZE 0 +%endif + +%macro PROBE_STACK 1 ; stack_size + %if STACK_PROBE_SIZE + %assign %%i STACK_PROBE_SIZE + %rep %1 / STACK_PROBE_SIZE + mov eax, [rsp-%%i] + %assign %%i %%i+STACK_PROBE_SIZE + %endrep + %endif +%endmacro + %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 %if %1 != 0 @@ -378,6 +391,7 @@ %if required_stack_alignment <= STACK_ALIGNMENT ; maintain the current stack alignment %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + PROBE_STACK stack_size_padded SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) @@ -393,6 +407,7 @@ %xdefine rstkm rstk %endif %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + PROBE_STACK stack_size_padded mov rstk, rsp and rsp, ~(required_stack_alignment-1) sub rsp, stack_size_padded @@ -425,16 +440,6 @@ %endif %endmacro -%macro DEFINE_ARGS_INTERNAL 3+ - %ifnum %2 - DEFINE_ARGS %3 - %elif %1 == 4 - DEFINE_ARGS %2 - %elif %1 > 4 - DEFINE_ARGS %2, %3 - %endif -%endmacro - %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx @@ -453,7 +458,7 @@ DECLARE_REG 13, R12, 112 DECLARE_REG 14, R13, 120 -%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... +%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args @@ -465,7 +470,15 @@ WIN64_SPILL_XMM %3 %endif LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 + %if %0 > 4 + %ifnum %4 + DEFINE_ARGS %5 + %else + DEFINE_ARGS %4, %5 + %endif + %elifnnum %4 + DEFINE_ARGS %4 + %endif %endmacro %macro WIN64_PUSH_XMM 0 @@ -561,7 +574,7 @@ DECLARE_REG 13, R12, 64 DECLARE_REG 14, R13, 72 -%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... +%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 %assign xmm_regs_used %3 @@ -571,7 +584,15 @@ PUSH_IF_USED 9, 10, 11, 12, 13, 14 ALLOC_STACK %4 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 + %if %0 > 4 + %ifnum %4 + DEFINE_ARGS %5 + %else + DEFINE_ARGS %4, %5 + %endif + %elifnnum %4 + DEFINE_ARGS %4 + %endif %endmacro %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required @@ -612,7 +633,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... +%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args @@ -627,7 +648,15 @@ PUSH_IF_USED 3, 4, 5, 6 ALLOC_STACK %4 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 - DEFINE_ARGS_INTERNAL %0, %4, %5 + %if %0 > 4 + %ifnum %4 + DEFINE_ARGS %5 + %else + DEFINE_ARGS %4, %5 + %endif + %elifnnum %4 + DEFINE_ARGS %4 + %endif %endmacro %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required @@ -991,6 +1020,8 @@ %if WIN64 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers %endif + %xdefine bcstd 1to4 + %xdefine bcstq 1to2 %endmacro %macro INIT_YMM 0-1+ @@ -1004,6 +1035,8 @@ INIT_CPUFLAGS %1 DEFINE_MMREGS ymm AVX512_MM_PERMUTATION + %xdefine bcstd 1to8 + %xdefine bcstq 1to4 %endmacro %macro INIT_ZMM 0-1+ @@ -1017,6 +1050,8 @@ INIT_CPUFLAGS %1 DEFINE_MMREGS zmm AVX512_MM_PERMUTATION + %xdefine bcstd 1to16 + %xdefine bcstq 1to8 %endmacro INIT_XMM @@ -1128,8 +1163,7 @@ %endif %xdefine %%tmp %%f %+ 0 %ifnum %%tmp - RESET_MM_PERMUTATION - AVX512_MM_PERMUTATION + DEFINE_MMREGS mmtype %assign %%i 0 %rep num_mmregs %xdefine %%tmp %%f %+ %%i @@ -1250,6 +1284,12 @@ %error use of ``%1'' sse2 instruction in cpuname function: current_function %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) %error use of ``%1'' avx2 instruction in cpuname function: current_function + %elif __sizeofreg == 16 && notcpuflag(sse) + %error use of ``%1'' sse instruction in cpuname function: current_function + %elif __sizeofreg == 32 && notcpuflag(avx) + %error use of ``%1'' avx instruction in cpuname function: current_function + %elif __sizeofreg == 64 && notcpuflag(avx512) + %error use of ``%1'' avx512 instruction in cpuname function: current_function %elifidn %1, pextrw ; special case because the base instruction is mmx2, %ifnid %6 ; but sse4 is required for memory operands %if notcpuflag(sse4) @@ -1299,26 +1339,50 @@ %elif %0 >= 9 __instr %6, %7, %8, %9 %elif %0 == 8 - %if avx_enabled && %5 + %if avx_enabled && __sizeofreg >= 16 && %4 == 0 %xdefine __src1 %7 %xdefine __src2 %8 - %ifnum regnumof%7 - %ifnum regnumof%8 - %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 - ; Most VEX-encoded instructions require an additional byte to encode when - ; src2 is a high register (e.g. m8..15). If the instruction is commutative - ; we can swap src1 and src2 when doing so reduces the instruction length. - %xdefine __src1 %8 - %xdefine __src2 %7 + %if %5 + %ifnum regnumof%7 + %ifnum regnumof%8 + %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 + ; Most VEX-encoded instructions require an additional byte to encode when + ; src2 is a high register (e.g. m8..15). If the instruction is commutative + ; we can swap src1 and src2 when doing so reduces the instruction length. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif %endif + %elifnum regnumof%8 ; put memory operands in src2 when possible + %xdefine __src1 %8 + %xdefine __src2 %7 + %else + %assign __emulate_avx 1 + %endif + %elifnnum regnumof%7 + ; EVEX allows imm8 shift instructions to be used with memory operands, + ; but VEX does not. This handles those special cases. + %ifnnum %8 + %assign __emulate_avx 1 + %elif notcpuflag(avx512) + %assign __emulate_avx 1 %endif %endif - __instr %6, __src1, __src2 + %if __emulate_avx ; a separate load is required + %if %3 + vmovaps %6, %7 + %else + vmovdqa %6, %7 + %endif + __instr %6, %8 + %else + __instr %6, __src1, __src2 + %endif %else __instr %6, %7, %8 %endif %elif %0 == 7 - %if avx_enabled && %5 + %if avx_enabled && __sizeofreg >= 16 && %5 %xdefine __src1 %6 %xdefine __src2 %7 %ifnum regnumof%6 @@ -1496,18 +1560,18 @@ AVX_INSTR pabsb, ssse3 AVX_INSTR pabsd, ssse3 AVX_INSTR pabsw, ssse3 -AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packssdw, mmx, 0, 0, 0 -AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 AVX_INSTR paddb, mmx, 0, 0, 1 -AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR paddd, mmx, 0, 0, 1 AVX_INSTR paddq, sse2, 0, 0, 1 AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR palignr, ssse3, 0, 1, 0 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 @@ -1515,71 +1579,71 @@ AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pblendvb, sse4 ; can't be emulated AVX_INSTR pblendw, sse4, 0, 1, 0 -AVX_INSTR pclmulqdq, fnord, 0, 1, 0 AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 -AVX_INSTR pcmpestri, sse42 -AVX_INSTR pcmpestrm, sse42 -AVX_INSTR pcmpistri, sse42 -AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pclmulqdq, fnord, 0, 1, 0 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 -AVX_INSTR pcmpeqw, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 -AVX_INSTR pcmpgtw, mmx, 0, 0, 0 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 AVX_INSTR pextrb, sse4 AVX_INSTR pextrd, sse4 AVX_INSTR pextrq, sse4 AVX_INSTR pextrw, mmx2 -AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phaddd, ssse3, 0, 0, 0 AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phminposuw, sse4 -AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR pinsrb, sse4, 0, 1, 0 AVX_INSTR pinsrd, sse4, 0, 1, 0 AVX_INSTR pinsrq, sse4, 0, 1, 0 AVX_INSTR pinsrw, mmx2, 0, 1, 0 -AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaxsb, sse4, 0, 0, 1 -AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxub, mmx2, 0, 0, 1 -AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pminsb, sse4, 0, 0, 1 -AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminub, mmx2, 0, 0, 1 -AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pmovmskb, mmx2 -AVX_INSTR pmovsxbw, sse4 AVX_INSTR pmovsxbd, sse4 AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxdq, sse4 AVX_INSTR pmovsxwd, sse4 AVX_INSTR pmovsxwq, sse4 -AVX_INSTR pmovsxdq, sse4 -AVX_INSTR pmovzxbw, sse4 AVX_INSTR pmovzxbd, sse4 AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmovzxwd, sse4 AVX_INSTR pmovzxwq, sse4 -AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmuldq, sse4, 0, 0, 1 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 AVX_INSTR pmulhw, mmx, 0, 0, 1 -AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmuludq, sse2, 0, 0, 1 AVX_INSTR por, mmx, 0, 0, 1 AVX_INSTR psadbw, mmx2, 0, 0, 1 @@ -1588,35 +1652,35 @@ AVX_INSTR pshufhw, sse2 AVX_INSTR pshuflw, sse2 AVX_INSTR psignb, ssse3, 0, 0, 0 -AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR psignd, ssse3, 0, 0, 0 -AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR pslld, mmx, 0, 0, 0 -AVX_INSTR psllq, mmx, 0, 0, 0 AVX_INSTR pslldq, sse2, 0, 0, 0 -AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 AVX_INSTR psrad, mmx, 0, 0, 0 -AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 AVX_INSTR psrld, mmx, 0, 0, 0 -AVX_INSTR psrlq, mmx, 0, 0, 0 AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 AVX_INSTR psubb, mmx, 0, 0, 0 -AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR psubd, mmx, 0, 0, 0 AVX_INSTR psubq, sse2, 0, 0, 0 AVX_INSTR psubsb, mmx, 0, 0, 0 AVX_INSTR psubsw, mmx, 0, 0, 0 AVX_INSTR psubusb, mmx, 0, 0, 0 AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR ptest, sse4 AVX_INSTR punpckhbw, mmx, 0, 0, 0 -AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpckhdq, mmx, 0, 0, 0 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpcklbw, mmx, 0, 0, 0 -AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 AVX_INSTR rcpps, sse, 1 AVX_INSTR rcpss, sse, 1, 0, 0 @@ -1648,8 +1712,8 @@ ; 3DNow instructions, for sharing code between AVX, SSE and 3DN AVX_INSTR pfadd, 3dnow, 1, 0, 1 -AVX_INSTR pfsub, 3dnow, 1, 0, 0 AVX_INSTR pfmul, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 ;%1 == instruction ;%2 == minimal instruction set @@ -1672,6 +1736,7 @@ GPR_INSTR bextr, bmi1 GPR_INSTR blsi, bmi1 GPR_INSTR blsmsk, bmi1 +GPR_INSTR blsr, bmi1 GPR_INSTR bzhi, bmi2 GPR_INSTR mulx, bmi2 GPR_INSTR pdep, bmi2 @@ -1713,9 +1778,9 @@ %endmacro %endmacro -FMA_INSTR pmacsww, pmullw, paddw -FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation -FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmadcswd, pmaddwd, paddd ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. diff -Nru dav1d-0.7.1/src/fg_apply_tmpl.c dav1d-0.9.1/src/fg_apply_tmpl.c --- dav1d-0.7.1/src/fg_apply_tmpl.c 2020-06-21 11:48:54.984126300 +0000 +++ dav1d-0.9.1/src/fg_apply_tmpl.c 2021-07-28 21:38:28.877852000 +0000 @@ -135,7 +135,7 @@ assert(out->stride[1] == in->stride[1]); const int ss_ver = in->p.layout == DAV1D_PIXEL_LAYOUT_I420; const ptrdiff_t stride = out->stride[1]; - const ptrdiff_t sz = (out->p.h * stride) >> ss_ver; + const ptrdiff_t sz = ((out->p.h + ss_ver) >> ss_ver) * stride; if (sz < 0) { if (!data->num_uv_points[0]) memcpy((uint8_t*) out->data[1] + sz - stride, diff -Nru dav1d-0.7.1/src/film_grain.h dav1d-0.9.1/src/film_grain.h --- dav1d-0.7.1/src/film_grain.h 2020-06-21 11:48:54.984126300 +0000 +++ dav1d-0.9.1/src/film_grain.h 2021-07-28 21:38:28.881852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -80,6 +80,7 @@ } Dav1dFilmGrainDSPContext; bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c); +bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c); bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c); #endif /* DAV1D_SRC_FILM_GRAIN_H */ diff -Nru dav1d-0.7.1/src/film_grain_tmpl.c dav1d-0.9.1/src/film_grain_tmpl.c --- dav1d-0.7.1/src/film_grain_tmpl.c 2020-06-21 11:48:54.984126300 +0000 +++ dav1d-0.9.1/src/film_grain_tmpl.c 2021-07-28 21:38:28.881852000 +0000 @@ -184,11 +184,7 @@ max_value = 235 << bitdepth_min_8; } else { min_value = 0; -#if BITDEPTH == 8 - max_value = 0xff; -#else - max_value = bitdepth_max; -#endif + max_value = BITDEPTH_MAX; } // seed[0] contains the current row, seed[1] contains the previous @@ -299,11 +295,7 @@ max_value = (is_id ? 235 : 240) << bitdepth_min_8; } else { min_value = 0; -#if BITDEPTH == 8 - max_value = 0xff; -#else - max_value = bitdepth_max; -#endif + max_value = BITDEPTH_MAX; } // seed[0] contains the current row, seed[1] contains the previous @@ -370,7 +362,7 @@ for (int x = 0; x < xstart; x++) { int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); - grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5; + grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); grain = iclip(grain, grain_min, grain_max); add_noise_uv(x, y, grain); } @@ -381,7 +373,7 @@ for (int x = xstart; x < bw; x++) { int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); - grain = (old * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5; + grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5); grain = iclip(grain, grain_min, grain_max); add_noise_uv(x, y, grain); } @@ -391,17 +383,17 @@ // Blend the top pixel with the top left block int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y); - top = (old * w[sx][x][0] + top * w[sx][x][1] + 16) >> 5; + top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5); top = iclip(top, grain_min, grain_max); // Blend the current pixel with the left block int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); - grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5; + grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); grain = iclip(grain, grain_min, grain_max); // Mix the row rows together and apply to image - grain = (top * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5; + grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5); grain = iclip(grain, grain_min, grain_max); add_noise_uv(x, y, grain); } @@ -431,7 +423,11 @@ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c; c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c; -#if HAVE_ASM && ARCH_X86 +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_film_grain_dsp_init_arm)(c); +#elif ARCH_X86 bitfn(dav1d_film_grain_dsp_init_x86)(c); #endif +#endif } diff -Nru dav1d-0.7.1/src/getbits.c dav1d-0.9.1/src/getbits.c --- dav1d-0.7.1/src/getbits.c 2020-06-21 11:48:54.984126300 +0000 +++ dav1d-0.9.1/src/getbits.c 2021-07-28 21:38:28.881852000 +0000 @@ -27,6 +27,8 @@ #include "config.h" +#include + #include "common/intops.h" #include "src/getbits.h" @@ -34,6 +36,8 @@ void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data, const size_t sz) { + // If sz were 0, c->eof would need to be initialized to 1. + assert(sz); c->ptr = c->ptr_start = data; c->ptr_end = &c->ptr_start[sz]; c->bits_left = 0; @@ -77,25 +81,23 @@ return res >> shift; } -unsigned dav1d_get_uleb128(GetBits *c) { - unsigned val = 0, more, i = 0; +unsigned dav1d_get_uleb128(GetBits *const c) { + uint64_t val = 0; + unsigned i = 0, more; do { - more = dav1d_get_bits(c, 1); - unsigned bits = dav1d_get_bits(c, 7); - if (i <= 3 || (i == 4 && bits < (1 << 4))) - val |= bits << (i * 7); - else if (bits) { - c->error = 1; - return 0; - } - if (more && ++i == 8) { - c->error = 1; - return 0; - } - } while (more); + const int v = dav1d_get_bits(c, 8); + more = v & 0x80; + val |= ((uint64_t) (v & 0x7F)) << i; + i += 7; + } while (more && i < 56); + + if (val > UINT_MAX || more) { + c->error = 1; + return 0; + } - return val; + return (unsigned) val; } unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) { diff -Nru dav1d-0.7.1/src/internal.h dav1d-0.9.1/src/internal.h --- dav1d-0.7.1/src/internal.h 2020-06-21 11:48:54.984126300 +0000 +++ dav1d-0.9.1/src/internal.h 2021-07-28 21:38:28.881852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -35,6 +35,8 @@ typedef struct Dav1dFrameContext Dav1dFrameContext; typedef struct Dav1dTileState Dav1dTileState; typedef struct Dav1dTileContext Dav1dTileContext; +typedef struct Dav1dPostFilterContext Dav1dPostFilterContext; +typedef struct Dav1dTask Dav1dTask; #include "common/attributes.h" @@ -76,14 +78,19 @@ Dav1dFrameContext *fc; unsigned n_fc; + Dav1dPostFilterContext *pfc; + unsigned n_pfc; + // cache of OBUs that make up a single frame before we submit them // to a frame worker to be decoded struct Dav1dTileGroup *tile; int n_tile_data_alloc; int n_tile_data; int n_tiles; + Dav1dMemPool *seq_hdr_pool; Dav1dRef *seq_hdr_ref; Dav1dSequenceHeader *seq_hdr; + Dav1dMemPool *frame_hdr_pool; Dav1dRef *frame_hdr_ref; Dav1dFrameHeader *frame_hdr; @@ -97,22 +104,33 @@ // decoded output picture queue Dav1dData in; Dav1dPicture out; + // dummy is a pointer to prevent compiler errors about atomic_load() + // not taking const arguments + atomic_int flush_mem, *flush; struct { Dav1dThreadPicture *out_delayed; unsigned next; - // dummy is a pointer to prevent compiler errors about atomic_load() - // not taking const arguments; the const attribute is not taken - // from pointers - atomic_int flush_mem, *flush; } frame_thread; + // postfilter threading (refer to pfc[] for per_thread thingies) + struct PostFilterThreadData { + pthread_mutex_t lock; + pthread_cond_t cond; + struct Dav1dTask *tasks; + int frame_cnt; + int inited; + } postfilter_thread; + // reference/entropy state + Dav1dMemPool *segmap_pool; + Dav1dMemPool *refmvs_pool; struct { Dav1dThreadPicture p; Dav1dRef *segmap; Dav1dRef *refmvs; unsigned refpoc[7]; } refs[8]; + Dav1dMemPool *cdf_pool; CdfThreadContext cdf[8]; Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */]; @@ -133,8 +151,12 @@ int all_layers; unsigned frame_size_limit; int drain; + enum PictureFlags frame_flags; + enum Dav1dEventFlags event_flags; Dav1dLogger logger; + + Dav1dMemPool *picture_pool; }; struct Dav1dFrameContext { @@ -175,6 +197,10 @@ recon_b_intra_fn recon_b_intra; recon_b_inter_fn recon_b_inter; filter_sbrow_fn filter_sbrow; + filter_sbrow_fn filter_sbrow_deblock; + filter_sbrow_fn filter_sbrow_cdef; + filter_sbrow_fn filter_sbrow_resize; + filter_sbrow_fn filter_sbrow_lr; backup_ipred_edge_fn backup_ipred_edge; read_coef_blocks_fn read_coef_blocks; } bd_fn; @@ -184,7 +210,7 @@ ptrdiff_t b4_stride; int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w; uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */]; - const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */]; + const uint8_t *qm[N_RECT_TX_SIZES][3 /* plane */]; BlockContext *a; int a_sz /* w*tile_rows */; refmvs_frame rf; @@ -231,6 +257,16 @@ pixel *p[3], *sr_p[3]; Av1Filter *mask_ptr, *prev_mask_ptr; int restore_planes; // enum LrRestorePlanes + + struct { + pthread_cond_t cond; + struct PostFilterThreadData *pftd; + struct Dav1dTask *tasks; + int num_tasks; + int npf; + int done; + int inited; + } thread; } lf; // threading (refer to tc[] for per-thread things) @@ -346,4 +382,11 @@ } tile_thread; }; +struct Dav1dPostFilterContext { + Dav1dContext *c; + struct thread_data td; + int flushed; + int die; +}; + #endif /* DAV1D_SRC_INTERNAL_H */ diff -Nru dav1d-0.7.1/src/ipred.h dav1d-0.9.1/src/ipred.h --- dav1d-0.7.1/src/ipred.h 2020-06-21 11:48:54.988126300 +0000 +++ dav1d-0.9.1/src/ipred.h 2021-07-28 21:38:28.881852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -71,6 +71,7 @@ /* * dst[x,y] = pal[idx[x,y]] * - palette indices are [0-7] + * - only 16-byte alignment is guaranteed for idx. */ #define decl_pal_pred_fn(name) \ void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \ diff -Nru dav1d-0.7.1/src/itx.h dav1d-0.9.1/src/itx.h --- dav1d-0.7.1/src/itx.h 2020-06-21 11:48:54.988126300 +0000 +++ dav1d-0.9.1/src/itx.h 2021-07-28 21:38:28.881852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -45,6 +45,6 @@ bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc); bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc); -bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c); +bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c, int bpc); #endif /* DAV1D_SRC_ITX_H */ diff -Nru dav1d-0.7.1/src/itx_tmpl.c dav1d-0.9.1/src/itx_tmpl.c --- dav1d-0.7.1/src/itx_tmpl.c 2020-06-21 11:48:54.988126300 +0000 +++ dav1d-0.9.1/src/itx_tmpl.c 2021-07-28 21:38:28.881852000 +0000 @@ -250,7 +250,7 @@ bitfn(dav1d_itx_dsp_init_arm)(c, bpc); #endif #if ARCH_X86 - bitfn(dav1d_itx_dsp_init_x86)(c); + bitfn(dav1d_itx_dsp_init_x86)(c, bpc); #endif #endif } diff -Nru dav1d-0.7.1/src/lf_mask.c dav1d-0.9.1/src/lf_mask.c --- dav1d-0.7.1/src/lf_mask.c 2020-06-21 11:48:54.992126500 +0000 +++ dav1d-0.9.1/src/lf_mask.c 2021-07-28 21:38:28.881852000 +0000 @@ -89,7 +89,7 @@ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[max_tx]; int y, x; - uint8_t txa[2 /* edge */][2 /* txsz, step */][32 /* y */][32 /* x */]; + ALIGN_STK_16(uint8_t, txa, 2 /* edge */, [2 /* txsz, step */][32 /* y */][32 /* x */]); for (int y_off = 0, y = 0; y < h4; y += t_dim->h, y_off++) for (int x_off = 0, x = 0; x < w4; x += t_dim->w, x_off++) decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][y][x], diff -Nru dav1d-0.7.1/src/lf_mask.h dav1d-0.9.1/src/lf_mask.h --- dav1d-0.7.1/src/lf_mask.h 2020-06-21 11:48:54.992126500 +0000 +++ dav1d-0.9.1/src/lf_mask.h 2021-07-28 21:38:28.881852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -40,11 +40,11 @@ } Av1FilterLUT; typedef struct Av1RestorationUnit { - enum Dav1dRestorationType type; - int16_t filter_h[3]; - int16_t filter_v[3]; + uint8_t /* enum Dav1dRestorationType */ type; + int8_t filter_h[3]; + int8_t filter_v[3]; uint8_t sgr_idx; - int16_t sgr_weights[2]; + int8_t sgr_weights[2]; } Av1RestorationUnit; // each struct describes one 128x128 area (1 or 4 SBs), pre-superres-scaling @@ -53,7 +53,7 @@ uint16_t filter_y[2 /* 0=col, 1=row */][32][3][2]; uint16_t filter_uv[2 /* 0=col, 1=row */][32][2][2]; int8_t cdef_idx[4]; // -1 means "unset" - uint16_t noskip_mask[32][2]; + uint16_t noskip_mask[16][2]; // for 8x8 blocks, but stored on a 4x8 basis } Av1Filter; // each struct describes one 128x128 area (1 or 4 SBs), post-superres-scaling diff -Nru dav1d-0.7.1/src/lib.c dav1d-0.9.1/src/lib.c --- dav1d-0.7.1/src/lib.c 2020-06-21 11:48:54.992126500 +0000 +++ dav1d-0.9.1/src/lib.c 2021-07-28 21:38:28.881852000 +0000 @@ -38,7 +38,6 @@ #include "dav1d/dav1d.h" #include "dav1d/data.h" -#include "common/mem.h" #include "common/validate.h" #include "src/cpu.h" @@ -66,6 +65,7 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) { s->n_frame_threads = 1; s->n_tile_threads = 1; + s->n_postfilter_threads = 1; s->apply_grain = 1; s->allocator.cookie = NULL; s->allocator.alloc_picture_callback = dav1d_default_picture_alloc; @@ -101,6 +101,8 @@ validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL)); validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->n_postfilter_threads >= 1 && + s->n_postfilter_threads <= DAV1D_MAX_POSTFILTER_THREADS, DAV1D_ERR(EINVAL)); validate_input_or_ret(s->n_tile_threads >= 1 && s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL)); validate_input_or_ret(s->n_frame_threads >= 1 && @@ -129,6 +131,27 @@ c->all_layers = s->all_layers; c->frame_size_limit = s->frame_size_limit; + if (dav1d_mem_pool_init(&c->seq_hdr_pool) || + dav1d_mem_pool_init(&c->frame_hdr_pool) || + dav1d_mem_pool_init(&c->segmap_pool) || + dav1d_mem_pool_init(&c->refmvs_pool) || + dav1d_mem_pool_init(&c->cdf_pool)) + { + goto error; + } + + if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc && + c->allocator.release_picture_callback == dav1d_default_picture_release) + { + if (c->allocator.cookie) goto error; + if (dav1d_mem_pool_init(&c->picture_pool)) goto error; + c->allocator.cookie = c->picture_pool; + } else if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc || + c->allocator.release_picture_callback == dav1d_default_picture_release) + { + goto error; + } + /* On 32-bit systems extremely large frame sizes can cause overflows in * dav1d_decode_frame() malloc size calculations. Prevent that from occuring * by enforcing a maximum frame size limit, chosen to roughly correspond to @@ -140,12 +163,49 @@ s->frame_size_limit, c->frame_size_limit); } - c->frame_thread.flush = &c->frame_thread.flush_mem; - atomic_init(c->frame_thread.flush, 0); + c->flush = &c->flush_mem; + atomic_init(c->flush, 0); + + c->n_pfc = s->n_postfilter_threads; c->n_fc = s->n_frame_threads; c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32); if (!c->fc) goto error; memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads); + + if (c->n_pfc > 1) { + c->pfc = dav1d_alloc_aligned(sizeof(*c->pfc) * s->n_postfilter_threads, 32); + if (!c->pfc) goto error; + memset(c->pfc, 0, sizeof(*c->pfc) * s->n_postfilter_threads); + if (pthread_mutex_init(&c->postfilter_thread.lock, NULL)) goto error; + if (pthread_cond_init(&c->postfilter_thread.cond, NULL)) { + pthread_mutex_destroy(&c->postfilter_thread.lock); + goto error; + } + c->postfilter_thread.inited = 1; + for (int n = 0; n < s->n_frame_threads; n++) { + Dav1dFrameContext *const f = &c->fc[n]; + if (pthread_cond_init(&f->lf.thread.cond, NULL)) goto error; + f->lf.thread.pftd = &c->postfilter_thread; + f->lf.thread.done = 1; + f->lf.thread.inited = 1; + } + for (int n = 0; n < s->n_postfilter_threads; ++n) { + Dav1dPostFilterContext *const pf = &c->pfc[n]; + pf->c = c; + if (pthread_mutex_init(&pf->td.lock, NULL)) goto error; + if (pthread_cond_init(&pf->td.cond, NULL)) { + pthread_mutex_destroy(&pf->td.lock); + goto error; + } + if (pthread_create(&pf->td.thread, &thread_attr, dav1d_postfilter_task, pf)) { + pthread_cond_destroy(&c->postfilter_thread.cond); + pthread_mutex_destroy(&c->postfilter_thread.lock); + goto error; + } + pf->td.inited = 1; + } + } + if (c->n_fc > 1) { c->frame_thread.out_delayed = calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed)); @@ -258,7 +318,7 @@ } if (!c->seq_hdr) { - res = DAV1D_ERR(EINVAL); + res = DAV1D_ERR(ENOENT); goto error; } @@ -347,8 +407,10 @@ const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], memory_order_relaxed); - if (out_delayed->visible && progress != FRAME_ERROR) + if (out_delayed->visible && progress != FRAME_ERROR) { dav1d_picture_ref(&c->out, &out_delayed->p); + c->event_flags |= dav1d_picture_get_event_flags(out_delayed); + } dav1d_thread_picture_unref(out_delayed); if (output_picture_ready(c)) return output_image(c, out, &c->out); @@ -447,11 +509,17 @@ dav1d_ref_dec(&c->content_light_ref); dav1d_ref_dec(&c->itut_t35_ref); - if (c->n_fc == 1) return; + if (c->n_fc == 1 && c->n_pfc == 1) return; - // mark each currently-running frame as flushing, so that we - // exit out as quickly as the running thread checks this flag - atomic_store(c->frame_thread.flush, 1); + // wait for threads to complete flushing + if (c->n_pfc > 1) + pthread_mutex_lock(&c->postfilter_thread.lock); + atomic_store(c->flush, 1); + if (c->n_pfc > 1) { + pthread_cond_broadcast(&c->postfilter_thread.cond); + pthread_mutex_unlock(&c->postfilter_thread.lock); + } + if (c->n_fc == 1) goto skip_ft_flush; for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) { if (next == c->n_fc) next = 0; Dav1dFrameContext *const f = &c->fc[next]; @@ -463,13 +531,31 @@ assert(!f->cur.data[0]); } pthread_mutex_unlock(&f->frame_thread.td.lock); - Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next]; + Dav1dThreadPicture *const out_delayed = + &c->frame_thread.out_delayed[next]; if (out_delayed->p.data[0]) dav1d_thread_picture_unref(out_delayed); } - atomic_store(c->frame_thread.flush, 0); - c->frame_thread.next = 0; +skip_ft_flush: + if (c->n_pfc > 1) { + for (unsigned i = 0; i < c->n_pfc; ++i) { + Dav1dPostFilterContext *const pf = &c->pfc[i]; + pthread_mutex_lock(&pf->td.lock); + if (!pf->flushed) + pthread_cond_wait(&pf->td.cond, &pf->td.lock); + pf->flushed = 0; + pthread_mutex_unlock(&pf->td.lock); + } + pthread_mutex_lock(&c->postfilter_thread.lock); + c->postfilter_thread.tasks = NULL; + pthread_mutex_unlock(&c->postfilter_thread.lock); + for (unsigned i = 0; i < c->n_fc; ++i) { + freep(&c->fc[i].lf.thread.tasks); + c->fc[i].lf.thread.num_tasks = 0; + } + } + atomic_store(c->flush, 0); } COLD void dav1d_close(Dav1dContext **const c_out) { @@ -483,6 +569,25 @@ if (flush) dav1d_flush(c); + if (c->pfc) { + struct PostFilterThreadData *pftd = &c->postfilter_thread; + if (pftd->inited) { + pthread_mutex_lock(&pftd->lock); + for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) + c->pfc[n].die = 1; + pthread_cond_broadcast(&pftd->cond); + pthread_mutex_unlock(&pftd->lock); + for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) { + pthread_join(c->pfc[n].td.thread, NULL); + pthread_cond_destroy(&c->pfc[n].td.cond); + pthread_mutex_destroy(&c->pfc[n].td.lock); + } + pthread_cond_destroy(&pftd->cond); + pthread_mutex_destroy(&pftd->lock); + } + dav1d_free_aligned(c->pfc); + } + for (unsigned n = 0; c->fc && n < c->n_fc; n++) { Dav1dFrameContext *const f = &c->fc[n]; @@ -534,6 +639,10 @@ pthread_cond_destroy(&ts->tile_thread.cond); pthread_mutex_destroy(&ts->tile_thread.lock); } + if (f->lf.thread.inited) { + freep(&f->lf.thread.tasks); + pthread_cond_destroy(&f->lf.thread.cond); + } dav1d_free_aligned(f->ts); dav1d_free_aligned(f->tc); dav1d_free_aligned(f->ipred_edge[0]); @@ -572,9 +681,25 @@ dav1d_ref_dec(&c->content_light_ref); dav1d_ref_dec(&c->itut_t35_ref); + dav1d_mem_pool_end(c->seq_hdr_pool); + dav1d_mem_pool_end(c->frame_hdr_pool); + dav1d_mem_pool_end(c->segmap_pool); + dav1d_mem_pool_end(c->refmvs_pool); + dav1d_mem_pool_end(c->cdf_pool); + dav1d_mem_pool_end(c->picture_pool); + dav1d_freep_aligned(c_out); } +int dav1d_get_event_flags(Dav1dContext *const c, enum Dav1dEventFlags *const flags) { + validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(flags != NULL, DAV1D_ERR(EINVAL)); + + *flags = c->event_flags; + c->event_flags = 0; + return 0; +} + void dav1d_picture_unref(Dav1dPicture *const p) { dav1d_picture_unref_internal(p); } diff -Nru dav1d-0.7.1/src/looprestoration.h dav1d-0.9.1/src/looprestoration.h --- dav1d-0.7.1/src/looprestoration.h 2020-06-21 11:48:54.992126500 +0000 +++ dav1d-0.9.1/src/looprestoration.h 2021-07-28 21:38:28.881852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -46,35 +46,37 @@ typedef const void *const_left_pixel_row; #endif -// Although the spec applies restoration filters over 4x4 blocks, the wiener -// filter can be applied to a bigger surface. +typedef union LooprestorationParams { + ALIGN(int16_t filter[2][8], 16); + struct { + uint32_t s0, s1; + int16_t w0, w1; + } sgr; +} LooprestorationParams; + +// Although the spec applies restoration filters over 4x4 blocks, +// they can be applied to a bigger surface. // * w is constrained by the restoration unit size (w <= 256) // * h is constrained by the stripe height (h <= 64) -#define decl_wiener_filter_fn(name) \ -void (name)(pixel *dst, ptrdiff_t dst_stride, \ - const_left_pixel_row left, \ - const pixel *lpf, ptrdiff_t lpf_stride, \ - int w, int h, const int16_t filterh[7], \ - const int16_t filterv[7], enum LrEdgeFlags edges \ - HIGHBD_DECL_SUFFIX) -typedef decl_wiener_filter_fn(*wienerfilter_fn); - -#define decl_selfguided_filter_fn(name) \ +// The filter functions are allowed to do aligned writes past the right +// edge of the buffer, aligned up to the minimum loop restoration unit size +// (which is 32 pixels for subsampled chroma and 64 pixels for luma). +#define decl_lr_filter_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const_left_pixel_row left, \ const pixel *lpf, ptrdiff_t lpf_stride, \ - int w, int h, int sgr_idx, const int16_t sgr_w[2], \ - const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) -typedef decl_selfguided_filter_fn(*selfguided_fn); + int w, int h, const LooprestorationParams *params, \ + enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +typedef decl_lr_filter_fn(*looprestorationfilter_fn); typedef struct Dav1dLoopRestorationDSPContext { - wienerfilter_fn wiener; - selfguided_fn selfguided; + looprestorationfilter_fn wiener[2]; /* 7-tap, 5-tap */ + looprestorationfilter_fn sgr[3]; /* 5x5, 3x3, mix */ } Dav1dLoopRestorationDSPContext; bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc); bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc); -bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c); -bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c); +bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c, int bpc); +bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c, int bpc); #endif /* DAV1D_SRC_LOOPRESTORATION_H */ diff -Nru dav1d-0.7.1/src/looprestoration_tmpl.c dav1d-0.9.1/src/looprestoration_tmpl.c --- dav1d-0.7.1/src/looprestoration_tmpl.c 2020-06-21 11:48:54.996126400 +0000 +++ dav1d-0.9.1/src/looprestoration_tmpl.c 2021-07-28 21:38:28.881852000 +0000 @@ -39,10 +39,10 @@ // TODO Reuse p when no padding is needed (add and remove lpf pixels in p) // TODO Chroma only requires 2 rows of padding. -static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride, - const pixel (*left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, - int unit_w, const int stripe_h, const enum LrEdgeFlags edges) +static NOINLINE void +padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride, + const pixel (*left)[4], const pixel *lpf, const ptrdiff_t lpf_stride, + int unit_w, const int stripe_h, const enum LrEdgeFlags edges) { const int have_left = !!(edges & LR_HAVE_LEFT); const int have_right = !!(edges & LR_HAVE_RIGHT); @@ -135,7 +135,7 @@ const pixel (*const left)[4], const pixel *lpf, const ptrdiff_t lpf_stride, const int w, const int h, - const int16_t filterh[7], const int16_t filterv[7], + const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels @@ -150,16 +150,20 @@ uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; uint16_t *hor_ptr = hor; + const int16_t (*const filter)[8] = params->filter; const int bitdepth = bitdepth_from_max(bitdepth_max); const int round_bits_h = 3 + (bitdepth == 12) * 2; const int rounding_off_h = 1 << (round_bits_h - 1); const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h); for (int j = 0; j < h + 6; j++) { for (int i = 0; i < w; i++) { - int sum = (tmp_ptr[i + 3] << 7) + (1 << (bitdepth + 6)); + int sum = (1 << (bitdepth + 6)); +#if BITDEPTH == 8 + sum += tmp_ptr[i + 3] * 128; +#endif for (int k = 0; k < 7; k++) { - sum += tmp_ptr[i + k] * filterh[k]; + sum += tmp_ptr[i + k] * filter[0][k]; } hor_ptr[i] = @@ -174,10 +178,10 @@ const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); for (int j = 0; j < h; j++) { for (int i = 0; i < w; i++) { - int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset; + int sum = -round_offset; for (int k = 0; k < 7; k++) { - sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k]; + sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k]; } p[j * PXSTRIDE(p_stride) + i] = @@ -208,44 +212,58 @@ // i: Pixel summed and stored (between loops) // c: Pixel summed not stored // x: Pixel not summed not stored -static void boxsum3(coef *dst, const pixel *src, const int w, const int h) { +static void boxsum3(int32_t *sumsq, coef *sum, const pixel *src, + const int w, const int h) +{ // We skip the first row, as it is never used src += REST_UNIT_STRIDE; - dst += REST_UNIT_STRIDE; // We skip the first and last columns, as they are never used for (int x = 1; x < w - 1; x++) { - coef *ds = dst + x; + coef *sum_v = sum + x; + int32_t *sumsq_v = sumsq + x; const pixel *s = src + x; - int a = s[0], b = s[REST_UNIT_STRIDE]; + int a = s[0], a2 = a * a; + int b = s[REST_UNIT_STRIDE], b2 = b * b; // We skip the first 2 rows, as they are skipped in the next loop and // we don't need the last 2 row as it is skipped in the next loop for (int y = 2; y < h - 2; y++) { s += REST_UNIT_STRIDE; const int c = s[REST_UNIT_STRIDE]; - ds += REST_UNIT_STRIDE; - *ds = a + b + c; + const int c2 = c * c; + sum_v += REST_UNIT_STRIDE; + sumsq_v += REST_UNIT_STRIDE; + *sum_v = a + b + c; + *sumsq_v = a2 + b2 + c2; a = b; + a2 = b2; b = c; + b2 = c2; } } - // We skip the first 2 rows as they are never read - dst += REST_UNIT_STRIDE; + // We skip the first row as it is never read + sum += REST_UNIT_STRIDE; + sumsq += REST_UNIT_STRIDE; // We skip the last 2 rows as it is never read for (int y = 2; y < h - 2; y++) { - int a = dst[1], b = dst[2]; + int a = sum[1], a2 = sumsq[1]; + int b = sum[2], b2 = sumsq[2]; // We don't store the first column as it is never read and // we don't store the last 2 columns as they are never read for (int x = 2; x < w - 2; x++) { - const int c = dst[x + 1]; - dst[x] = a + b + c; + const int c = sum[x + 1], c2 = sumsq[x + 1]; + sum[x] = a + b + c; + sumsq[x] = a2 + b2 + c2; a = b; + a2 = b2; b = c; + b2 = c2; } - dst += REST_UNIT_STRIDE; + sum += REST_UNIT_STRIDE; + sumsq += REST_UNIT_STRIDE; } } @@ -271,168 +289,86 @@ // i: Pixel summed and stored (between loops) // c: Pixel summed not stored // x: Pixel not summed not stored -static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) { - // We skip the first row, as it is never used - dst += REST_UNIT_STRIDE; - - for (int x = 0; x < w; x++) { - coef *ds = dst + x; - const pixel *s = src + 3 * REST_UNIT_STRIDE + x; - int a = s[-3 * REST_UNIT_STRIDE]; - int b = s[-2 * REST_UNIT_STRIDE]; - int c = s[-1 * REST_UNIT_STRIDE]; - int d = s[0]; - - // We skip the first 2 rows, as they are skipped in the next loop and - // we don't need the last 2 row as it is skipped in the next loop - for (int y = 2; y < h - 2; y++) { - s += REST_UNIT_STRIDE; - const int e = *s; - ds += REST_UNIT_STRIDE; - *ds = a + b + c + d + e; - a = b; - b = c; - c = d; - d = e; - } - } - - // We skip the first 2 rows as they are never read - dst += REST_UNIT_STRIDE; - for (int y = 2; y < h - 2; y++) { - int a = dst[0]; - int b = dst[1]; - int c = dst[2]; - int d = dst[3]; - - for (int x = 2; x < w - 2; x++) { - const int e = dst[x + 2]; - dst[x] = a + b + c + d + e; - a = b; - b = c; - c = d; - d = e; - } - dst += REST_UNIT_STRIDE; - } -} - -// See boxsum3 function comments for details on row and column skipping -static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) { - // We skip the first row, as it is never used - src += REST_UNIT_STRIDE; - dst += REST_UNIT_STRIDE; - - // We skip the first and last columns, as they are never used - for (int x = 1; x < w - 1; x++) { - int32_t *ds = dst + x; - const pixel *s = src + x; - int a = s[0] * s[0]; - int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE]; - - // We skip the first row, as it is skipped in the next loop and - // we don't need the last row as it is skipped in the next loop - for (int y = 2; y < h - 2; y++) { - s += REST_UNIT_STRIDE; - const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE]; - ds += REST_UNIT_STRIDE; - *ds = a + b + c; - a = b; - b = c; - } - } - - // We skip the first row as it is never read - dst += REST_UNIT_STRIDE; - // We skip the last row as it is never read - for (int y = 2; y < h - 2; y++) { - int a = dst[1], b = dst[2]; - - // We don't store the first column as it is never read and - // we don't store the last 2 columns as they are never read - for (int x = 2; x < w - 2; x++) { - const int c = dst[x + 1]; - dst[x] = a + b + c; - a = b; - b = c; - } - dst += REST_UNIT_STRIDE; - } -} - -// See boxsum5 function comments for details on row and column skipping -static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w, - const int h) +static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src, + const int w, const int h) { - // We skip the first row, as it is never used - dst += REST_UNIT_STRIDE; - for (int x = 0; x < w; x++) { - int32_t *ds = dst + x; + coef *sum_v = sum + x; + int32_t *sumsq_v = sumsq + x; const pixel *s = src + 3 * REST_UNIT_STRIDE + x; - int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE]; - int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE]; - int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE]; - int d = s[0] * s[0]; + int a = s[-3 * REST_UNIT_STRIDE], a2 = a * a; + int b = s[-2 * REST_UNIT_STRIDE], b2 = b * b; + int c = s[-1 * REST_UNIT_STRIDE], c2 = c * c; + int d = s[0], d2 = d * d; // We skip the first 2 rows, as they are skipped in the next loop and // we don't need the last 2 row as it is skipped in the next loop for (int y = 2; y < h - 2; y++) { s += REST_UNIT_STRIDE; - const int e = s[0] * s[0]; - ds += REST_UNIT_STRIDE; - *ds = a + b + c + d + e; + const int e = *s, e2 = e * e; + sum_v += REST_UNIT_STRIDE; + sumsq_v += REST_UNIT_STRIDE; + *sum_v = a + b + c + d + e; + *sumsq_v = a2 + b2 + c2 + d2 + e2; a = b; b = c; c = d; d = e; + a2 = b2; + b2 = c2; + c2 = d2; + d2 = e2; } } - // We skip the first 2 rows as they are never read - dst += REST_UNIT_STRIDE; + // We skip the first row as it is never read + sum += REST_UNIT_STRIDE; + sumsq += REST_UNIT_STRIDE; for (int y = 2; y < h - 2; y++) { - int a = dst[0]; - int b = dst[1]; - int c = dst[2]; - int d = dst[3]; + int a = sum[0], a2 = sumsq[0]; + int b = sum[1], b2 = sumsq[1]; + int c = sum[2], c2 = sumsq[2]; + int d = sum[3], d2 = sumsq[3]; for (int x = 2; x < w - 2; x++) { - const int e = dst[x + 2]; - dst[x] = a + b + c + d + e; + const int e = sum[x + 2], e2 = sumsq[x + 2]; + sum[x] = a + b + c + d + e; + sumsq[x] = a2 + b2 + c2 + d2 + e2; a = b; b = c; c = d; d = e; + a2 = b2; + b2 = c2; + c2 = d2; + d2 = e2; } - dst += REST_UNIT_STRIDE; + sum += REST_UNIT_STRIDE; + sumsq += REST_UNIT_STRIDE; } } -static void selfguided_filter(coef *dst, const pixel *src, - const ptrdiff_t src_stride, const int w, - const int h, const int n, const int s - HIGHBD_DECL_SUFFIX) +static NOINLINE void +selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride, + const int w, const int h, const int n, const unsigned s + HIGHBD_DECL_SUFFIX) { - const int sgr_one_by_x = n == 25 ? 164 : 455; + const unsigned sgr_one_by_x = n == 25 ? 164 : 455; // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels // of padding above and below - int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; - int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3; + int32_t sumsq[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE]; + int32_t *A = sumsq + 2 * REST_UNIT_STRIDE + 3; // By inverting A and B after the boxsums, B can be of size coef instead // of int32_t - coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; - coef *B = B_ + 3 * REST_UNIT_STRIDE + 3; + coef sum[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE]; + coef *B = sum + 2 * REST_UNIT_STRIDE + 3; const int step = (n == 25) + 1; - if (n == 25) { - boxsum5(B_, src, w + 6, h + 6); - boxsum5sqr(A_, src, w + 6, h + 6); - } else { - boxsum3(B_, src, w + 6, h + 6); - boxsum3sqr(A_, src, w + 6, h + 6); - } + if (n == 25) + boxsum5(sumsq, sum, src, w + 6, h + 6); + else + boxsum3(sumsq, sum, src, w + 6, h + 6); const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; int32_t *AA = A - REST_UNIT_STRIDE; @@ -511,79 +447,103 @@ #undef EIGHT_NEIGHBORS } -static void selfguided_c(pixel *p, const ptrdiff_t p_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, - const int w, const int h, const int sgr_idx, - const int16_t sgr_w[2], const enum LrEdgeFlags edges - HIGHBD_DECL_SUFFIX) +static void sgr_5x5_c(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], const pixel *lpf, + const ptrdiff_t lpf_stride, const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels // of padding above and below pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; - padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); - // Selfguided filter outputs to a maximum stripe height of 64 and a // maximum restoration width of 384 (256 * 1.5) coef dst[64 * 384]; - // both r1 and r0 can't be zero - if (!dav1d_sgr_params[sgr_idx][0]) { - const int s1 = dav1d_sgr_params[sgr_idx][3]; - selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX); - const int w1 = (1 << 7) - sgr_w[1]; - for (int j = 0; j < h; j++) { - for (int i = 0; i < w; i++) { - const int u = (p[i] << 4); - const int v = (u << 7) + w1 * (dst[j * 384 + i] - u); - p[i] = iclip_pixel((v + (1 << 10)) >> 11); - } - p += PXSTRIDE(p_stride); + padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); + selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, + params->sgr.s0 HIGHBD_TAIL_SUFFIX); + + const int w0 = params->sgr.w0; + for (int j = 0; j < h; j++) { + for (int i = 0; i < w; i++) { + const int u = (p[i] << 4); + const int v = (u << 7) + w0 * (dst[j * 384 + i] - u); + p[i] = iclip_pixel((v + (1 << 10)) >> 11); } - } else if (!dav1d_sgr_params[sgr_idx][1]) { - const int s0 = dav1d_sgr_params[sgr_idx][2]; - selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX); - const int w0 = sgr_w[0]; - for (int j = 0; j < h; j++) { - for (int i = 0; i < w; i++) { - const int u = (p[i] << 4); - const int v = (u << 7) + w0 * (dst[j * 384 + i] - u); - p[i] = iclip_pixel((v + (1 << 10)) >> 11); - } - p += PXSTRIDE(p_stride); + p += PXSTRIDE(p_stride); + } +} + +static void sgr_3x3_c(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], const pixel *lpf, + const ptrdiff_t lpf_stride, const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; + coef dst[64 * 384]; + + padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); + selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, + params->sgr.s1 HIGHBD_TAIL_SUFFIX); + + const int w1 = params->sgr.w1; + for (int j = 0; j < h; j++) { + for (int i = 0; i < w; i++) { + const int u = (p[i] << 4); + const int v = (u << 7) + w1 * (dst[j * 384 + i] - u); + p[i] = iclip_pixel((v + (1 << 10)) >> 11); } - } else { - coef dst1[64 * 384]; - const int s0 = dav1d_sgr_params[sgr_idx][2]; - const int s1 = dav1d_sgr_params[sgr_idx][3]; - const int w0 = sgr_w[0]; - const int w1 = (1 << 7) - w0 - sgr_w[1]; - selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0 HIGHBD_TAIL_SUFFIX); - selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1 HIGHBD_TAIL_SUFFIX); - for (int j = 0; j < h; j++) { - for (int i = 0; i < w; i++) { - const int u = (p[i] << 4); - const int v = (u << 7) + w0 * (dst[j * 384 + i] - u) + - w1 * (dst1[j * 384 + i] - u); - p[i] = iclip_pixel((v + (1 << 10)) >> 11); - } - p += PXSTRIDE(p_stride); + p += PXSTRIDE(p_stride); + } +} + +static void sgr_mix_c(pixel *p, const ptrdiff_t p_stride, + const pixel (*const left)[4], const pixel *lpf, + const ptrdiff_t lpf_stride, const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE]; + coef dst0[64 * 384]; + coef dst1[64 * 384]; + + padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); + selfguided_filter(dst0, tmp, REST_UNIT_STRIDE, w, h, 25, + params->sgr.s0 HIGHBD_TAIL_SUFFIX); + selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, + params->sgr.s1 HIGHBD_TAIL_SUFFIX); + + const int w0 = params->sgr.w0; + const int w1 = params->sgr.w1; + for (int j = 0; j < h; j++) { + for (int i = 0; i < w; i++) { + const int u = (p[i] << 4); + const int v = (u << 7) + w0 * (dst0[j * 384 + i] - u) + + w1 * (dst1[j * 384 + i] - u); + p[i] = iclip_pixel((v + (1 << 10)) >> 11); } + p += PXSTRIDE(p_stride); } } -COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, int bpc) { - c->wiener = wiener_c; - c->selfguided = selfguided_c; +COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, + const int bpc) +{ + c->wiener[0] = c->wiener[1] = wiener_c; + c->sgr[0] = sgr_5x5_c; + c->sgr[1] = sgr_3x3_c; + c->sgr[2] = sgr_mix_c; #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc); #elif ARCH_PPC64LE - bitfn(dav1d_loop_restoration_dsp_init_ppc)(c); + bitfn(dav1d_loop_restoration_dsp_init_ppc)(c, bpc); #elif ARCH_X86 - bitfn(dav1d_loop_restoration_dsp_init_x86)(c); + bitfn(dav1d_loop_restoration_dsp_init_x86)(c, bpc); #endif #endif } diff -Nru dav1d-0.7.1/src/lr_apply_tmpl.c dav1d-0.9.1/src/lr_apply_tmpl.c --- dav1d-0.7.1/src/lr_apply_tmpl.c 2020-06-21 11:48:54.996126400 +0000 +++ dav1d-0.9.1/src/lr_apply_tmpl.c 2021-07-28 21:38:28.881852000 +0000 @@ -48,31 +48,32 @@ const pixel *src, const ptrdiff_t src_stride, const int ss_ver, const int sb128, int row, const int row_h, const int src_w, - const int h, const int ss_hor) + const int h, const int ss_hor, const int pft) { const int dst_w = f->frame_hdr->super_res.enabled ? (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w; // The first stripe of the frame is shorter by 8 luma pixel rows. int stripe_h = (64 - 8 * !row) >> ss_ver; + src += (stripe_h - 2) * PXSTRIDE(src_stride); - if (row) { - const int top = 4 << sb128; - // Copy the top part of the stored loop filtered pixels from the - // previous sb row needed above the first stripe of this sb row. - pixel_copy(&dst[PXSTRIDE(dst_stride) * 0], - &dst[PXSTRIDE(dst_stride) * top], dst_w); - pixel_copy(&dst[PXSTRIDE(dst_stride) * 1], - &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w); - pixel_copy(&dst[PXSTRIDE(dst_stride) * 2], - &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w); - pixel_copy(&dst[PXSTRIDE(dst_stride) * 3], - &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w); + if (!pft) { + if (row) { + const int top = 4 << sb128; + // Copy the top part of the stored loop filtered pixels from the + // previous sb row needed above the first stripe of this sb row. + pixel_copy(&dst[PXSTRIDE(dst_stride) * 0], + &dst[PXSTRIDE(dst_stride) * top], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 1], + &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 2], + &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 3], + &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w); + } + dst += 4 * PXSTRIDE(dst_stride); } - dst += 4 * PXSTRIDE(dst_stride); - src += (stripe_h - 2) * PXSTRIDE(src_stride); - if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { while (row + stripe_h <= row_h) { const int n_lines = 4 - (row + stripe_h + 1 == h); @@ -107,9 +108,15 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, /*const*/ pixel *const src[3], const int sby) { + const int pft = f->c->n_pfc > 1; const int offset = 8 * !!sby; const ptrdiff_t *const src_stride = f->cur.stride; const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel); + pixel *const dst[3] = { + f->lf.lr_lpf_line[0] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride), + f->lf.lr_lpf_line[1] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride), + f->lf.lr_lpf_line[2] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride) + }; // TODO Also check block level restore type to reduce copying. const int restore_planes = f->lf.restore_planes; @@ -119,9 +126,9 @@ const int w = f->bw << 2; const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1); const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset; - backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride, + backup_lpf(f, dst[0], lr_stride, src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], - 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0); + 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, pft); } if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) { const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420; @@ -130,18 +137,16 @@ const int w = f->bw << (2 - ss_hor); const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1); const int offset_uv = offset >> ss_ver; - const int y_stripe = - (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; - + const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; if (restore_planes & LR_RESTORE_U) { - backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride, + backup_lpf(f, dst[1], lr_stride, src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1], - ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor); + ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft); } if (restore_planes & LR_RESTORE_V) { - backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride, + backup_lpf(f, dst[2], lr_stride, src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1], - ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor); + ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft); } } } @@ -154,43 +159,53 @@ const Dav1dDSPContext *const dsp = f->dsp; const int chroma = !!plane; const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420); - const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM); - const pixel *lpf = f->lf.lr_lpf_line[plane] + x; const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma]; const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31); + const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128); + const pixel *lpf = f->lf.lr_lpf_line[plane] + (f->c->n_pfc > 1) * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x; // The first stripe of the frame is shorter by 8 luma pixel rows. int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y); - // FIXME [8] might be easier for SIMD - int16_t filterh[7], filterv[7]; + looprestorationfilter_fn lr_fn; + LooprestorationParams params; if (lr->type == DAV1D_RESTORATION_WIENER) { - filterh[0] = filterh[6] = lr->filter_h[0]; - filterh[1] = filterh[5] = lr->filter_h[1]; - filterh[2] = filterh[4] = lr->filter_h[2]; - filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2); - - filterv[0] = filterv[6] = lr->filter_v[0]; - filterv[1] = filterv[5] = lr->filter_v[1]; - filterv[2] = filterv[4] = lr->filter_v[2]; - filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2); + int16_t (*const filter)[8] = params.filter; + filter[0][0] = filter[0][6] = lr->filter_h[0]; + filter[0][1] = filter[0][5] = lr->filter_h[1]; + filter[0][2] = filter[0][4] = lr->filter_h[2]; + filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2; +#if BITDEPTH != 8 + /* For 8-bit SIMD it's beneficial to handle the +128 separately + * in order to avoid overflows. */ + filter[0][3] += 128; +#endif + + filter[1][0] = filter[1][6] = lr->filter_v[0]; + filter[1][1] = filter[1][5] = lr->filter_v[1]; + filter[1][2] = filter[1][4] = lr->filter_v[2]; + filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2; + + lr_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])]; + } else { + assert(lr->type == DAV1D_RESTORATION_SGRPROJ); + const uint16_t *const sgr_params = dav1d_sgr_params[lr->sgr_idx]; + params.sgr.s0 = sgr_params[0]; + params.sgr.s1 = sgr_params[1]; + params.sgr.w0 = lr->sgr_weights[0]; + params.sgr.w1 = 128 - (lr->sgr_weights[0] + lr->sgr_weights[1]); + + lr_fn = dsp->lr.sgr[!!sgr_params[0] + !!sgr_params[1] * 2 - 1]; } while (y + stripe_h <= row_h) { - // Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h) - edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM; - if (lr->type == DAV1D_RESTORATION_WIENER) { - dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h, - filterh, filterv, edges HIGHBD_CALL_SUFFIX); - } else { - assert(lr->type == DAV1D_RESTORATION_SGRPROJ); - dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h, - lr->sgr_idx, lr->sgr_weights, edges HIGHBD_CALL_SUFFIX); - } + // Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h) + edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM; + lr_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h, + ¶ms, edges HIGHBD_CALL_SUFFIX); left += stripe_h; y += stripe_h; - if (y + stripe_h > row_h && sbrow_has_bottom) break; p += stripe_h * PXSTRIDE(p_stride); edges |= LR_HAVE_TOP; stripe_h = imin(64 >> ss_ver, row_h - y); @@ -234,8 +249,7 @@ pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4]; const Av1RestorationUnit *lr[2]; - enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT | - (row_h < h ? LR_HAVE_BOTTOM : 0); + enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT; int aligned_unit_pos = row_y & ~(unit_size - 1); if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h) @@ -273,11 +287,13 @@ const int offset_y = 8 * !!sby; const ptrdiff_t *const dst_stride = f->sr_cur.p.stride; const int restore_planes = f->lf.restore_planes; + const int not_last = sby + 1 < f->sbh; if (restore_planes & LR_RESTORE_Y) { const int h = f->sr_cur.p.p.h; const int w = f->sr_cur.p.p.w; - const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h); + const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128); + const int row_h = imin(next_row_y - 8 * not_last, h); const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y; lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w, h, row_h, 0); @@ -287,10 +303,10 @@ const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444; const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver; const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; - const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h); + const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128); + const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h); const int offset_uv = offset_y >> ss_ver; - const int y_stripe = - (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; + const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; if (restore_planes & LR_RESTORE_U) lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe, w, h, row_h, 1); diff -Nru dav1d-0.7.1/src/mc_tmpl.c dav1d-0.9.1/src/mc_tmpl.c --- dav1d-0.7.1/src/mc_tmpl.c 2020-06-21 11:48:54.996126400 +0000 +++ dav1d-0.9.1/src/mc_tmpl.c 2021-07-28 21:38:28.885852000 +0000 @@ -87,9 +87,15 @@ #define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \ ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh)) +#define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \ + ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh)) + #define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \ iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh)) +#define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \ + iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh)) + #define GET_H_FILTER(mx) \ const int8_t *const fh = !(mx) ? NULL : w > 4 ? \ dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \ @@ -111,7 +117,7 @@ const int filter_type HIGHBD_DECL_SUFFIX) { const int intermediate_bits = get_intermediate_bits(bitdepth_max); - const int intermediate_rnd = (1 << intermediate_bits) >> 1; + const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1); GET_FILTERS(); dst_stride = PXSTRIDE(dst_stride); @@ -144,9 +150,8 @@ } else { do { for (int x = 0; x < w; x++) { - const int px = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, - 6 - intermediate_bits); - dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits); + dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1, + intermediate_rnd, 6); } dst += dst_stride; @@ -736,30 +741,16 @@ #undef w_mask_fns -#if ARCH_X86 -#define FILTER_WARP(src, x, F, stride) \ - (F[0] * src[x + -3 * stride] + \ - F[4] * src[x + -2 * stride] + \ - F[1] * src[x + -1 * stride] + \ - F[5] * src[x + +0 * stride] + \ - F[2] * src[x + +1 * stride] + \ - F[6] * src[x + +2 * stride] + \ - F[3] * src[x + +3 * stride] + \ - F[7] * src[x + +4 * stride]) -#else -#define FILTER_WARP(src, x, F, stride) \ - (F[0] * src[x + -3 * stride] + \ - F[1] * src[x + -2 * stride] + \ - F[2] * src[x + -1 * stride] + \ - F[3] * src[x + +0 * stride] + \ - F[4] * src[x + +1 * stride] + \ - F[5] * src[x + +2 * stride] + \ - F[6] * src[x + +3 * stride] + \ - F[7] * src[x + +4 * stride]) -#endif - #define FILTER_WARP_RND(src, x, F, stride, sh) \ - ((FILTER_WARP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh)) + ((F[0] * src[x - 3 * stride] + \ + F[1] * src[x - 2 * stride] + \ + F[2] * src[x - 1 * stride] + \ + F[3] * src[x + 0 * stride] + \ + F[4] * src[x + 1 * stride] + \ + F[5] * src[x + 2 * stride] + \ + F[6] * src[x + 3 * stride] + \ + F[7] * src[x + 4 * stride] + \ + ((1 << (sh)) >> 1)) >> (sh)) #define FILTER_WARP_CLIP(src, x, F, stride, sh) \ iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh)) diff -Nru dav1d-0.7.1/src/mem.c dav1d-0.9.1/src/mem.c --- dav1d-0.7.1/src/mem.c 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/mem.c 2021-07-28 21:38:28.885852000 +0000 @@ -0,0 +1,119 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * Copyright © 2020, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "src/internal.h" + +static COLD void mem_pool_destroy(Dav1dMemPool *const pool) { + pthread_mutex_destroy(&pool->lock); + free(pool); +} + +void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) { + pthread_mutex_lock(&pool->lock); + const int ref_cnt = --pool->ref_cnt; + if (!pool->end) { + buf->next = pool->buf; + pool->buf = buf; + pthread_mutex_unlock(&pool->lock); + assert(ref_cnt > 0); + } else { + pthread_mutex_unlock(&pool->lock); + dav1d_free_aligned(buf->data); + if (!ref_cnt) mem_pool_destroy(pool); + } +} + +Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t size) { + assert(!(size & (sizeof(void*) - 1))); + pthread_mutex_lock(&pool->lock); + Dav1dMemPoolBuffer *buf = pool->buf; + pool->ref_cnt++; + uint8_t *data; + if (buf) { + pool->buf = buf->next; + pthread_mutex_unlock(&pool->lock); + data = buf->data; + if ((uintptr_t)buf - (uintptr_t)data != size) { + /* Reallocate if the size has changed */ + dav1d_free_aligned(data); + goto alloc; + } + } else { + pthread_mutex_unlock(&pool->lock); +alloc: + data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64); + if (!data) { + pthread_mutex_lock(&pool->lock); + const int ref_cnt = --pool->ref_cnt; + pthread_mutex_unlock(&pool->lock); + if (!ref_cnt) mem_pool_destroy(pool); + return NULL; + } + buf = (Dav1dMemPoolBuffer*)(data + size); + buf->data = data; + } + + return buf; +} + +COLD int dav1d_mem_pool_init(Dav1dMemPool **const ppool) { + Dav1dMemPool *const pool = malloc(sizeof(Dav1dMemPool)); + if (pool) { + if (!pthread_mutex_init(&pool->lock, NULL)) { + pool->buf = NULL; + pool->ref_cnt = 1; + pool->end = 0; + *ppool = pool; + return 0; + } + free(pool); + } + *ppool = NULL; + return DAV1D_ERR(ENOMEM); +} + +COLD void dav1d_mem_pool_end(Dav1dMemPool *const pool) { + if (pool) { + pthread_mutex_lock(&pool->lock); + Dav1dMemPoolBuffer *buf = pool->buf; + const int ref_cnt = --pool->ref_cnt; + pool->buf = NULL; + pool->end = 1; + pthread_mutex_unlock(&pool->lock); + + while (buf) { + void *const data = buf->data; + buf = buf->next; + dav1d_free_aligned(data); + } + if (!ref_cnt) mem_pool_destroy(pool); + } +} diff -Nru dav1d-0.7.1/src/mem.h dav1d-0.9.1/src/mem.h --- dav1d-0.7.1/src/mem.h 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/mem.h 2021-07-28 21:38:28.885852000 +0000 @@ -0,0 +1,103 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_MEM_H +#define DAV1D_SRC_MEM_H + +#include + +#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN) +#include +#endif + +#include "common/attributes.h" + +#include "src/thread.h" + +typedef struct Dav1dMemPoolBuffer { + void *data; + struct Dav1dMemPoolBuffer *next; +} Dav1dMemPoolBuffer; + +typedef struct Dav1dMemPool { + pthread_mutex_t lock; + Dav1dMemPoolBuffer *buf; + int ref_cnt; + int end; +} Dav1dMemPool; + +void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf); +Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size); +int dav1d_mem_pool_init(Dav1dMemPool **pool); +void dav1d_mem_pool_end(Dav1dMemPool *pool); + +/* + * Allocate align-byte aligned memory. The return value can be released + * by calling the dav1d_free_aligned() function. + */ +static inline void *dav1d_alloc_aligned(size_t sz, size_t align) { + assert(!(align & (align - 1))); +#ifdef HAVE_POSIX_MEMALIGN + void *ptr; + if (posix_memalign(&ptr, align, sz)) return NULL; + return ptr; +#elif defined(HAVE_ALIGNED_MALLOC) + return _aligned_malloc(sz, align); +#elif defined(HAVE_MEMALIGN) + return memalign(align, sz); +#else +#error Missing aligned alloc implementation +#endif +} + +static inline void dav1d_free_aligned(void* ptr) { +#ifdef HAVE_POSIX_MEMALIGN + free(ptr); +#elif defined(HAVE_ALIGNED_MALLOC) + _aligned_free(ptr); +#elif defined(HAVE_MEMALIGN) + free(ptr); +#endif +} + +static inline void dav1d_freep_aligned(void* ptr) { + void **mem = (void **) ptr; + if (*mem) { + dav1d_free_aligned(*mem); + *mem = NULL; + } +} + +static inline void freep(void *ptr) { + void **mem = (void **) ptr; + if (*mem) { + free(*mem); + *mem = NULL; + } +} + +#endif /* DAV1D_SRC_MEM_H */ diff -Nru dav1d-0.7.1/src/meson.build dav1d-0.9.1/src/meson.build --- dav1d-0.7.1/src/meson.build 2020-06-21 11:48:54.996126400 +0000 +++ dav1d-0.9.1/src/meson.build 2021-07-28 21:38:28.885852000 +0000 @@ -38,6 +38,7 @@ 'itx_1d.c', 'lf_mask.c', 'log.c', + 'mem.c', 'msac.c', 'obu.c', 'picture.c', @@ -82,7 +83,7 @@ ) # ASM specific sources -libdav1d_nasm_objs = [] +libdav1d_asm_objs = [] # Arch-specific flags arch_flags = [] if is_asm_enabled @@ -94,14 +95,16 @@ ) libdav1d_tmpl_sources += files( 'arm/cdef_init_tmpl.c', + 'arm/film_grain_init_tmpl.c', 'arm/ipred_init_tmpl.c', 'arm/itx_init_tmpl.c', 'arm/loopfilter_init_tmpl.c', 'arm/looprestoration_init_tmpl.c', 'arm/mc_init_tmpl.c', ) - if host_machine.cpu_family() == 'aarch64' - libdav1d_sources += files( + if (host_machine.cpu_family() == 'aarch64' or + host_machine.cpu() == 'arm64') + libdav1d_sources_asm = files( # itx.S is used for both 8 and 16 bpc. 'arm/64/itx.S', 'arm/64/looprestoration_common.S', @@ -109,8 +112,9 @@ ) if dav1d_bitdepths.contains('8') - libdav1d_sources += files( + libdav1d_sources_asm += files( 'arm/64/cdef.S', + 'arm/64/film_grain.S', 'arm/64/ipred.S', 'arm/64/loopfilter.S', 'arm/64/looprestoration.S', @@ -119,8 +123,9 @@ endif if dav1d_bitdepths.contains('16') - libdav1d_sources += files( + libdav1d_sources_asm += files( 'arm/64/cdef16.S', + 'arm/64/film_grain16.S', 'arm/64/ipred16.S', 'arm/64/itx16.S', 'arm/64/loopfilter16.S', @@ -129,15 +134,18 @@ ) endif elif host_machine.cpu_family().startswith('arm') - libdav1d_sources += files( + libdav1d_sources_asm = files( + # itx.S is used for both 8 and 16 bpc. + 'arm/32/itx.S', + 'arm/32/looprestoration_common.S', 'arm/32/msac.S', ) if dav1d_bitdepths.contains('8') - libdav1d_sources += files( + libdav1d_sources_asm += files( 'arm/32/cdef.S', + 'arm/32/film_grain.S', 'arm/32/ipred.S', - 'arm/32/itx.S', 'arm/32/loopfilter.S', 'arm/32/looprestoration.S', 'arm/32/mc.S', @@ -145,10 +153,23 @@ endif if dav1d_bitdepths.contains('16') - libdav1d_sources += files( + libdav1d_sources_asm += files( + 'arm/32/cdef16.S', + 'arm/32/film_grain16.S', + 'arm/32/ipred16.S', + 'arm/32/itx16.S', + 'arm/32/loopfilter16.S', + 'arm/32/looprestoration16.S', + 'arm/32/mc16.S', ) endif endif + + if use_gaspp + libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm) + else + libdav1d_sources += libdav1d_sources_asm + endif elif host_machine.cpu_family().startswith('x86') libdav1d_sources += files( @@ -170,35 +191,50 @@ libdav1d_sources_asm = files( 'x86/cpuid.asm', 'x86/msac.asm', + 'x86/cdef_avx2.asm', + 'x86/itx_avx2.asm', + 'x86/looprestoration_avx2.asm', + 'x86/cdef_sse.asm', + 'x86/itx_sse.asm', ) if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( 'x86/cdef_avx512.asm', - 'x86/cdef_avx2.asm', - 'x86/film_grain.asm', - 'x86/ipred.asm', - 'x86/itx.asm', - 'x86/loopfilter.asm', - 'x86/looprestoration.asm', - 'x86/mc.asm', - 'x86/cdef_sse.asm', - 'x86/film_grain_ssse3.asm', - 'x86/ipred_ssse3.asm', - 'x86/itx_ssse3.asm', - 'x86/loopfilter_ssse3.asm', - 'x86/looprestoration_ssse3.asm', + 'x86/mc_avx512.asm', + 'x86/mc_avx2.asm', + 'x86/film_grain_avx2.asm', + 'x86/ipred_avx2.asm', + 'x86/loopfilter_avx2.asm', + 'x86/film_grain_sse.asm', + 'x86/ipred_sse.asm', + 'x86/loopfilter_sse.asm', + 'x86/looprestoration_sse.asm', 'x86/mc_sse.asm', ) endif if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( + 'x86/cdef16_avx2.asm', + 'x86/film_grain16_avx2.asm', + 'x86/ipred16_avx2.asm', + 'x86/itx16_avx2.asm', + 'x86/loopfilter16_avx2.asm', + 'x86/looprestoration16_avx2.asm', + 'x86/mc16_avx2.asm', + 'x86/cdef16_sse.asm', + 'x86/film_grain16_sse.asm', + 'x86/ipred16_sse.asm', + 'x86/itx16_sse.asm', + 'x86/loopfilter16_sse.asm', + 'x86/looprestoration16_sse.asm', + 'x86/mc16_sse.asm', ) endif # Compile the ASM sources with NASM - libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm) + libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm) elif host_machine.cpu() == 'ppc64le' arch_flags = ['-maltivec', '-mvsx'] libdav1d_sources += files( @@ -220,17 +256,6 @@ # if host_machine.system() == 'windows' and get_option('default_library') != 'static' - rc_version_array = meson.project_version().split('.') - winmod = import('windows') - rc_data = configuration_data() - rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0]) - rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1]) - rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2]) - rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major) - rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor) - rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision) - rc_data.set('COPYRIGHT_YEARS', '2019') - rc_file = configure_file( input : 'dav1d.rc.in', output : 'dav1d.rc', @@ -257,7 +282,7 @@ rev_target, config_h_target, include_directories : dav1d_inc_dirs, - dependencies: [stdatomic_dependency], + dependencies: [stdatomic_dependencies], c_args : [stackalign_flag, stackrealign_flag, api_export_flags], install : false, build_by_default : false, @@ -270,7 +295,7 @@ 'dav1d_bitdepth_@0@'.format(bitdepth), libdav1d_tmpl_sources, config_h_target, include_directories: dav1d_inc_dirs, - dependencies : [stdatomic_dependency], + dependencies : [stdatomic_dependencies], c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag, install : false, build_by_default : false, @@ -283,7 +308,7 @@ 'dav1d_arch_bitdepth_@0@'.format(bitdepth), libdav1d_arch_tmpl_sources, config_h_target, include_directories: dav1d_inc_dirs, - dependencies : [stdatomic_dependency], + dependencies : [stdatomic_dependencies], c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags, install : false, build_by_default : false, @@ -299,7 +324,7 @@ libdav1d = library('dav1d', libdav1d_sources, - libdav1d_nasm_objs, + libdav1d_asm_objs, libdav1d_rc_obj, objects : [ @@ -309,7 +334,7 @@ include_directories : dav1d_inc_dirs, dependencies : [ - stdatomic_dependency, + stdatomic_dependencies, thread_dependency, thread_compat_dep, libdl_dependency, diff -Nru dav1d-0.7.1/src/msac.c dav1d-0.9.1/src/msac.c --- dav1d-0.7.1/src/msac.c 2020-06-21 11:48:54.996126400 +0000 +++ dav1d-0.9.1/src/msac.c 2021-07-28 21:38:28.885852000 +0000 @@ -101,17 +101,17 @@ } int dav1d_msac_decode_subexp(MsacContext *const s, const int ref, - const int n, const unsigned k) + const int n, unsigned k) { - int i = 0; - int a = 0; - int b = k; - while ((2 << b) < n) { - if (!dav1d_msac_decode_bool_equi(s)) break; - b = k + i++; - a = (1 << b); + assert(n >> k == 8); + + unsigned a = 0; + if (dav1d_msac_decode_bool_equi(s)) { + if (dav1d_msac_decode_bool_equi(s)) + k += dav1d_msac_decode_bool_equi(s) + 1; + a = 1 << k; } - const unsigned v = dav1d_msac_decode_bools(s, b) + a; + const unsigned v = dav1d_msac_decode_bools(s, k) + a; return ref * 2 <= n ? inv_recenter(ref, v) : n - 1 - inv_recenter(n - 1 - ref, v); } diff -Nru dav1d-0.7.1/src/obu.c dav1d-0.9.1/src/obu.c --- dav1d-0.7.1/src/obu.c 2020-06-21 11:48:54.996126400 +0000 +++ dav1d-0.9.1/src/obu.c 2021-07-28 21:38:28.885852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -33,6 +33,7 @@ #include "dav1d/data.h" +#include "common/frame.h" #include "common/intops.h" #include "src/decode.h" @@ -55,7 +56,7 @@ hdr->profile = dav1d_get_bits(gb, 3); if (hdr->profile > 2) goto error; #if DEBUG_SEQ_HDR - printf("SEQHDR: post-profile: off=%ld\n", + printf("SEQHDR: post-profile: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif @@ -63,7 +64,7 @@ hdr->reduced_still_picture_header = dav1d_get_bits(gb, 1); if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error; #if DEBUG_SEQ_HDR - printf("SEQHDR: post-stillpicture_flags: off=%ld\n", + printf("SEQHDR: post-stillpicture_flags: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif @@ -102,7 +103,7 @@ hdr->decoder_model_info_present = 0; } #if DEBUG_SEQ_HDR - printf("SEQHDR: post-timinginfo: off=%ld\n", + printf("SEQHDR: post-timinginfo: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif @@ -112,6 +113,8 @@ struct Dav1dSequenceHeaderOperatingPoint *const op = &hdr->operating_points[i]; op->idc = dav1d_get_bits(gb, 12); + if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00))) + goto error; op->major_level = 2 + dav1d_get_bits(gb, 3); op->minor_level = dav1d_get_bits(gb, 2); op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0; @@ -136,7 +139,7 @@ c->operating_point < hdr->num_operating_points ? c->operating_point : 0; c->operating_point_idc = hdr->operating_points[op_idx].idc; #if DEBUG_SEQ_HDR - printf("SEQHDR: post-operating-points: off=%ld\n", + printf("SEQHDR: post-operating-points: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif } @@ -146,7 +149,7 @@ hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1; hdr->max_height = dav1d_get_bits(gb, hdr->height_n_bits) + 1; #if DEBUG_SEQ_HDR - printf("SEQHDR: post-size: off=%ld\n", + printf("SEQHDR: post-size: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif hdr->frame_id_numbers_present = @@ -156,7 +159,7 @@ hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1; } #if DEBUG_SEQ_HDR - printf("SEQHDR: post-frame-id-numbers-present: off=%ld\n", + printf("SEQHDR: post-frame-id-numbers-present: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif @@ -190,7 +193,7 @@ } hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1); #if DEBUG_SEQ_HDR - printf("SEQHDR: post-screentools: off=%ld\n", + printf("SEQHDR: post-screentools: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif hdr->force_integer_mv = hdr->screen_content_tools ? @@ -202,7 +205,7 @@ hdr->cdef = dav1d_get_bits(gb, 1); hdr->restoration = dav1d_get_bits(gb, 1); #if DEBUG_SEQ_HDR - printf("SEQHDR: post-featurebits: off=%ld\n", + printf("SEQHDR: post-featurebits: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif @@ -262,13 +265,13 @@ } hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1); #if DEBUG_SEQ_HDR - printf("SEQHDR: post-colorinfo: off=%ld\n", + printf("SEQHDR: post-colorinfo: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif hdr->film_grain_present = dav1d_get_bits(gb, 1); #if DEBUG_SEQ_HDR - printf("SEQHDR: post-filmgrain: off=%ld\n", + printf("SEQHDR: post-filmgrain: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif @@ -365,15 +368,18 @@ hdr->show_existing_frame = !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1); #if DEBUG_FRAME_HDR - printf("HDR: post-show_existing_frame: off=%ld\n", + printf("HDR: post-show_existing_frame: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif if (hdr->show_existing_frame) { hdr->existing_frame_idx = dav1d_get_bits(gb, 3); if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval) hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length); - if (seqhdr->frame_id_numbers_present) + if (seqhdr->frame_id_numbers_present) { hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits); + Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->existing_frame_idx].p.p.frame_hdr; + if (!ref_frame_hdr || ref_frame_hdr->frame_id != hdr->frame_id) return DAV1D_ERR(EINVAL); + } return 0; } @@ -389,7 +395,7 @@ hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH || seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1); #if DEBUG_FRAME_HDR - printf("HDR: post-frametype_bits: off=%ld\n", + printf("HDR: post-frametype_bits: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->disable_cdf_update = dav1d_get_bits(gb, 1); @@ -401,7 +407,7 @@ else hdr->force_integer_mv = 0; - if (!(hdr->frame_type & 1)) + if (IS_KEY_OR_INTRA(hdr)) hdr->force_integer_mv = 1; if (seqhdr->frame_id_numbers_present) @@ -410,12 +416,12 @@ hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 : hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bits(gb, 1); #if DEBUG_FRAME_HDR - printf("HDR: post-frame_size_override_flag: off=%ld\n", + printf("HDR: post-frame_size_override_flag: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->frame_offset = seqhdr->order_hint ? dav1d_get_bits(gb, seqhdr->order_hint_n_bits) : 0; - hdr->primary_ref_frame = !hdr->error_resilient_mode && hdr->frame_type & 1 ? + hdr->primary_ref_frame = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE; if (seqhdr->decoder_model_info_present) { @@ -434,9 +440,7 @@ } } - if (hdr->frame_type == DAV1D_FRAME_TYPE_KEY || - hdr->frame_type == DAV1D_FRAME_TYPE_INTRA) - { + if (IS_KEY_OR_INTRA(hdr)) { hdr->refresh_frame_flags = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY && hdr->show_frame) ? 0xff : dav1d_get_bits(gb, 8); if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint) @@ -548,8 +552,12 @@ for (int i = 0; i < 7; i++) { if (!hdr->frame_ref_short_signaling) hdr->refidx[i] = dav1d_get_bits(gb, 3); - if (seqhdr->frame_id_numbers_present) - dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits); + if (seqhdr->frame_id_numbers_present) { + const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits); + const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1); + Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr; + if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error; + } } const int use_ref = !hdr->error_resilient_mode && hdr->frame_size_override; @@ -560,17 +568,17 @@ hdr->switchable_motion_mode = dav1d_get_bits(gb, 1); hdr->use_ref_frame_mvs = !hdr->error_resilient_mode && seqhdr->ref_frame_mvs && seqhdr->order_hint && - hdr->frame_type & 1 && dav1d_get_bits(gb, 1); + IS_INTER_OR_SWITCH(hdr) && dav1d_get_bits(gb, 1); } #if DEBUG_FRAME_HDR - printf("HDR: post-frametype-specific-bits: off=%ld\n", + printf("HDR: post-frametype-specific-bits: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->refresh_context = !seqhdr->reduced_still_picture_header && !hdr->disable_cdf_update && !dav1d_get_bits(gb, 1); #if DEBUG_FRAME_HDR - printf("HDR: post-refresh_context: off=%ld\n", + printf("HDR: post-refresh_context: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif @@ -644,7 +652,7 @@ hdr->tiling.n_bytes = hdr->tiling.update = 0; } #if DEBUG_FRAME_HDR - printf("HDR: post-tiling: off=%ld\n", + printf("HDR: post-tiling: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif @@ -667,7 +675,7 @@ } } #if DEBUG_FRAME_HDR - printf("HDR: post-quant: off=%ld\n", + printf("HDR: post-quant: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->quant.qm = dav1d_get_bits(gb, 1); @@ -679,7 +687,7 @@ hdr->quant.qm_u; } #if DEBUG_FRAME_HDR - printf("HDR: post-qm: off=%ld\n", + printf("HDR: post-qm: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif @@ -764,7 +772,7 @@ hdr->segmentation.seg_data.d[i].ref = -1; } #if DEBUG_FRAME_HDR - printf("HDR: post-segmentation: off=%ld\n", + printf("HDR: post-segmentation: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif @@ -776,7 +784,7 @@ hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0; hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bits(gb, 1) : 0; #if DEBUG_FRAME_HDR - printf("HDR: post-delta_q_lf_flags: off=%ld\n", + printf("HDR: post-delta_q_lf_flags: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif @@ -836,7 +844,7 @@ } } #if DEBUG_FRAME_HDR - printf("HDR: post-lpf: off=%ld\n", + printf("HDR: post-lpf: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif @@ -855,7 +863,7 @@ hdr->cdef.uv_strength[0] = 0; } #if DEBUG_FRAME_HDR - printf("HDR: post-cdef: off=%ld\n", + printf("HDR: post-cdef: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif @@ -897,23 +905,23 @@ hdr->restoration.type[2] = DAV1D_RESTORATION_NONE; } #if DEBUG_FRAME_HDR - printf("HDR: post-restoration: off=%ld\n", + printf("HDR: post-restoration: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY : dav1d_get_bits(gb, 1) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST; #if DEBUG_FRAME_HDR - printf("HDR: post-txfmmode: off=%ld\n", + printf("HDR: post-txfmmode: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->switchable_comp_refs = hdr->frame_type & 1 ? dav1d_get_bits(gb, 1) : 0; + hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bits(gb, 1) : 0; #if DEBUG_FRAME_HDR - printf("HDR: post-refmode: off=%ld\n", + printf("HDR: post-refmode: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->skip_mode_allowed = 0; - if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) { + if (hdr->switchable_comp_refs && IS_INTER_OR_SWITCH(hdr) && seqhdr->order_hint) { const unsigned poc = hdr->frame_offset; unsigned off_before = 0xFFFFFFFFU; int off_after = -1; @@ -970,25 +978,25 @@ } hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0; #if DEBUG_FRAME_HDR - printf("HDR: post-extskip: off=%ld\n", + printf("HDR: post-extskip: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->warp_motion = !hdr->error_resilient_mode && hdr->frame_type & 1 && + hdr->warp_motion = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) && seqhdr->warped_motion && dav1d_get_bits(gb, 1); #if DEBUG_FRAME_HDR - printf("HDR: post-warpmotionbit: off=%ld\n", + printf("HDR: post-warpmotionbit: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->reduced_txtp_set = dav1d_get_bits(gb, 1); #if DEBUG_FRAME_HDR - printf("HDR: post-reducedtxtpset: off=%ld\n", + printf("HDR: post-reducedtxtpset: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif for (int i = 0; i < 7; i++) hdr->gmv[i] = dav1d_default_wm_params; - if (hdr->frame_type & 1) { + if (IS_INTER_OR_SWITCH(hdr)) { for (int i = 0; i < 7; i++) { hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY : dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM : @@ -1035,7 +1043,7 @@ } } #if DEBUG_FRAME_HDR - printf("HDR: post-gmv: off=%ld\n", + printf("HDR: post-gmv: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif @@ -1119,7 +1127,7 @@ memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data)); } #if DEBUG_FRAME_HDR - printf("HDR: post-filmgrain: off=%ld\n", + printf("HDR: post-filmgrain: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif @@ -1198,7 +1206,6 @@ const unsigned init_bit_pos = dav1d_get_bits_pos(&gb); const unsigned init_byte_pos = init_bit_pos >> 3; - const unsigned pkt_bytelen = init_byte_pos + len; // We must have read a whole number of bytes at this point (1 byte // for the header and whole bytes at a time when reading the @@ -1226,7 +1233,8 @@ switch (type) { case DAV1D_OBU_SEQ_HDR: { - Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dSequenceHeader)); + Dav1dRef *ref = dav1d_ref_create_using_pool(c->seq_hdr_pool, + sizeof(Dav1dSequenceHeader)); if (!ref) return DAV1D_ERR(ENOMEM); Dav1dSequenceHeader *seq_hdr = ref->data; memset(seq_hdr, 0, sizeof(*seq_hdr)); @@ -1241,11 +1249,13 @@ // If we have read a sequence header which is different from // the old one, this is a new video sequence and can't use any // previous state. Free that state. - if (!c->seq_hdr) + + if (!c->seq_hdr) { c->frame_hdr = NULL; + c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE; // see 7.5, operating_parameter_info is allowed to change in // sequence headers of a single sequence - else if (memcmp(seq_hdr, c->seq_hdr, offsetof(Dav1dSequenceHeader, operating_parameter_info))) { + } else if (memcmp(seq_hdr, c->seq_hdr, offsetof(Dav1dSequenceHeader, operating_parameter_info))) { c->frame_hdr = NULL; c->mastering_display = NULL; c->content_light = NULL; @@ -1258,6 +1268,12 @@ dav1d_ref_dec(&c->refs[i].refmvs); dav1d_cdf_thread_unref(&c->cdf[i]); } + c->frame_flags |= PICTURE_FLAG_NEW_SEQUENCE; + // If operating_parameter_info changed, signal it + } else if (memcmp(seq_hdr->operating_parameter_info, c->seq_hdr->operating_parameter_info, + sizeof(seq_hdr->operating_parameter_info))) + { + c->frame_flags |= PICTURE_FLAG_NEW_OP_PARAMS_INFO; } dav1d_ref_dec(&c->seq_hdr_ref); c->seq_hdr_ref = ref; @@ -1272,7 +1288,8 @@ if (global) break; if (!c->seq_hdr) goto error; if (!c->frame_hdr_ref) { - c->frame_hdr_ref = dav1d_ref_create(sizeof(Dav1dFrameHeader)); + c->frame_hdr_ref = dav1d_ref_create_using_pool(c->frame_hdr_pool, + sizeof(Dav1dFrameHeader)); if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM); } #ifndef NDEBUG @@ -1342,6 +1359,7 @@ // The current bit position is a multiple of 8 (because we // just aligned it) and less than 8*pkt_bytelen because // otherwise the overrun check would have fired. + const unsigned pkt_bytelen = init_byte_pos + len; const unsigned bit_pos = dav1d_get_bits_pos(&gb); assert((bit_pos & 7) == 0); assert(pkt_bytelen >= (bit_pos >> 3)); @@ -1364,24 +1382,33 @@ break; } case DAV1D_OBU_METADATA: { +#define DEBUG_OBU_METADATA 0 +#if DEBUG_OBU_METADATA + const uint8_t *const init_ptr = gb.ptr; +#endif // obu metadta type field const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb); const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3; if (gb.error) goto error; - Dav1dRef *ref; - Dav1dContentLightLevel *content_light; - Dav1dMasteringDisplay *mastering_display; - Dav1dITUTT35 *itut_t35_metadata; switch (meta_type) { - case OBU_META_HDR_CLL: - ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel)); + case OBU_META_HDR_CLL: { + Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel)); if (!ref) return DAV1D_ERR(ENOMEM); - content_light = ref->data; - memset(content_light, 0, sizeof(*content_light)); + Dav1dContentLightLevel *const content_light = ref->data; content_light->max_content_light_level = dav1d_get_bits(&gb, 16); +#if DEBUG_OBU_METADATA + printf("CLLOBU: max-content-light-level: %d [off=%td]\n", + content_light->max_content_light_level, + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16); +#if DEBUG_OBU_METADATA + printf("CLLOBU: max-frame-average-light-level: %d [off=%td]\n", + content_light->max_frame_average_light_level, + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif // Skip the trailing bit, align to the next byte boundary and check for overrun. dav1d_get_bits(&gb, 1); @@ -1395,22 +1422,46 @@ c->content_light = content_light; c->content_light_ref = ref; break; + } case OBU_META_HDR_MDCV: { - ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay)); + Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay)); if (!ref) return DAV1D_ERR(ENOMEM); - mastering_display = ref->data; - memset(mastering_display, 0, sizeof(*mastering_display)); + Dav1dMasteringDisplay *const mastering_display = ref->data; for (int i = 0; i < 3; i++) { mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16); mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16); +#if DEBUG_OBU_METADATA + printf("MDCVOBU: primaries[%d]: (%d, %d) [off=%td]\n", i, + mastering_display->primaries[i][0], + mastering_display->primaries[i][1], + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif } mastering_display->white_point[0] = dav1d_get_bits(&gb, 16); +#if DEBUG_OBU_METADATA + printf("MDCVOBU: white-point-x: %d [off=%td]\n", + mastering_display->white_point[0], + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif mastering_display->white_point[1] = dav1d_get_bits(&gb, 16); - +#if DEBUG_OBU_METADATA + printf("MDCVOBU: white-point-y: %d [off=%td]\n", + mastering_display->white_point[1], + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif mastering_display->max_luminance = dav1d_get_bits(&gb, 32); +#if DEBUG_OBU_METADATA + printf("MDCVOBU: max-luminance: %d [off=%td]\n", + mastering_display->max_luminance, + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif mastering_display->min_luminance = dav1d_get_bits(&gb, 32); - +#if DEBUG_OBU_METADATA + printf("MDCVOBU: min-luminance: %d [off=%td]\n", + mastering_display->min_luminance, + (gb.ptr - init_ptr) * 8 - gb.bits_left); +#endif // Skip the trailing bit, align to the next byte boundary and check for overrun. dav1d_get_bits(&gb, 1); dav1d_bytealign_get_bits(&gb); @@ -1447,9 +1498,9 @@ goto error; } - ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t)); + Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t)); if (!ref) return DAV1D_ERR(ENOMEM); - itut_t35_metadata = ref->data; + Dav1dITUTT35 *const itut_t35_metadata = ref->data; // We need our public headers to be C++ compatible, so payload can't be // a flexible array member @@ -1494,6 +1545,7 @@ dav1d_picture_ref(&c->out, &c->refs[c->frame_hdr->existing_frame_idx].p.p); dav1d_data_props_copy(&c->out.m, &in->m); + c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p); } else { // need to append this to the frame output queue const unsigned next = c->frame_thread.next++; @@ -1510,8 +1562,10 @@ if (out_delayed->p.data[0]) { const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], memory_order_relaxed); - if (out_delayed->visible && progress != FRAME_ERROR) + if (out_delayed->visible && progress != FRAME_ERROR) { dav1d_picture_ref(&c->out, &out_delayed->p); + c->event_flags |= dav1d_picture_get_event_flags(out_delayed); + } dav1d_thread_picture_unref(out_delayed); } dav1d_thread_picture_ref(out_delayed, diff -Nru dav1d-0.7.1/src/picture.c dav1d-0.9.1/src/picture.c --- dav1d-0.7.1/src/picture.c 2020-06-21 11:48:54.996126400 +0000 +++ dav1d-0.9.1/src/picture.c 2021-07-28 21:38:28.885852000 +0000 @@ -34,7 +34,6 @@ #include #include "common/intops.h" -#include "common/mem.h" #include "common/validate.h" #include "src/internal.h" @@ -45,7 +44,7 @@ #include "src/thread_task.h" int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) { - assert(cookie == NULL); + assert(sizeof(Dav1dMemPoolBuffer) <= DAV1D_PICTURE_ALIGNMENT); const int hbd = p->p.bpc > 8; const int aligned_w = (p->p.w + 127) & ~127; const int aligned_h = (p->p.h + 127) & ~127; @@ -67,27 +66,24 @@ p->stride[1] = uv_stride; const size_t y_sz = y_stride * aligned_h; const size_t uv_sz = uv_stride * (aligned_h >> ss_ver); - const size_t pic_size = y_sz + 2 * uv_sz + DAV1D_PICTURE_ALIGNMENT; - uint8_t *const data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT); - if (!data) return DAV1D_ERR(ENOMEM); + const size_t pic_size = y_sz + 2 * uv_sz; + Dav1dMemPoolBuffer *const buf = dav1d_mem_pool_pop(cookie, pic_size + + DAV1D_PICTURE_ALIGNMENT - + sizeof(Dav1dMemPoolBuffer)); + if (!buf) return DAV1D_ERR(ENOMEM); + p->allocator_data = buf; + + uint8_t *const data = buf->data; p->data[0] = data; p->data[1] = has_chroma ? data + y_sz : NULL; p->data[2] = has_chroma ? data + y_sz + uv_sz : NULL; -#ifndef NDEBUG /* safety check */ - p->allocator_data = data; -#endif - return 0; } void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) { - assert(cookie == NULL); -#ifndef NDEBUG /* safety check */ - assert(p->allocator_data == p->data[0]); -#endif - dav1d_free_aligned(p->data[0]); + dav1d_mem_pool_push(cookie, p->allocator_data); } struct pic_ctx_context { @@ -198,6 +194,9 @@ dav1d_ref_dec(&c->itut_t35_ref); c->itut_t35 = NULL; + p->flags = c->frame_flags; + c->frame_flags = 0; + p->visible = f->frame_hdr->show_frame; if (p->t) { atomic_init(&p->progress[0], 0); @@ -258,6 +257,7 @@ dst->t = src->t; dst->visible = src->visible; dst->progress = src->progress; + dst->flags = src->flags; } void dav1d_picture_unref_internal(Dav1dPicture *const p) { @@ -326,3 +326,16 @@ pthread_cond_broadcast(&p->t->cond); pthread_mutex_unlock(&p->t->lock); } + +enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *const p) { + if (!p->flags) + return 0; + + enum Dav1dEventFlags flags = 0; + if (p->flags & PICTURE_FLAG_NEW_SEQUENCE) + flags |= DAV1D_EVENT_FLAG_NEW_SEQUENCE; + if (p->flags & PICTURE_FLAG_NEW_OP_PARAMS_INFO) + flags |= DAV1D_EVENT_FLAG_NEW_OP_PARAMS_INFO; + + return flags; +} diff -Nru dav1d-0.7.1/src/picture.h dav1d-0.9.1/src/picture.h --- dav1d-0.7.1/src/picture.h 2020-06-21 11:48:55.000126400 +0000 +++ dav1d-0.9.1/src/picture.h 2021-07-28 21:38:28.885852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -43,15 +43,26 @@ PLANE_TYPE_ALL, }; +enum PictureFlags { + PICTURE_FLAG_NEW_SEQUENCE = 1 << 0, + PICTURE_FLAG_NEW_OP_PARAMS_INFO = 1 << 1, +}; + typedef struct Dav1dThreadPicture { Dav1dPicture p; int visible; + enum PictureFlags flags; struct thread_data *t; // [0] block data (including segmentation map and motion vectors) // [1] pixel data atomic_uint *progress; } Dav1dThreadPicture; +typedef struct Dav1dPictureBuffer { + void *data; + struct Dav1dPictureBuffer *next; +} Dav1dPictureBuffer; + /* * Allocate a picture with custom border size. */ @@ -109,4 +120,9 @@ void dav1d_default_picture_release(Dav1dPicture *p, void *cookie); void dav1d_picture_unref_internal(Dav1dPicture *p); +/** + * Get event flags from picture flags. + */ +enum Dav1dEventFlags dav1d_picture_get_event_flags(const Dav1dThreadPicture *p); + #endif /* DAV1D_SRC_PICTURE_H */ diff -Nru dav1d-0.7.1/src/ppc/looprestoration_init_tmpl.c dav1d-0.9.1/src/ppc/looprestoration_init_tmpl.c --- dav1d-0.7.1/src/ppc/looprestoration_init_tmpl.c 2020-06-21 11:48:55.000126400 +0000 +++ dav1d-0.9.1/src/ppc/looprestoration_init_tmpl.c 2021-07-28 21:38:28.885852000 +0000 @@ -49,7 +49,7 @@ static void wiener_filter_h_vsx(int32_t *hor_ptr, uint8_t *tmp_ptr, - const int16_t filterh[7], + const int16_t filterh[8], const int w, const int h) { static const i32x4 zerov = vec_splats(0); @@ -149,14 +149,10 @@ } while (0) #define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \ - i32x4 v_1 = (i32x4) vec_ld( 0, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \ - i32x4 v_2 = (i32x4) vec_ld(16, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \ - i32x4 v_3 = (i32x4) vec_ld(32, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \ - i32x4 v_4 = (i32x4) vec_ld(48, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \ - i32x4 sum1 = -round_offset_vec; \ - i32x4 sum2 = -round_offset_vec; \ - i32x4 sum3 = -round_offset_vec; \ - i32x4 sum4 = -round_offset_vec; \ + i32x4 sum1 = round_vec; \ + i32x4 sum2 = round_vec; \ + i32x4 sum3 = round_vec; \ + i32x4 sum4 = round_vec; \ APPLY_FILTER_V(0, filterv0); \ APPLY_FILTER_V(1, filterv1); \ APPLY_FILTER_V(2, filterv2); \ @@ -164,31 +160,25 @@ APPLY_FILTER_V(4, filterv4); \ APPLY_FILTER_V(5, filterv5); \ APPLY_FILTER_V(6, filterv6); \ - sum1 = (v_1 << seven_vec) + sum1 + rounding_off_vec; \ - sum2 = (v_2 << seven_vec) + sum2 + rounding_off_vec; \ - sum3 = (v_3 << seven_vec) + sum3 + rounding_off_vec; \ - sum4 = (v_4 << seven_vec) + sum4 + rounding_off_vec; \ sum1 = sum1 >> round_bits_vec; \ sum2 = sum2 >> round_bits_vec; \ sum3 = sum3 >> round_bits_vec; \ sum4 = sum4 >> round_bits_vec; \ - i16x8 sum_short_packed_1 = (i16x8) vec_pack( sum1, sum2 ); \ - i16x8 sum_short_packed_2 = (i16x8) vec_pack( sum3, sum4 ); \ + i16x8 sum_short_packed_1 = (i16x8) vec_pack(sum1, sum2); \ + i16x8 sum_short_packed_2 = (i16x8) vec_pack(sum3, sum4); \ sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \ sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \ - sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2 ); \ + sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2); \ } while (0) static inline void wiener_filter_v_vsx(uint8_t *p, const ptrdiff_t p_stride, const int32_t *hor, - const int16_t filterv[7], + const int16_t filterv[8], const int w, const int h) { static const i32x4 round_bits_vec = vec_splats(11); - static const i32x4 rounding_off_vec = vec_splats(1 << 10); - static const i32x4 round_offset_vec = vec_splats(1 << 18); - static const i32x4 seven_vec = vec_splats(7); + static const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18)); i32x4 filterv0 = vec_splats((int32_t) filterv[0]); i32x4 filterv1 = vec_splats((int32_t) filterv[1]); @@ -309,7 +299,6 @@ } } - // FIXME Could split into luma and chroma specific functions, // (since first and last tops are always 0 for chroma) // FIXME Could implement a version that requires less temporary memory @@ -319,31 +308,31 @@ const uint8_t *lpf, const ptrdiff_t lpf_stride, const int w, const int h, - const int16_t filterh[7], - const int16_t filterv[7], + const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { + const int16_t (*const filter)[8] = params->filter; + // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels // of padding above and below ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges); ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,); - wiener_filter_h_vsx(hor, tmp, filterh, w, h); - wiener_filter_v_vsx(p, p_stride, hor, filterv, w, h); - + wiener_filter_h_vsx(hor, tmp, filter[0], w, h); + wiener_filter_v_vsx(p, p_stride, hor, filter[1], w, h); } #endif -COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc) - (Dav1dLoopRestorationDSPContext *const c) +COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc)(Dav1dLoopRestorationDSPContext *const c, + const int bpc) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return; #if BITDEPTH == 8 - c->wiener = wiener_filter_vsx; + c->wiener[0] = c->wiener[1] = wiener_filter_vsx; #endif } diff -Nru dav1d-0.7.1/src/qm.c dav1d-0.9.1/src/qm.c --- dav1d-0.7.1/src/qm.c 2020-06-21 11:48:55.004126300 +0000 +++ dav1d-0.9.1/src/qm.c 2021-07-28 21:38:28.885852000 +0000 @@ -3066,7 +3066,6 @@ }; const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES]; -static uint8_t pb_32x32[32 * 32]; static uint8_t qm_tbl_4x4[15][2][16]; static uint8_t qm_tbl_4x8[15][2][32]; static uint8_t qm_tbl_4x16[15][2][64]; @@ -3145,8 +3144,5 @@ dav1d_qm_tbl[i][j][RTX_16X64] = dav1d_qm_tbl[i][j][RTX_16X32]; } - memset(pb_32x32, 32, sizeof(pb_32x32)); - for (int j = 0; j < 2; j++) - for (int k = 0; k < N_RECT_TX_SIZES; k++) - dav1d_qm_tbl[15][j][k] = pb_32x32; + // dav1d_qm_tbl[15][*][*] == NULL } diff -Nru dav1d-0.7.1/src/recon.h dav1d-0.9.1/src/recon.h --- dav1d-0.7.1/src/recon.h 2020-06-21 11:48:55.004126300 +0000 +++ dav1d-0.9.1/src/recon.h 2021-07-28 21:38:28.889852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -65,6 +65,14 @@ decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc); decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_16bpc); decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc); decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc); diff -Nru dav1d-0.7.1/src/recon_tmpl.c dav1d-0.9.1/src/recon_tmpl.c --- dav1d-0.7.1/src/recon_tmpl.c 2020-06-21 11:48:55.004126300 +0000 +++ dav1d-0.9.1/src/recon_tmpl.c 2021-07-28 21:38:28.889852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -33,8 +33,8 @@ #include "common/attributes.h" #include "common/bitdepth.h" #include "common/dump.h" +#include "common/frame.h" #include "common/intops.h" -#include "common/mem.h" #include "src/cdef_apply.h" #include "src/ctx.h" @@ -439,34 +439,39 @@ } else { eob = eob_bin; } + assert(eob >= 0); // base tokens uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma]; uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma]; - const uint16_t *const scan = dav1d_scans[tx][tx_class]; - int dc_tok; + unsigned rc, dc_tok; if (eob) { uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma]; uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8); - const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1; /* eob */ - unsigned rc = scan[eob], x = rc >> shift, y = rc & mask; unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4); int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2); int tok = eob_tok + 1; int level_tok = tok * 0x41; unsigned mag; - if (dbg) - printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", - t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); #define DECODE_COEFS_CLASS(tx_class) \ + unsigned x, y; \ + if (tx_class == TX_CLASS_2D) \ + rc = scan[eob], x = rc >> shift, y = rc & mask; \ + else if (tx_class == TX_CLASS_H) \ + /* Transposing reduces the stride and padding requirements */ \ + x = eob & mask, y = eob >> shift, rc = eob; \ + else /* tx_class == TX_CLASS_V */ \ + x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \ + if (dbg) \ + printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ + t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \ if (eob_tok == 2) { \ - ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \ - tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \ + ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \ tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ level_tok = tok + (3 << 6); \ if (dbg) \ @@ -474,40 +479,46 @@ imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \ ts->msac.rng); \ } \ - cf[rc] = tok; \ - if (tx_class == TX_CLASS_H) \ - /* Transposing reduces the stride and padding requirements */ \ - levels[y * stride + x] = (uint8_t) level_tok; \ - else \ - levels[x * stride + y] = (uint8_t) level_tok; \ + cf[rc] = tok << 11; \ + levels[x * stride + y] = (uint8_t) level_tok; \ for (int i = eob - 1; i > 0; i--) { /* ac */ \ - if (tx_class == TX_CLASS_H) \ - rc = i, x = rc & mask, y = rc >> shift; \ - else \ - rc = scan[i], x = rc >> shift, y = rc & mask; \ + unsigned rc_i; \ + if (tx_class == TX_CLASS_2D) \ + rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \ + else if (tx_class == TX_CLASS_H) \ + x = i & mask, y = i >> shift, rc_i = i; \ + else /* tx_class == TX_CLASS_V */ \ + x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \ assert(x < 32 && y < 32); \ uint8_t *const level = levels + x * stride + y; \ ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \ if (tx_class == TX_CLASS_2D) \ y |= x; \ tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \ - level_tok = tok * 0x41; \ if (dbg) \ printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ - t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \ + t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \ if (tok == 3) { \ mag &= 63; \ ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \ (mag > 12 ? 6 : (mag + 1) >> 1); \ tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \ - level_tok = tok + (3 << 6); \ if (dbg) \ printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \ - imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \ + imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \ ts->msac.rng); \ + *level = (uint8_t) (tok + (3 << 6)); \ + cf[rc_i] = (tok << 11) | rc; \ + rc = rc_i; \ + } else { \ + /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \ + tok *= 0x17ff41; \ + *level = (uint8_t) tok; \ + /* tok ? (tok << 11) | rc : 0 */ \ + tok = (tok >> 9) & (rc + ~0x7ffu); \ + if (tok) rc = rc_i; \ + cf[rc_i] = tok; \ } \ - cf[rc] = tok; \ - *level = (uint8_t) level_tok; \ } \ /* dc */ \ ctx = (tx_class == TX_CLASS_2D) ? 0 : \ @@ -529,27 +540,35 @@ } \ break + const uint16_t *scan; switch (tx_class) { case TX_CLASS_2D: { const unsigned nonsquare_tx = tx >= RTX_4X8; const uint8_t (*const lo_ctx_offsets)[5] = dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)]; + scan = dav1d_scans[tx]; const ptrdiff_t stride = 4 * sh; + const unsigned shift = t_dim->lh < 4 ? t_dim->lh + 2 : 5, shift2 = 0; + const unsigned mask = 4 * sh - 1; memset(levels, 0, stride * (4 * sw + 2)); DECODE_COEFS_CLASS(TX_CLASS_2D); } case TX_CLASS_H: { -#define lo_ctx_offsets NULL + const uint8_t (*const lo_ctx_offsets)[5] = NULL; const ptrdiff_t stride = 16; + const unsigned shift = t_dim->lh + 2, shift2 = 0; + const unsigned mask = 4 * sh - 1; memset(levels, 0, stride * (4 * sh + 2)); DECODE_COEFS_CLASS(TX_CLASS_H); } case TX_CLASS_V: { + const uint8_t (*const lo_ctx_offsets)[5] = NULL; const ptrdiff_t stride = 16; + const unsigned shift = t_dim->lw + 2, shift2 = t_dim->lh + 2; + const unsigned mask = 4 * sw - 1; memset(levels, 0, stride * (4 * sw + 2)); DECODE_COEFS_CLASS(TX_CLASS_V); } -#undef lo_ctx_offsets #undef DECODE_COEFS_CLASS default: assert(0); } @@ -565,71 +584,137 @@ printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); } + rc = 0; } // residual and sign - int dc_sign = 1 << 6; const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane]; - const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane]; + const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL; const int dq_shift = imax(0, t_dim->ctx - 2); - const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc; - const int cf_max = (1 << (7 + bitdepth)) - 1; - unsigned cul_level = 0; - - if (dc_tok) { // dc - const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l); - uint16_t *const dc_sign_cdf = - ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; - const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); - const unsigned dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5; - if (dbg) - printf("Post-dc_sign[%d][%d][%d]: r=%d\n", - chroma, dc_sign_ctx, sign, ts->msac.rng); - dc_sign = (sign - 1) & (2 << 6); + const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc)); + unsigned cul_level, dc_sign_level; + + if (!dc_tok) { + cul_level = 0; + dc_sign_level = 1 << 6; + if (qm_tbl) goto ac_qm; + goto ac_noqm; + } + + const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l); + uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; + const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); + if (dbg) + printf("Post-dc_sign[%d][%d][%d]: r=%d\n", + chroma, dc_sign_ctx, dc_sign, ts->msac.rng); + + unsigned dc_dq = dq_tbl[0]; + dc_sign_level = (dc_sign - 1) & (2 << 6); + + if (qm_tbl) { + dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5; if (dc_tok == 15) { - dc_tok += read_golomb(&ts->msac); + dc_tok = read_golomb(&ts->msac) + 15; if (dbg) printf("Post-dc_residual[%d->%d]: r=%d\n", dc_tok - 15, dc_tok, ts->msac.rng); dc_tok &= 0xfffff; + dc_dq = (dc_dq * dc_tok) & 0xffffff; + } else { + dc_dq *= dc_tok; + assert(dc_dq <= 0xffffff); } + cul_level = dc_tok; + dc_dq >>= dq_shift; + cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign); + + if (rc) ac_qm: { + const unsigned ac_dq = dq_tbl[1]; + do { + const int sign = dav1d_msac_decode_bool_equi(&ts->msac); + if (dbg) + printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); + const unsigned rc_tok = cf[rc]; + unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5; + + if (rc_tok >= (15 << 11)) { + tok = read_golomb(&ts->msac) + 15; + if (dbg) + printf("Post-residual[%d=%d->%d]: r=%d\n", + rc, tok - 15, tok, ts->msac.rng); - cul_level += dc_tok; - dc_tok = ((dq * dc_tok) & 0xffffff) >> dq_shift; - cf[0] = imin(dc_tok - sign, cf_max) ^ -sign; - } - for (int i = 1; i <= eob; i++) { // ac - const int rc = scan[i]; - int tok = cf[rc]; - if (!tok) continue; - - // sign - const int sign = dav1d_msac_decode_bool_equi(&ts->msac); - const unsigned dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5; - if (dbg) - printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng); + tok &= 0xfffff; + dq = (dq * tok) & 0xffffff; + } else { + tok = rc_tok >> 11; + dq *= tok; + assert(dq <= 0xffffff); + } + cul_level += tok; + dq >>= dq_shift; + cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign); - // residual - if (tok == 15) { - tok += read_golomb(&ts->msac); + rc = rc_tok & 0x3ff; + } while (rc); + } + } else { + // non-qmatrix is the common case and allows for additional optimizations + if (dc_tok == 15) { + dc_tok = read_golomb(&ts->msac) + 15; if (dbg) - printf("Post-residual[%d=%d=%d->%d]: r=%d\n", - i, rc, tok - 15, tok, ts->msac.rng); + printf("Post-dc_residual[%d->%d]: r=%d\n", + dc_tok - 15, dc_tok, ts->msac.rng); - // coefficient parsing, see 5.11.39 - tok &= 0xfffff; + dc_tok &= 0xfffff; + dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift; + dc_dq = umin(dc_dq - dc_sign, cf_max); + } else { + dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign; + assert(dc_dq <= cf_max); } + cul_level = dc_tok; + cf[0] = (coef) (dc_dq ^ -dc_sign); - // dequant, see 7.12.3 - cul_level += tok; - tok = ((dq * tok) & 0xffffff) >> dq_shift; - cf[rc] = imin(tok - sign, cf_max) ^ -sign; + if (rc) ac_noqm: { + const unsigned ac_dq = dq_tbl[1]; + do { + const int sign = dav1d_msac_decode_bool_equi(&ts->msac); + if (dbg) + printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng); + const unsigned rc_tok = cf[rc]; + unsigned tok, dq; + + // residual + if (rc_tok >= (15 << 11)) { + tok = read_golomb(&ts->msac) + 15; + if (dbg) + printf("Post-residual[%d=%d->%d]: r=%d\n", + rc, tok - 15, tok, ts->msac.rng); + + // coefficient parsing, see 5.11.39 + tok &= 0xfffff; + + // dequant, see 7.12.3 + dq = ((ac_dq * tok) & 0xffffff) >> dq_shift; + dq = umin(dq - sign, cf_max); + } else { + // cannot exceed cf_max, so we can avoid the clipping + tok = rc_tok >> 11; + dq = ((ac_dq * tok) >> dq_shift) - sign; + assert(dq <= cf_max); + } + cul_level += tok; + cf[rc] = (coef) (dq ^ -sign); + + rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob + } while (rc); + } } // context - *res_ctx = umin(cul_level, 63) | dc_sign; + *res_ctx = umin(cul_level, 63) | dc_sign_level; return eob; } @@ -1082,11 +1167,11 @@ const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; const int dx = (int) (mvx >> 16) - 4; - const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 - - wmp->beta * 7) & ~0x3f; + const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 - + wmp->u.p.beta * 7) & ~0x3f; const int dy = (int) (mvy >> 16) - 4; - const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 - - wmp->delta * 4) & ~0x3f; + const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 - + wmp->u.p.delta * 4) & ~0x3f; const pixel *ref_ptr; ptrdiff_t ref_stride = refp->p.stride[!!pl]; @@ -1108,10 +1193,10 @@ } if (dst16 != NULL) dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride, - wmp->abcd, mx, my HIGHBD_CALL_SUFFIX); + wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); else dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride, - wmp->abcd, mx, my HIGHBD_CALL_SUFFIX); + wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX); } if (dst8) dst8 += 8 * PXSTRIDE(dstride); else dst16 += 8 * dstride; @@ -1545,7 +1630,7 @@ 4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx); const ptrdiff_t uvdstoff = 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); - if (!(f->frame_hdr->frame_type & 1)) { + if (IS_KEY_OR_INTRA(f->frame_hdr)) { // intrabc assert(!f->frame_hdr->super_res.enabled); res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0, @@ -1966,76 +2051,109 @@ return 0; } -void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) { - const int sbsz = f->sb_step, sbh = f->sbh; - - if (f->frame_hdr->loopfilter.level_y[0] || - f->frame_hdr->loopfilter.level_y[1]) - { +void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby) { + const int y = sby * f->sb_step * 4; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *const p[3] = { + f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), + f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) + }; + Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; + if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) { int start_of_tile_row = 0; if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby) start_of_tile_row = f->lf.tile_row++; - bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby, - start_of_tile_row); + bytefn(dav1d_loopfilter_sbrow)(f, p, mask, sby, start_of_tile_row); } - if (f->lf.restore_planes) { // Store loop filtered pixels required by loop restoration - bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby); - } - if (f->seq_hdr->cdef) { - if (sby) { - const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; - pixel *p_up[3] = { - f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]), - f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), - f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), - }; - bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr, - sby * sbsz - 2, sby * sbsz); - } - const int n_blks = sbsz - 2 * (sby + 1 < sbh); - bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz, - imin(sby * sbsz + n_blks, f->bh)); - } - if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { - const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; - for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) { - const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; - const int h_start = 8 * !!sby >> ss_ver; - const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl]; - pixel *dst = f->lf.sr_p[pl] - h_start * PXSTRIDE(dst_stride); - const ptrdiff_t src_stride = f->cur.stride[!!pl]; - const pixel *src = f->lf.p[pl] - h_start * PXSTRIDE(src_stride); - const int h_end = 4 * (sbsz - 2 * (sby + 1 < sbh)) >> ss_ver; - const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; - const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; - const int src_w = (4 * f->bw + ss_hor) >> ss_hor; - const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; - - f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, - imin(img_h, h_end) + h_start, src_w, - f->resize_step[!!pl], f->resize_start[!!pl] - HIGHBD_CALL_SUFFIX); - } - } - if (f->lf.restore_planes) { - bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby); + bytefn(dav1d_lr_copy_lpf)(f, p, sby); } +} + +void bytefn(dav1d_filter_sbrow_cdef)(Dav1dFrameContext *const f, const int sby) { + const int sbsz = f->sb_step; + const int y = sby * sbsz * 4; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *const p[3] = { + f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), + f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) + }; + Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w; + Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; + const int start = sby * sbsz; + if (sby) { + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *p_up[3] = { + p[0] - 8 * PXSTRIDE(f->cur.stride[0]), + p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + }; + bytefn(dav1d_cdef_brow)(f, p_up, prev_mask, start - 2, start); + } + const int n_blks = sbsz - 2 * (sby + 1 < f->sbh); + const int end = imin(start + n_blks, f->bh); + bytefn(dav1d_cdef_brow)(f, p, mask, start, end); +} +void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) { + const int sbsz = f->sb_step; + const int y = sby * sbsz * 4; const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; - f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.stride[0]); - f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; - f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; - f->lf.sr_p[0] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[0]); - f->lf.sr_p[1] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver; - f->lf.sr_p[2] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver; - f->lf.prev_mask_ptr = f->lf.mask_ptr; - if ((sby & 1) || f->seq_hdr->sb128) { - f->lf.mask_ptr += f->sb128w; + const pixel *const p[3] = { + f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), + f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) + }; + pixel *const sr_p[3] = { + f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), + f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), + f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) + }; + const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; + for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) { + const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int h_start = 8 * !!sby >> ss_ver; + const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl]; + pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride); + const ptrdiff_t src_stride = f->cur.stride[!!pl]; + const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride); + const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver; + const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; + const int src_w = (4 * f->bw + ss_hor) >> ss_hor; + const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; + + f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, + imin(img_h, h_end) + h_start, src_w, + f->resize_step[!!pl], f->resize_start[!!pl] + HIGHBD_CALL_SUFFIX); } } +void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) { + const int y = sby * f->sb_step * 4; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *const sr_p[3] = { + f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), + f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), + f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) + }; + bytefn(dav1d_lr_sbrow)(f, sr_p, sby); +} + +void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) { + bytefn(dav1d_filter_sbrow_deblock)(f, sby); + if (f->seq_hdr->cdef) + bytefn(dav1d_filter_sbrow_cdef)(f, sby); + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) + bytefn(dav1d_filter_sbrow_resize)(f, sby); + if (f->lf.restore_planes) + bytefn(dav1d_filter_sbrow_lr)(f, sby); +} + void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) { const Dav1dFrameContext *const f = t->f; Dav1dTileState *const ts = t->ts; diff -Nru dav1d-0.7.1/src/ref.c dav1d-0.9.1/src/ref.c --- dav1d-0.7.1/src/ref.c 2020-06-21 11:48:55.004126300 +0000 +++ dav1d-0.9.1/src/ref.c 2021-07-28 21:38:28.889852000 +0000 @@ -27,8 +27,6 @@ #include "config.h" -#include "common/mem.h" - #include "src/ref.h" static void default_free_callback(const uint8_t *const data, void *const user_data) { @@ -36,15 +34,39 @@ dav1d_free_aligned(user_data); } -Dav1dRef *dav1d_ref_create(const size_t size) { - void *data = dav1d_alloc_aligned(size, 32); +Dav1dRef *dav1d_ref_create(size_t size) { + size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + + uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64); if (!data) return NULL; - Dav1dRef *const res = dav1d_ref_wrap(data, default_free_callback, data); - if (res) - res->data = data; - else - dav1d_free_aligned(data); + Dav1dRef *const res = (Dav1dRef*)(data + size); + res->const_data = res->user_data = res->data = data; + atomic_init(&res->ref_cnt, 1); + res->free_ref = 0; + res->free_callback = default_free_callback; + + return res; +} + +static void pool_free_callback(const uint8_t *const data, void *const user_data) { + dav1d_mem_pool_push((Dav1dMemPool*)data, user_data); +} + +Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *const pool, size_t size) { + size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + + Dav1dMemPoolBuffer *const buf = + dav1d_mem_pool_pop(pool, size + sizeof(Dav1dRef)); + if (!buf) return NULL; + + Dav1dRef *const res = &((Dav1dRef*)buf)[-1]; + res->data = buf->data; + res->const_data = pool; + atomic_init(&res->ref_cnt, 1); + res->free_ref = 0; + res->free_callback = pool_free_callback; + res->user_data = buf; return res; } @@ -59,6 +81,7 @@ res->data = NULL; res->const_data = ptr; atomic_init(&res->ref_cnt, 1); + res->free_ref = 1; res->free_callback = free_callback; res->user_data = user_data; @@ -76,8 +99,9 @@ if (!ref) return; if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) { + const int free_ref = ref->free_ref; ref->free_callback(ref->const_data, ref->user_data); - free(ref); + if (free_ref) free(ref); } *pref = NULL; } diff -Nru dav1d-0.7.1/src/ref.h dav1d-0.9.1/src/ref.h --- dav1d-0.7.1/src/ref.h 2020-06-21 11:48:55.004126300 +0000 +++ dav1d-0.9.1/src/ref.h 2021-07-28 21:38:28.889852000 +0000 @@ -30,6 +30,9 @@ #include "dav1d/dav1d.h" +#include "src/mem.h" +#include "src/thread.h" + #include #include @@ -37,11 +40,13 @@ void *data; const void *const_data; atomic_int ref_cnt; + int free_ref; void (*free_callback)(const uint8_t *data, void *user_data); void *user_data; }; Dav1dRef *dav1d_ref_create(size_t size); +Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size); Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr, void (*free_callback)(const uint8_t *data, void *user_data), void *user_data); diff -Nru dav1d-0.7.1/src/refmvs.c dav1d-0.9.1/src/refmvs.c --- dav1d-0.7.1/src/refmvs.c 2020-06-21 11:48:55.008126500 +0000 +++ dav1d-0.9.1/src/refmvs.c 2021-07-28 21:38:28.889852000 +0000 @@ -51,12 +51,13 @@ const mv cand_mv = ((b->mf & 1) && gmv[0].n != INVALID_MV) ? gmv[0] : b->mv.mv[n]; + *have_refmv_match = 1; + *have_newmv_match |= b->mf >> 1; + const int last = *cnt; for (int m = 0; m < last; m++) if (mvstack[m].mv.mv[0].n == cand_mv.n) { mvstack[m].weight += weight; - *have_refmv_match = 1; - *have_newmv_match |= b->mf >> 1; return; } @@ -65,8 +66,6 @@ mvstack[last].weight = weight; *cnt = last + 1; } - *have_refmv_match = 1; - *have_newmv_match |= b->mf >> 1; return; } } @@ -76,12 +75,13 @@ [1] = ((b->mf & 1) && gmv[1].n != INVALID_MV) ? gmv[1] : b->mv.mv[1], }}; + *have_refmv_match = 1; + *have_newmv_match |= b->mf >> 1; + const int last = *cnt; for (int n = 0; n < last; n++) if (mvstack[n].mv.n == cand_mv.n) { mvstack[n].weight += weight; - *have_refmv_match = 1; - *have_newmv_match |= b->mf >> 1; return; } @@ -90,8 +90,6 @@ mvstack[last].weight = weight; *cnt = last + 1; } - *have_refmv_match = 1; - *have_newmv_match |= b->mf >> 1; } } diff -Nru dav1d-0.7.1/src/scan.c dav1d-0.9.1/src/scan.c --- dav1d-0.7.1/src/scan.c 2020-06-21 11:48:55.008126500 +0000 +++ dav1d-0.9.1/src/scan.c 2021-07-28 21:38:28.889852000 +0000 @@ -30,19 +30,14 @@ #include "common/attributes.h" #include "src/scan.h" -static const uint16_t ALIGN(av1_default_scan_4x4[], 32) = { +static const uint16_t ALIGN(scan_4x4[], 32) = { 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15, }; -static const uint16_t ALIGN(av1_mrow_scan_4x4[], 32) = { - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15, -}; -static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = { + +static const uint16_t ALIGN(scan_4x8[], 32) = { 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, @@ -52,17 +47,8 @@ 14, 7, 29, 22, 15, 30, 23, 31, }; -static const uint16_t ALIGN(av1_mrow_scan_4x8[], 32) = { - 0, 8, 16, 24, - 1, 9, 17, 25, - 2, 10, 18, 26, - 3, 11, 19, 27, - 4, 12, 20, 28, - 5, 13, 21, 29, - 6, 14, 22, 30, - 7, 15, 23, 31, -}; -static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = { + +static const uint16_t ALIGN(scan_4x16[], 32) = { 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, @@ -80,37 +66,15 @@ 30, 15, 61, 46, 31, 62, 47, 63, }; -static const uint16_t ALIGN(av1_mrow_scan_4x16[], 32) = { - 0, 16, 32, 48, - 1, 17, 33, 49, - 2, 18, 34, 50, - 3, 19, 35, 51, - 4, 20, 36, 52, - 5, 21, 37, 53, - 6, 22, 38, 54, - 7, 23, 39, 55, - 8, 24, 40, 56, - 9, 25, 41, 57, - 10, 26, 42, 58, - 11, 27, 43, 59, - 12, 28, 44, 60, - 13, 29, 45, 61, - 14, 30, 46, 62, - 15, 31, 47, 63, -}; -static const uint16_t ALIGN(av1_default_scan_8x4[], 32) = { + +static const uint16_t ALIGN(scan_8x4[], 32) = { 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31, }; -static const uint16_t ALIGN(av1_mrow_scan_8x4[], 32) = { - 0, 4, 8, 12, 16, 20, 24, 28, - 1, 5, 9, 13, 17, 21, 25, 29, - 2, 6, 10, 14, 18, 22, 26, 30, - 3, 7, 11, 15, 19, 23, 27, 31, -}; -static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = { + +static const uint16_t ALIGN(scan_8x8[], 32) = { 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, 33, 26, 19, 12, 5, 6, 13, 20, @@ -120,17 +84,8 @@ 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63, }; -static const uint16_t ALIGN(av1_mrow_scan_8x8[], 32) = { - 0, 8, 16, 24, 32, 40, 48, 56, - 1, 9, 17, 25, 33, 41, 49, 57, - 2, 10, 18, 26, 34, 42, 50, 58, - 3, 11, 19, 27, 35, 43, 51, 59, - 4, 12, 20, 28, 36, 44, 52, 60, - 5, 13, 21, 29, 37, 45, 53, 61, - 6, 14, 22, 30, 38, 46, 54, 62, - 7, 15, 23, 31, 39, 47, 55, 63, -}; -static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = { + +static const uint16_t ALIGN(scan_8x16[], 32) = { 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80, 65, 50, 35, 20, 5, 96, 81, 66, @@ -148,25 +103,8 @@ 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127, }; -static const uint16_t ALIGN(av1_mrow_scan_8x16[], 32) = { - 0, 16, 32, 48, 64, 80, 96, 112, - 1, 17, 33, 49, 65, 81, 97, 113, - 2, 18, 34, 50, 66, 82, 98, 114, - 3, 19, 35, 51, 67, 83, 99, 115, - 4, 20, 36, 52, 68, 84, 100, 116, - 5, 21, 37, 53, 69, 85, 101, 117, - 6, 22, 38, 54, 70, 86, 102, 118, - 7, 23, 39, 55, 71, 87, 103, 119, - 8, 24, 40, 56, 72, 88, 104, 120, - 9, 25, 41, 57, 73, 89, 105, 121, - 10, 26, 42, 58, 74, 90, 106, 122, - 11, 27, 43, 59, 75, 91, 107, 123, - 12, 28, 44, 60, 76, 92, 108, 124, - 13, 29, 45, 61, 77, 93, 109, 125, - 14, 30, 46, 62, 78, 94, 110, 126, - 15, 31, 47, 63, 79, 95, 111, 127, -}; -static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = { + +static const uint16_t ALIGN(scan_8x32[], 32) = { 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160, 129, 98, 67, 36, 5, 192, 161, 130, @@ -200,19 +138,15 @@ 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223, 255, }; -static const uint16_t ALIGN(av1_default_scan_16x4[], 32) = { + +static const uint16_t ALIGN(scan_16x4[], 32) = { 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63, }; -static const uint16_t ALIGN(av1_mrow_scan_16x4[], 32) = { - 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, - 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, - 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, - 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, -}; -static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = { + +static const uint16_t ALIGN(scan_16x8[], 32) = { 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, @@ -222,17 +156,8 @@ 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127, }; -static const uint16_t ALIGN(av1_mrow_scan_16x8[], 32) = { - 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, - 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, - 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, - 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, - 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, - 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, - 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, - 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, -}; -static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = { + +static const uint16_t ALIGN(scan_16x16[], 32) = { 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80, 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67, 52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114, @@ -250,43 +175,8 @@ 188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190, 175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255, }; -static const uint16_t ALIGN(av1_mrow_scan_16x16[], 32) = { - 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, - 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, - 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, - 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, - 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, - 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, - 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, - 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, - 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, - 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, - 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, - 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, - 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, - 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, - 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, - 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, -}; -static const uint16_t ALIGN(av1_mcol_scan_16x16[], 32) = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, - 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, - 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, - 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, - 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, - 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, - 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, - 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, -}; -static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = { + +static const uint16_t ALIGN(scan_16x32[], 32) = { 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226, @@ -320,7 +210,8 @@ 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511, }; -static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = { + +static const uint16_t ALIGN(scan_32x8[], 32) = { 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92, @@ -330,7 +221,8 @@ 195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255, }; -static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = { + +static const uint16_t ALIGN(scan_32x16[], 32) = { 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73, @@ -348,7 +240,8 @@ 381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444, 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511, }; -static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = { + +static const uint16_t ALIGN(scan_32x32[], 32) = { 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131, 100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258, 289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292, @@ -383,62 +276,24 @@ 892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023, }; -const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = { - [TX_4X4] = { - [TX_CLASS_2D] = av1_default_scan_4x4, - [TX_CLASS_V] = av1_mrow_scan_4x4, - [TX_CLASS_H] = av1_mcol_scan_16x16, - }, [TX_8X8] = { - [TX_CLASS_2D] = av1_default_scan_8x8, - [TX_CLASS_V] = av1_mrow_scan_8x8, - [TX_CLASS_H] = av1_mcol_scan_16x16, - }, [TX_16X16] = { - [TX_CLASS_2D] = av1_default_scan_16x16, - [TX_CLASS_V] = av1_mrow_scan_16x16, - [TX_CLASS_H] = av1_mcol_scan_16x16, - }, [TX_32X32] = { - [TX_CLASS_2D] = av1_default_scan_32x32, - }, [TX_64X64] = { - [TX_CLASS_2D] = av1_default_scan_32x32, - }, [RTX_4X8] = { - [TX_CLASS_2D] = av1_default_scan_4x8, - [TX_CLASS_V] = av1_mrow_scan_4x8, - [TX_CLASS_H] = av1_mcol_scan_16x16, - }, [RTX_8X4] = { - [TX_CLASS_2D] = av1_default_scan_8x4, - [TX_CLASS_V] = av1_mrow_scan_8x4, - [TX_CLASS_H] = av1_mcol_scan_16x16, - }, [RTX_8X16] = { - [TX_CLASS_2D] = av1_default_scan_8x16, - [TX_CLASS_V] = av1_mrow_scan_8x16, - [TX_CLASS_H] = av1_mcol_scan_16x16, - }, [RTX_16X8] = { - [TX_CLASS_2D] = av1_default_scan_16x8, - [TX_CLASS_V] = av1_mrow_scan_16x8, - [TX_CLASS_H] = av1_mcol_scan_16x16, - }, [RTX_16X32] = { - [TX_CLASS_2D] = av1_default_scan_16x32, - }, [RTX_32X16] = { - [TX_CLASS_2D] = av1_default_scan_32x16, - }, [RTX_32X64] = { - [TX_CLASS_2D] = av1_default_scan_32x32, - }, [RTX_64X32] = { - [TX_CLASS_2D] = av1_default_scan_32x32, - }, [RTX_4X16] = { - [TX_CLASS_2D] = av1_default_scan_4x16, - [TX_CLASS_V] = av1_mrow_scan_4x16, - [TX_CLASS_H] = av1_mcol_scan_16x16, - }, [RTX_16X4] = { - [TX_CLASS_2D] = av1_default_scan_16x4, - [TX_CLASS_V] = av1_mrow_scan_16x4, - [TX_CLASS_H] = av1_mcol_scan_16x16, - }, [RTX_8X32] = { - [TX_CLASS_2D] = av1_default_scan_8x32, - }, [RTX_32X8] = { - [TX_CLASS_2D] = av1_default_scan_32x8, - }, [RTX_16X64] = { - [TX_CLASS_2D] = av1_default_scan_16x32, - }, [RTX_64X16] = { - [TX_CLASS_2D] = av1_default_scan_32x16, - }, +const uint16_t *const dav1d_scans[N_RECT_TX_SIZES] = { + [ TX_4X4 ] = scan_4x4, + [ TX_8X8 ] = scan_8x8, + [ TX_16X16] = scan_16x16, + [ TX_32X32] = scan_32x32, + [ TX_64X64] = scan_32x32, + [RTX_4X8 ] = scan_4x8, + [RTX_8X4 ] = scan_8x4, + [RTX_8X16 ] = scan_8x16, + [RTX_16X8 ] = scan_16x8, + [RTX_16X32] = scan_16x32, + [RTX_32X16] = scan_32x16, + [RTX_32X64] = scan_32x32, + [RTX_64X32] = scan_32x32, + [RTX_4X16 ] = scan_4x16, + [RTX_16X4 ] = scan_16x4, + [RTX_8X32 ] = scan_8x32, + [RTX_32X8 ] = scan_32x8, + [RTX_16X64] = scan_16x32, + [RTX_64X16] = scan_32x16, }; diff -Nru dav1d-0.7.1/src/scan.h dav1d-0.9.1/src/scan.h --- dav1d-0.7.1/src/scan.h 2020-06-21 11:48:55.008126500 +0000 +++ dav1d-0.9.1/src/scan.h 2021-07-28 21:38:28.889852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -32,6 +32,6 @@ #include "src/levels.h" -extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3]; +extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES]; #endif /* DAV1D_SRC_SCAN_H */ diff -Nru dav1d-0.7.1/src/tables.c dav1d-0.9.1/src/tables.c --- dav1d-0.7.1/src/tables.c 2020-06-21 11:48:55.008126500 +0000 +++ dav1d-0.9.1/src/tables.c 2021-07-28 21:38:28.889852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -391,10 +391,10 @@ 0, 0, 1 << 16, 0, 0, 1 << 16, }, - .alpha = 0, - .beta = 0, - .gamma = 0, - .delta = 0, + .u.p.alpha = 0, + .u.p.beta = 0, + .u.p.gamma = 0, + .u.p.delta = 0, }; const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = { @@ -412,13 +412,11 @@ { 0 * 12 + 1, -1 * 12 + 2 }, // 1 }; -const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1 - { 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 }, - { 2, 1, 80, 1438 }, { 2, 1, 70, 1295 }, { 2, 1, 58, 1177 }, - { 2, 1, 47, 1079 }, { 2, 1, 37, 996 }, { 2, 1, 30, 925 }, - { 2, 1, 25, 863 }, { 0, 1, -1, 2589 }, { 0, 1, -1, 1618 }, - { 0, 1, -1, 1177 }, { 0, 1, -1, 925 }, { 2, 0, 56, -1 }, - { 2, 0, 22, -1 }, +const uint16_t ALIGN(dav1d_sgr_params[16][2], 4) = { + { 140, 3236 }, { 112, 2158 }, { 93, 1618 }, { 80, 1438 }, + { 70, 1295 }, { 58, 1177 }, { 47, 1079 }, { 37, 996 }, + { 30, 925 }, { 25, 863 }, { 0, 2589 }, { 0, 1618 }, + { 0, 1177 }, { 0, 925 }, { 56, 0 }, { 22, 0 }, }; const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = { @@ -548,113 +546,108 @@ } }; -#if ARCH_X86 -#define W(v0, v1, v2, v3, v4, v5, v6, v7) { v0, v2, v4, v6, v1, v3, v5, v7 } -#else -#define W(v0, v1, v2, v3, v4, v5, v6, v7) { v0, v1, v2, v3, v4, v5, v6, v7 } -#endif const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = { - // [-1, 0) - W( 0, 0, 127, 1, 0, 0, 0, 0 ), W( 0, - 1, 127, 2, 0, 0, 0, 0 ), - W( 1, - 3, 127, 4, - 1, 0, 0, 0 ), W( 1, - 4, 126, 6, - 2, 1, 0, 0 ), - W( 1, - 5, 126, 8, - 3, 1, 0, 0 ), W( 1, - 6, 125, 11, - 4, 1, 0, 0 ), - W( 1, - 7, 124, 13, - 4, 1, 0, 0 ), W( 2, - 8, 123, 15, - 5, 1, 0, 0 ), - W( 2, - 9, 122, 18, - 6, 1, 0, 0 ), W( 2, -10, 121, 20, - 6, 1, 0, 0 ), - W( 2, -11, 120, 22, - 7, 2, 0, 0 ), W( 2, -12, 119, 25, - 8, 2, 0, 0 ), - W( 3, -13, 117, 27, - 8, 2, 0, 0 ), W( 3, -13, 116, 29, - 9, 2, 0, 0 ), - W( 3, -14, 114, 32, -10, 3, 0, 0 ), W( 3, -15, 113, 35, -10, 2, 0, 0 ), - W( 3, -15, 111, 37, -11, 3, 0, 0 ), W( 3, -16, 109, 40, -11, 3, 0, 0 ), - W( 3, -16, 108, 42, -12, 3, 0, 0 ), W( 4, -17, 106, 45, -13, 3, 0, 0 ), - W( 4, -17, 104, 47, -13, 3, 0, 0 ), W( 4, -17, 102, 50, -14, 3, 0, 0 ), - W( 4, -17, 100, 52, -14, 3, 0, 0 ), W( 4, -18, 98, 55, -15, 4, 0, 0 ), - W( 4, -18, 96, 58, -15, 3, 0, 0 ), W( 4, -18, 94, 60, -16, 4, 0, 0 ), - W( 4, -18, 91, 63, -16, 4, 0, 0 ), W( 4, -18, 89, 65, -16, 4, 0, 0 ), - W( 4, -18, 87, 68, -17, 4, 0, 0 ), W( 4, -18, 85, 70, -17, 4, 0, 0 ), - W( 4, -18, 82, 73, -17, 4, 0, 0 ), W( 4, -18, 80, 75, -17, 4, 0, 0 ), - W( 4, -18, 78, 78, -18, 4, 0, 0 ), W( 4, -17, 75, 80, -18, 4, 0, 0 ), - W( 4, -17, 73, 82, -18, 4, 0, 0 ), W( 4, -17, 70, 85, -18, 4, 0, 0 ), - W( 4, -17, 68, 87, -18, 4, 0, 0 ), W( 4, -16, 65, 89, -18, 4, 0, 0 ), - W( 4, -16, 63, 91, -18, 4, 0, 0 ), W( 4, -16, 60, 94, -18, 4, 0, 0 ), - W( 3, -15, 58, 96, -18, 4, 0, 0 ), W( 4, -15, 55, 98, -18, 4, 0, 0 ), - W( 3, -14, 52, 100, -17, 4, 0, 0 ), W( 3, -14, 50, 102, -17, 4, 0, 0 ), - W( 3, -13, 47, 104, -17, 4, 0, 0 ), W( 3, -13, 45, 106, -17, 4, 0, 0 ), - W( 3, -12, 42, 108, -16, 3, 0, 0 ), W( 3, -11, 40, 109, -16, 3, 0, 0 ), - W( 3, -11, 37, 111, -15, 3, 0, 0 ), W( 2, -10, 35, 113, -15, 3, 0, 0 ), - W( 3, -10, 32, 114, -14, 3, 0, 0 ), W( 2, - 9, 29, 116, -13, 3, 0, 0 ), - W( 2, - 8, 27, 117, -13, 3, 0, 0 ), W( 2, - 8, 25, 119, -12, 2, 0, 0 ), - W( 2, - 7, 22, 120, -11, 2, 0, 0 ), W( 1, - 6, 20, 121, -10, 2, 0, 0 ), - W( 1, - 6, 18, 122, - 9, 2, 0, 0 ), W( 1, - 5, 15, 123, - 8, 2, 0, 0 ), - W( 1, - 4, 13, 124, - 7, 1, 0, 0 ), W( 1, - 4, 11, 125, - 6, 1, 0, 0 ), - W( 1, - 3, 8, 126, - 5, 1, 0, 0 ), W( 1, - 2, 6, 126, - 4, 1, 0, 0 ), - W( 0, - 1, 4, 127, - 3, 1, 0, 0 ), W( 0, 0, 2, 127, - 1, 0, 0, 0 ), + // [-1, 0) + { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, -1, 127, 2, 0, 0, 0, 0 }, + { 1, -3, 127, 4, - 1, 0, 0, 0 }, { 1, -4, 126, 6, -2, 1, 0, 0 }, + { 1, -5, 126, 8, - 3, 1, 0, 0 }, { 1, -6, 125, 11, -4, 1, 0, 0 }, + { 1, -7, 124, 13, - 4, 1, 0, 0 }, { 2, -8, 123, 15, -5, 1, 0, 0 }, + { 2, -9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, -6, 1, 0, 0 }, + { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, -8, 2, 0, 0 }, + { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, -9, 2, 0, 0 }, + { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 }, + { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 }, + { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 }, + { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 }, + { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 }, + { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 }, + { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 }, + { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 }, + { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 }, + { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 }, + { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 }, + { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 }, + { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 }, + { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 }, + { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 }, + { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 }, + { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 }, + { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 }, + { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 }, + { 2, -8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 }, + { 2, -7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 }, + { 1, -6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 }, + { 1, -4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 }, + { 1, -3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 }, + { 0, -1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 }, // [0, 1) - W( 0, 0, 0, 127, 1, 0, 0, 0),W( 0, 0, -1, 127, 2, 0, 0, 0), - W( 0, 1, -3, 127, 4, -2, 1, 0),W( 0, 1, -5, 127, 6, -2, 1, 0), - W( 0, 2, -6, 126, 8, -3, 1, 0),W(-1, 2, -7, 126, 11, -4, 2, -1), - W(-1, 3, -8, 125, 13, -5, 2, -1),W(-1, 3, -10, 124, 16, -6, 3, -1), - W(-1, 4, -11, 123, 18, -7, 3, -1),W(-1, 4, -12, 122, 20, -7, 3, -1), - W(-1, 4, -13, 121, 23, -8, 3, -1),W(-2, 5, -14, 120, 25, -9, 4, -1), - W(-1, 5, -15, 119, 27, -10, 4, -1),W(-1, 5, -16, 118, 30, -11, 4, -1), - W(-2, 6, -17, 116, 33, -12, 5, -1),W(-2, 6, -17, 114, 35, -12, 5, -1), - W(-2, 6, -18, 113, 38, -13, 5, -1),W(-2, 7, -19, 111, 41, -14, 6, -2), - W(-2, 7, -19, 110, 43, -15, 6, -2),W(-2, 7, -20, 108, 46, -15, 6, -2), - W(-2, 7, -20, 106, 49, -16, 6, -2),W(-2, 7, -21, 104, 51, -16, 7, -2), - W(-2, 7, -21, 102, 54, -17, 7, -2),W(-2, 8, -21, 100, 56, -18, 7, -2), - W(-2, 8, -22, 98, 59, -18, 7, -2),W(-2, 8, -22, 96, 62, -19, 7, -2), - W(-2, 8, -22, 94, 64, -19, 7, -2),W(-2, 8, -22, 91, 67, -20, 8, -2), - W(-2, 8, -22, 89, 69, -20, 8, -2),W(-2, 8, -22, 87, 72, -21, 8, -2), - W(-2, 8, -21, 84, 74, -21, 8, -2),W(-2, 8, -22, 82, 77, -21, 8, -2), - W(-2, 8, -21, 79, 79, -21, 8, -2),W(-2, 8, -21, 77, 82, -22, 8, -2), - W(-2, 8, -21, 74, 84, -21, 8, -2),W(-2, 8, -21, 72, 87, -22, 8, -2), - W(-2, 8, -20, 69, 89, -22, 8, -2),W(-2, 8, -20, 67, 91, -22, 8, -2), - W(-2, 7, -19, 64, 94, -22, 8, -2),W(-2, 7, -19, 62, 96, -22, 8, -2), - W(-2, 7, -18, 59, 98, -22, 8, -2),W(-2, 7, -18, 56, 100, -21, 8, -2), - W(-2, 7, -17, 54, 102, -21, 7, -2),W(-2, 7, -16, 51, 104, -21, 7, -2), - W(-2, 6, -16, 49, 106, -20, 7, -2),W(-2, 6, -15, 46, 108, -20, 7, -2), - W(-2, 6, -15, 43, 110, -19, 7, -2),W(-2, 6, -14, 41, 111, -19, 7, -2), - W(-1, 5, -13, 38, 113, -18, 6, -2),W(-1, 5, -12, 35, 114, -17, 6, -2), - W(-1, 5, -12, 33, 116, -17, 6, -2),W(-1, 4, -11, 30, 118, -16, 5, -1), - W(-1, 4, -10, 27, 119, -15, 5, -1),W(-1, 4, -9, 25, 120, -14, 5, -2), - W(-1, 3, -8, 23, 121, -13, 4, -1),W(-1, 3, -7, 20, 122, -12, 4, -1), - W(-1, 3, -7, 18, 123, -11, 4, -1),W(-1, 3, -6, 16, 124, -10, 3, -1), - W(-1, 2, -5, 13, 125, -8, 3, -1),W(-1, 2, -4, 11, 126, -7, 2, -1), - W( 0, 1, -3, 8, 126, -6, 2, 0),W( 0, 1, -2, 6, 127, -5, 1, 0), - W( 0, 1, -2, 4, 127, -3, 1, 0),W( 0, 0, 0, 2, 127, -1, 0, 0), + { 0, 0, 0, 127, 1, 0, 0, 0 }, { 0, 0, -1, 127, 2, 0, 0, 0 }, + { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -5, 127, 6, -2, 1, 0 }, + { 0, 2, -6, 126, 8, -3, 1, 0 }, { -1, 2, -7, 126, 11, -4, 2, -1 }, + { -1, 3, -8, 125, 13, -5, 2, -1 }, { -1, 3, -10, 124, 16, -6, 3, -1 }, + { -1, 4, -11, 123, 18, -7, 3, -1 }, { -1, 4, -12, 122, 20, -7, 3, -1 }, + { -1, 4, -13, 121, 23, -8, 3, -1 }, { -2, 5, -14, 120, 25, -9, 4, -1 }, + { -1, 5, -15, 119, 27, -10, 4, -1 }, { -1, 5, -16, 118, 30, -11, 4, -1 }, + { -2, 6, -17, 116, 33, -12, 5, -1 }, { -2, 6, -17, 114, 35, -12, 5, -1 }, + { -2, 6, -18, 113, 38, -13, 5, -1 }, { -2, 7, -19, 111, 41, -14, 6, -2 }, + { -2, 7, -19, 110, 43, -15, 6, -2 }, { -2, 7, -20, 108, 46, -15, 6, -2 }, + { -2, 7, -20, 106, 49, -16, 6, -2 }, { -2, 7, -21, 104, 51, -16, 7, -2 }, + { -2, 7, -21, 102, 54, -17, 7, -2 }, { -2, 8, -21, 100, 56, -18, 7, -2 }, + { -2, 8, -22, 98, 59, -18, 7, -2 }, { -2, 8, -22, 96, 62, -19, 7, -2 }, + { -2, 8, -22, 94, 64, -19, 7, -2 }, { -2, 8, -22, 91, 67, -20, 8, -2 }, + { -2, 8, -22, 89, 69, -20, 8, -2 }, { -2, 8, -22, 87, 72, -21, 8, -2 }, + { -2, 8, -21, 84, 74, -21, 8, -2 }, { -2, 8, -22, 82, 77, -21, 8, -2 }, + { -2, 8, -21, 79, 79, -21, 8, -2 }, { -2, 8, -21, 77, 82, -22, 8, -2 }, + { -2, 8, -21, 74, 84, -21, 8, -2 }, { -2, 8, -21, 72, 87, -22, 8, -2 }, + { -2, 8, -20, 69, 89, -22, 8, -2 }, { -2, 8, -20, 67, 91, -22, 8, -2 }, + { -2, 7, -19, 64, 94, -22, 8, -2 }, { -2, 7, -19, 62, 96, -22, 8, -2 }, + { -2, 7, -18, 59, 98, -22, 8, -2 }, { -2, 7, -18, 56, 100, -21, 8, -2 }, + { -2, 7, -17, 54, 102, -21, 7, -2 }, { -2, 7, -16, 51, 104, -21, 7, -2 }, + { -2, 6, -16, 49, 106, -20, 7, -2 }, { -2, 6, -15, 46, 108, -20, 7, -2 }, + { -2, 6, -15, 43, 110, -19, 7, -2 }, { -2, 6, -14, 41, 111, -19, 7, -2 }, + { -1, 5, -13, 38, 113, -18, 6, -2 }, { -1, 5, -12, 35, 114, -17, 6, -2 }, + { -1, 5, -12, 33, 116, -17, 6, -2 }, { -1, 4, -11, 30, 118, -16, 5, -1 }, + { -1, 4, -10, 27, 119, -15, 5, -1 }, { -1, 4, -9, 25, 120, -14, 5, -2 }, + { -1, 3, -8, 23, 121, -13, 4, -1 }, { -1, 3, -7, 20, 122, -12, 4, -1 }, + { -1, 3, -7, 18, 123, -11, 4, -1 }, { -1, 3, -6, 16, 124, -10, 3, -1 }, + { -1, 2, -5, 13, 125, -8, 3, -1 }, { -1, 2, -4, 11, 126, -7, 2, -1 }, + { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -2, 6, 127, -5, 1, 0 }, + { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, 0, 2, 127, -1, 0, 0 }, // [1, 2) - W( 0, 0, 0, 1, 127, 0, 0, 0 ),W( 0, 0, 0, - 1, 127, 2, 0, 0 ), - W( 0, 0, 1, - 3, 127, 4, - 1, 0 ), W( 0, 0, 1, - 4, 126, 6, - 2, 1 ), - W( 0, 0, 1, - 5, 126, 8, - 3, 1 ), W( 0, 0, 1, - 6, 125, 11, - 4, 1 ), - W( 0, 0, 1, - 7, 124, 13, - 4, 1 ), W( 0, 0, 2, - 8, 123, 15, - 5, 1 ), - W( 0, 0, 2, - 9, 122, 18, - 6, 1 ), W( 0, 0, 2, -10, 121, 20, - 6, 1 ), - W( 0, 0, 2, -11, 120, 22, - 7, 2 ), W( 0, 0, 2, -12, 119, 25, - 8, 2 ), - W( 0, 0, 3, -13, 117, 27, - 8, 2 ), W( 0, 0, 3, -13, 116, 29, - 9, 2 ), - W( 0, 0, 3, -14, 114, 32, -10, 3 ), W( 0, 0, 3, -15, 113, 35, -10, 2 ), - W( 0, 0, 3, -15, 111, 37, -11, 3 ), W( 0, 0, 3, -16, 109, 40, -11, 3 ), - W( 0, 0, 3, -16, 108, 42, -12, 3 ), W( 0, 0, 4, -17, 106, 45, -13, 3 ), - W( 0, 0, 4, -17, 104, 47, -13, 3 ), W( 0, 0, 4, -17, 102, 50, -14, 3 ), - W( 0, 0, 4, -17, 100, 52, -14, 3 ), W( 0, 0, 4, -18, 98, 55, -15, 4 ), - W( 0, 0, 4, -18, 96, 58, -15, 3 ), W( 0, 0, 4, -18, 94, 60, -16, 4 ), - W( 0, 0, 4, -18, 91, 63, -16, 4 ), W( 0, 0, 4, -18, 89, 65, -16, 4 ), - W( 0, 0, 4, -18, 87, 68, -17, 4 ), W( 0, 0, 4, -18, 85, 70, -17, 4 ), - W( 0, 0, 4, -18, 82, 73, -17, 4 ), W( 0, 0, 4, -18, 80, 75, -17, 4 ), - W( 0, 0, 4, -18, 78, 78, -18, 4 ), W( 0, 0, 4, -17, 75, 80, -18, 4 ), - W( 0, 0, 4, -17, 73, 82, -18, 4 ), W( 0, 0, 4, -17, 70, 85, -18, 4 ), - W( 0, 0, 4, -17, 68, 87, -18, 4 ), W( 0, 0, 4, -16, 65, 89, -18, 4 ), - W( 0, 0, 4, -16, 63, 91, -18, 4 ), W( 0, 0, 4, -16, 60, 94, -18, 4 ), - W( 0, 0, 3, -15, 58, 96, -18, 4 ), W( 0, 0, 4, -15, 55, 98, -18, 4 ), - W( 0, 0, 3, -14, 52, 100, -17, 4 ), W( 0, 0, 3, -14, 50, 102, -17, 4 ), - W( 0, 0, 3, -13, 47, 104, -17, 4 ), W( 0, 0, 3, -13, 45, 106, -17, 4 ), - W( 0, 0, 3, -12, 42, 108, -16, 3 ), W( 0, 0, 3, -11, 40, 109, -16, 3 ), - W( 0, 0, 3, -11, 37, 111, -15, 3 ), W( 0, 0, 2, -10, 35, 113, -15, 3 ), - W( 0, 0, 3, -10, 32, 114, -14, 3 ), W( 0, 0, 2, - 9, 29, 116, -13, 3 ), - W( 0, 0, 2, - 8, 27, 117, -13, 3 ), W( 0, 0, 2, - 8, 25, 119, -12, 2 ), - W( 0, 0, 2, - 7, 22, 120, -11, 2 ), W( 0, 0, 1, - 6, 20, 121, -10, 2 ), - W( 0, 0, 1, - 6, 18, 122, - 9, 2 ), W( 0, 0, 1, - 5, 15, 123, - 8, 2 ), - W( 0, 0, 1, - 4, 13, 124, - 7, 1 ), W( 0, 0, 1, - 4, 11, 125, - 6, 1 ), - W( 0, 0, 1, - 3, 8, 126, - 5, 1 ), W( 0, 0, 1, - 2, 6, 126, - 4, 1 ), - W( 0, 0, 0, - 1, 4, 127, - 3, 1 ), W( 0, 0, 0, 0, 2, 127, - 1, 0 ), + { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, -1, 127, 2, 0, 0 }, + { 0, 0, 1, -3, 127, 4, -1, 0 }, { 0, 0, 1, -4, 126, 6, -2, 1 }, + { 0, 0, 1, -5, 126, 8, -3, 1 }, { 0, 0, 1, -6, 125, 11, -4, 1 }, + { 0, 0, 1, -7, 124, 13, -4, 1 }, { 0, 0, 2, -8, 123, 15, -5, 1 }, + { 0, 0, 2, -9, 122, 18, -6, 1 }, { 0, 0, 2, -10, 121, 20, -6, 1 }, + { 0, 0, 2, -11, 120, 22, -7, 2 }, { 0, 0, 2, -12, 119, 25, -8, 2 }, + { 0, 0, 3, -13, 117, 27, -8, 2 }, { 0, 0, 3, -13, 116, 29, -9, 2 }, + { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 }, + { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 }, + { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 }, + { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 }, + { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 }, + { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 }, + { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 }, + { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 }, + { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 }, + { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 }, + { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 }, + { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 }, + { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 }, + { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 }, + { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 }, + { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 }, + { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 }, + { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 }, + { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, -9, 29, 116, -13, 3 }, + { 0, 0, 2, -8, 27, 117, -13, 3 }, { 0, 0, 2, -8, 25, 119, -12, 2 }, + { 0, 0, 2, -7, 22, 120, -11, 2 }, { 0, 0, 1, -6, 20, 121, -10, 2 }, + { 0, 0, 1, -6, 18, 122, -9, 2 }, { 0, 0, 1, -5, 15, 123, -8, 2 }, + { 0, 0, 1, -4, 13, 124, -7, 1 }, { 0, 0, 1, -4, 11, 125, -6, 1 }, + { 0, 0, 1, -3, 8, 126, -5, 1 }, { 0, 0, 1, -2, 6, 126, -4, 1 }, + { 0, 0, 0, -1, 4, 127, -3, 1 }, { 0, 0, 0, 0, 2, 127, -1, 0 }, // dummy (replicate row index 191) - W( 0, 0, 0, 0, 2, 127, - 1, 0 ), + { 0, 0, 0, 0, 2, 127, -1, 0 }, }; const int8_t ALIGN(dav1d_resize_filter[64][8], 8) = { @@ -692,7 +685,7 @@ { 0, -1, 2, -4, -127, 3, -1, 0 }, { 0, 0, 1, -2, -128, 1, 0, 0 }, }; -const uint8_t dav1d_sm_weights[128] = { +const uint8_t ALIGN(dav1d_sm_weights[128], 16) = { // Unused, because we always offset by bs, which is at least 2. 0, 0, // bs = 2 diff -Nru dav1d-0.7.1/src/tables.h dav1d-0.9.1/src/tables.h --- dav1d-0.7.1/src/tables.h 2020-06-21 11:48:55.008126500 +0000 +++ dav1d-0.9.1/src/tables.h 2021-07-28 21:38:28.889852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -107,7 +107,7 @@ extern const int8_t dav1d_cdef_directions[12][2]; -extern const int16_t dav1d_sgr_params[16][4]; +extern const uint16_t dav1d_sgr_params[16][2]; extern const uint8_t dav1d_sgr_x_by_x[256]; extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8]; diff -Nru dav1d-0.7.1/src/thread.h dav1d-0.9.1/src/thread.h --- dav1d-0.7.1/src/thread.h 2020-06-21 11:48:55.008126500 +0000 +++ dav1d-0.9.1/src/thread.h 2021-07-28 21:38:28.889852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -169,6 +169,14 @@ pthread_setname_np(pthread_self(), "%s", (void*)name); } +#elif defined(__HAIKU__) + +#include + +static inline void dav1d_set_thread_name(const char *const name) { + rename_thread(find_thread(NULL), name); +} + #else #define dav1d_set_thread_name(name) do {} while (0) diff -Nru dav1d-0.7.1/src/thread_task.c dav1d-0.9.1/src/thread_task.c --- dav1d-0.7.1/src/thread_task.c 2020-06-21 11:48:55.012126400 +0000 +++ dav1d-0.9.1/src/thread_task.c 2021-07-28 21:38:28.889852000 +0000 @@ -29,6 +29,140 @@ #include "src/thread_task.h" +int dav1d_task_create_filter_sbrow(Dav1dFrameContext *const f) { + struct PostFilterThreadData *const pftd = f->lf.thread.pftd; + const int frame_idx = (int)(f - f->c->fc); + + const int has_deblock = f->frame_hdr->loopfilter.level_y[0] || + f->frame_hdr->loopfilter.level_y[1] || + f->lf.restore_planes; + const int has_cdef = f->seq_hdr->cdef; + const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1]; + const int has_lr = !!f->lf.restore_planes; + f->lf.thread.npf = has_deblock + has_cdef + has_resize + has_lr; + if (f->lf.thread.npf == 0) return 0; + + pthread_mutex_lock(&pftd->lock); + + Dav1dTask *tasks = f->lf.thread.tasks; + int num_tasks = f->sbh * f->lf.thread.npf; + if (num_tasks > f->lf.thread.num_tasks) { + const size_t size = sizeof(Dav1dTask) * num_tasks; + tasks = realloc(f->lf.thread.tasks, size); + if (!tasks) { + pthread_mutex_unlock(&pftd->lock); + return -1; + } + memset(tasks, 0, size); + f->lf.thread.tasks = tasks; + f->lf.thread.num_tasks = num_tasks; + } + +#define create_task(task, ready_cond, start_cond) \ + do { \ + t = &tasks[num_tasks++]; \ + t->status = ready_cond ? DAV1D_TASK_READY : DAV1D_TASK_DEFAULT; \ + t->start = start_cond; \ + t->frame_id = frame_cnt; \ + t->frame_idx = frame_idx; \ + t->sby = sby; \ + t->fn = f->bd_fn.filter_sbrow_##task; \ + t->last_deps[0] = NULL; \ + t->last_deps[1] = NULL; \ + t->next_deps[0] = NULL; \ + t->next_deps[1] = NULL; \ + t->next_exec = NULL; \ + } while (0) + + Dav1dTask *last_sbrow_deblock = NULL; + Dav1dTask *last_sbrow_cdef = NULL; + Dav1dTask *last_sbrow_resize = NULL; + Dav1dTask *last_sbrow_lr = NULL; + num_tasks = 0; + const int frame_cnt = pftd->frame_cnt++; + + for (int sby = 0; sby < f->sbh; ++sby) { + Dav1dTask *t; + Dav1dTask *last = NULL; + if (has_deblock) { + create_task(deblock, sby == 0, 0); + if (sby) { + t->last_deps[1] = last_sbrow_deblock; + last_sbrow_deblock->next_deps[1] = t; + } + last = t; + last_sbrow_deblock = t; + } + if (has_cdef) { + create_task(cdef, sby == 0 && !has_deblock, has_deblock); + if (has_deblock) { + t->last_deps[0] = last; + last->next_deps[0] = t; + } + if (sby) { + t->last_deps[1] = last_sbrow_cdef; + last_sbrow_cdef->next_deps[1] = t; + } + last = t; + last_sbrow_cdef = t; + }; + if (has_resize) { + create_task(resize, sby == 0 && !last, !!last); + if (last) { + t->last_deps[0] = last; + last->next_deps[0] = t; + } + if (sby) { + t->last_deps[1] = last_sbrow_resize; + last_sbrow_resize->next_deps[1] = t; + } + last = t; + last_sbrow_resize = t; + } + if (has_lr) { + create_task(lr, sby == 0 && !last, !!last); + if (last) { + t->last_deps[0] = last; + last->next_deps[0] = t; + } + if (sby) { + t->last_deps[1] = last_sbrow_lr; + last_sbrow_lr->next_deps[1] = t; + } + last_sbrow_lr = t; + } + } + f->lf.thread.done = 0; + pthread_mutex_unlock(&pftd->lock); + + return 0; +} + +void dav1d_task_schedule(struct PostFilterThreadData *const pftd, + Dav1dTask *const t) +{ + Dav1dTask **pt = &pftd->tasks; + while (*pt && + ((*pt)->sby < t->sby || + ((*pt)->sby == t->sby && (*pt)->frame_id <= t->frame_id))) + pt = &(*pt)->next_exec; + t->next_exec = *pt; + *pt = t; + pthread_cond_signal(&pftd->cond); +} + +static inline void update_task(Dav1dTask *const t, const int dep_type, + Dav1dFrameContext *const f) +{ + if (!t->last_deps[!dep_type] || + t->last_deps[!dep_type]->status == DAV1D_TASK_DONE) + { + t->status = DAV1D_TASK_READY; + if (t->start) + dav1d_task_schedule(f->lf.thread.pftd, t); + } +} + void *dav1d_frame_task(void *const data) { Dav1dFrameContext *const f = data; @@ -140,3 +274,98 @@ return NULL; } + +static inline int handle_abortion(Dav1dPostFilterContext *const pf, + Dav1dContext *const c, + struct PostFilterThreadData *const pftd) +{ + const int flush = atomic_load_explicit(c->flush, memory_order_acquire); + if (flush) { + pthread_mutex_lock(&pf->td.lock); + pf->flushed = 0; + pthread_mutex_unlock(&pf->td.lock); + } + for (unsigned i = 0; i < c->n_fc; i++) { + Dav1dFrameContext *const f = &c->fc[i]; + int send_signal; + if (flush) // TODO before merge, see if this can be safely merged + send_signal = f->lf.thread.done != 1 && f->lf.thread.num_tasks != 0; + else + send_signal = f->lf.thread.done == -1; + for (int j = 0; send_signal && j < f->lf.thread.num_tasks; j++) { + Dav1dTask *const t = &f->lf.thread.tasks[j]; + if (t->status == DAV1D_TASK_RUNNING || + (t->status == DAV1D_TASK_DONE && t->start != -1)) + send_signal = 0; + } + if (send_signal) { + if (!flush) { + Dav1dTask **pt = &pftd->tasks; + while (*pt) { + if ((*pt)->frame_idx == i) + *pt = (*pt)->next_exec; + else + pt = &(*pt)->next_exec; + } + } + f->lf.thread.done = 1; + pthread_cond_signal(&f->lf.thread.cond); + } + } + if (flush) { + pthread_mutex_lock(&pf->td.lock); + pf->flushed = 1; + pthread_cond_signal(&pf->td.cond); + pthread_mutex_unlock(&pf->td.lock); + } + return !flush; +} + +void *dav1d_postfilter_task(void *data) { + Dav1dPostFilterContext *const pf = data; + Dav1dContext *const c = pf->c; + struct PostFilterThreadData *pftd = &c->postfilter_thread; + + dav1d_set_thread_name("dav1d-postfilter"); + + int exec = 1; + pthread_mutex_lock(&pftd->lock); + for (;;) { + if (!exec && !pf->die) + pthread_cond_wait(&pftd->cond, &pftd->lock); + if (!(exec = handle_abortion(pf, c, pftd))) continue; + if (pf->die) break; + + Dav1dTask *const t = pftd->tasks; + if (!t) { exec = 0; continue; } + pftd->tasks = t->next_exec; + t->status = DAV1D_TASK_RUNNING; + + pthread_mutex_unlock(&pftd->lock); + Dav1dFrameContext *const f = &c->fc[t->frame_idx]; + t->fn(f, t->sby); + exec = 1; + pthread_mutex_lock(&pftd->lock); + + if (t->next_deps[0]) + update_task(t->next_deps[0], 0, f); + if (t->next_deps[1]) + update_task(t->next_deps[1], 1, f); + t->status = DAV1D_TASK_DONE; + if (!t->next_deps[0]) { + const enum PlaneType progress_plane_type = + c->n_fc > 1 && f->frame_hdr->refresh_context ? + PLANE_TYPE_Y : PLANE_TYPE_ALL; + const int y = (t->sby + 1) * f->sb_step * 4; + dav1d_thread_picture_signal(&f->sr_cur, y, progress_plane_type); + if (t->sby + 1 == f->sbh) { + f->lf.thread.done = 1; + pthread_cond_signal(&f->lf.thread.cond); + } + } + t->start = -1; + } + pthread_mutex_unlock(&pftd->lock); + + return NULL; +} diff -Nru dav1d-0.7.1/src/thread_task.h dav1d-0.9.1/src/thread_task.h --- dav1d-0.7.1/src/thread_task.h 2020-06-21 11:48:55.012126400 +0000 +++ dav1d-0.9.1/src/thread_task.h 2021-07-28 21:38:28.889852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -35,10 +35,33 @@ #define FRAME_ERROR (UINT_MAX - 1) #define TILE_ERROR (INT_MAX - 1) -int dav1d_decode_frame(Dav1dFrameContext *f); +enum TaskStatus { + DAV1D_TASK_DEFAULT, + DAV1D_TASK_READY, + DAV1D_TASK_RUNNING, + DAV1D_TASK_DONE, +}; + +struct Dav1dTask { + enum TaskStatus status; // task status + int start; // frame thread start flag + unsigned frame_idx; // frame thread id + int frame_id; // frame ordering + int sby; // sbrow + filter_sbrow_fn fn; // task work + Dav1dTask *last_deps[2]; // dependencies + Dav1dTask *next_deps[2]; // dependant tasks + Dav1dTask *next_exec; // tasks scheduling +}; + +int dav1d_task_create_filter_sbrow(Dav1dFrameContext *f); +void dav1d_task_schedule(struct PostFilterThreadData *pftd, Dav1dTask *t); + void *dav1d_frame_task(void *data); +void *dav1d_tile_task(void *data); +void *dav1d_postfilter_task(void *data); +int dav1d_decode_frame(Dav1dFrameContext *f); int dav1d_decode_tile_sbrow(Dav1dTileContext *t); -void *dav1d_tile_task(void *data); #endif /* DAV1D_SRC_THREAD_TASK_H */ diff -Nru dav1d-0.7.1/src/warpmv.c dav1d-0.9.1/src/warpmv.c --- dav1d-0.7.1/src/warpmv.c 2020-06-21 11:48:55.012126400 +0000 +++ dav1d-0.9.1/src/warpmv.c 2021-07-28 21:38:28.889852000 +0000 @@ -82,21 +82,21 @@ if (mat[2] <= 0) return 1; - wm->alpha = iclip_wmp(mat[2] - 0x10000); - wm->beta = iclip_wmp(mat[3]); + wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000); + wm->u.p.beta = iclip_wmp(mat[3]); int shift; const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]); const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y; const int rnd = (1 << shift) >> 1; - wm->gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1)); + wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1)); const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y; - wm->delta = iclip_wmp(mat[5] - + wm->u.p.delta = iclip_wmp(mat[5] - apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) - 0x10000); - return (4 * abs(wm->alpha) + 7 * abs(wm->beta) >= 0x10000) || - (4 * abs(wm->gamma) + 4 * abs(wm->delta) >= 0x10000); + return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) || + (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000); } static int resolve_divisor_64(const uint64_t d, int *const shift) { diff -Nru dav1d-0.7.1/src/wedge.c dav1d-0.9.1/src/wedge.c --- dav1d-0.7.1/src/wedge.c 2020-06-21 11:48:55.012126400 +0000 +++ dav1d-0.9.1/src/wedge.c 2021-07-28 21:38:28.889852000 +0000 @@ -45,41 +45,41 @@ }; typedef struct { - enum WedgeDirectionType direction; - int x_offset; - int y_offset; + uint8_t /* enum WedgeDirectionType */ direction; + uint8_t x_offset; + uint8_t y_offset; } wedge_code_type; static const wedge_code_type wedge_codebook_16_hgtw[16] = { - { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 }, - { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 }, - { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, - { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, }; static const wedge_code_type wedge_codebook_16_hltw[16] = { - { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, - { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 }, - { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 }, - { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, - { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, }; static const wedge_code_type wedge_codebook_16_heqw[16] = { - { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 }, - { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 }, - { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, - { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, }; diff -Nru dav1d-0.7.1/src/win32/thread.c dav1d-0.9.1/src/win32/thread.c --- dav1d-0.7.1/src/win32/thread.c 2020-06-21 11:48:55.012126400 +0000 +++ dav1d-0.9.1/src/win32/thread.c 2021-07-28 21:38:28.893852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -40,9 +40,12 @@ static HRESULT (WINAPI *set_thread_description)(HANDLE, PCWSTR); COLD void dav1d_init_thread(void) { - set_thread_description = - (void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"), - "SetThreadDescription"); +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) + HANDLE kernel32 = GetModuleHandleW(L"kernel32.dll"); + if (kernel32) + set_thread_description = + (void*)GetProcAddress(kernel32, "SetThreadDescription"); +#endif } #undef dav1d_set_thread_name diff -Nru dav1d-0.7.1/src/x86/cdef16_avx2.asm dav1d-0.9.1/src/x86/cdef16_avx2.asm --- dav1d-0.7.1/src/x86/cdef16_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/cdef16_avx2.asm 2021-07-28 21:38:28.893852000 +0000 @@ -0,0 +1,485 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA + +tap_table: dw 4, 2, 3, 3, 2, 1 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + +dir_shift: times 2 dw 0x4000 + times 2 dw 0x1000 + +pw_2048: times 2 dw 2048 + +cextern cdef_dir_8bpc_avx2.main + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride + ; load p0/p1 + movsx offq, byte [dirq+kq+%1] ; off1 +%if %5 == 4 + movq xm5, [stkq+offq*2+%6*0] ; p0 + movq xm6, [stkq+offq*2+%6*2] + movhps xm5, [stkq+offq*2+%6*1] + movhps xm6, [stkq+offq*2+%6*3] + vinserti128 m5, xm6, 1 +%else + movu xm5, [stkq+offq*2+%6*0] ; p0 + vinserti128 m5, [stkq+offq*2+%6*1], 1 +%endif + neg offq ; -off1 +%if %5 == 4 + movq xm6, [stkq+offq*2+%6*0] ; p1 + movq xm9, [stkq+offq*2+%6*2] + movhps xm6, [stkq+offq*2+%6*1] + movhps xm9, [stkq+offq*2+%6*3] + vinserti128 m6, xm9, 1 +%else + movu xm6, [stkq+offq*2+%6*0] ; p1 + vinserti128 m6, [stkq+offq*2+%6*1], 1 +%endif + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 ; max after p0 + pminuw m8, m5 ; min after p0 + pmaxsw m7, m6 ; max after p1 + pminuw m8, m6 ; min after p1 + + ; accumulate sum[m15] over p0/p1 + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + pabsw m9, m5 + pabsw m10, m6 + psignw m11, %4, m5 + psignw m12, %4, m6 + psrlw m5, m9, %2 + psrlw m6, m10, %2 + psubusw m5, %3, m5 + psubusw m6, %3, m6 + pminuw m5, m9 ; constrain(diff_p0) + pminuw m6, m10 ; constrain(diff_p1) + pmullw m5, m11 ; constrain(diff_p0) * taps + pmullw m6, m12 ; constrain(diff_p1) * taps + paddw m15, m5 + paddw m15, m6 +%endmacro + +%macro cdef_filter_fn 3 ; w, h, stride +INIT_YMM avx2 +%if %1 != 4 || %2 != 8 +cglobal cdef_filter_%1x%2_16bpc, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + dst, stride, left, top, pri, sec, \ + stride3, dst4, edge +%else +cglobal cdef_filter_%1x%2_16bpc, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ + dst, stride, left, top, pri, sec, \ + stride3, dst4, edge +%endif +%define px rsp+2*16+2*%3 + pcmpeqw m14, m14 + psllw m14, 15 ; 0x8000 + mov edged, r8m + + ; prepare pixel buffers - body/right +%if %1 == 4 + INIT_XMM avx2 +%endif +%if %2 == 8 + lea dst4q, [dstq+strideq*4] +%endif + lea stride3q, [strideq*3] + test edgeb, 2 ; have_right + jz .no_right + movu m1, [dstq+strideq*0] + movu m2, [dstq+strideq*1] + movu m3, [dstq+strideq*2] + movu m4, [dstq+stride3q] + mova [px+0*%3], m1 + mova [px+1*%3], m2 + mova [px+2*%3], m3 + mova [px+3*%3], m4 +%if %2 == 8 + movu m1, [dst4q+strideq*0] + movu m2, [dst4q+strideq*1] + movu m3, [dst4q+strideq*2] + movu m4, [dst4q+stride3q] + mova [px+4*%3], m1 + mova [px+5*%3], m2 + mova [px+6*%3], m3 + mova [px+7*%3], m4 +%endif + jmp .body_done +.no_right: +%if %1 == 4 + movq xm1, [dstq+strideq*0] + movq xm2, [dstq+strideq*1] + movq xm3, [dstq+strideq*2] + movq xm4, [dstq+stride3q] + movq [px+0*%3], xm1 + movq [px+1*%3], xm2 + movq [px+2*%3], xm3 + movq [px+3*%3], xm4 +%else + mova xm1, [dstq+strideq*0] + mova xm2, [dstq+strideq*1] + mova xm3, [dstq+strideq*2] + mova xm4, [dstq+stride3q] + mova [px+0*%3], xm1 + mova [px+1*%3], xm2 + mova [px+2*%3], xm3 + mova [px+3*%3], xm4 +%endif + movd [px+0*%3+%1*2], xm14 + movd [px+1*%3+%1*2], xm14 + movd [px+2*%3+%1*2], xm14 + movd [px+3*%3+%1*2], xm14 +%if %2 == 8 + %if %1 == 4 + movq xm1, [dst4q+strideq*0] + movq xm2, [dst4q+strideq*1] + movq xm3, [dst4q+strideq*2] + movq xm4, [dst4q+stride3q] + movq [px+4*%3], xm1 + movq [px+5*%3], xm2 + movq [px+6*%3], xm3 + movq [px+7*%3], xm4 + %else + mova xm1, [dst4q+strideq*0] + mova xm2, [dst4q+strideq*1] + mova xm3, [dst4q+strideq*2] + mova xm4, [dst4q+stride3q] + mova [px+4*%3], xm1 + mova [px+5*%3], xm2 + mova [px+6*%3], xm3 + mova [px+7*%3], xm4 + %endif + movd [px+4*%3+%1*2], xm14 + movd [px+5*%3+%1*2], xm14 + movd [px+6*%3+%1*2], xm14 + movd [px+7*%3+%1*2], xm14 +%endif +.body_done: + + ; top + test edgeb, 4 ; have_top + jz .no_top + test edgeb, 1 ; have_left + jz .top_no_left + test edgeb, 2 ; have_right + jz .top_no_right + movu m1, [topq+strideq*0-%1] + movu m2, [topq+strideq*1-%1] + movu [px-2*%3-%1], m1 + movu [px-1*%3-%1], m2 + jmp .top_done +.top_no_right: + movu m1, [topq+strideq*0-%1*2] + movu m2, [topq+strideq*1-%1*2] + movu [px-2*%3-%1*2], m1 + movu [px-1*%3-%1*2], m2 + movd [px-2*%3+%1*2], xm14 + movd [px-1*%3+%1*2], xm14 + jmp .top_done +.top_no_left: + test edgeb, 2 ; have_right + jz .top_no_left_right + movu m1, [topq+strideq*0] + movu m2, [topq+strideq*1] + mova [px-2*%3+0], m1 + mova [px-1*%3+0], m2 + movd [px-2*%3-4], xm14 + movd [px-1*%3-4], xm14 + jmp .top_done +.top_no_left_right: +%if %1 == 4 + movq xm1, [topq+strideq*0] + movq xm2, [topq+strideq*1] + movq [px-2*%3+0], xm1 + movq [px-1*%3+0], xm2 +%else + mova xm1, [topq+strideq*0] + mova xm2, [topq+strideq*1] + mova [px-2*%3+0], xm1 + mova [px-1*%3+0], xm2 +%endif + movd [px-2*%3-4], xm14 + movd [px-1*%3-4], xm14 + movd [px-2*%3+%1*2], xm14 + movd [px-1*%3+%1*2], xm14 + jmp .top_done +.no_top: + movu [px-2*%3-%1], m14 + movu [px-1*%3-%1], m14 +.top_done: + + ; left + test edgeb, 1 ; have_left + jz .no_left + mova xm1, [leftq+ 0] +%if %2 == 8 + mova xm2, [leftq+16] +%endif + movd [px+0*%3-4], xm1 + pextrd [px+1*%3-4], xm1, 1 + pextrd [px+2*%3-4], xm1, 2 + pextrd [px+3*%3-4], xm1, 3 +%if %2 == 8 + movd [px+4*%3-4], xm2 + pextrd [px+5*%3-4], xm2, 1 + pextrd [px+6*%3-4], xm2, 2 + pextrd [px+7*%3-4], xm2, 3 +%endif + jmp .left_done +.no_left: + movd [px+0*%3-4], xm14 + movd [px+1*%3-4], xm14 + movd [px+2*%3-4], xm14 + movd [px+3*%3-4], xm14 +%if %2 == 8 + movd [px+4*%3-4], xm14 + movd [px+5*%3-4], xm14 + movd [px+6*%3-4], xm14 + movd [px+7*%3-4], xm14 +%endif +.left_done: + + ; bottom + DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge + test edgeb, 8 ; have_bottom + jz .no_bottom + lea dst8q, [dstq+%2*strideq] + test edgeb, 1 ; have_left + jz .bottom_no_left + test edgeb, 2 ; have_right + jz .bottom_no_right + movu m1, [dst8q-%1] + movu m2, [dst8q+strideq-%1] + movu [px+(%2+0)*%3-%1], m1 + movu [px+(%2+1)*%3-%1], m2 + jmp .bottom_done +.bottom_no_right: + movu m1, [dst8q-%1*2] + movu m2, [dst8q+strideq-%1*2] + movu [px+(%2+0)*%3-%1*2], m1 + movu [px+(%2+1)*%3-%1*2], m2 +%if %1 == 8 + movd [px+(%2-1)*%3+%1*2], xm14 ; overwritten by previous movu +%endif + movd [px+(%2+0)*%3+%1*2], xm14 + movd [px+(%2+1)*%3+%1*2], xm14 + jmp .bottom_done +.bottom_no_left: + test edgeb, 2 ; have_right + jz .bottom_no_left_right + movu m1, [dst8q] + movu m2, [dst8q+strideq] + mova [px+(%2+0)*%3+0], m1 + mova [px+(%2+1)*%3+0], m2 + movd [px+(%2+0)*%3-4], xm14 + movd [px+(%2+1)*%3-4], xm14 + jmp .bottom_done +.bottom_no_left_right: +%if %1 == 4 + movq xm1, [dst8q] + movq xm2, [dst8q+strideq] + movq [px+(%2+0)*%3+0], xm1 + movq [px+(%2+1)*%3+0], xm2 +%else + mova xm1, [dst8q] + mova xm2, [dst8q+strideq] + mova [px+(%2+0)*%3+0], xm1 + mova [px+(%2+1)*%3+0], xm2 +%endif + movd [px+(%2+0)*%3-4], xm14 + movd [px+(%2+1)*%3-4], xm14 + movd [px+(%2+0)*%3+%1*2], xm14 + movd [px+(%2+1)*%3+%1*2], xm14 + jmp .bottom_done +.no_bottom: + movu [px+(%2+0)*%3-%1], m14 + movu [px+(%2+1)*%3-%1], m14 +.bottom_done: + + ; actual filter + INIT_YMM avx2 + DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero + %undef edged + movifnidn prid, prim + mov dampingd, r7m + lzcnt pridmpd, prid +%if UNIX64 + movd xm0, prid + movd xm1, secdmpd +%endif + lzcnt secdmpd, secdmpm + sub dampingd, 31 + xor zerod, zerod + add pridmpd, dampingd + cmovs pridmpd, zerod + add secdmpd, dampingd + cmovs secdmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + mov [rsp+8], secdmpq ; sec_shift + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, table, dir, pri, sec, stride3 +%if UNIX64 + vpbroadcastw m0, xm0 ; pri_strength + vpbroadcastw m1, xm1 ; sec_strength +%else + vpbroadcastw m0, prim ; pri_strength + vpbroadcastw m1, secm ; sec_strength +%endif + rorx r2d, prid, 2 + cmp dword r9m, 0xfff + cmove prid, r2d + and prid, 4 + lea tableq, [tap_table] + lea priq, [tableq+priq] ; pri_taps + lea secq, [tableq+8] ; sec_taps + + ; off1/2/3[k] [6 total] from [tableq+12+(dir+0/2/6)*2+k] + mov dird, r6m + lea tableq, [tableq+dirq*2+12] +%if %1*%2*2/mmsize > 1 + %if %1 == 4 + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k + %else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k + %endif + mov hd, %1*%2*2/mmsize +%else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k +%endif + lea stkq, [px] + pxor m13, m13 +%if %1*%2*2/mmsize > 1 +.v_loop: +%endif + mov kd, 1 +%if %1 == 4 + movq xm4, [stkq+%3*0] + movhps xm4, [stkq+%3*1] + movq xm5, [stkq+%3*2] + movhps xm5, [stkq+%3*3] + vinserti128 m4, xm5, 1 +%else + mova xm4, [stkq+%3*0] ; px + vinserti128 m4, [stkq+%3*1], 1 +%endif + pxor m15, m15 ; sum + mova m7, m4 ; max + mova m8, m4 ; min +.k_loop: + vpbroadcastw m2, [priq+kq*2] ; pri_taps + vpbroadcastw m3, [secq+kq*2] ; sec_taps + + ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3 + ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3 + ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3 + + dec kq + jge .k_loop + + vpbroadcastd m12, [pw_2048] + pcmpgtw m11, m13, m15 + paddw m15, m11 + pmulhrsw m15, m12 + paddw m4, m15 + pminsw m4, m7 + pmaxsw m4, m8 +%if %1 == 4 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+stride3q], xm5 +%else + mova [dstq+strideq*0], xm4 + vextracti128 [dstq+strideq*1], m4, 1 +%endif + +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, %3*vloop_lines + dec hd + jg .v_loop +%endif + + RET +%endmacro + +cdef_filter_fn 8, 8, 32 +cdef_filter_fn 4, 4, 32 + +INIT_YMM avx2 +cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax + lea r6, [dir_shift] + shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc + vpbroadcastd m4, [r6+bdmaxq*4] + lea r6, [strideq*3] + mova xm0, [srcq+strideq*0] + mova xm1, [srcq+strideq*1] + mova xm2, [srcq+strideq*2] + mova xm3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + vinserti128 m0, [srcq+r6 ], 1 + vinserti128 m1, [srcq+strideq*2], 1 + vinserti128 m2, [srcq+strideq*1], 1 + vinserti128 m3, [srcq+strideq*0], 1 + REPX {pmulhuw x, m4}, m0, m1, m2, m3 + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/cdef16_sse.asm dav1d-0.9.1/src/x86/cdef16_sse.asm --- dav1d-0.7.1/src/x86/cdef16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/cdef16_sse.asm 2021-07-28 21:38:28.893852000 +0000 @@ -0,0 +1,123 @@ +; Copyright (c) 2017-2021, The rav1e contributors +; Copyright (c) 2021, Nathan Egge +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +dir_shift: times 4 dw 0x4000 + times 4 dw 0x1000 + +pw_128: times 4 dw 128 + +cextern cdef_dir_8bpc_ssse3.main +cextern cdef_dir_8bpc_sse4.main +cextern shufw_6543210x + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%macro CDEF_DIR 0 +%if ARCH_X86_64 +cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax + lea r6, [dir_shift] + shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc + movddup m7, [r6+bdmaxq*8] + lea r6, [strideq*3] + mova m0, [srcq+strideq*0] + mova m1, [srcq+strideq*1] + mova m2, [srcq+strideq*2] + mova m3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + mova m4, [srcq+strideq*0] + mova m5, [srcq+strideq*1] + mova m6, [srcq+strideq*2] + REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhuw m7, [srcq+r6 ] + pxor m8, m8 + packuswb m9, m0, m1 + packuswb m10, m2, m3 + packuswb m11, m4, m5 + packuswb m12, m6, m7 + REPX {psadbw x, m8}, m9, m10, m11, m12 + packssdw m9, m10 + packssdw m11, m12 + packssdw m9, m11 + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main +%else +cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax + mov bdmaxd, bdmaxm + LEA r2, dir_shift + shr bdmaxd, 11 + movddup m7, [r2+bdmaxq*8] + lea r3, [strideq*3] + pmulhuw m3, m7, [srcq+strideq*0] + pmulhuw m4, m7, [srcq+strideq*1] + pmulhuw m5, m7, [srcq+strideq*2] + pmulhuw m6, m7, [srcq+r3 ] + movddup m1, [r2-dir_shift+pw_128] + lea srcq, [srcq+strideq*4] + pxor m0, m0 + packuswb m2, m3, m4 + psubw m3, m1 + psubw m4, m1 + mova [esp+0x00], m3 + mova [esp+0x10], m4 + packuswb m3, m5, m6 + psadbw m2, m0 + psadbw m3, m0 + psubw m5, m1 + psubw m6, m1 + packssdw m2, m3 + mova [esp+0x20], m5 + mova [esp+0x50], m6 + pmulhuw m4, m7, [srcq+strideq*0] + pmulhuw m5, m7, [srcq+strideq*1] + pmulhuw m6, m7, [srcq+strideq*2] + pmulhuw m7, [srcq+r3 ] + packuswb m3, m4, m5 + packuswb m1, m6, m7 + psadbw m3, m0 + psadbw m1, m0 + packssdw m3, m1 + movddup m1, [r2-dir_shift+pw_128] + LEA r2, shufw_6543210x + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main +%endif +%endmacro + +INIT_XMM ssse3 +CDEF_DIR + +INIT_XMM sse4 +CDEF_DIR diff -Nru dav1d-0.7.1/src/x86/cdef_avx2.asm dav1d-0.9.1/src/x86/cdef_avx2.asm --- dav1d-0.7.1/src/x86/cdef_avx2.asm 2020-06-21 11:48:55.012126400 +0000 +++ dav1d-0.9.1/src/x86/cdef_avx2.asm 2021-07-28 21:38:28.893852000 +0000 @@ -23,6 +23,7 @@ ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +%include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 @@ -38,7 +39,7 @@ %endmacro %macro CDEF_FILTER_JMP_TABLE 1 -JMP_TABLE cdef_filter_%1, \ +JMP_TABLE cdef_filter_%1_8bpc, \ d6k0, d6k1, d7k0, d7k1, \ d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ @@ -93,7 +94,7 @@ %macro PREP_REGS 2 ; w, h ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] mov dird, r6m - lea tableq, [cdef_filter_%1x%2_jmptable] + lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] lea dirq, [tableq+dirq*2*4] %if %1 == 4 %if %2 == 4 @@ -396,7 +397,7 @@ %macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 -cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ +cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \ pri, sec, dir, damping, edge %assign stack_offset_entry stack_offset mov edged, edgem @@ -1591,40 +1592,39 @@ CDEF_FILTER 4, 4 INIT_YMM avx2 -cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 +cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 lea stride3q, [strideq*3] movq xm0, [srcq+strideq*0] movq xm1, [srcq+strideq*1] movq xm2, [srcq+strideq*2] - movq xm3, [srcq+stride3q] + movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - vpbroadcastq m4, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - vpbroadcastq m6, [srcq+strideq*2] - vpbroadcastq m7, [srcq+stride3q] - vpbroadcastd m8, [pw_128] - pxor m9, m9 - - vpblendd m0, m0, m7, 0xf0 - vpblendd m1, m1, m6, 0xf0 - vpblendd m2, m2, m5, 0xf0 - vpblendd m3, m3, m4, 0xf0 - - punpcklbw m0, m9 - punpcklbw m1, m9 - punpcklbw m2, m9 - punpcklbw m3, m9 - - psubw m0, m8 - psubw m1, m8 - psubw m2, m8 - psubw m3, m8 + vpbroadcastq m4, [srcq+stride3q ] + vpbroadcastq m5, [srcq+strideq*2] + vpblendd m0, m4, 0xf0 + vpblendd m1, m5, 0xf0 + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m5, [srcq+strideq*0] + vpblendd m2, m4, 0xf0 + vpblendd m3, m5, 0xf0 + pxor m4, m4 + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 +cglobal_label .main + vpbroadcastd m4, [pw_128] + PROLOGUE 3, 4, 15 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 ; shuffle registers to generate partial_sum_diag[0-1] together - vpermq m7, m0, q1032 - vpermq m6, m1, q1032 - vpermq m5, m2, q1032 - vpermq m4, m3, q1032 + vperm2i128 m7, m0, m0, 0x01 + vperm2i128 m6, m1, m1, 0x01 + vperm2i128 m5, m2, m2, 0x01 + vperm2i128 m4, m3, m3, 0x01 ; start with partial_sum_hv[0-1] paddw m8, m0, m1 diff -Nru dav1d-0.7.1/src/x86/cdef_avx512.asm dav1d-0.9.1/src/x86/cdef_avx512.asm --- dav1d-0.7.1/src/x86/cdef_avx512.asm 2020-06-21 11:48:55.012126400 +0000 +++ dav1d-0.9.1/src/x86/cdef_avx512.asm 2021-07-28 21:38:28.893852000 +0000 @@ -23,6 +23,7 @@ ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +%include "config.asm" %include "ext/x86/x86inc.asm" %if HAVE_AVX512ICL && ARCH_X86_64 @@ -108,7 +109,8 @@ ; 5e 5f 50 51 52 53 54 55 INIT_ZMM avx512icl -cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge +cglobal cdef_filter_4x4_8bpc, 4, 8, 13, dst, stride, left, top, \ + pri, sec, dir, damping, edge %define base r7-edge_mask movq xmm0, [dstq+strideq*0] movhps xmm0, [dstq+strideq*1] @@ -268,8 +270,7 @@ ; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 ; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 -cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \ - pri, sec, dir, damping, edge +cglobal cdef_filter_4x8_8bpc, 4, 9, 22, dst, stride, left, top, pri, sec, dir, damping, edge %define base r8-edge_mask vpbroadcastd ym21, strided mov r6d, edgem @@ -503,8 +504,8 @@ ; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b ; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b -cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \ - pri, sec, dir, damping, edge +cglobal cdef_filter_8x8_8bpc, 4, 11, 32, 4*64, dst, stride, left, top, \ + pri, sec, dir, damping, edge %define base r8-edge_mask mov r6d, edgem lea r10, [dstq+strideq*4-2] diff -Nru dav1d-0.7.1/src/x86/cdef_init_tmpl.c dav1d-0.9.1/src/x86/cdef_init_tmpl.c --- dav1d-0.7.1/src/x86/cdef_init_tmpl.c 2020-06-21 11:48:55.012126400 +0000 +++ dav1d-0.9.1/src/x86/cdef_init_tmpl.c 2021-07-28 21:38:28.893852000 +0000 @@ -28,20 +28,20 @@ #include "src/cpu.h" #include "src/cdef.h" -#define decl_cdef_size_fn(sz) \ - decl_cdef_fn(dav1d_cdef_filter_##sz##_avx512icl); \ - decl_cdef_fn(dav1d_cdef_filter_##sz##_avx2); \ - decl_cdef_fn(dav1d_cdef_filter_##sz##_sse4); \ - decl_cdef_fn(dav1d_cdef_filter_##sz##_ssse3); \ - decl_cdef_fn(dav1d_cdef_filter_##sz##_sse2) - -decl_cdef_size_fn(4x4); -decl_cdef_size_fn(4x8); -decl_cdef_size_fn(8x8); - -decl_cdef_dir_fn(dav1d_cdef_dir_avx2); -decl_cdef_dir_fn(dav1d_cdef_dir_sse4); -decl_cdef_dir_fn(dav1d_cdef_dir_ssse3); +#define decl_cdef_fns(ext) \ + decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \ + decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \ + decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext)) + +decl_cdef_fns(avx512icl); +decl_cdef_fns(avx2); +decl_cdef_fns(sse4); +decl_cdef_fns(ssse3); +decl_cdef_fns(sse2); + +decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2)); +decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4)); +decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3)); COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); @@ -49,46 +49,45 @@ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; #if BITDEPTH == 8 - c->fb[0] = dav1d_cdef_filter_8x8_sse2; - c->fb[1] = dav1d_cdef_filter_4x8_sse2; - c->fb[2] = dav1d_cdef_filter_4x4_sse2; + c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2); + c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2); + c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2); #endif if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + c->dir = BF(dav1d_cdef_dir, ssse3); #if BITDEPTH == 8 - c->dir = dav1d_cdef_dir_ssse3; - c->fb[0] = dav1d_cdef_filter_8x8_ssse3; - c->fb[1] = dav1d_cdef_filter_4x8_ssse3; - c->fb[2] = dav1d_cdef_filter_4x4_ssse3; + c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3); + c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3); + c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3); #endif if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + c->dir = BF(dav1d_cdef_dir, sse4); #if BITDEPTH == 8 - c->dir = dav1d_cdef_dir_sse4; - c->fb[0] = dav1d_cdef_filter_8x8_sse4; - c->fb[1] = dav1d_cdef_filter_4x8_sse4; - c->fb[2] = dav1d_cdef_filter_4x4_sse4; + c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4); + c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4); + c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4); #endif #if ARCH_X86_64 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + c->dir = BF(dav1d_cdef_dir, avx2); + c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2); #if BITDEPTH == 8 - c->dir = dav1d_cdef_dir_avx2; - c->fb[0] = dav1d_cdef_filter_8x8_avx2; - c->fb[1] = dav1d_cdef_filter_4x8_avx2; - c->fb[2] = dav1d_cdef_filter_4x4_avx2; + c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2); #endif + c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2); if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; #if HAVE_AVX512ICL && BITDEPTH == 8 - c->fb[0] = dav1d_cdef_filter_8x8_avx512icl; - c->fb[1] = dav1d_cdef_filter_4x8_avx512icl; - c->fb[2] = dav1d_cdef_filter_4x4_avx512icl; + c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl); + c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl); + c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl); #endif - #endif } diff -Nru dav1d-0.7.1/src/x86/cdef_sse.asm dav1d-0.9.1/src/x86/cdef_sse.asm --- dav1d-0.7.1/src/x86/cdef_sse.asm 2020-06-21 11:48:55.016126400 +0000 +++ dav1d-0.9.1/src/x86/cdef_sse.asm 2021-07-28 21:38:28.893852000 +0000 @@ -24,32 +24,37 @@ ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +%include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 -%if ARCH_X86_32 -pb_0: times 16 db 0 -pb_0xFF: times 16 db 0xFF -%endif -pw_8: times 8 dw 8 -pw_128: times 8 dw 128 -pw_256: times 8 dw 256 -pw_2048: times 8 dw 2048 -%if ARCH_X86_32 +%macro DUP8 1-* + %rep %0 + times 8 db %1 + %rotate 1 + %endrep +%endmacro + +div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 + dd 420, 210, 140, 105, 105, 105, 105, 105 +div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 + dw 168, 168, 140, 140, 120, 120, 105, 105 + dw 420, 420, 210, 210, 140, 140, 105, 105 + dw 105, 105, 105, 105, 105, 105, 105, 105 +const shufw_6543210x, \ + db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pw_8: times 8 dw 8 +pw_128: times 8 dw 128 +pw_256: times 8 dw 256 +pw_2048: times 8 dw 2048 pw_0x7FFF: times 8 dw 0x7FFF pw_0x8000: times 8 dw 0x8000 -%endif -div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 - dd 420, 210, 140, 105, 105, 105, 105, 105 -div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105 - dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105 -shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 -shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 tap_table: ; masks for 8-bit shift emulation - db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 + DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 ; weights - db 4, 2, 3, 3, 2, 1 + DUP8 4, 2, 3, 3, 2, 1 ; taps indices db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 @@ -75,57 +80,17 @@ %endif %endmacro -%macro SAVE_ARG 2 ; varname, argnum - %define %1_stkloc [rsp+%2*gprsize] - %define %1_argnum %2 - mov r2, r%2m - mov %1_stkloc, r2 -%endmacro - -%macro LOAD_ARG 1-2 0 ; varname, load_to_varname_register - %if %2 == 0 - mov r %+ %{1}_argnum, %1_stkloc - %else - mov %1q, %1_stkloc - %endif -%endmacro - -%macro LOAD_ARG32 1-2 ; varname, load_to_varname_register - %if ARCH_X86_32 - %if %0 == 1 - LOAD_ARG %1 - %else - LOAD_ARG %1, %2 - %endif - %endif -%endmacro - -%if ARCH_X86_32 - %define PIC_base_offset $$ - %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) -%else - %define PIC_sym(sym) sym -%endif - -%macro SAVE_PIC_REG 1 - %if ARCH_X86_32 - mov [esp+%1], PIC_reg - %endif -%endmacro - -%macro LOAD_PIC_REG 1 - %if ARCH_X86_32 - mov PIC_reg, [esp+%1] - %endif -%endmacro - %macro PMOVZXBW 2-3 0 ; %3 = half - %if %3 == 1 - movd %1, %2 + %if cpuflag(sse4) && %3 == 0 + pmovzxbw %1, %2 %else + %if %3 == 1 + movd %1, %2 + %else movq %1, %2 + %endif + punpcklbw %1, m7 %endif - punpcklbw %1, m15 %endmacro %macro PSHUFB_0 2 @@ -138,34 +103,33 @@ %endif %endmacro -%macro LOAD_SEC_TAP 0 - %if ARCH_X86_64 - movd m3, [secq+kq] - PSHUFB_0 m3, m15 - %else - movd m2, [secq+kq] ; sec_taps - pxor m3, m3 - PSHUFB_0 m2, m3 - %endif +%macro MOVDDUP 2 +%if cpuflag(ssse3) + movddup %1, %2 +%else + movq %1, %2 + punpcklqdq %1, %1 +%endif %endmacro -%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride +%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax ; load p0/p1 - movsx offq, byte [dirq+kq+%1] ; off1 + movsx offq, byte [dirq+kq+%1+14*8] ; off1 %if %6 == 4 - movq m5, [stkq+offq*2+%7*0] ; p0 - movhps m5, [stkq+offq*2+%7*1] + movq m5, [stkq+offq*2+32*0] ; p0 + movhps m5, [stkq+offq*2+32*1] %else - movu m5, [stkq+offq*2+%7*0] ; p0 + movu m5, [stkq+offq*2+32*0] ; p0 %endif neg offq ; -off1 %if %6 == 4 - movq m6, [stkq+offq*2+%7*0] ; p1 - movhps m6, [stkq+offq*2+%7*1] + movq m6, [stkq+offq*2+32*0] ; p1 + movhps m6, [stkq+offq*2+32*1] %else - movu m6, [stkq+offq*2+%7*0] ; p1 + movu m6, [stkq+offq*2+32*0] ; p1 %endif - %if cpuflag(sse4) + %if %7 + %if cpuflag(sse4) ; out of bounds values are set to a value that is a both a large unsigned ; value and a negative signed value. ; use signed max and unsigned min to remove them @@ -173,40 +137,26 @@ pminuw m8, m5 pmaxsw m7, m6 pminuw m8, m6 - %else - %if ARCH_X86_64 - pcmpeqw m9, m14, m5 - pcmpeqw m10, m14, m6 - pandn m9, m5 - pandn m10, m6 - pmaxsw m7, m9 ; max after p0 - pminsw m8, m5 ; min after p0 - pmaxsw m7, m10 ; max after p1 - pminsw m8, m6 ; min after p1 %else - pcmpeqw m9, m5, OUT_OF_BOUNDS_MEM - pandn m9, m5 - pmaxsw m7, m9 ; max after p0 - pminsw m8, m5 ; min after p0 - pcmpeqw m9, m6, OUT_OF_BOUNDS_MEM - pandn m9, m6 - pmaxsw m7, m9 ; max after p1 - pminsw m8, m6 ; min after p1 + pcmpeqw m3, m14, m5 + pminsw m8, m5 ; min after p0 + pandn m3, m5 + pmaxsw m7, m3 ; max after p0 + pcmpeqw m3, m14, m6 + pminsw m8, m6 ; min after p1 + pandn m3, m6 + pmaxsw m7, m3 ; max after p1 %endif %endif ; accumulate sum[m13] over p0/p1 - psubw m5, m4 ; diff_p0(p0 - px) - psubw m6, m4 ; diff_p1(p1 - px) - packsswb m5, m6 ; convert pixel diff to 8-bit + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + packsswb m5, m6 ; convert pixel diff to 8-bit %if cpuflag(ssse3) - %if ARCH_X86_64 && cpuflag(sse4) - pshufb m5, m14 ; group diffs p0 and p1 into pairs - %else - pshufb m5, [PIC_sym(shufb_lohi)] - %endif + pshufb m5, m13 ; group diffs p0 and p1 into pairs pabsb m6, m5 - psignb m9, %5, m5 + psignb m3, %5, m5 %else movlhps m6, m5 punpckhbw m6, m5 @@ -214,122 +164,124 @@ pcmpgtb m5, m6 paddb m6, m5 pxor m6, m5 - paddb m9, %5, m5 - pxor m9, m5 + paddb m3, %5, m5 + pxor m3, m5 %endif - %if ARCH_X86_64 - psrlw m10, m6, %2 ; emulate 8-bit shift - pand m10, %3 - psubusb m5, %4, m10 - %else - psrlw m5, m6, %2 ; emulate 8-bit shift - pand m5, %3 - paddusb m5, %4 - pxor m5, [PIC_sym(pb_0xFF)] - %endif - pminub m5, m6 ; constrain(diff_p) + pand m9, %3, m6 ; emulate 8-bit shift + psrlw m9, %2 + psubusb m5, %4, m9 + pminub m5, m6 ; constrain(diff_p) %if cpuflag(ssse3) - pmaddubsw m5, m9 ; constrain(diff_p) * taps + pmaddubsw m5, m3 ; constrain(diff_p) * taps %else - psrlw m2, m5, 8 - psraw m6, m9, 8 + psrlw m9, m5, 8 + psraw m6, m3, 8 psllw m5, 8 - psllw m9, 8 - pmullw m2, m6 - pmulhw m5, m9 - paddw m5, m2 + psllw m3, 8 + pmullw m9, m6 + pmulhw m5, m3 + paddw m5, m9 %endif - paddw m13, m5 + paddw m0, m5 %endmacro -%macro LOAD_BODY 4 ; dst, src, block_width, tmp_stride +%macro LOAD_BODY 3 ; dst, src, block_width %if %3 == 4 PMOVZXBW m0, [%2+strideq*0] PMOVZXBW m1, [%2+strideq*1] PMOVZXBW m2, [%2+strideq*2] PMOVZXBW m3, [%2+stride3q] + mova [%1+32*0], m0 + mova [%1+32*1], m1 + mova [%1+32*2], m2 + mova [%1+32*3], m3 %else movu m0, [%2+strideq*0] movu m1, [%2+strideq*1] movu m2, [%2+strideq*2] movu m3, [%2+stride3q] - punpckhbw m4, m0, m15 - punpcklbw m0, m15 - punpckhbw m5, m1, m15 - punpcklbw m1, m15 - punpckhbw m6, m2, m15 - punpcklbw m2, m15 - punpckhbw m7, m3, m15 - punpcklbw m3, m15 - %endif - mova [%1+0*%4], m0 - mova [%1+1*%4], m1 - mova [%1+2*%4], m2 - mova [%1+3*%4], m3 - %if %3 == 8 - mova [%1+0*%4+2*8], m4 - mova [%1+1*%4+2*8], m5 - mova [%1+2*%4+2*8], m6 - mova [%1+3*%4+2*8], m7 + punpcklbw m4, m0, m7 + punpckhbw m0, m7 + mova [%1+32*0+ 0], m4 + mova [%1+32*0+16], m0 + punpcklbw m4, m1, m7 + punpckhbw m1, m7 + mova [%1+32*1+ 0], m4 + mova [%1+32*1+16], m1 + punpcklbw m4, m2, m7 + punpckhbw m2, m7 + mova [%1+32*2+ 0], m4 + mova [%1+32*2+16], m2 + punpcklbw m4, m3, m7 + punpckhbw m3, m7 + mova [%1+32*3+ 0], m4 + mova [%1+32*3+16], m3 %endif %endmacro -%macro CDEF_FILTER 3 ; w, h, stride - - %if cpuflag(sse4) - %define OUT_OF_BOUNDS 0x80008000 +%macro CDEF_FILTER_END 2 ; w, minmax + pxor m6, m6 + pcmpgtw m6, m0 + paddw m0, m6 + %if cpuflag(ssse3) + pmulhrsw m0, m15 %else - %define OUT_OF_BOUNDS 0x7FFF7FFF + paddw m0, m15 + psraw m0, 4 %endif - - %if ARCH_X86_64 -cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*%3, \ - dst, stride, left, top, pri, sec, stride3, dst4, edge - pcmpeqw m14, m14 - %if cpuflag(sse4) - psllw m14, 15 ; 0x8000 - %else - psrlw m14, 1 ; 0x7FFF - %endif - pxor m15, m15 - - %define px rsp+3*16+2*%3 + paddw m4, m0 + %if %2 + pminsw m4, m7 + pmaxsw m4, m8 + %endif + packuswb m4, m4 + %if %1 == 4 + movd [dstq+strideq*0], m4 + psrlq m4, 32 + movd [dstq+strideq*1], m4 + add stkq, 32*2 + lea dstq, [dstq+strideq*2] %else -cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \ - dst, stride, left, top, stride3, dst4, edge - SAVE_ARG left, 2 - SAVE_ARG top, 3 - SAVE_ARG pri, 4 - SAVE_ARG sec, 5 - SAVE_ARG dir, 6 - SAVE_ARG damping, 7 - - %define PIC_reg r2 - LEA PIC_reg, PIC_base_offset - - %if cpuflag(sse4) - %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x8000)] - %else - %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x7FFF)] - %endif - - %define m15 [PIC_sym(pb_0)] - - %define px esp+7*16+2*%3 + movq [dstq], m4 + add stkq, 32 + add dstq, strideq %endif +%endmacro +%macro CDEF_FILTER 2 ; w, h + %if ARCH_X86_64 +cglobal cdef_filter_%1x%2_8bpc, 4, 9, 16, 3 * 16 + (%2+4)*32, \ + dst, stride, left, top, pri, sec, edge, stride3, dst4 + %define px rsp+3*16+2*32 + %define base 0 + %else +cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ + dst, stride, left, edge, stride3 + %define topq r2 + %define dst4q r2 + LEA r5, tap_table + %define px esp+7*16+2*32 + %define base r5-tap_table + %endif mov edged, r8m + %if cpuflag(sse4) + %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] + %else + %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] + %endif + mova m6, OUT_OF_BOUNDS_MEM + pxor m7, m7 ; prepare pixel buffers - body/right %if %2 == 8 lea dst4q, [dstq+strideq*4] %endif lea stride3q, [strideq*3] - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .no_right - LOAD_BODY px, dstq, %1, %3 + LOAD_BODY px, dstq, %1 %if %2 == 8 - LOAD_BODY px+4*%3, dst4q, %1, %3 + LOAD_BODY px+4*32, dst4q, %1 %endif jmp .body_done .no_right: @@ -337,39 +289,37 @@ PMOVZXBW m1, [dstq+strideq*1], %1 == 4 PMOVZXBW m2, [dstq+strideq*2], %1 == 4 PMOVZXBW m3, [dstq+stride3q ], %1 == 4 + mova [px+32*0], m0 + mova [px+32*1], m1 + mova [px+32*2], m2 + mova [px+32*3], m3 + movd [px+32*0+%1*2], m6 + movd [px+32*1+%1*2], m6 + movd [px+32*2+%1*2], m6 + movd [px+32*3+%1*2], m6 %if %2 == 8 - PMOVZXBW m4, [dst4q+strideq*0], %1 == 4 - PMOVZXBW m5, [dst4q+strideq*1], %1 == 4 - PMOVZXBW m6, [dst4q+strideq*2], %1 == 4 - PMOVZXBW m7, [dst4q+stride3q ], %1 == 4 - %endif - mova [px+0*%3], m0 - mova [px+1*%3], m1 - mova [px+2*%3], m2 - mova [px+3*%3], m3 - %if %2 == 8 - mova [px+4*%3], m4 - mova [px+5*%3], m5 - mova [px+6*%3], m6 - mova [px+7*%3], m7 - mov dword [px+4*%3+%1*2], OUT_OF_BOUNDS - mov dword [px+5*%3+%1*2], OUT_OF_BOUNDS - mov dword [px+6*%3+%1*2], OUT_OF_BOUNDS - mov dword [px+7*%3+%1*2], OUT_OF_BOUNDS - %endif - mov dword [px+0*%3+%1*2], OUT_OF_BOUNDS - mov dword [px+1*%3+%1*2], OUT_OF_BOUNDS - mov dword [px+2*%3+%1*2], OUT_OF_BOUNDS - mov dword [px+3*%3+%1*2], OUT_OF_BOUNDS + PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 + PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 + PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 + PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 + mova [px+32*4], m0 + mova [px+32*5], m1 + mova [px+32*6], m2 + mova [px+32*7], m3 + movd [px+32*4+%1*2], m6 + movd [px+32*5+%1*2], m6 + movd [px+32*6+%1*2], m6 + movd [px+32*7+%1*2], m6 + %endif .body_done: ; top - LOAD_ARG32 top - test edged, 4 ; have_top + movifnidn topq, r3mp + test edgeb, 4 ; have_top jz .no_top - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .top_no_left - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .top_no_right %if %1 == 4 PMOVZXBW m0, [topq+strideq*0-2] @@ -377,39 +327,39 @@ %else movu m0, [topq+strideq*0-4] movu m1, [topq+strideq*1-4] - punpckhbw m2, m0, m15 - punpcklbw m0, m15 - punpckhbw m3, m1, m15 - punpcklbw m1, m15 - movu [px-2*%3+8], m2 - movu [px-1*%3+8], m3 + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movu [px-32*2+8], m2 + movu [px-32*1+8], m3 %endif - movu [px-2*%3-%1], m0 - movu [px-1*%3-%1], m1 + movu [px-32*2-%1], m0 + movu [px-32*1-%1], m1 jmp .top_done .top_no_right: %if %1 == 4 PMOVZXBW m0, [topq+strideq*0-%1] PMOVZXBW m1, [topq+strideq*1-%1] - movu [px-2*%3-4*2], m0 - movu [px-1*%3-4*2], m1 + movu [px-32*2-8], m0 + movu [px-32*1-8], m1 %else movu m0, [topq+strideq*0-%1] movu m1, [topq+strideq*1-%2] - punpckhbw m2, m0, m15 - punpcklbw m0, m15 - punpckhbw m3, m1, m15 - punpcklbw m1, m15 - mova [px-2*%3-8*2], m0 - mova [px-2*%3-0*2], m2 - mova [px-1*%3-8*2], m1 - mova [px-1*%3-0*2], m3 + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px-32*2-16], m0 + mova [px-32*2+ 0], m2 + mova [px-32*1-16], m1 + mova [px-32*1+ 0], m3 %endif - mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS - mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS + movd [px-32*2+%1*2], m6 + movd [px-32*1+%1*2], m6 jmp .top_done .top_no_left: - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .top_no_left_right %if %1 == 4 PMOVZXBW m0, [topq+strideq*0] @@ -417,102 +367,92 @@ %else movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] - punpckhbw m2, m0, m15 - punpcklbw m0, m15 - punpckhbw m3, m1, m15 - punpcklbw m1, m15 - movd [px-2*%3+8*2], m2 - movd [px-1*%3+8*2], m3 - %endif - mova [px-2*%3], m0 - mova [px-1*%3], m1 - mov dword [px-2*%3-4], OUT_OF_BOUNDS - mov dword [px-1*%3-4], OUT_OF_BOUNDS + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movd [px-32*2+16], m2 + movd [px-32*1+16], m3 + %endif + movd [px-32*2- 4], m6 + movd [px-32*1- 4], m6 + mova [px-32*2+ 0], m0 + mova [px-32*1+ 0], m1 jmp .top_done .top_no_left_right: PMOVZXBW m0, [topq+strideq*0], %1 == 4 PMOVZXBW m1, [topq+strideq*1], %1 == 4 - mova [px-2*%3], m0 - mova [px-1*%3], m1 - mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS - mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS - mov dword [px-2*%3-4], OUT_OF_BOUNDS - mov dword [px-1*%3-4], OUT_OF_BOUNDS + movd [px-32*2-4], m6 + movd [px-32*1-4], m6 + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + movd [px-32*2+%1*2], m6 + movd [px-32*1+%1*2], m6 jmp .top_done .no_top: - %if ARCH_X86_64 - SWAP m0, m14 - %else - mova m0, OUT_OF_BOUNDS_MEM - %endif - movu [px-2*%3-4], m0 - movu [px-1*%3-4], m0 + movu [px-32*2- 4], m6 + movu [px-32*1- 4], m6 %if %1 == 8 - movq [px-2*%3+12], m0 - movq [px-1*%3+12], m0 - %endif - %if ARCH_X86_64 - SWAP m0, m14 + movq [px-32*2+12], m6 + movq [px-32*1+12], m6 %endif .top_done: ; left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left - SAVE_PIC_REG 0 - LOAD_ARG32 left + movifnidn leftq, leftmp %if %2 == 4 movq m0, [leftq] %else movu m0, [leftq] %endif - LOAD_PIC_REG 0 %if %2 == 4 - punpcklbw m0, m15 + punpcklbw m0, m7 %else - punpckhbw m1, m0, m15 - punpcklbw m0, m15 + punpckhbw m1, m0, m7 + punpcklbw m0, m7 movhlps m3, m1 - movd [px+4*%3-4], m1 - movd [px+6*%3-4], m3 + movd [px+32*4-4], m1 + movd [px+32*6-4], m3 psrlq m1, 32 psrlq m3, 32 - movd [px+5*%3-4], m1 - movd [px+7*%3-4], m3 + movd [px+32*5-4], m1 + movd [px+32*7-4], m3 %endif movhlps m2, m0 - movd [px+0*%3-4], m0 - movd [px+2*%3-4], m2 + movd [px+32*0-4], m0 + movd [px+32*2-4], m2 psrlq m0, 32 psrlq m2, 32 - movd [px+1*%3-4], m0 - movd [px+3*%3-4], m2 + movd [px+32*1-4], m0 + movd [px+32*3-4], m2 jmp .left_done .no_left: - mov dword [px+0*%3-4], OUT_OF_BOUNDS - mov dword [px+1*%3-4], OUT_OF_BOUNDS - mov dword [px+2*%3-4], OUT_OF_BOUNDS - mov dword [px+3*%3-4], OUT_OF_BOUNDS + movd [px+32*0-4], m6 + movd [px+32*1-4], m6 + movd [px+32*2-4], m6 + movd [px+32*3-4], m6 %if %2 == 8 - mov dword [px+4*%3-4], OUT_OF_BOUNDS - mov dword [px+5*%3-4], OUT_OF_BOUNDS - mov dword [px+6*%3-4], OUT_OF_BOUNDS - mov dword [px+7*%3-4], OUT_OF_BOUNDS + movd [px+32*4-4], m6 + movd [px+32*5-4], m6 + movd [px+32*6-4], m6 + movd [px+32*7-4], m6 %endif .left_done: ; bottom %if ARCH_X86_64 - DEFINE_ARGS dst, stride, dummy1, dst8, pri, sec, stride3, dummy2, edge + DEFINE_ARGS dst, stride, dst8, dummy, pri, sec, edge, stride3 %else - DEFINE_ARGS dst, stride, dummy1, dst8, stride3, dummy2, edge + DEFINE_ARGS dst, stride, dst8, edge, stride3 %endif - test edged, 8 ; have_bottom + test edgeb, 8 ; have_bottom jz .no_bottom lea dst8q, [dstq+%2*strideq] - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .bottom_no_left - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .bottom_no_right %if %1 == 4 PMOVZXBW m0, [dst8q-(%1/2)] @@ -520,40 +460,40 @@ %else movu m0, [dst8q-4] movu m1, [dst8q+strideq-4] - punpckhbw m2, m0, m15 - punpcklbw m0, m15 - punpckhbw m3, m1, m15 - punpcklbw m1, m15 - movu [px+(%2+0)*%3+8], m2 - movu [px+(%2+1)*%3+8], m3 + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movu [px+32*(%2+0)+8], m2 + movu [px+32*(%2+1)+8], m3 %endif - movu [px+(%2+0)*%3-%1], m0 - movu [px+(%2+1)*%3-%1], m1 + movu [px+32*(%2+0)-%1], m0 + movu [px+32*(%2+1)-%1], m1 jmp .bottom_done .bottom_no_right: %if %1 == 4 PMOVZXBW m0, [dst8q-4] PMOVZXBW m1, [dst8q+strideq-4] - movu [px+(%2+0)*%3-4*2], m0 - movu [px+(%2+1)*%3-4*2], m1 + movu [px+32*(%2+0)-8], m0 + movu [px+32*(%2+1)-8], m1 %else movu m0, [dst8q-8] movu m1, [dst8q+strideq-8] - punpckhbw m2, m0, m15 - punpcklbw m0, m15 - punpckhbw m3, m1, m15 - punpcklbw m1, m15 - mova [px+(%2+0)*%3-8*2], m0 - mova [px+(%2+0)*%3-0*2], m2 - mova [px+(%2+1)*%3-8*2], m1 - mova [px+(%2+1)*%3-0*2], m3 - mov dword [px+(%2-1)*%3+8*2], OUT_OF_BOUNDS ; overwritten by first mova + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px+32*(%2+0)-16], m0 + mova [px+32*(%2+0)+ 0], m2 + mova [px+32*(%2+1)-16], m1 + mova [px+32*(%2+1)+ 0], m3 + movd [px+32*(%2-1)+16], m6 ; overwritten by first mova %endif - mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS - mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS + movd [px+32*(%2+0)+%1*2], m6 + movd [px+32*(%2+1)+%1*2], m6 jmp .bottom_done .bottom_no_left: - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .bottom_no_left_right %if %1 == 4 PMOVZXBW m0, [dst8q] @@ -561,233 +501,245 @@ %else movu m0, [dst8q] movu m1, [dst8q+strideq] - punpckhbw m2, m0, m15 - punpcklbw m0, m15 - punpckhbw m3, m1, m15 - punpcklbw m1, m15 - mova [px+(%2+0)*%3+8*2], m2 - mova [px+(%2+1)*%3+8*2], m3 - %endif - mova [px+(%2+0)*%3], m0 - mova [px+(%2+1)*%3], m1 - mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS - mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px+32*(%2+0)+16], m2 + mova [px+32*(%2+1)+16], m3 + %endif + mova [px+32*(%2+0)+ 0], m0 + mova [px+32*(%2+1)+ 0], m1 + movd [px+32*(%2+0)- 4], m6 + movd [px+32*(%2+1)- 4], m6 jmp .bottom_done .bottom_no_left_right: PMOVZXBW m0, [dst8q+strideq*0], %1 == 4 PMOVZXBW m1, [dst8q+strideq*1], %1 == 4 - mova [px+(%2+0)*%3], m0 - mova [px+(%2+1)*%3], m1 - mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS - mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS - mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS - mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS + mova [px+32*(%2+0)+ 0], m0 + mova [px+32*(%2+1)+ 0], m1 + movd [px+32*(%2+0)+%1*2], m6 + movd [px+32*(%2+1)+%1*2], m6 + movd [px+32*(%2+0)- 4], m6 + movd [px+32*(%2+1)- 4], m6 jmp .bottom_done .no_bottom: - %if ARCH_X86_64 - SWAP m0, m14 - %else - mova m0, OUT_OF_BOUNDS_MEM - %endif - movu [px+(%2+0)*%3-4], m0 - movu [px+(%2+1)*%3-4], m0 + movu [px+32*(%2+0)- 4], m6 + movu [px+32*(%2+1)- 4], m6 %if %1 == 8 - movq [px+(%2+0)*%3+12], m0 - movq [px+(%2+1)*%3+12], m0 - %endif - %if ARCH_X86_64 - SWAP m0, m14 + movq [px+32*(%2+0)+12], m6 + movq [px+32*(%2+1)+12], m6 %endif .bottom_done: ; actual filter - DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp %if ARCH_X86_64 - movifnidn prid, prim - movifnidn secd, secm - mov dampingd, r7m + DEFINE_ARGS dst, stride, pridmp, damping, pri, sec + mova m13, [shufb_lohi] + %if cpuflag(ssse3) + mova m15, [pw_2048] %else - LOAD_ARG pri - LOAD_ARG sec - LOAD_ARG damping, 1 + mova m15, [pw_8] %endif - - SAVE_PIC_REG 8 - mov pridmpd, prid - mov secdmpd, secd - or pridmpd, 1 - or secdmpd, 1 - bsr pridmpd, pridmpd - bsr secdmpd, secdmpd + mova m14, m6 + %else + DEFINE_ARGS dst, pridmp, sec, damping, pri, tap + %xdefine m8 m1 + %xdefine m9 m2 + %xdefine m10 m0 + %xdefine m13 [base+shufb_lohi] + %xdefine m14 OUT_OF_BOUNDS_MEM + %if cpuflag(ssse3) + %xdefine m15 [base+pw_2048] + %else + %xdefine m15 [base+pw_8] + %endif + %endif + movifnidn prid, r4m + movifnidn secd, r5m + mov dampingd, r7m + movif32 [esp+0x3C], r1d + test prid, prid + jz .sec_only + movd m1, prim + bsr pridmpd, prid + test secd, secd + jz .pri_only + movd m10, r5m + bsr secd, secd + and prid, 1 sub pridmpd, dampingd - sub secdmpd, dampingd + sub secd, dampingd xor dampingd, dampingd + add prid, prid neg pridmpd cmovs pridmpd, dampingd - neg secdmpd - cmovs secdmpd, dampingd + neg secd + cmovs secd, dampingd + PSHUFB_0 m1, m7 + PSHUFB_0 m10, m7 %if ARCH_X86_64 - mov [rsp+ 0], pridmpq ; pri_shift - mov [rsp+16], secdmpq ; sec_shift - %else + DEFINE_ARGS dst, stride, pridmp, tap, pri, sec + lea tapq, [tap_table] + MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask + MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask + mov [rsp+0x00], pridmpq ; pri_shift + mov [rsp+0x10], secq ; sec_shift + DEFINE_ARGS dst, stride, dir, tap, pri, stk, k, off, h + %else + MOVDDUP m2, [tapq+pridmpq*8] + MOVDDUP m3, [tapq+secq*8] + mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw + mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP mov [esp+0x00], pridmpd - mov [esp+0x30], secdmpd - mov dword [esp+0x04], 0 ; zero upper 32 bits of psrlw - mov dword [esp+0x34], 0 ; source operand in ACCUMULATE_TAP - %define PIC_reg r4 - LOAD_PIC_REG 8 + mov [esp+0x30], secd + DEFINE_ARGS dst, stride, dir, stk, pri, tap, h + %define offq dstq + %define kd strided + %define kq strideq + mova [esp+0x10], m2 + mova [esp+0x40], m3 + mova [esp+0x20], m1 + mova [esp+0x50], m10 %endif - - DEFINE_ARGS dst, stride, pridmp, table, pri, sec, secdmp - lea tableq, [PIC_sym(tap_table)] - %if ARCH_X86_64 - SWAP m2, m11 - SWAP m3, m12 + mov dird, r6m + lea stkq, [px] + lea priq, [tapq+8*8+priq*8] ; pri_taps + mov hd, %1*%2/8 + lea dirq, [tapq+dirq*2] +.v_loop: + movif32 [esp+0x38], dstd + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] ; px %endif - movd m2, [tableq+pridmpq] - movd m3, [tableq+secdmpq] - PSHUFB_0 m2, m15 ; pri_shift_mask - PSHUFB_0 m3, m15 ; sec_shift_mask + pxor m0, m0 ; sum + mova m7, m4 ; max + mova m8, m4 ; min +.k_loop: + MOVDDUP m2, [priq+kq*8] %if ARCH_X86_64 - SWAP m2, m11 - SWAP m3, m12 - %else - %define PIC_reg r6 - mov PIC_reg, r4 - DEFINE_ARGS dst, stride, dir, table, pri, sec, secdmp - LOAD_ARG pri - LOAD_ARG dir, 1 - mova [esp+0x10], m2 - mova [esp+0x40], m3 + ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 + ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 + %else + ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 %endif + dec kd + jge .k_loop + movif32 dstq, [esp+0x38] + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 1 + dec hd + jg .v_loop + RET - ; pri/sec_taps[k] [4 total] - DEFINE_ARGS dst, stride, dummy, tap, pri, sec - movd m0, prid - movd m1, secd - %if ARCH_X86_64 - PSHUFB_0 m0, m15 - PSHUFB_0 m1, m15 +.pri_only: +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, pridmp, damping, pri, tap, zero + lea tapq, [tap_table] %else - %if cpuflag(ssse3) - pxor m2, m2 - %endif - mova m3, [PIC_sym(pb_0xFF)] - PSHUFB_0 m0, m2 - PSHUFB_0 m1, m2 - pxor m0, m3 - pxor m1, m3 - mova [esp+0x20], m0 - mova [esp+0x50], m1 + DEFINE_ARGS dst, pridmp, zero, damping, pri, tap %endif and prid, 1 - lea priq, [tapq+8+priq*2] ; pri_taps - lea secq, [tapq+12] ; sec_taps - - %if ARCH_X86_64 && cpuflag(sse4) - mova m14, [shufb_lohi] - %endif - - ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] - DEFINE_ARGS dst, stride, dir, tap, pri, sec + xor zerod, zerod + sub dampingd, pridmpd + cmovs dampingd, zerod + add prid, prid + PSHUFB_0 m1, m7 + MOVDDUP m7, [tapq+dampingq*8] + mov [rsp+0x00], dampingq %if ARCH_X86_64 - mov dird, r6m - lea dirq, [tapq+14+dirq*2] - DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k + DEFINE_ARGS dst, stride, dir, stk, pri, tap, k, off, h %else - lea dird, [tapd+14+dird*2] - DEFINE_ARGS dst, stride, dir, stk, pri, sec - %define hd dword [esp+8] - %define offq dstq - %define kq strideq + mov [rsp+0x04], zerod + DEFINE_ARGS dst, stride, dir, stk, pri, tap, h %endif - mov hd, %1*%2*2/mmsize + mov dird, r6m lea stkq, [px] - movif32 [esp+0x3C], strided -.v_loop: + lea priq, [tapq+8*8+priq*8] + mov hd, %1*%2/8 + lea dirq, [tapq+dirq*2] +.pri_v_loop: movif32 [esp+0x38], dstd - mov kq, 1 + mov kd, 1 %if %1 == 4 - movq m4, [stkq+%3*0] - movhps m4, [stkq+%3*1] + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] %else - mova m4, [stkq+%3*0] ; px - %endif - - %if ARCH_X86_32 - %xdefine m9 m3 - %xdefine m13 m7 - %xdefine m7 m0 - %xdefine m8 m1 + mova m4, [stkq+32*0] %endif + pxor m0, m0 +.pri_k_loop: + MOVDDUP m2, [priq+kq*8] + ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 + dec kd + jge .pri_k_loop + movif32 dstq, [esp+0x38] + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 0 + dec hd + jg .pri_v_loop + RET - pxor m13, m13 ; sum - mova m7, m4 ; max - mova m8, m4 ; min -.k_loop: - movd m2, [priq+kq] ; pri_taps +.sec_only: +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, dir, damping, tap, sec, zero +%else + DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero +%endif + movd m1, r5m + bsr secd, secd + mov dird, r6m + xor zerod, zerod + sub dampingd, secd + cmovs dampingd, zerod + PSHUFB_0 m1, m7 %if ARCH_X86_64 - PSHUFB_0 m2, m15 - %if cpuflag(ssse3) - LOAD_SEC_TAP ; sec_taps - %endif - ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3 - %if notcpuflag(ssse3) - LOAD_SEC_TAP ; sec_taps - %endif - ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3 - ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3 + lea tapq, [tap_table] %else - %if cpuflag(ssse3) - pxor m3, m3 - %endif - PSHUFB_0 m2, m3 - ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3 - LOAD_SEC_TAP ; sec_taps - ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3 - %if notcpuflag(ssse3) - LOAD_SEC_TAP ; sec_taps - %endif - ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3 + mov [rsp+0x04], zerod %endif - - dec kq - jge .k_loop - - pxor m6, m6 - pcmpgtw m6, m13 - paddw m13, m6 - %if cpuflag(ssse3) - pmulhrsw m13, [PIC_sym(pw_2048)] + mov [rsp+0x00], dampingq + MOVDDUP m7, [tapq+dampingq*8] + lea dirq, [tapq+dirq*2] + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, dir, stk, tap, off, k, h %else - paddw m13, [PIC_sym(pw_8)] - psraw m13, 4 + DEFINE_ARGS dst, stride, off, stk, dir, tap, h %endif - paddw m4, m13 - pminsw m4, m7 - pmaxsw m4, m8 - packuswb m4, m4 - movif32 dstd, [esp+0x38] - movif32 strided, [esp+0x3C] + lea stkq, [px] + mov hd, %1*%2/8 +.sec_v_loop: + mov kd, 1 %if %1 == 4 - movd [dstq+strideq*0], m4 - psrlq m4, 32 - movd [dstq+strideq*1], m4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] %else - movq [dstq], m4 + mova m4, [stkq+32*0] %endif - - %if %1 == 4 - %define vloop_lines (mmsize/(%1*2)) - lea dstq, [dstq+strideq*vloop_lines] - add stkq, %3*vloop_lines - %else - lea dstq, [dstq+strideq] - add stkq, %3 + pxor m0, m0 +.sec_k_loop: + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 + %if ARCH_X86_32 + MOVDDUP m2, [tapq+12*8+kq*8] %endif + ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 + dec kd + jge .sec_k_loop + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 0 dec hd - jg .v_loop - + jg .sec_v_loop RET %endmacro @@ -807,27 +759,26 @@ %macro CDEF_DIR 0 %if ARCH_X86_64 -cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3 - lea stride3q, [strideq*3] +cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var + lea r6, [strideq*3] movq m1, [srcq+strideq*0] movhps m1, [srcq+strideq*1] movq m3, [srcq+strideq*2] - movhps m3, [srcq+stride3q] + movhps m3, [srcq+r6 ] lea srcq, [srcq+strideq*4] movq m5, [srcq+strideq*0] movhps m5, [srcq+strideq*1] movq m7, [srcq+strideq*2] - movhps m7, [srcq+stride3q] + movhps m7, [srcq+r6 ] pxor m8, m8 - psadbw m0, m1, m8 + psadbw m9, m1, m8 psadbw m2, m3, m8 psadbw m4, m5, m8 psadbw m6, m7, m8 - packssdw m0, m2 + packssdw m9, m2 packssdw m4, m6 - packssdw m0, m4 - SWAP m0, m9 + packssdw m9, m4 punpcklbw m0, m1, m8 punpckhbw m1, m8 @@ -837,7 +788,7 @@ punpckhbw m5, m8 punpcklbw m6, m7, m8 punpckhbw m7, m8 - +cglobal_label .main mova m8, [pw_128] psubw m0, m8 psubw m1, m8 @@ -1067,30 +1018,34 @@ punpckldq m4, m6 psubd m2, m0, m1 psubd m3, m0, m4 - mova [rsp+0x00], m2 ; emulate ymm in stack - mova [rsp+0x10], m3 +%if WIN64 + WIN64_RESTORE_XMM + %define tmp rsp+stack_offset+8 +%else + %define tmp rsp-40 +%endif + mova [tmp+0x00], m2 ; emulate ymm in stack + mova [tmp+0x10], m3 pcmpeqd m1, m0 ; compute best cost mask pcmpeqd m4, m0 packssdw m4, m1 pmovmskb eax, m4 ; get byte-idx from mask tzcnt eax, eax - mov r1d, [rsp+rax*2] ; get idx^4 complement from emulated ymm + mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm shr eax, 1 ; get direction by converting byte-idx to word-idx shr r1d, 10 mov [varq], r1d %else -cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3 - %define PIC_reg r4 - LEA PIC_reg, PIC_base_offset - +cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 +%define base r2-shufw_6543210x + LEA r2, shufw_6543210x pxor m0, m0 - mova m1, [PIC_sym(pw_128)] - lea stride3q, [strideq*3] movq m5, [srcq+strideq*0] movhps m5, [srcq+strideq*1] movq m7, [srcq+strideq*2] movhps m7, [srcq+stride3q] + mova m1, [base+pw_128] psadbw m2, m5, m0 psadbw m3, m7, m0 packssdw m2, m3 @@ -1114,19 +1069,19 @@ movq m7, [srcq+strideq*2] movhps m7, [srcq+stride3q] psadbw m3, m5, m0 - psadbw m0, m7, m0 + psadbw m0, m7 packssdw m3, m0 pxor m0, m0 - packssdw m2, m3 punpcklbw m4, m5, m0 punpckhbw m5, m0 punpcklbw m6, m7, m0 punpckhbw m7, m0 +cglobal_label .main psubw m4, m1 psubw m5, m1 psubw m6, m1 psubw m7, m1 - + packssdw m2, m3 psllw m1, 3 psubw m2, m1 ; partial_sum_hv[0] pmaddwd m2, m2 @@ -1143,7 +1098,7 @@ pmaddwd m0, m0 phaddd m2, m0 - MULLD m2, [PIC_sym(div_table%+SUFFIX)+48] + MULLD m2, [base+div_table%+SUFFIX+48] mova [esp+0x30], m2 mova m1, [esp+0x10] @@ -1176,13 +1131,13 @@ paddw m0, m2 ; partial_sum_diag[0][0-7] paddw m1, m3 ; partial_sum_diag[0][8-14,zero] mova m3, [esp+0x50] - pshufb m1, [PIC_sym(shufw_6543210x)] + pshufb m1, [base+shufw_6543210x] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmaddwd m2, m2 pmaddwd m0, m0 - MULLD m2, [PIC_sym(div_table%+SUFFIX)+16] - MULLD m0, [PIC_sym(div_table%+SUFFIX)+0] + MULLD m2, [base+div_table%+SUFFIX+16] + MULLD m0, [base+div_table%+SUFFIX+ 0] paddd m0, m2 ; cost[0a-d] mova [esp+0x40], m0 @@ -1217,13 +1172,13 @@ paddw m0, m2 ; partial_sum_diag[1][0-7] paddw m1, m3 ; partial_sum_diag[1][8-14,zero] mova m3, [esp+0x50] - pshufb m1, [PIC_sym(shufw_6543210x)] + pshufb m1, [base+shufw_6543210x] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmaddwd m2, m2 pmaddwd m0, m0 - MULLD m2, [PIC_sym(div_table%+SUFFIX)+16] - MULLD m0, [PIC_sym(div_table%+SUFFIX)+0] + MULLD m2, [base+div_table%+SUFFIX+16] + MULLD m0, [base+div_table%+SUFFIX+ 0] paddd m0, m2 ; cost[4a-d] phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] phaddd m1, [esp+0x30] ; cost[0,4,2,6] @@ -1259,8 +1214,8 @@ punpcklwd m0, m1 pmaddwd m2, m2 pmaddwd m0, m0 - MULLD m2, [PIC_sym(div_table%+SUFFIX)+48] - MULLD m0, [PIC_sym(div_table%+SUFFIX)+32] + MULLD m2, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] paddd m0, m2 ; cost[7a-d] mova [esp+0x40], m0 @@ -1280,8 +1235,8 @@ punpcklwd m0, m2 pmaddwd m7, m7 pmaddwd m0, m0 - MULLD m7, [PIC_sym(div_table%+SUFFIX)+48] - MULLD m0, [PIC_sym(div_table%+SUFFIX)+32] + MULLD m7, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] paddd m0, m7 ; cost[5a-d] mova [esp+0x50], m0 @@ -1303,8 +1258,8 @@ punpcklwd m0, m2 pmaddwd m7, m7 pmaddwd m0, m0 - MULLD m7, [PIC_sym(div_table%+SUFFIX)+48] - MULLD m0, [PIC_sym(div_table%+SUFFIX)+32] + MULLD m7, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] paddd m0, m7 ; cost[1a-d] SWAP m0, m4 @@ -1330,8 +1285,8 @@ punpcklwd m4, m2 pmaddwd m0, m0 pmaddwd m4, m4 - MULLD m0, [PIC_sym(div_table%+SUFFIX)+48] - MULLD m4, [PIC_sym(div_table%+SUFFIX)+32] + MULLD m0, [base+div_table%+SUFFIX+48] + MULLD m4, [base+div_table%+SUFFIX+32] paddd m4, m0 ; cost[3a-d] mova m1, [esp+0x00] @@ -1367,6 +1322,7 @@ %endif ; get direction and variance + mov vard, varm punpckhdq m3, m2, m1 punpckldq m2, m1 psubd m1, m0, m3 @@ -1388,18 +1344,18 @@ %endmacro INIT_XMM sse4 -CDEF_FILTER 8, 8, 32 -CDEF_FILTER 4, 8, 32 -CDEF_FILTER 4, 4, 32 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 CDEF_DIR INIT_XMM ssse3 -CDEF_FILTER 8, 8, 32 -CDEF_FILTER 4, 8, 32 -CDEF_FILTER 4, 4, 32 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 CDEF_DIR INIT_XMM sse2 -CDEF_FILTER 8, 8, 32 -CDEF_FILTER 4, 8, 32 -CDEF_FILTER 4, 4, 32 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 diff -Nru dav1d-0.7.1/src/x86/cpuid.asm dav1d-0.9.1/src/x86/cpuid.asm --- dav1d-0.7.1/src/x86/cpuid.asm 2020-06-21 11:48:55.016126400 +0000 +++ dav1d-0.9.1/src/x86/cpuid.asm 2021-07-28 21:38:28.893852000 +0000 @@ -23,6 +23,7 @@ ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +%include "config.asm" %include "ext/x86/x86inc.asm" SECTION .text diff -Nru dav1d-0.7.1/src/x86/film_grain16_avx2.asm dav1d-0.9.1/src/x86/film_grain16_avx2.asm --- dav1d-0.7.1/src/x86/film_grain16_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/film_grain16_avx2.asm 2021-07-28 21:38:28.893852000 +0000 @@ -0,0 +1,2375 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pd_16: dd 16 +pd_m65536: dd ~0xffff +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512, 1024 +max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 +min: dw 0, 16*4, 16*16 +pw_27_17_17_27: dw 27, 17, 17, 27 +; these two should be next to each other +pw_4: times 2 dw 4 +pw_16: times 2 dw 16 +pw_23_22: dw 23, 22, 0, 32 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +INIT_YMM avx2 +cglobal generate_grain_y_16bpc, 3, 9, 16, buf, fg_data, bdmax + lea r4, [pb_mask] +%define base r4-pb_mask + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r3d, [fg_dataq+FGData.grain_scale_shift] + lea r6d, [bdmaxq+1] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r3, r6 + vpbroadcastw xm8, [base+round+r3*2-2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastd xm9, [base+pd_m65536] + mov r3, -73*82*2 + sub bufq, r3 + lea r6, [gaussian_sequence] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r6+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw xm2, xm8 + movq [bufq+r3], xm2 + add r3, 4*2 + jl .loop + + ; auto-regression code + movsxd r3, [fg_dataq+FGData.ar_coeff_lag] + movsxd r3, [base+generate_grain_y_16bpc_avx2_table+r3*4] + lea r3, [r3+base+generate_grain_y_16bpc_avx2_table] + jmp r3 + +.ar1: + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_y] + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 + pinsrb xm4, [pb_1], 3 + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right + punpcklwd xm0, xm2 + punpcklwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + movq xm8, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 + vinserti128 m8, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 + pxor m9, m9 + punpcklwd xm14, xm9 + pcmpgtb m9, m8 + punpcklbw m8, m9 ; cf5-11,0-4 + vpermq m9, m8, q3333 ; cf4 + psrldq xm10, xm8, 6 ; cf8-11 + vpblendw xm9, xm10, 11111110b ; cf4,9-11 + pshufd m12, m8, q0000 ; cf[5,6], cf[0-1] + pshufd m11, m8, q1111 ; cf[7,8], cf[2-3] + pshufd xm13, xm9, q1111 ; cf[10,11] + pshufd xm10, xm9, q0000 ; cf[4,9] + sar bdmaxd, 1 + movd xm15, bdmaxd + pcmpeqd xm7, xm7 + vpbroadcastd xm15, xm15 ; max_grain + pxor xm7, xm15 ; min_grain + sub bufq, 2*(82*73-(82*3+79)) + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + vinserti128 m0, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] + psrldq m1, m0, 2 ; y=-1/-2,x=[-1,+5] + psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] + psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] + + vextracti128 xm4, m0, 1 ; y=-2,x=[-2,+5] + punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpckhwd xm4, xm0 ; y=-2/-1 interleaved, x=[+2,+5] + punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + + pmaddwd m2, m11 + pmaddwd m0, m12 + pmaddwd xm4, xm10 + + paddd m0, m2 + vextracti128 xm2, m0, 1 + paddd xm4, xm0 + paddd xm2, xm14 + paddd xm2, xm4 + + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd xm4, xm0, q3321 + pmovsxwd xm4, xm4 ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd xm3, xm0, xm13 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + paddd xm3, xm4 + pminsd xm3, xm15 + pmaxsd xm3, xm7 + pextrw [bufq+xq*2], xm3, 0 + psrldq xm4, 4 + pslldq xm3, 2 + psrldq xm0, 2 + vpblendw xm0, xm3, 0010b + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~31 + sub rsp, 64 + %define tmp rsp +%elif STACK_ALIGNMENT < 32 + mov r6, rsp + and r6, ~31 + %define tmp r6-64 +%else + %define tmp rsp+stack_offset-88 +%endif + sar bdmaxd, 1 + movd xm15, bdmaxd + pcmpeqd xm13, xm13 + vpbroadcastd xm15, xm15 ; max_grain + pxor xm13, xm15 ; min_grain + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw m14, [base+round_vals+shiftq*2-12] + movq xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 + movd xm1, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 + pinsrb xm0, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 + pinsrb xm1, [pb_1], 3 ; cf14-16,pb_1 + movd xm2, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 + vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 + vinserti128 m1, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 + punpcklbw m0, m0 ; sign-extension + punpcklbw m1, m1 ; sign-extension + punpcklbw xm2, xm2 + REPX {psraw x, 8}, m0, m1, xm2 + + pshufd m8, m0, q0000 ; cf[0,1] | cf[7,8] + pshufd m9, m0, q1111 ; cf[2,3] | cf[9,10] + pshufd m10, m0, q2222 ; cf[4,5] | cf[11,12] + pshufd xm11, xm0, q3333 ; cf[6,13] + + pshufd m3, m1, q0000 ; cf[14,15] | cf[17,18] + pshufd m4, m1, q1111 ; cf[16],pw_1 | cf[19,20] + mova [tmp+0*32], m3 + mova [tmp+1*32], m4 + + paddw xm5, xm14, xm14 + vpblendw xm12, xm2, xm5, 00001000b + + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] + + palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] + punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] + vextracti128 xm7, m1, 1 + punpcklwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] + + psrldq m3, m2, 2 + psrldq m4, m2, 4 + psrldq m7, m2, 6 + vpblendd m7, m14, 00001111b ; rounding constant + punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpcklwd m4, m7 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + + pmaddwd m0, m8 + pmaddwd m6, m9 + pmaddwd m5, m10 + pmaddwd xm1, xm11 + pmaddwd m2, [tmp+0*32] + pmaddwd m4, [tmp+1*32] + + paddd m0, m6 + paddd m5, m2 + paddd m0, m4 + paddd m0, m5 + vextracti128 xm4, m0, 1 + paddd xm0, xm1 + paddd xm0, xm4 + + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm12 + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + pminsd xm2, xm15 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + psrldq xm1, 2 + vpblendw xm1, xm2, 0100b + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%endif + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM avx2 +cglobal generate_grain_uv_%1_16bpc, 4, 10, 16, buf, bufy, fg_data, uv, bdmax +%define base r8-pb_mask + lea r8, [pb_mask] + movifnidn bdmaxd, bdmaxm + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + lea r6d, [bdmaxq+1] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r5, r6 + vpbroadcastw xm8, [base+round+r5*2-2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] + pxor xm0, xm9 + vpbroadcastd xm9, [base+pd_m65536] + lea r6, [gaussian_sequence] +%if %2 + mov r7d, 73-35*%3 + add bufq, 44*2 +.loop_y: + mov r5, -44 +%else + mov r5, -82*73 + add bufq, 2*82*73 +%endif +.loop_x: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r6+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw xm2, xm8 + movq [bufq+r5*2], xm2 + add r5, 4 + jl .loop_x +%if %2 + add bufq, 82*2 + dec r7d + jg .loop_y +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_16bpc_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_16bpc_avx2_table] + jmp r5 + +.ar0: + INIT_YMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + vpbroadcastw m3, [base+hmul_bits+shiftq*2-10] + sar bdmaxd, 1 + movd xm14, bdmaxd + pcmpeqw m7, m7 + vpbroadcastw m14, xm14 ; max_gain + pxor m7, m14 ; min_grain + DEFINE_ARGS buf, bufy, h, x + pmovsxbw xm4, xm4 +%if %2 + vpbroadcastw m6, [hmul_bits+2+%3*2] +%endif + vpbroadcastw m4, xm4 + pxor m5, m5 +%if %2 + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif + add bufyq, 2*(3+82*3) + mov hd, 70-35*%3 +.y_loop_ar0: +%if %2 + ; first 32 pixels + movu xm8, [bufyq] + movu xm10, [bufyq+ 16] +%if %3 + movu xm9, [bufyq+82*2] + movu xm11, [bufyq+82*2+16] +%endif + vinserti128 m8, [bufyq+ 32], 1 + vinserti128 m10, [bufyq+ 48], 1 +%if %3 + vinserti128 m9, [bufyq+82*2+32], 1 + vinserti128 m11, [bufyq+82*2+48], 1 + paddw m8, m9 + paddw m10, m11 +%endif + phaddw m8, m10 + movu xm10, [bufyq+ 64] + movu xm12, [bufyq+ 80] +%if %3 + movu xm11, [bufyq+82*2+64] + movu xm13, [bufyq+82*2+80] +%endif + vinserti128 m10, [bufyq+ 96], 1 + vinserti128 m12, [bufyq+ 112], 1 +%if %3 + vinserti128 m11, [bufyq+82*2+96], 1 + vinserti128 m13, [bufyq+82*2+112], 1 + paddw m10, m11 + paddw m12, m13 +%endif + phaddw m10, m12 + pmulhrsw m8, m6 + pmulhrsw m10, m6 +%else + xor xd, xd +.x_loop_ar0: + movu m8, [bufyq+xq*2] + movu m10, [bufyq+xq*2+32] +%endif + punpckhwd m9, m8, m5 + punpcklwd m8, m5 + punpckhwd m11, m10, m5 + punpcklwd m10, m5 + REPX {pmaddwd x, m4}, m8, m9, m10, m11 + REPX {psrad x, 5}, m8, m9, m10, m11 + packssdw m8, m9 + packssdw m10, m11 + REPX {pmulhrsw x, m3}, m8, m10 +%if %2 + paddw m8, [bufq+ 0] + paddw m10, [bufq+32] +%else + paddw m8, [bufq+xq*2+ 0] + paddw m10, [bufq+xq*2+32] +%endif + pminsw m8, m14 + pminsw m10, m14 + pmaxsw m8, m7 + pmaxsw m10, m7 +%if %2 + movu [bufq+ 0], m8 + movu [bufq+32], m10 + + ; last 6 pixels + movu xm8, [bufyq+32*4] + movu xm10, [bufyq+32*4+16] +%if %3 + paddw xm8, [bufyq+32*4+82*2] + paddw xm10, [bufyq+32*4+82*2+16] +%endif + phaddw xm8, xm10 + pmulhrsw xm8, xm6 + punpckhwd xm9, xm8, xm5 + punpcklwd xm8, xm5 + REPX {pmaddwd x, xm4}, xm8, xm9 + REPX {psrad x, 5}, xm8, xm9 + packssdw xm8, xm9 + pmulhrsw xm8, xm3 + movu xm0, [bufq+32*2] + paddw xm8, xm0 + pminsw xm8, xm14 + pmaxsw xm8, xm7 + vpblendw xm0, xm8, xm0, 11000000b + movu [bufq+32*2], xm0 +%else + movu [bufq+xq*2+ 0], m8 + movu [bufq+xq*2+32], m10 + add xd, 32 + cmp xd, 64 + jl .x_loop_ar0 + + ; last 12 pixels + movu m8, [bufyq+64*2] + punpckhwd m9, m8, m5 + punpcklwd m8, m5 + REPX {pmaddwd x, m4}, m8, m9 + REPX {psrad x, 5}, m8, m9 + packssdw m8, m9 + pmulhrsw m8, m3 + movu m0, [bufq+64*2] + paddw m8, m0 + pminsw m8, m14 + pmaxsw m8, m7 + vpblendd m0, m8, m0, 11000000b + movu [bufq+64*2], m0 +%endif + + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar0 + RET + +.ar1: + INIT_XMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd + vpbroadcastw xm6, [hmul_bits+2+%3*2] + vpbroadcastd xm3, xm3 +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left +%if %2 + movu xm8, [bufyq+xq*4] +%else + movq xm8, [bufyq+xq*2] +%endif + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right +%if %2 +%if %3 + phaddw xm8, [bufyq+xq*4+82*2] + pshufd xm9, xm8, q3232 + paddw xm8, xm9 +%else + phaddw xm8, xm8 +%endif + pmulhrsw xm8, xm6 +%endif + punpcklwd xm0, xm2 + punpcklwd xm1, xm8 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 + paddd xm0, xm3 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar1 + RET + + INIT_YMM avx2 +.ar2: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + sar bdmaxd, 1 + movd xm6, bdmaxd + pcmpeqd xm5, xm5 + vpbroadcastd xm6, xm6 ; max_grain + pxor xm5, xm6 ; min_grain +%if %2 + vpbroadcastw xm7, [base+hmul_bits+2+%3*2] +%endif + vpbroadcastw xm15, [base+round_vals-12+shiftq*2] + + movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+5] + pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 + pinsrb xm0, [pb_1], 5 + pinsrw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 + movhps xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] + pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+9], 13 + pmovsxbw m0, xm0 + + pshufd xm13, xm0, q3333 + pshufd m12, m0, q0000 + pshufd m11, m0, q1111 + pshufd m10, m0, q2222 + + DEFINE_ARGS buf, bufy, fg_data, h, x +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + movu xm0, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + vinserti128 m0, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] + psrldq m1, m0, 2 ; y=-1/-2,x=[-1,+5] + psrldq m2, m0, 4 ; y=-1/-2,x=[-0,+5] + psrldq m3, m0, 6 ; y=-1/-2,x=[+1,+5] + +%if %2 + movu xm8, [bufyq+xq*4] +%if %3 + paddw xm8, [bufyq+xq*4+82*2] +%endif + phaddw xm8, xm8 +%else + movq xm8, [bufyq+xq*2] +%endif + + vinserti128 m4, xm0, 1 ; y=-1,x=[-2,+5] + punpcklwd m2, m3 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpckhwd m4, m0, m4 ; y=-2/-1 interleaved, x=[+2,+5] + punpcklwd m0, m1 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + +%if %2 + pmulhrsw xm1, xm8, xm7 + punpcklwd xm1, xm15 ; luma, round interleaved +%else + punpcklwd xm1, xm8, xm15 +%endif + vpblendd m1, m1, m4, 11110000b + + pmaddwd m2, m11 + pmaddwd m0, m12 + pmaddwd m1, m10 + paddd m2, m0 + paddd m2, m1 + vextracti128 xm0, m2, 1 + paddd xm2, xm0 + + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd xm4, xm0, q3321 + pmovsxwd xm4, xm4 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd xm3, xm0, xm13 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd xm3, xm4 + pminsd xm3, xm6 + pmaxsd xm3, xm5 + pextrw [bufq+xq*2], xm3, 0 + psrldq xm0, 2 + pslldq xm3, 2 + psrldq xm4, 4 + vpblendw xm0, xm3, 00000010b + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~31 + sub rsp, 96 + %define tmp rsp +%elif STACK_ALIGNMENT < 32 + mov r6, rsp + and r6, ~31 + %define tmp r6-96 +%else + %define tmp rsp+stack_offset-120 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + sar bdmaxd, 1 + movd xm15, bdmaxd + pcmpeqd xm13, xm13 + vpbroadcastd xm15, xm15 ; max_grain + pxor xm13, xm15 ; min_grain +%if %2 + vpbroadcastw xm12, [base+hmul_bits+2+%3*2] +%endif + + movq xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pinsrb xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma + movhps xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] + pmovsxbw m0, xm0 + + pshufd m11, m0, q3333 + pshufd m10, m0, q2222 + pshufd m9, m0, q1111 + pshufd m8, m0, q0000 + + movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] + pinsrb xm0, [pb_1], 3 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 + pmovsxbw m0, xm0 + + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + mova [tmp+32*2], m11 + pshufd xm11, xm0, q3232 + mova [tmp+32*0], m1 + mova [tmp+32*1], m2 + pinsrw xm11, [base+round_vals-10+shiftq*2], 3 + + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] + +%if %2 + movu xm7, [bufyq+xq*4] +%if %3 + paddw xm7, [bufyq+xq*4+82*2] +%endif + phaddw xm7, xm7 +%else + movq xm7, [bufyq+xq*2] +%endif + + palignr m4, m1, m0, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] + punpckhwd m5, m0, m4 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m4 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + palignr m6, m5, m0, 8 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] +%if %2 + pmulhrsw xm7, xm12 +%endif + punpcklwd m1, m7 + + psrldq m3, m2, 2 + psrldq m4, m2, 4 + psrldq m7, m2, 6 + vpblendd m7, m14, 00001111b ; rounding constant + punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpcklwd m4, m7 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + + pmaddwd m0, m8 + pmaddwd m6, m9 + pmaddwd m5, m10 + pmaddwd m1, [tmp+32*2] + pmaddwd m2, [tmp+32*0] + pmaddwd m4, [tmp+32*1] + + paddd m0, m6 + paddd m5, m2 + paddd m4, m1 + paddd m0, m4 + paddd m0, m5 + vextracti128 xm4, m0, 1 + paddd xm0, xm4 + + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm11 + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; no need to packssdw since we only care about one value + pminsd xm2, xm15 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + psrldq xm1, 2 + vpblendw xm1, xm2, 00000100b + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%endif + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +INIT_YMM avx2 +cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, grain_lut + mov r7d, [fg_dataq+FGData.scaling_shift] + lea r8, [pb_mask] +%define base r8-pb_mask + vpbroadcastw m11, [base+mul_bits+r7*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, r9m ; bdmax + sar r9d, 11 ; is_12bpc + shlx r10d, r6d, r9d + vpbroadcastw m13, [base+min+r10*2] + lea r9d, [r9d*3] + lea r9d, [r6d*2+r9d] + vpbroadcastw m12, [base+max+r9*2] + vpbroadcastw m10, r9m + pxor m2, m2 + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, byte [fg_dataq+FGData.overlap_flag] + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y: + ; src + pminuw m0, m10, [srcq+ 0] + pminuw m1, m10, [srcq+32] ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m8, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m5-3], m9 + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m5, [scalingq+m6-3], m3 + vpgatherdd m6, [scalingq+m7-3], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 + packssdw m8, m4 + packssdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] + movu m3, [grain_lutq+offxyq*2+32] + + ; noise = round2(scaling[src] * grain, scaling_shift) + REPX {pmullw x, m11}, m8, m5 + pmulhrsw m9, m8 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je .loop_x + + ; r8m = sbym + movq xm15, [pw_27_17_17_27] + cmp dword r8m, 0 + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) + vpbroadcastd xm14, [pd_16] +.loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + pminuw m0, m10, [srcq+ 0] + pminuw m1, m10, [srcq+32] ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m8, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m5-3], m9 + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m5, [scalingq+m6-3], m3 + vpgatherdd m6, [scalingq+m7-3], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 + packssdw m8, m4 + packssdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] + movd xm7, [grain_lutq+left_offxyq*2] + punpcklwd xm7, xm9 + pmaddwd xm7, xm15 + paddd xm7, xm14 + psrad xm7, 5 + packssdw xm7, xm7 + vpblendd m9, m7, 00000001b + pcmpeqw m3, m3 + psraw m7, m10, 1 ; max_grain + pxor m3, m7 ; min_grain + pminsw m9, m7 + pmaxsw m9, m3 + movu m3, [grain_lutq+offxyq*2+32] + + ; noise = round2(scaling[src] * grain, scaling_shift) + REPX {pmullw x, m11}, m8, m5 + pmulhrsw m9, m8 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + + ; r8m = sbym + cmp dword r8m, 0 + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.end: + RET + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + + vpbroadcastd m14, [pd_16] +.loop_x_v_overlap: + vpbroadcastd m15, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movu m7, [grain_lutq+top_offxyq*2] + punpckhwd m9, m7, m3 + punpcklwd m7, m3 + REPX {pmaddwd x, m15}, m9, m7 + REPX {paddd x, m14}, m9, m7 + REPX {psrad x, 5}, m9, m7 + packssdw m7, m9 + pcmpeqw m0, m0 + psraw m1, m10, 1 ; max_grain + pxor m0, m1 ; min_grain + pminsw m7, m1 + pmaxsw m7, m0 + movu m3, [grain_lutq+offxyq*2+32] + movu m8, [grain_lutq+top_offxyq*2+32] + punpckhwd m9, m8, m3 + punpcklwd m8, m3 + REPX {pmaddwd x, m15}, m9, m8 + REPX {paddd x, m14}, m9, m8 + REPX {psrad x, 5}, m9, m8 + packssdw m8, m9 + pminsw m8, m1 + pmaxsw m8, m0 + + ; src + pminuw m0, m10, [srcq+ 0] ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + + ; scaling[src] + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m6, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m5-3], m9 + REPX {psrld x, 24}, m6, m4 + packssdw m6, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m6, m11 + pmulhrsw m6, m7 + + ; same for the other half + pminuw m1, m10, [srcq+32] ; m0-1: src as word + punpckhwd m9, m1, m2 + punpcklwd m4, m1, m2 ; m4-7: src as dword + pcmpeqw m3, m3 + mova m7, m3 + vpgatherdd m5, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m9-3], m7 + REPX {psrld x, 24}, m5, m4 + packssdw m5, m4 + + pmullw m5, m11 + pmulhrsw m5, m8 + + ; dst = clip_pixel(src, noise) + paddw m0, m6 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82*2 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: + add wq, 32 + jge .end_hv + lea srcq, [src_bakq+wq*2] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + + movq xm15, [pw_27_17_17_27] +.loop_x_hv_overlap: + vpbroadcastd m8, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+32] + lea left_offxyq, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movu m0, [grain_lutq+offxyq*2+32] + movu m6, [grain_lutq+top_offxyq*2] + movu m1, [grain_lutq+top_offxyq*2+32] + movd xm4, [grain_lutq+left_offxyq*2] + movd xm7, [grain_lutq+topleft_offxyq*2] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd xm4, xm3 + punpcklwd xm7, xm6 + REPX {pmaddwd x, xm15}, xm4, xm7 + REPX {paddd x, xm14}, xm4, xm7 + REPX {psrad x, 5}, xm4, xm7 + REPX {packssdw x, x}, xm4, xm7 + pcmpeqw m5, m5 + psraw m9, m10, 1 ; max_grain + pxor m5, m9 ; min_grain + REPX {pminsw x, xm9}, xm4, xm7 + REPX {pmaxsw x, xm5}, xm4, xm7 + vpblendd m3, m4, 00000001b + vpblendd m6, m7, 00000001b + ; followed by v interpolation (top | cur -> cur) + punpckhwd m7, m6, m3 + punpcklwd m6, m3 + punpckhwd m3, m1, m0 + punpcklwd m1, m0 + REPX {pmaddwd x, m8}, m7, m6, m3, m1 + REPX {paddd x, m14}, m7, m6, m3, m1 + REPX {psrad x, 5}, m7, m6, m3, m1 + packssdw m7, m6, m7 + packssdw m3, m1, m3 + REPX {pminsw x, m9}, m7, m3 + REPX {pmaxsw x, m5}, m7, m3 + + ; src + pminuw m0, m10, [srcq+ 0] + pminuw m1, m10, [srcq+32] ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + + ; scaling[src] + pcmpeqw m9, m9 + vpgatherdd m6, [scalingq+m4-3], m9 + pcmpeqw m9, m9 + vpgatherdd m4, [scalingq+m5-3], m9 + REPX {psrld x, 24}, m6, m4 + packssdw m6, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m6, m11 + pmulhrsw m7, m6 + + ; other half + punpckhwd m5, m1, m2 + punpcklwd m4, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m6, m6 + vpgatherdd m9, [scalingq+m4-3], m6 + pcmpeqw m6, m6 + vpgatherdd m4, [scalingq+m5-3], m6 + REPX {psrld x, 24}, m9, m4 + packssdw m9, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m9, m11 + pmulhrsw m3, m9 + + ; dst = clip_pixel(src, noise) + paddw m0, m7 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + vpbroadcastd m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82*2 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq*2] + jl .loop_x_hv_overlap + +.end_hv: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r8-pb_mask + lea r8, [pb_mask] + mov r7d, [fg_dataq+FGData.scaling_shift] + vpbroadcastw m11, [base+mul_bits+r7*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, r13m ; bdmax + sar r9d, 11 ; is_12bpc + shlx r10d, r6d, r9d + vpbroadcastw m13, [base+min+r10*2] + lea r10d, [r9d*3] + mov r11d, is_idm + shlx r6d, r6d, r11d + add r10d, r6d + vpbroadcastw m12, [base+max+r10*2] + vpbroadcastw m10, r13m + pxor m2, m2 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + +%if %1 + mov r7d, r11m + vpbroadcastw m0, [fg_dataq+FGData.uv_mult+r7*4] + vpbroadcastw m1, [fg_dataq+FGData.uv_luma_mult+r7*4] + punpcklwd m14, m1, m0 + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] + vpbroadcastd m9, [base+pw_4+r9*4] + pmullw m15, m9 +%else + vpbroadcastd m14, [pd_16] +%if %2 + vpbroadcastq m15, [pw_23_22] +%else + vpbroadcastq m15, [pw_27_17_17_27] +%endif +%endif + + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, byte [fg_dataq+FGData.overlap_flag] + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y: + ; src + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] ; m0-1: src as word +%else + mova m1, [srcq+32] +%endif + + ; luma_src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm7, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+32], 1 + vinserti128 m7, [lumaq+lstrideq*0+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m4, m7 + phaddw m6, m8 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, m2}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pminuw x, m10}, m4, m6 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m8, [scalingq+m4-3], m3 + vpgatherdd m4, [scalingq+m5-3], m9 + pcmpeqw m3, m3 + mova m9, m3 + vpgatherdd m5, [scalingq+m6-3], m3 + vpgatherdd m6, [scalingq+m7-3], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 + packssdw m8, m4 + packssdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m8, m5 + pmulhrsw m9, m8 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 +%else + dec hb +%endif + jg %%loop_y + + add wq, 32>>%2 + jge %%end + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je %%loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] + + ; luma_src + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm7, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+32], 1 + vinserti128 m7, [lumaq+lstrideq*0+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m4, m7 + phaddw m6, m8 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m1, [srcq+32] + + ; luma_src + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, m2}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pminuw x, m10}, m4, m6 +%endif + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif + movd xm5, [grain_lutq+left_offxyq*2+ 0] +%if %2 + pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} + punpckldq xm7, xm9, xm3 ; {cur0, cur1} + punpcklwd xm5, xm7 ; {left0, cur0, left1, cur1} +%else + punpcklwd xm5, xm9 +%endif +%if %1 +%if %2 + vpbroadcastq xm8, [pw_23_22] +%else + movq xm8, [pw_27_17_17_27] +%endif + pmaddwd xm5, xm8 + vpbroadcastd xm8, [pd_16] + paddd xm5, xm8 +%else + pmaddwd xm5, xm15 + paddd xm5, xm14 +%endif + psrad xm5, 5 + packssdw xm5, xm5 + pcmpeqw xm8, xm8 + psraw xm7, xm10, 1 + pxor xm8, xm7 + pmaxsw xm5, xm8 + pminsw xm5, xm7 + vpblendd m9, m9, m5, 00000001b +%if %2 + psrldq xm5, 4 + vpblendd m3, m3, m5, 00000001b +%endif + + ; scaling[luma_src] + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + pcmpeqw m7, m7 + vpgatherdd m8, [scalingq+m4-3], m7 + pcmpeqw m7, m7 + vpgatherdd m4, [scalingq+m5-3], m7 + REPX {psrld x, 24}, m8, m4 + packssdw m8, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m8, m11 + pmulhrsw m9, m8 + + ; same for the other half + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + pcmpeqw m8, m8 + mova m4, m8 + vpgatherdd m5, [scalingq+m6-3], m8 + vpgatherdd m6, [scalingq+m7-3], m4 + REPX {psrld x, 24}, m5, m6 + packssdw m5, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m5, m11 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 +%else + dec hb +%endif + jg %%loop_y_h_overlap + + add wq, 32>>%2 + jge %%end + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%end: + RET + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, unused3, unused4, unused5, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %2 == 0 + lea r10, [pw_27_17_17_27] +%endif + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_v_overlap: + ; src + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] + + ; luma_src + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm7, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+32], 1 + vinserti128 m7, [lumaq+lstrideq*0+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m4, m7 + phaddw m6, m8 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m1, [srcq+32] + + ; luma_src + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, m2}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pminuw x, m10}, m4, m6 +%endif + + ; grain = grain_lut[offy+y][offx+x] + movu m9, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + punpckhwd m7, m5, m9 + punpcklwd m5, m9 ; {top/cur interleaved} +%if %3 + vpbroadcastd m3, [pw_23_22] +%elif %2 + vpbroadcastd m3, [pw_27_17_17_27] +%else + vpbroadcastd m3, [r10] +%endif + REPX {pmaddwd x, m3}, m7, m5 +%if %1 + vpbroadcastd m8, [pd_16] + REPX {paddd x, m8}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m9, m5, m7 +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif +%if %3 == 0 +%if %2 + movu m5, [grain_lutq+top_offxyq*2+82*2] +%else + movu m5, [grain_lutq+top_offxyq*2+32] +%endif + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} +%if %2 + vpbroadcastd m3, [pw_27_17_17_27+4] +%else + vpbroadcastd m3, [r10] +%endif + REPX {pmaddwd x, m3}, m7, m5 +%if %1 + REPX {paddd x, m8}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m3, m5, m7 +%endif ; %3 == 0 + pcmpeqw m7, m7 + psraw m5, m10, 1 + pxor m7, m5 +%if %3 + pmaxsw m9, m7 + pminsw m9, m5 +%else + REPX {pmaxsw x, m7}, m9, m3 + REPX {pminsw x, m5}, m9, m3 +%endif + + ; scaling[luma_src] + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + pcmpeqw m7, m7 + vpgatherdd m8, [scalingq+m4-3], m7 + pcmpeqw m7, m7 + vpgatherdd m4, [scalingq+m5-3], m7 + REPX {psrld x, 24}, m8, m4 + packssdw m8, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m8, m11 + pmulhrsw m9, m8 + + ; scaling for the other half + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + pcmpeqw m8, m8 + mova m4, m8 + vpgatherdd m5, [scalingq+m6-3], m8 + vpgatherdd m6, [scalingq+m7-3], m4 + REPX {psrld x, 24}, m5, m6 + packssdw m5, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m5, m11 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + + sub hb, 2 +%else + mova [dstq+32], m1 + dec hb +%endif + jle %%end_y_v_overlap +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + jmp %%loop_y +%else + btc hd, 16 + jc %%loop_y + add r10, 4 + jmp %%loop_y_v_overlap +%endif + +%%end_y_v_overlap: + add wq, 32>>%2 + jge %%end_hv + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + +%if %2 == 0 + lea r12, [pw_27_17_17_27] + mov r13mp, r12 +%endif + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movd xm5, [grain_lutq+left_offxyq*2] +%if %2 + pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 +%if %3 + vinserti128 m5, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } +%else + ; insert both top/left lines + movd xm9, [grain_lutq+topleft_offxyq*2+82*2] + pinsrw xm9, [grain_lutq+topleft_offxyq*2], 2 + vinserti128 m5, xm9, 1 +%endif +%else + pinsrd xm5, [grain_lutq+topleft_offxyq*2], 1 +%endif + movu m9, [grain_lutq+offxyq*2] +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif + movu m8, [grain_lutq+top_offxyq*2] +%if %2 + punpckldq xm7, xm9, xm3 ; { cur0, cur1 } +%if %3 + vinserti128 m7, xm8, 1 ; { cur0, cur1, top0 } +%else + ; insert both top lines + movu m1, [grain_lutq+top_offxyq*2+82*2] + punpckldq xm0, xm1, xm8 + vinserti128 m7, xm0, 1 +%endif +%else + movu m1, [grain_lutq+top_offxyq*2+32] + punpckldq xm7, xm9, xm8 +%endif + punpcklwd m5, m7 ; { cur/left } interleaved +%if %2 +%if %1 + vpbroadcastq m0, [pw_23_22] + pmaddwd m5, m0 + vpbroadcastd m0, [pd_16] + paddd m5, m0 +%else + pmaddwd m5, m15 + paddd m5, m14 +%endif + psrad m5, 5 + vextracti128 xm0, m5, 1 + packssdw xm5, xm0 +%else +%if %1 + movddup xm0, [pw_27_17_17_27] + pmaddwd xm5, xm0 + vpbroadcastd m0, [pd_16] + paddd xm5, xm0 +%else + pmaddwd xm5, xm15 + paddd xm5, xm14 +%endif + psrad xm5, 5 + packssdw xm5, xm5 +%endif + pcmpeqw m0, m0 + psraw m7, m10, 1 + pxor m0, m7 + pminsw xm5, xm7 + pmaxsw xm5, xm0 + vpblendd m9, m9, m5, 00000001b +%if %2 + psrldq xm5, 4 + vpblendd m3, m3, m5, 00000001b +%if %3 == 0 + psrldq xm5, 4 + vpblendd m1, m1, m5, 00000001b +%endif +%endif + psrldq xm5, 4 + vpblendd m5, m8, m5, 00000001b + + punpckhwd m8, m5, m9 + punpcklwd m5, m9 ; {top/cur interleaved} +%if %3 + vpbroadcastd m9, [pw_23_22] +%elif %2 + vpbroadcastd m9, [pw_27_17_17_27] +%else + xchg r12, r13mp + vpbroadcastd m9, [r12] +%endif + REPX {pmaddwd x, m9}, m8, m5 +%if %1 + vpbroadcastd m4, [pd_16] + REPX {paddd x, m4}, m8, m5 +%else + REPX {paddd x, m14}, m8, m5 +%endif + REPX {psrad x, 5}, m8, m5 + packssdw m9, m5, m8 +%if %3 + pminsw m9, m7 + pmaxsw m9, m0 +%else + punpckhwd m8, m1, m3 + punpcklwd m1, m3 ; {top/cur interleaved} +%if %2 + vpbroadcastd m3, [pw_27_17_17_27+4] +%else + vpbroadcastd m3, [r12] + xchg r12, r13mp +%endif + REPX {pmaddwd x, m3}, m8, m1 +%if %1 + REPX {paddd x, m4}, m8, m1 +%else + REPX {paddd x, m14}, m8, m1 +%endif + REPX {psrad x, 5}, m8, m1 + packssdw m3, m1, m8 + REPX {pminsw x, m7}, m9, m3 + REPX {pmaxsw x, m0}, m9, m3 +%endif + + ; src + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + + ; luma_src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm7, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+32], 1 + vinserti128 m7, [lumaq+lstrideq*0+48], 1 + mova xm6, [lumaq+lstrideq*(1<<%3)+ 0] + mova xm8, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m6, [lumaq+lstrideq*(1<<%3)+32], 1 + vinserti128 m8, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m4, m7 + phaddw m6, m8 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m6, [lumaq+32] +%endif + +%if %1 + punpckhwd m8, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m8, m4, m5, m6 + REPX {psrad x, 6}, m8, m4, m5, m6 + packssdw m4, m8 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, m2}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pminuw x, m10}, m4, m6 +%endif + + ; scaling[luma_src] + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + pcmpeqw m7, m7 + vpgatherdd m8, [scalingq+m4-3], m7 + pcmpeqw m7, m7 + vpgatherdd m4, [scalingq+m5-3], m7 + REPX {psrld x, 24}, m8, m4 + packssdw m8, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m8, m11 + pmulhrsw m9, m8 + + ; same for the other half + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + pcmpeqw m8, m8 + mova m4, m8 + vpgatherdd m5, [scalingq+m6-3], m8 + vpgatherdd m6, [scalingq+m7-3], m4 + REPX {psrld x, 24}, m5, m6 + packssdw m5, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m5, m11 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m9 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 + jg %%loop_y_h_overlap +%else + dec hb + jle %%end_y_hv_overlap + btc hd, 16 + jc %%loop_y_h_overlap + add r13mp, 4 + jmp %%loop_y_hv_overlap +%endif + +%%end_y_hv_overlap: + add wq, 32>>%2 + jge %%end_hv + mov srcq, r10mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + jmp %%loop_x_hv_overlap + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/film_grain16_sse.asm dav1d-0.9.1/src/x86/film_grain16_sse.asm --- dav1d-0.7.1/src/x86/film_grain16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/film_grain16_sse.asm 2021-07-28 21:38:28.893852000 +0000 @@ -0,0 +1,3450 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 +pd_16: times 4 dd 16 +pw_1: times 8 dw 1 +pw_16384: times 8 dw 16384 +pw_8192: times 8 dw 8192 +pw_23_22: dw 23, 22 + times 3 dw 0, 32 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +pw_27_17_17_27: dw 27, 17, 17, 27 + times 2 dw 0, 32 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512, 1024 +max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 +min: dw 0, 16*4, 16*16 +; these two should be next to each other +pw_4: times 2 dw 4 +pw_16: times 2 dw 16 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%if ARCH_X86_32 +%undef base +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 8 +%define %%tmp %8 +%endif +%rep (%6/2) +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %6 == 8 +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4*%7] +%else + pinsrw %1, [%3+%4*%7], %%idx + 0 +%endif + pinsrw %1, [%3+%5*%7], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +%macro SPLATD 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax + lea r4, [pb_mask] +%define base r4-pb_mask +%else +cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax + LEA r4, $$ +%define base r4-$$ +%endif + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r3d, [fg_dataq+FGData.grain_scale_shift] + lea r5d, [bdmaxq+1] + shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r3, r5 + SPLATW m6, [base+round+r3*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] + mov r3, -73*82*2 + sub bufq, r3 +%if ARCH_X86_64 + lea r6, [gaussian_sequence] +%endif +.loop: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + vpgatherdw m3, m2, r6, r5, r7, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 +%endif + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m6 + movq [bufq+r3], m3 + add r3, 4*2 + jl .loop + + ; auto-regression code + movsxd r3, [fg_dataq+FGData.ar_coeff_lag] + movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] + lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] + jmp r3 + +.ar1: +%if WIN64 + DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 + lea bufq, [r0-2*(82*73-(82*3+79))] + PUSH r8 +%else +%if ARCH_X86_64 + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 +%else ; x86-32 + DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 + PUSH r6 +%define shiftd r1d +%endif + sub bufq, 2*(82*73-(82*3+79)) +%endif + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] +%if WIN64 + DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 +%elif ARCH_X86_64 + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 +%else ; x86-32 +%undef shiftd + DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 +%define hd dword r0m +%define maxd dword minm +%endif +%if cpuflag(sse4) + pmovsxbw m4, m4 +%else + pxor m3, m3 + pcmpgtb m3, m4 + punpcklbw m4, m3 +%endif + pinsrw m4, [base+pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +%if WIN64 + POP r8 +%elif ARCH_X86_32 + POP r6 +%undef maxd +%undef hd +%endif +.ar0: + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m0, [base+round_vals-12+shiftq*2] + pshuflw m0, m0, q0000 + movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 + pxor m2, m2 + punpcklwd m0, m2 + pcmpgtb m2, m6 + punpckhbw m3, m6, m2 + punpcklbw m6, m2 + pshufd m2, m6, q3333 + pshufd m1, m6, q2222 + pshufd m7, m6, q1111 + pshufd m6, m6, q0000 + pshufd m4, m3, q1111 + pshufd m3, m3, q0000 +%if ARCH_X86_64 + SWAP 0, 12 + SWAP 1, 8 + SWAP 2, 9 + SWAP 3, 10 + SWAP 4, 11 +%else +%define m12 [rsp+0*16] +%define m8 [rsp+1*16] +%define m9 [rsp+2*16] +%define m10 [rsp+3*16] +%define m11 [rsp+4*16] + mova m12, m0 + mova m8, m1 + mova m9, m2 + mova m10, m3 + mova m11, m4 + mov bdmaxd, bdmaxm +%endif + sar bdmaxd, 1 + SPLATW m0, bdmaxd ; max_grain + pcmpeqw m1, m1 +%if !cpuflag(sse4) + pcmpeqw m2, m2 + psrldq m2, 14 + pslldq m2, 2 + pxor m2, m1 +%endif + pxor m1, m0 ; min_grain +%if ARCH_X86_64 + SWAP 0, 13 + SWAP 1, 14 + SWAP 2, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] + mova m13, m0 + mova m14, m1 +%if !cpuflag(sse4) +%define m15 [rsp+7*16] + mova m15, m2 +%endif +%endif + sub bufq, 2*(82*73-(82*3+79)) + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m2, m0, 2 + psrldq m3, m0, 4 + psrldq m4, m0, 6 + psrldq m5, m0, 8 + punpcklwd m0, m2 + punpcklwd m3, m4 + punpcklwd m5, m1 + psrldq m2, m1, 2 + psrldq m4, m1, 4 + punpcklwd m2, m4 + psrldq m4, m1, 6 + psrldq m1, 8 + punpcklwd m4, m1 + pmaddwd m0, m6 + pmaddwd m3, m7 + pmaddwd m5, m8 + pmaddwd m2, m9 + pmaddwd m4, m10 + paddd m0, m3 + paddd m5, m2 + paddd m0, m4 + paddd m0, m5 ; accumulated top 2 rows + paddd m0, m12 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m4, m1, q3321 + pxor m2, m2 + pcmpgtw m2, m4 + punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd m2, m1, m11 + paddd m2, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + paddd m2, m4 + packssdw m2, m2 + pminsw m2, m13 + pmaxsw m2, m14 + psrldq m4, 4 + pslldq m2, 2 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000010b +%else + pand m1, m15 + pandn m3, m15, m2 + por m1, m3 +%endif + ; overwrite previous pixel, this should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 64 + %define tmp rsp +%elif ARCH_X86_64 + %define tmp rsp+stack_offset-72 +%else +%assign stack_offset stack_offset_old + ALLOC_STACK -16*12 + %define tmp rsp + mov bdmaxd, bdmaxm +%endif + sar bdmaxd, 1 + SPLATW m7, bdmaxd ; max_grain + pcmpeqw m6, m6 +%if !cpuflag(sse4) + pcmpeqw m4, m4 + psrldq m4, 14 + pslldq m4, 4 + pxor m4, m6 +%endif + pxor m6, m7 ; min_grain + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + +%if ARCH_X86_64 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m14 [rsp+10*16] +%define m15 [esp+11*16] + mova m14, m6 + mova m15, m7 +%endif + + ; build cf0-1 until 18-19 in m5-12 and r0/1 + pxor m1, m1 + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + +%if cpuflag(sse4) + pshufd m4, m2, q3333 +%else + pshufd m5, m2, q3333 + mova [tmp+48], m5 +%endif + pshufd m3, m2, q2222 + pshufd m1, m2, q0000 + pshufd m2, m2, q1111 + pshufd m7, m0, q2222 + pshufd m6, m0, q1111 + pshufd m5, m0, q0000 + pshufd m0, m0, q3333 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+4*16] +%define m9 [esp+5*16] +%define m10 [rsp+6*16] +%define m11 [esp+7*16] +%define m12 [rsp+8*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif + + ; build cf20,round in r2 + ; build cf21-23,round*2 in m13 + pxor m1, m1 + movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pcmpgtb m1, m0 + punpcklbw m0, m1 + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + mova [tmp+ 0], m1 + mova [tmp+16], m2 + psrldq m3, m0, 10 + pinsrw m3, [base+round_vals+shiftq*2-10], 3 + +%if ARCH_X86_64 + SWAP 3, 13 +%else +%define m13 [esp+9*16] + mova m13, m3 +%endif + + pinsrw m0, [base+round_vals+shiftq*2-12], 5 + pshufd m3, m0, q2222 + mova [tmp+32], m3 + + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + paddd m0, m2 + paddd m0, m3 + ; m0 = top line first 6 multiplied by cf, m1 = top line last entry + + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] + punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + + pmaddwd m1, m8 + pmaddwd m4, m9 + pmaddwd m3, m10 + pmaddwd m2, m11 + paddd m1, m4 + paddd m3, m2 + paddd m0, m1 + paddd m0, m3 + ; m0 = top 2 lines multiplied by cf + + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, [base+pw_1] + +%if cpuflag(sse4) + pmaddwd m1, m12 +%else + pmaddwd m1, [tmp+48] +%endif + pmaddwd m3, [tmp+ 0] + pmaddwd m4, [tmp+16] + pmaddwd m2, [tmp+32] + paddd m1, m3 + paddd m4, m2 + paddd m0, m1 + paddd m0, m4 + ; m0 = top 3 lines multiplied by cf plus rounding for downshift + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m15 + pmaxsw m2, m14 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m12 + pandn m3, m12, m2 + por m1, m3 +%endif + ; overwrite a couple of pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg +%define base r8-pb_mask + lea r8, [pb_mask] + movifnidn bdmaxd, bdmaxm + lea r6d, [bdmaxq+1] +%else +cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h +%define base r2-$$ + LEA r2, $$ + mov fg_dataq, r2m + mov r6d, r4m + inc r6d +%endif + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r5, r6 + SPLATW m6, [base+round+r5*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] +%if ARCH_X86_64 + SPLATW m2, [base+pw_seed_xor+uvq*4] +%else + mov r5d, r3m + SPLATW m2, [base+pw_seed_xor+r5*4] +%endif + pxor m0, m2 +%if ARCH_X86_64 + lea r6, [gaussian_sequence] +%endif +%if %2 + mov hd, 73-35*%3 + add bufq, 44*2 +.loop_y: + mov xq, -44 +%else + mov xq, -82*73 + add bufq, 82*73*2 +%endif +.loop_x: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + vpgatherdw m3, m2, r6, r9, r10, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 +%endif + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m6 + movq [bufq+xq*2], m3 + add xq, 4 + jl .loop_x +%if %2 + add bufq, 82*2 + dec hd + jg .loop_y +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] + jmp r5 + +.ar0: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset_old stack_offset + ALLOC_STACK -16*2 + mov bufyq, r1m + mov uvd, r3m +%endif + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + SPLATW m3, [base+hmul_bits+shiftq*2-10] +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m1, bdmaxd ; max_gain +%else + SPLATW m1, r4m + psraw m1, 1 +%endif + pcmpeqw m7, m7 + pxor m7, m1 ; min_grain +%if ARCH_X86_64 + SWAP 1, 14 + DEFINE_ARGS buf, bufy, h, x +%else +%define m14 [rsp+0*16] + mova m14, m1 + DEFINE_ARGS buf, bufy, pic_reg, h, x +%endif + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif + SPLATW m4, m4 + pxor m5, m5 +%if %2 +%if !cpuflag(sse4) + pcmpeqw m2, m2 + pslldq m2, 12 +%if ARCH_X86_64 + SWAP 2, 12 +%else +%define m12 [rsp+1*16] + mova m12, m2 +%endif +%endif +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif + add bufyq, 2*(3+82*3) + mov hd, 70-35*%3 +.y_loop_ar0: + ; first 32 pixels + xor xd, xd +.x_loop_ar0: + movu m0, [bufyq+xq*(2<<%2)] +%if %2 +%if %3 + movu m2, [bufyq+xq*4+82*2] + paddw m0, m2 +%endif + movu m1, [bufyq+xq*4 +16] +%if %3 + movu m2, [bufyq+xq*4+82*2+16] + paddw m1, m2 +%endif + phaddw m0, m1 + pmulhrsw m0, m6 +%endif + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + REPX {pmaddwd x, m4}, m0, m1 + REPX {psrad x, 5}, m0, m1 + packssdw m0, m1 + pmulhrsw m0, m3 + movu m1, [bufq+xq*2] + paddw m0, m1 + pminsw m0, m14 + pmaxsw m0, m7 + cmp xd, 72-40*%2 + je .end + movu [bufq+xq*2], m0 + add xd, 8 + jmp .x_loop_ar0 + + ; last 6/4 pixels +.end: +%if %2 +%if cpuflag(sse4) + pblendw m0, m1, 11000000b +%else + pand m1, m12 + pandn m2, m12, m0 + por m0, m1, m2 +%endif + movu [bufq+xq*2], m0 +%else + movq [bufq+xq*2], m0 +%endif + + add bufq, 82*2 + add bufyq, 82*(2<<%3) + dec hd + jg .y_loop_ar0 +%if ARCH_X86_32 +%undef m12 +%undef m14 +%endif + RET + +.ar1: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x +%else +%assign stack_offset stack_offset_old +%xdefine rstk rsp +%assign stack_size_padded 0 + DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 + mov bufyq, r1m + mov uvd, r3m +%endif + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] +%if WIN64 + DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 +%if %2 + lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] +%else + lea bufq, [r0-2*(82*69+3)] +%endif +%else +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 +%else + DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 +%define hd dword r1m +%define mind dword r3m +%define maxd dword r4m +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif +%endif +%if ARCH_X86_64 + mov shiftd, [r2+FGData.ar_coeff_shift] +%else + mov shiftd, [r3+FGData.ar_coeff_shift] +%endif + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 ; cf0-4 in words + pshuflw m4, m4, q2100 + psrldq m4, 2 ; cf0-3,4 in words + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pxor m6, m6 + punpcklwd m3, m6 +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif + SPLATD m3, m3 + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 + sar maxd, 1 +%if ARCH_X86_64 + mov mind, maxd + xor mind, -1 +%else + DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 + mov r2, maxd + xor r2, -1 + mov mind, r2 +%endif +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left +%if %2 + movu m7, [bufyq+xq*4] +%if %3 + movu m1, [bufyq+xq*4+82*2] + phaddw m7, m1 +%else + phaddw m7, m7 +%endif +%else + movq m7, [bufyq+xq*2] +%endif + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 +%if %2 +%if %3 + pshufd m2, m7, q3232 + paddw m7, m2 +%endif + pmulhrsw m7, m6 +%endif + punpcklwd m1, m7 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar1 +%if ARCH_X86_32 +%undef maxd +%undef mind +%undef hd +%endif + RET + +.ar2: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift + ALLOC_STACK -16*8 + mov bufyq, r1m + mov uvd, r3m +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m5, bdmaxd ; max_grain +%else + SPLATW m5, r4m + psraw m5, 1 +%endif + pcmpeqw m6, m6 +%if !cpuflag(sse4) + pcmpeqw m7, m7 + psrldq m7, 14 + pslldq m7, 2 + pxor m7, m6 +%endif + pxor m6, m5 ; min_grain +%if %2 && cpuflag(sse4) + SPLATW m7, [base+hmul_bits+2+%3*2] +%endif + +%if ARCH_X86_64 + SWAP 5, 13 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] +%define m15 [rsp+7*16] + mova m13, m5 + mova m14, m6 + mova m15, m7 +%endif + + ; coef values + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pinsrw m2, [base+round_vals-12+shiftq*2], 5 + + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m1, m0, q3333 + pshufd m0, m0, q2222 + pshufd m3, m2, q1111 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+0*16] +%define m9 [rsp+1*16] +%define m10 [rsp+2*16] +%define m11 [rsp+3*16] +%define m12 [rsp+4*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif + +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, h, x +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m4, m0, 2 ; y=-2,x=[-1,+5] + psrldq m1, m0, 4 ; y=-2,x=[-0,+5] + psrldq m3, m0, 6 ; y=-2,x=[+1,+5] + psrldq m2, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] + pmaddwd m0, m6 + pmaddwd m1, m7 + pmaddwd m2, m8 + paddd m0, m1 + paddd m0, m2 + psrldq m3, m5, 2 ; y=-1,x=[-1,+5] + psrldq m1, m5, 4 ; y=-1,x=[-0,+5] + psrldq m4, m5, 6 ; y=-1,x=[+1,+5] + psrldq m2, m5, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + punpcklwd m4, m2 + pmaddwd m3, m9 + pmaddwd m4, m10 + paddd m3, m4 + paddd m0, m3 + + ; luma component & rounding +%if %2 + movu m1, [bufyq+xq*4] +%if %3 + movu m2, [bufyq+xq*4+82*2] + phaddw m1, m2 + pshufd m2, m1, q3232 + paddw m1, m2 +%else + phaddw m1, m1 +%endif +%if cpuflag(sse4) + pmulhrsw m1, m15 +%elif %3 + pmulhrsw m1, [base+pw_8192] +%else + pmulhrsw m1, [base+pw_16384] +%endif +%else + movq m1, [bufyq+xq*2] +%endif + punpcklwd m1, [base+pw_1] + pmaddwd m1, m12 + paddd m0, m1 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m2, m1, q3321 + pxor m3, m3 + pcmpgtw m3, m2 + punpcklwd m2, m3 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd m3, m1, m11 + paddd m3, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd m3, m2 + packssdw m3, m3 + pminsw m3, m13 + pmaxsw m3, m14 + psrldq m1, 2 + pslldq m3, 2 + psrldq m2, 4 +%if cpuflag(sse4) + pblendw m1, m3, 00000010b +%else + pand m1, m15 + pandn m4, m15, m3 + por m1, m4 +%endif + ; overwrite previous pixel, should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +.ar3: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 96 + %define tmp rsp +%else + %define tmp rsp+stack_offset-120 +%endif +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 + mov bufyq, r1m + mov uvd, r3m + %define tmp rsp +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + SPLATW m4, [base+round_vals-12+shiftq*2] + pxor m5, m5 + pcmpgtw m5, m4 + punpcklwd m4, m5 +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m6, bdmaxd ; max_grain +%else + SPLATW m6, r4m + psraw m6, 1 +%endif + pcmpeqw m7, m7 +%if !cpuflag(sse4) + pcmpeqw m3, m3 + psrldq m3, 14 + pslldq m3, 4 + pxor m3, m7 +%endif + pxor m7, m6 ; min_grain +%if %2 && cpuflag(sse4) + SPLATW m3, [base+hmul_bits+2+%3*2] +%endif + +%if ARCH_X86_64 + SWAP 3, 11 + SWAP 4, 12 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m11 [rsp+ 9*16] +%define m12 [rsp+10*16] +%define m14 [rsp+12*16] +%define m15 [rsp+13*16] + mova m11, m3 + mova m12, m4 + mova m14, m6 + mova m15, m7 +%endif + + ; cf from y=-3,x=-3 until y=-3,x=-2 + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pshufd m1, m0, q0000 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m0, m0, q3333 + pshufd m5, m2, q0000 + pshufd m6, m2, q1111 + mova [tmp+16*0], m1 + mova [tmp+16*1], m3 + mova [tmp+16*2], m4 + mova [tmp+16*3], m0 + mova [tmp+16*4], m5 + mova [tmp+16*5], m6 + pshufd m6, m2, q2222 + pshufd m7, m2, q3333 + + ; cf from y=-1,x=-1 to y=0,x=-1 + luma component + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 ; luma + punpcklbw m0, m1 + pshufd m3, m0, q3232 + psrldq m5, m0, 10 + ; y=0,x=[-3 to -1] + "1.0" for current pixel + pinsrw m5, [base+round_vals-10+shiftq*2], 3 + ; y=-1,x=[-1 to +2] + pshufd m1, m0, q0000 + pshufd m0, m0, q1111 + ; y=-1,x=+3 + luma + punpcklwd m3, m2 + pshufd m3, m3, q0000 + +%if ARCH_X86_64 + SWAP 1, 8 + SWAP 0, 9 + SWAP 3, 10 + SWAP 5, 13 + DEFINE_ARGS buf, bufy, fg_data, h, x +%else +%define m8 [rsp+ 6*16] +%define m9 [rsp+ 7*16] +%define m10 [rsp+ 8*16] +%define m13 [rsp+11*16] + mova m8, m1 + mova m9, m0 + mova m10, m3 + mova m13, m5 + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + ; first line + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, [tmp+0*16] + pmaddwd m2, [tmp+1*16] + pmaddwd m3, [tmp+2*16] + paddd m0, m2 + paddd m0, m3 ; first 6 x of top y + + ; second line [m0/1 are busy] + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] + punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, [tmp+3*16] + pmaddwd m4, [tmp+4*16] + pmaddwd m3, [tmp+5*16] + pmaddwd m5, m6 + paddd m1, m4 + paddd m3, m5 + paddd m0, m1 + paddd m0, m3 ; top 2 lines + + ; third line [m0 is busy] & luma + round + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] +%if %2 + movu m5, [bufyq+xq*4] +%if %3 + movu m4, [bufyq+xq*4+82*2] + phaddw m5, m4 +%else + phaddw m5, m5 +%endif +%else + movq m5, [bufyq+xq*2] +%endif + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] +%if %3 + pshufd m4, m5, q3232 + paddw m5, m4 +%endif +%if %2 +%if cpuflag(sse4) + pmulhrsw m5, m11 +%elif %3 + pmulhrsw m5, [base+pw_8192] +%else + pmulhrsw m5, [base+pw_16384] +%endif +%endif + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, m5 + pmaddwd m1, m7 + pmaddwd m3, m8 + pmaddwd m4, m9 + pmaddwd m2, m10 + paddd m1, m3 + paddd m4, m2 + paddd m0, m12 ; += round + paddd m1, m4 + paddd m0, m1 + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m14 + pmaxsw m2, m15 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m11 + pandn m3, m11, m2 + por m1, m3 +%endif + ; overwrite previous pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] + + mov r0m, r0 + mov r2m, r1 + mov r4m, r2 + mov r6m, r3 + mov r7m, r4 + mov r8m, r5 +%else +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov scalingq, r5m + mov fg_dataq, r3m +%if STACK_ALIGNMENT < mmsize + mov r6, r9m + +%define r9m [rsp+8*mmsize+ 4*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + + mov r9m, r6 +%endif + LEA r5, $$ +%define base r5-$$ + mov r5m, picptrq +%else +cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + lea r8, [pb_mask] +%define base r8-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] +%if ARCH_X86_32 + DECLARE_REG_TMP 0, 3 +%else + DECLARE_REG_TMP 9, 10 +%endif + mov t0d, r9m ; bdmax + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t0d, [t0d*3] + lea t0d, [r6d*2+t0d] + SPLATW m4, [base+max+t0*2] + SPLATW m2, r9m + + pcmpeqw m1, m1 + psraw m7, m2, 1 ; max_grain + pxor m1, m7 ; min_grain + SPLATD m6, [base+pd_16] + + SCRATCH 1, 9, 0 + SCRATCH 2, 10, 1 + SCRATCH 3, 11, 2 + SCRATCH 4, 12, 3 + SCRATCH 5, 13, 4 + SCRATCH 6, 14, 5 + SCRATCH 7, 15, 6 + + mova m6, [base+pw_27_17_17_27] ; for horizontal filter + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 + DECLARE_REG_TMP 0 +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + DECLARE_REG_TMP 7 +%endif + + mov sbyd, r8m + movzx t0d, byte [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz .no_vertical_overlap + test sbyd, sbyd + jnz .vertical_overlap +.no_vertical_overlap: + mov dword r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak +%endif + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif + +.loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak +%endif + +.loop_x_odd: + movzx hd, word r7m + mov grain_lutq, grain_lutmp +.loop_y: + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 +%else + vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 +%endif + REPX {psrlw x, 8}, m2, m3 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[src] * grain, scaling_shift) + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp ; src += stride + add grain_lutq, 82*2 + dec hd + jg .loop_y + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + btc dword r8m, 2 + jc .next_blk + add offxyd, 16 + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r12d, 16 ; top_offxy += 16 +%endif + jmp .loop_x_odd_v_overlap + +.next_blk: + test dword r8m, 1 + jz .loop_x + + ; r8m = sbym + test dword r8m, 2 + jnz .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + mov seed, r3m +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy +%endif + + mov hd, dword r7m + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m5, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m4, [grain_lutq+r5*2] +%else + movd m4, [grain_lutq+left_offxyq*2] +%endif + punpcklwd m4, m5 + pmaddwd m4, m6 + paddd m4, m14 + psrad m4, 5 + packssdw m4, m4 + pminsw m4, m15 + pmaxsw m4, m9 + shufps m4, m5, q3210 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 +%endif + REPX {psrlw x, 8}, m2, m3 + + ; noise = round2(scaling[src] * grain, scaling_shift) + movu m5, [grain_lutq+offxyq*2+16] + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + or dword r8m, 4 + add offxyd, 16 + + ; r8m = sbym + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r12d, 16 ; top_offxy += 16 +%endif + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see +%endif + + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add t0d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and t0d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak +%endif + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif + +.loop_x_v_overlap: +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27] + mov seed, r3m +%else + SPLATD m7, [pw_27_17_17_27] +%endif + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +.loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] + mov hd, dword r7m + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+1*gprsize] + movu m2, [grain_lutq+r5*2] +%else + movu m2, [grain_lutq+top_offxyq*2] +%endif + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + REPX {pmaddwd x, m7}, m4, m2 + REPX {paddd x, m14}, m4, m2 + REPX {psrad x, 5}, m4, m2 + packssdw m2, m4 + pminsw m2, m15 + pmaxsw m2, m9 + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m3, [grain_lutq+r5*2+16] +%else + movu m3, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m5, m3, m4 + punpcklwd m3, m4 + REPX {pmaddwd x, m7}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m3, m5 + pminsw m3, m15 + pmaxsw m3, m9 + + ; src + pand m0, m10, [srcq+ 0] ; m0-1: src as word + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 +%endif + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m4, m2 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 +%else + vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 +%endif + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + btc dword r8m, 2 + jc .next_blk_v +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + jmp .loop_x_odd_v_overlap + +.next_blk_v: + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +.loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r0, [rsp+8*mmsize+1*gprsize] + add r3, 16 + add r0, 16 + mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy + mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy + + mov seed, r3m + xor r0, r0 +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] + + movzx hd, word r7m + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m2, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + movu m4, [grain_lutq+r0*2] + movd m5, [grain_lutq+r5*2] + mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy + movd m3, [grain_lutq+r5*2] +%else + movu m4, [grain_lutq+top_offxyq*2] + movd m5, [grain_lutq+left_offxyq*2] + movd m3, [grain_lutq+topleft_offxyq*2] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd m5, m2 + punpcklwd m3, m4 + REPX {pmaddwd x, m6}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m5, m3 + pminsw m5, m15 + pmaxsw m5, m9 + shufps m3, m5, m2, q3210 + shufps m5, m4, q3232 + ; followed by v interpolation (top | cur -> cur) + movu m0, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m1, [grain_lutq+r0*2+16] +%else + movu m1, [grain_lutq+top_offxyq*2+16] +%endif + punpcklwd m2, m5, m3 + punpckhwd m5, m3 + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + REPX {pmaddwd x, m7}, m2, m5, m3, m1 + REPX {paddd x, m14}, m2, m5, m3, m1 + REPX {psrad x, 5}, m2, m5, m3, m1 + packssdw m2, m5 + packssdw m3, m1 + REPX {pminsw x, m15}, m2, m3 + REPX {pmaxsw x, m9}, m2, m3 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 +%endif + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m2, m4 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 +%else + vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 +%endif + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + or dword r8m, 4 +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov r5, r5m + add offxyd, 16 + add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + add offxyd, 16 + add top_offxyd, 16 + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + jmp .loop_x_odd_v_overlap + +.end_hv: + RET +%if ARCH_X86_32 + DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +%endif + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r1m + mov r2, r2m + mov r4, r3m + mov r3, r4m + mov r5, r5m +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r1m [rsp+8*mmsize+ 4*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + mov r0m, r0 + mov r2m, r2 + mov r4m, r3 + mov r5m, r5 + + mov r0, r6m + mov r2, r7m + mov r3, r8m + mov r5, r9m +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] +%define r9m [rsp+8*mmsize+12*gprsize] + mov r6m, r0 + mov r7m, r2 + mov r8m, r3 + mov r9m, r5 + + mov r2, r10m + mov r3, r11m + mov r5, r12m + mov r0, r13m +%define r10m [rsp+8*mmsize+13*gprsize] +%define r11m [rsp+8*mmsize+14*gprsize] +%define r12m [rsp+8*mmsize+15*gprsize] + mov r10m, r2 + mov r11m, r3 + mov r12m, r5 + + SPLATW m2, r13m +%else +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused + mov srcq, srcm + mov fg_dataq, r3m +%endif + LEA r5, $$ +%define base r5-$$ + + DECLARE_REG_TMP 0, 2, 3 +%else +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r8-pb_mask + lea r8, [pb_mask] + + DECLARE_REG_TMP 9, 10, 11 +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] +%if STACK_ALIGNMENT >= mmsize + mov t0d, r13m ; bdmax +%endif + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t1d, [t0d*3] + mov t2d, r12m + inc t2d + imul r6d, t2d + add t1d, r6d + SPLATW m4, [base+max+t1*2] +%if STACK_ALIGNMENT >= mmsize + SPLATW m2, r13m +%endif + + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + +%define mzero m7 + +%if %3 + SPLATD m2, [base+pw_23_22] +%endif + +%if ARCH_X86_32 + mov scalingq, r5m + mov r5m, r5 +%else + mov r13mp, strideq +%endif + + pcmpeqw m0, m0 + psraw m1, m10, 1 + pxor m0, m1 + + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap + + DECLARE_REG_TMP 0 +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + DECLARE_REG_TMP 9 +%endif + +%if %1 + mov r6d, r11m + SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] + SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklwd m6, m1, m0 + SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] + SPLATD m7, [base+pw_4+t0*4] + pmullw m5, m7 +%else + SPLATD m6, [base+pd_16] +%if %2 + mova m5, [base+pw_23_22] +%else + mova m5, [base+pw_27_17_17_27] +%endif +%endif + + SCRATCH 6, 14, 6 + SCRATCH 5, 15, 7 + +%if ARCH_X86_32 + DECLARE_REG_TMP 0 +%else + DECLARE_REG_TMP 7 +%endif + + mov sbyd, r8m + mov t0d, [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz %%no_vertical_overlap + test sbyd, sbyd + jnz %%vertical_overlap + +%%no_vertical_overlap: + mov r8m, t0d +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*(2<<%2)] + mov r12mp, r3 +%if %3 + shl r10mp, 1 +%endif +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif + mov lumaq, r9mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 +%endif + neg wq +%if ARCH_X86_32 + mov r4mp, wq +%endif + +%%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride +%endif + +%if %2 == 0 +%%loop_x_odd: +%endif + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y: + ; src + mova m0, [srcq] + mova m1, [srcq+16] ; m0-1: src as word + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif +%if %2 + pavgw m4, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m3, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m6, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m3, m5 + pmulhrsw m4, m3 + pmulhrsw m6, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hd + jg %%loop_y + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma + + mov wq, r4mp +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0m, dstq + mov r9m, lumaq + mov r4m, wq +%endif +%if %2 == 0 + btc dword r8m, 2 + jc %%next_blk + add offxyd, 16 + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%%next_blk: +%endif + test dword r8m, 1 + je %%loop_x + + ; r8m = sbym + test dword r8m, 2 + jnz %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif +%if %2 + pavgw m4, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; grain = grain_lut[offy+y][offx+x] + movu m7, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m5, [grain_lutq+r5*2] +%else + movd m5, [grain_lutq+left_offxyq*2+ 0] +%endif + punpcklwd m5, m7 ; {left0, cur0} +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %2 + pmaddwd m5, [PIC_ptr(pw_23_22)] +%else + pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] +%endif + paddd m5, [PIC_ptr(pd_16)] +%else + pmaddwd m5, m15 + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 + shufps m5, m7, q3210 + movu m3, [grain_lutq+offxyq*2+16] + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 + vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m7, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m4 + pmulhrsw m5, m7 + pmulhrsw m3, m4 + + ; dst = clip_pixel(src, noise) + paddw m0, m5 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hd + jg %%loop_y_h_overlap + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + mov wq, r4mp +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif + +%if %2 + ; r8m = sbym + test dword r8m, 2 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap +%else + or dword r8m, 4 + add offxyd, 16 + + ; r8m = sbym + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%end: + RET + +%%vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride +%endif + + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add t0d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and t0d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov r3m, seed + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*(2<<%2)] + mov r12mp, r3 +%if %3 + shl r10mp, 1 +%endif +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, unused3, unused4, unused5, luma, lstride + + mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif + mov lumaq, r9mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 +%endif + neg wq +%if ARCH_X86_32 + mov r4m, wq +%endif + +%%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor t0d, t0d +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride +%endif + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if %2 == 0 +%%loop_x_odd_v_overlap: +%endif +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy + movu m5, [grain_lutq+r0*2] +%else + movu m5, [grain_lutq+top_offxyq*2] +%endif + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m5, [grain_lutq+r0*2+16] +%else + movu m5, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m7, m5, m4 + punpcklwd m5, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m4, m5, m7 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m5, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m5, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif +%if %2 + pavgw m5, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m7, m5, m0 + punpcklwd m5, m0 + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 + punpckhwd m7, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 + pxor mzero, mzero + REPX {paddw x, m15}, m5, m6 + REPX {pmaxsw x, mzero}, m5, m6 + REPX {pminsw x, m10}, m5, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m5, m6 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m5 + pmulhrsw m3, m7 + pmulhrsw m4, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + + dec hw + jle %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 +%if %3 + jmp %%loop_y +%else + btc hd, 16 + jc %%loop_y +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_v_overlap +%endif + +%%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif + +%if %2 + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%else + btc dword r8m, 2 + jc %%loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut + + mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy + add offxyd, 16 + add t0d, 16 + mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd + mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m + xor t0d, t0d +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride +%endif + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + movd m5, [grain_lutq+r5*2] +%else + movd m5, [grain_lutq+left_offxyq*2] +%endif + movu m7, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+2*gprsize] + movu m4, [grain_lutq+r0*2] +%if %2 + pinsrw m5, [grain_lutq+r5*2], 2 +%else + movd m3, [grain_lutq+r5*2] +%endif +%else + movu m4, [grain_lutq+top_offxyq*2] +%if %2 + pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } +%else + movd m3, [grain_lutq+topleft_offxyq*2] +%endif +%endif +%if %2 == 0 + punpckldq m5, m3 +%endif + punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } + punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %2 + movddup m0, [PIC_ptr(pw_23_22)] +%else + movddup m0, [PIC_ptr(pw_27_17_17_27)] +%endif +%else + pshufd m0, m15, q1010 +%endif + pmaddwd m5, m0 +%if %1 + paddd m5, [PIC_ptr(pd_16)] +%else + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 + shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 + shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter + shufps m5, m4, q3231 ; top0-7 post-h_filter + + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 +%else + REPX {paddd x, m14}, m5, m7 +%endif + REPX {psrad x, 5}, m5, m7 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; right half + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m0, [grain_lutq+r0*2+16] +%else + movu m0, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m1, m0, m4 + punpcklwd m0, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m1, m0 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 +%else + REPX {paddd x, m14}, m1, m0 +%endif + REPX {psrad x, 5}, m1, m0 + packssdw m4, m0, m1 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m6, [lumaq+ 0] + mova m5, [lumaq+(16<<%2)] +%if %2 + phaddw m6, [lumaq+16] + phaddw m5, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif +%if %2 + pavgw m6, mzero + pavgw m5, mzero +%endif + +%if %1 + punpckhwd m7, m6, m0 + punpcklwd m6, m0 + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 + punpckhwd m7, m5, m1 + punpcklwd m5, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 + pxor mzero, mzero + REPX {paddw x, m15}, m6, m5 + REPX {pmaxsw x, mzero}, m6, m5 + REPX {pminsw x, m10}, m6, m5 ; clip_pixel() +%else + REPX {pand x, m10}, m6, m5 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 + vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 +%else +%if %3 == 0 + ; register shortage :) + push r12 +%endif + vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 + vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 +%if %3 == 0 + pop r12 +%endif +%endif + REPX {psrlw x, 8}, m7, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m6 + pmulhrsw m3, m7 + pmulhrsw m4, m6 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hw +%if %3 + jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap + btc hd, 16 + jc %%loop_y_h_overlap +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_hv_overlap +%%end_y_hv_overlap: +%endif +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov dstmp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif +%if %2 + jmp %%loop_x_hv_overlap +%else + or dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 diff -Nru dav1d-0.7.1/src/x86/film_grain.asm dav1d-0.9.1/src/x86/film_grain.asm --- dav1d-0.7.1/src/x86/film_grain.asm 2020-06-21 11:48:55.016126400 +0000 +++ dav1d-0.9.1/src/x86/film_grain.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,2404 +0,0 @@ -; Copyright © 2019, VideoLAN and dav1d authors -; Copyright © 2019, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - -%if ARCH_X86_64 - -SECTION_RODATA 32 -pb_8x_27_17_8x_17_27: times 8 db 27, 17 - times 8 db 17, 27 -pw_1024: times 16 dw 1024 -pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 -rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 -byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 -pw_seed_xor: times 2 dw 0xb524 - times 2 dw 0x49d8 -pd_m65536: dd ~0xffff -pb_23_22: times 2 db 23, 22 -pb_1: times 4 db 1 -hmul_bits: dw 32768, 16384, 8192, 4096 -round: dw 2048, 1024, 512 -mul_bits: dw 256, 128, 64, 32, 16 -round_vals: dw 32, 64, 128, 256, 512 -max: dw 255, 240, 235 -min: dw 0, 16 -pb_27_17_17_27: db 27, 17, 17, 27 -pw_1: dw 1 - -%macro JMP_TABLE 1-* - %xdefine %1_table %%table - %xdefine %%base %1_table - %xdefine %%prefix mangle(private_prefix %+ _%1) - %%table: - %rep %0 - 1 - dd %%prefix %+ .ar%2 - %%base - %rotate 1 - %endrep -%endmacro - -ALIGN 4 -JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3 - -struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - -SECTION .text - -INIT_XMM avx2 -cglobal generate_grain_y, 2, 9, 16, buf, fg_data - lea r4, [pb_mask] -%define base r4-pb_mask - movq xm1, [base+rnd_next_upperbit_mask] - movq xm4, [base+mul_bits] - movq xm7, [base+hmul_bits] - mov r2d, [fg_dataq+FGData.grain_scale_shift] - vpbroadcastw xm8, [base+round+r2*2] - mova xm5, [base+pb_mask] - vpbroadcastw xm0, [fg_dataq+FGData.seed] - vpbroadcastd xm9, [base+pd_m65536] - mov r2, -73*82 - sub bufq, r2 - lea r3, [gaussian_sequence] -.loop: - pand xm2, xm0, xm1 - psrlw xm3, xm2, 10 - por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw xm2, xm4 ; bits 0x0f00 are set - pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds - psllq xm6, xm2, 30 - por xm2, xm6 - psllq xm6, xm2, 15 - por xm2, xm6 ; aggregate each bit into next seed's high bit - pmulhuw xm3, xm0, xm7 - por xm2, xm3 ; 4 next output seeds - pshuflw xm0, xm2, q3333 - psrlw xm2, 5 - pmovzxwd xm3, xm2 - mova xm6, xm9 - vpgatherdd xm2, [r3+xm3*2], xm6 - pandn xm2, xm9, xm2 - packusdw xm2, xm2 - pmulhrsw xm2, xm8 - packsswb xm2, xm2 - movd [bufq+r2], xm2 - add r2, 4 - jl .loop - - ; auto-regression code - movsxd r2, [fg_dataq+FGData.ar_coeff_lag] - movsxd r2, [base+generate_grain_y_avx2_table+r2*4] - lea r2, [r2+base+generate_grain_y_avx2_table] - jmp r2 - -.ar1: - DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] - movd xm4, [fg_dataq+FGData.ar_coeffs_y] - DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 - pinsrb xm4, [pb_1], 3 - pmovsxbw xm4, xm4 - pshufd xm5, xm4, q1111 - pshufd xm4, xm4, q0000 - vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd - sub bufq, 82*73-(82*3+79) - mov hd, 70 - mov mind, -128 - mov maxd, 127 -.y_loop_ar1: - mov xq, -76 - movsx val3d, byte [bufq+xq-1] -.x_loop_ar1: - pmovsxbw xm0, [bufq+xq-82-1] ; top/left - pmovsxbw xm2, [bufq+xq-82+0] ; top - pmovsxbw xm1, [bufq+xq-82+1] ; top/right - punpcklwd xm0, xm2 - punpcklwd xm1, xm3 - pmaddwd xm0, xm4 - pmaddwd xm1, xm5 - paddd xm0, xm1 -.x_loop_ar1_inner: - movd val0d, xm0 - psrldq xm0, 4 - imul val3d, cf3d - add val3d, val0d -%if WIN64 - sarx val3d, val3d, shiftd -%else - sar val3d, shiftb -%endif - movsx val0d, byte [bufq+xq] - add val3d, val0d - cmp val3d, maxd - cmovns val3d, maxd - cmp val3d, mind - cmovs val3d, mind - mov byte [bufq+xq], val3b - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82 - dec hd - jg .y_loop_ar1 -.ar0: - RET - -.ar2: - DEFINE_ARGS buf, fg_data, shift - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - vpbroadcastw xm14, [base+round_vals-12+shiftq*2] - movq xm15, [base+byte_blend+1] - pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 - movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 - pmovsxbw xm9, xm9 - DEFINE_ARGS buf, fg_data, h, x - pshufd xm12, xm9, q0000 - pshufd xm13, xm9, q1111 - pshufd xm11, xm8, q3333 - pshufd xm10, xm8, q2222 - pshufd xm9, xm8, q1111 - pshufd xm8, xm8, q0000 - pmovzxwd xm14, xm14 - sub bufq, 82*73-(82*3+79) - mov hd, 70 -.y_loop_ar2: - mov xq, -76 - -.x_loop_ar2: - pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] - pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] - psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] - psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] - psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] - punpcklwd xm2, xm0, xm2 - punpcklwd xm3, xm4 - pmaddwd xm2, xm8 - pmaddwd xm3, xm11 - paddd xm2, xm3 - - psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] - psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] - psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5] - punpcklwd xm4, xm5 - punpcklwd xm6, xm1 - psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5] - psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] - punpcklwd xm7, xm1 - pmaddwd xm4, xm9 - pmaddwd xm6, xm10 - pmaddwd xm7, xm12 - paddd xm4, xm6 - paddd xm2, xm7 - paddd xm2, xm4 - paddd xm2, xm14 - - movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] -.x_loop_ar2_inner: - pmovsxbw xm1, xm0 - pmaddwd xm3, xm1, xm13 - paddd xm3, xm2 - psrldq xm1, 4 ; y=0,x=0 - psrldq xm2, 4 ; shift top to next pixel - psrad xm3, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw since we only care about one value - paddw xm3, xm1 - packsswb xm3, xm3 - pextrb [bufq+xq], xm3, 0 - pslldq xm3, 2 - pand xm3, xm15 - pandn xm0, xm15, xm0 - por xm0, xm3 - psrldq xm0, 1 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82 - dec hd - jg .y_loop_ar2 - RET - -.ar3: - DEFINE_ARGS buf, fg_data, shift -%if WIN64 - SUB rsp, 16*12 -%assign stack_size_padded (stack_size_padded+16*12) -%assign stack_size (stack_size+16*12) -%else - ALLOC_STACK 16*12 -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - vpbroadcastw xm14, [base+round_vals-12+shiftq*2] - movq xm15, [base+byte_blend] - pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7 - pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15 - pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 - pshufd xm9, xm0, q1111 - pshufd xm10, xm0, q2222 - pshufd xm11, xm0, q3333 - pshufd xm0, xm0, q0000 - pshufd xm6, xm1, q1111 - pshufd xm7, xm1, q2222 - pshufd xm8, xm1, q3333 - pshufd xm1, xm1, q0000 - pshufd xm3, xm2, q1111 - psrldq xm13, xm2, 10 - pinsrw xm2, [pw_1], 5 - pshufd xm4, xm2, q2222 - pshufd xm2, xm2, q0000 - pinsrw xm13, [base+round_vals+shiftq*2-10], 3 - mova [rsp+ 0*16], xm0 - mova [rsp+ 1*16], xm9 - mova [rsp+ 2*16], xm10 - mova [rsp+ 3*16], xm11 - mova [rsp+ 4*16], xm1 - mova [rsp+ 5*16], xm6 - mova [rsp+ 6*16], xm7 - mova [rsp+ 7*16], xm8 - mova [rsp+ 8*16], xm2 - mova [rsp+ 9*16], xm3 - mova [rsp+10*16], xm4 - DEFINE_ARGS buf, fg_data, h, x - sub bufq, 82*73-(82*3+79) - mov hd, 70 -.y_loop_ar3: - mov xq, -76 - -.x_loop_ar3: - movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] - movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] - movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] - pxor xm3, xm3 - pcmpgtb xm6, xm3, xm2 - pcmpgtb xm5, xm3, xm1 - pcmpgtb xm4, xm3, xm0 - punpckhbw xm3, xm0, xm4 - punpcklbw xm0, xm4 - punpckhbw xm4, xm1, xm5 - punpcklbw xm1, xm5 - punpckhbw xm5, xm2, xm6 - punpcklbw xm2, xm6 - - psrldq xm6, xm0, 2 - psrldq xm7, xm0, 4 - psrldq xm8, xm0, 6 - psrldq xm9, xm0, 8 - palignr xm10, xm3, xm0, 10 - palignr xm11, xm3, xm0, 12 - - punpcklwd xm0, xm6 - punpcklwd xm7, xm8 - punpcklwd xm9, xm10 - punpcklwd xm11, xm1 - pmaddwd xm0, [rsp+ 0*16] - pmaddwd xm7, [rsp+ 1*16] - pmaddwd xm9, [rsp+ 2*16] - pmaddwd xm11, [rsp+ 3*16] - paddd xm0, xm7 - paddd xm9, xm11 - paddd xm0, xm9 - - psrldq xm6, xm1, 2 - psrldq xm7, xm1, 4 - psrldq xm8, xm1, 6 - psrldq xm9, xm1, 8 - palignr xm10, xm4, xm1, 10 - palignr xm11, xm4, xm1, 12 - psrldq xm12, xm2, 2 - - punpcklwd xm6, xm7 - punpcklwd xm8, xm9 - punpcklwd xm10, xm11 - punpcklwd xm12, xm2, xm12 - pmaddwd xm6, [rsp+ 4*16] - pmaddwd xm8, [rsp+ 5*16] - pmaddwd xm10, [rsp+ 6*16] - pmaddwd xm12, [rsp+ 7*16] - paddd xm6, xm8 - paddd xm10, xm12 - paddd xm6, xm10 - paddd xm0, xm6 - - psrldq xm6, xm2, 4 - psrldq xm7, xm2, 6 - psrldq xm8, xm2, 8 - palignr xm9, xm5, xm2, 10 - palignr xm5, xm5, xm2, 12 - - punpcklwd xm6, xm7 - punpcklwd xm8, xm9 - punpcklwd xm5, xm14 - pmaddwd xm6, [rsp+ 8*16] - pmaddwd xm8, [rsp+ 9*16] - pmaddwd xm5, [rsp+10*16] - paddd xm0, xm6 - paddd xm8, xm5 - paddd xm0, xm8 - - movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pmovsxbw xm2, xm1 - pmaddwd xm2, xm13 - pshufd xm3, xm2, q1111 - paddd xm2, xm3 ; left+cur - paddd xm2, xm0 ; add top - psrldq xm0, 4 - psrad xm2, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw since we only care about one value - packsswb xm2, xm2 - pextrb [bufq+xq], xm2, 0 - pslldq xm2, 3 - pand xm2, xm15 - pandn xm1, xm15, xm1 - por xm1, xm2 - psrldq xm1, 1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82 - dec hd - jg .y_loop_ar3 - RET - -%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y -INIT_XMM avx2 -cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv - lea r4, [pb_mask] -%define base r4-pb_mask - movq xm1, [base+rnd_next_upperbit_mask] - movq xm4, [base+mul_bits] - movq xm7, [base+hmul_bits] - mov r5d, [fg_dataq+FGData.grain_scale_shift] - vpbroadcastw xm8, [base+round+r5*2] - mova xm5, [base+pb_mask] - vpbroadcastw xm0, [fg_dataq+FGData.seed] - vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] - pxor xm0, xm9 - vpbroadcastd xm9, [base+pd_m65536] - lea r6, [gaussian_sequence] -%if %2 - mov r7d, 73-35*%3 - add bufq, 44 -.loop_y: - mov r5, -44 -.loop_x: -%else - mov r5, -73*82 - sub bufq, r5 -.loop: -%endif - pand xm2, xm0, xm1 - psrlw xm3, xm2, 10 - por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw xm2, xm4 ; bits 0x0f00 are set - pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds - psllq xm6, xm2, 30 - por xm2, xm6 - psllq xm6, xm2, 15 - por xm2, xm6 ; aggregate each bit into next seed's high bit - pmulhuw xm3, xm0, xm7 - por xm2, xm3 ; 4 next output seeds - pshuflw xm0, xm2, q3333 - psrlw xm2, 5 - pmovzxwd xm3, xm2 - mova xm6, xm9 - vpgatherdd xm2, [r6+xm3*2], xm6 - pandn xm2, xm9, xm2 - packusdw xm2, xm2 - pmulhrsw xm2, xm8 - packsswb xm2, xm2 - movd [bufq+r5], xm2 - add r5, 4 -%if %2 - jl .loop_x - add bufq, 82 - dec r7d - jg .loop_y -%else - jl .loop -%endif - - ; auto-regression code - movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_avx2_table] - jmp r5 - -.ar0: - INIT_YMM avx2 - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - imul uvd, 28 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] - movd xm3, [base+hmul_bits+shiftq*2] - DEFINE_ARGS buf, bufy, h - pmovsxbw xm4, xm4 -%if %2 - vpbroadcastd m7, [pb_1] - vpbroadcastw m6, [hmul_bits+2+%3*2] -%endif - vpbroadcastw m4, xm4 - vpbroadcastw m3, xm3 - pxor m12, m12 -%if %2 - sub bufq, 82*(73-35*%3)+82-(82*3+41) -%else - sub bufq, 82*70-3 -%endif - add bufyq, 3+82*3 - mov hd, 70-35*%3 -.y_loop_ar0: -%if %2 - ; first 32 pixels - movu xm8, [bufyq] -%if %3 - movu xm9, [bufyq+82] -%endif - movu xm10, [bufyq+16] -%if %3 - movu xm11, [bufyq+82+16] -%endif - vinserti128 m8, [bufyq+32], 1 -%if %3 - vinserti128 m9, [bufyq+82+32], 1 -%endif - vinserti128 m10, [bufyq+48], 1 -%if %3 - vinserti128 m11, [bufyq+82+48], 1 -%endif - pmaddubsw m8, m7, m8 -%if %3 - pmaddubsw m9, m7, m9 -%endif - pmaddubsw m10, m7, m10 -%if %3 - pmaddubsw m11, m7, m11 - paddw m8, m9 - paddw m10, m11 -%endif - pmulhrsw m8, m6 - pmulhrsw m10, m6 -%else - xor r3d, r3d - ; first 32x2 pixels -.x_loop_ar0: - movu m8, [bufyq+r3] - pcmpgtb m9, m12, m8 - punpckhbw m10, m8, m9 - punpcklbw m8, m9 -%endif - pmullw m8, m4 - pmullw m10, m4 - pmulhrsw m8, m3 - pmulhrsw m10, m3 -%if %2 - movu m0, [bufq] -%else - movu m0, [bufq+r3] -%endif - pcmpgtb m1, m12, m0 - punpckhbw m9, m0, m1 - punpcklbw m0, m1 - paddw m0, m8 - paddw m9, m10 - packsswb m0, m9 -%if %2 - movu [bufq], m0 -%else - movu [bufq+r3], m0 - add r3d, 32 - cmp r3d, 64 - jl .x_loop_ar0 -%endif - - ; last 6/12 pixels - movu xm8, [bufyq+32*2] -%if %2 -%if %3 - movu xm9, [bufyq+32*2+82] -%endif - pmaddubsw xm8, xm7, xm8 -%if %3 - pmaddubsw xm9, xm7, xm9 - paddw xm8, xm9 -%endif - pmulhrsw xm8, xm6 - pmullw xm8, xm4 - pmulhrsw xm8, xm3 - movq xm0, [bufq+32] - pcmpgtb xm9, xm12, xm0 - punpcklbw xm9, xm0, xm9 - paddw xm8, xm9 - packsswb xm8, xm8 - vpblendw xm0, xm8, xm0, 1000b - movq [bufq+32], xm0 -%else - pcmpgtb xm9, xm12, xm8 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - pmullw xm10, xm4 - pmullw xm8, xm4 - pmulhrsw xm10, xm3 - pmulhrsw xm8, xm3 - movu xm0, [bufq+64] - pcmpgtb xm9, xm12, xm0 - punpcklbw xm1, xm0, xm9 - punpckhbw xm9, xm0, xm9 - paddw xm1, xm8 - paddw xm9, xm10 - packsswb xm1, xm9 - vpblendw xm0, xm1, xm0, 11000000b - movu [bufq+64], xm0 -%endif - - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar0 - RET - -.ar1: - INIT_XMM avx2 - DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift - imul uvd, 28 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] - movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] - pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 - DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift - pmovsxbw xm4, xm4 - pshufd xm5, xm4, q1111 - pshufd xm4, xm4, q0000 - pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd -%if %2 - vpbroadcastd xm7, [pb_1] - vpbroadcastw xm6, [hmul_bits+2+%3*2] -%endif - vpbroadcastd xm3, xm3 -%if %2 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*70-(82-3) -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 - mov mind, -128 - mov maxd, 127 -.y_loop_ar1: - mov xq, -(76>>%2) - movsx val3d, byte [bufq+xq-1] -.x_loop_ar1: - pmovsxbw xm0, [bufq+xq-82-1] ; top/left -%if %2 - movq xm8, [bufyq+xq*2] -%if %3 - movq xm9, [bufyq+xq*2+82] -%endif -%endif - psrldq xm2, xm0, 2 ; top - psrldq xm1, xm0, 4 ; top/right -%if %2 - pmaddubsw xm8, xm7, xm8 -%if %3 - pmaddubsw xm9, xm7, xm9 - paddw xm8, xm9 -%endif - pmulhrsw xm8, xm6 -%else - pmovsxbw xm8, [bufyq+xq] -%endif - punpcklwd xm0, xm2 - punpcklwd xm1, xm8 - pmaddwd xm0, xm4 - pmaddwd xm1, xm5 - paddd xm0, xm1 - paddd xm0, xm3 -.x_loop_ar1_inner: - movd val0d, xm0 - psrldq xm0, 4 - imul val3d, cf3d - add val3d, val0d - sarx val3d, val3d, shiftd - movsx val0d, byte [bufq+xq] - add val3d, val0d - cmp val3d, maxd - cmovns val3d, maxd - cmp val3d, mind - cmovs val3d, mind - mov byte [bufq+xq], val3b - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar1 - RET - -.ar2: - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - vpbroadcastw xm15, [base+round_vals-12+shiftq*2] - pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 - pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 - pinsrw xm9, [base+pw_1], 5 -%if %2 - vpbroadcastw xm7, [base+hmul_bits+2+%3*2] - vpbroadcastd xm6, [base+pb_1] -%endif - DEFINE_ARGS buf, bufy, fg_data, h, unused, x - pshufd xm12, xm9, q0000 - pshufd xm13, xm9, q1111 - pshufd xm14, xm9, q2222 - pshufd xm11, xm8, q3333 - pshufd xm10, xm8, q2222 - pshufd xm9, xm8, q1111 - pshufd xm8, xm8, q0000 -%if %2 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*70-(82-3) -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 -.y_loop_ar2: - mov xq, -(76>>%2) - -.x_loop_ar2: - pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] - pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] - psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] - psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] - psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] - punpcklwd xm2, xm0, xm2 - punpcklwd xm3, xm4 - pmaddwd xm2, xm8 - pmaddwd xm3, xm11 - paddd xm2, xm3 - - psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] - psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] - psrldq xm0, 8 ; y=-2,x=[+2,+5] - punpcklwd xm4, xm5 - punpcklwd xm0, xm1 - psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5] - psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] - punpcklwd xm3, xm1 - pmaddwd xm4, xm9 - pmaddwd xm0, xm10 - pmaddwd xm3, xm12 - paddd xm4, xm0 - paddd xm2, xm3 - paddd xm2, xm4 - -%if %2 - movq xm0, [bufyq+xq*2] -%if %3 - movq xm3, [bufyq+xq*2+82] -%endif - pmaddubsw xm0, xm6, xm0 -%if %3 - pmaddubsw xm3, xm6, xm3 - paddw xm0, xm3 -%endif - pmulhrsw xm0, xm7 -%else - pmovsxbw xm0, [bufyq+xq] -%endif - punpcklwd xm0, xm15 - pmaddwd xm0, xm14 - paddd xm2, xm0 - - movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] -.x_loop_ar2_inner: - pmovsxbw xm0, xm0 - pmaddwd xm3, xm0, xm13 - paddd xm3, xm2 - psrldq xm2, 4 ; shift top to next pixel - psrad xm3, [fg_dataq+FGData.ar_coeff_shift] - pslldq xm3, 2 - psrldq xm0, 2 - paddw xm3, xm0 - vpblendw xm0, xm3, 00000010b - packsswb xm0, xm0 - pextrb [bufq+xq], xm0, 1 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar2 - RET - -.ar3: - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - SUB rsp, 16*12 -%assign stack_size_padded (stack_size_padded+16*12) -%assign stack_size (stack_size+16*12) - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - vpbroadcastw xm14, [base+round_vals-12+shiftq*2] - pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7 - pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15 - pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 - pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] - pshufd xm9, xm0, q1111 - pshufd xm10, xm0, q2222 - pshufd xm11, xm0, q3333 - pshufd xm0, xm0, q0000 - pshufd xm6, xm1, q1111 - pshufd xm7, xm1, q2222 - pshufd xm8, xm1, q3333 - pshufd xm1, xm1, q0000 - pshufd xm3, xm2, q1111 - pshufd xm4, xm2, q2222 - vpbroadcastw xm5, xm5 - vpblendw xm4, xm5, 10101010b ; interleave luma cf - psrldq xm5, xm2, 10 - pshufd xm2, xm2, q0000 - pinsrw xm5, [base+round_vals+shiftq*2-10], 3 - pmovzxwd xm14, xm14 - mova [rsp+ 0*16], xm0 - mova [rsp+ 1*16], xm9 - mova [rsp+ 2*16], xm10 - mova [rsp+ 3*16], xm11 - mova [rsp+ 4*16], xm1 - mova [rsp+ 5*16], xm6 - mova [rsp+ 6*16], xm7 - mova [rsp+ 7*16], xm8 - mova [rsp+ 8*16], xm2 - mova [rsp+ 9*16], xm3 - mova [rsp+10*16], xm4 - mova [rsp+11*16], xm5 -%if %2 - vpbroadcastd xm13, [base+pb_1] - vpbroadcastw xm15, [base+hmul_bits+2+%3*2] -%endif - DEFINE_ARGS buf, bufy, fg_data, h, unused, x -%if %2 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*70-(82-3) -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 -.y_loop_ar3: - mov xq, -(76>>%2) - -.x_loop_ar3: - movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] - movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] - movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] - pxor xm3, xm3 - pcmpgtb xm6, xm3, xm2 - pcmpgtb xm5, xm3, xm1 - pcmpgtb xm4, xm3, xm0 - punpckhbw xm3, xm0, xm4 - punpcklbw xm0, xm4 - punpckhbw xm4, xm1, xm5 - punpcklbw xm1, xm5 - punpckhbw xm5, xm2, xm6 - punpcklbw xm2, xm6 - - psrldq xm6, xm0, 2 - psrldq xm7, xm0, 4 - psrldq xm8, xm0, 6 - psrldq xm9, xm0, 8 - palignr xm10, xm3, xm0, 10 - palignr xm11, xm3, xm0, 12 - - punpcklwd xm0, xm6 - punpcklwd xm7, xm8 - punpcklwd xm9, xm10 - punpcklwd xm11, xm1 - pmaddwd xm0, [rsp+ 0*16] - pmaddwd xm7, [rsp+ 1*16] - pmaddwd xm9, [rsp+ 2*16] - pmaddwd xm11, [rsp+ 3*16] - paddd xm0, xm7 - paddd xm9, xm11 - paddd xm0, xm9 - - psrldq xm6, xm1, 2 - psrldq xm7, xm1, 4 - psrldq xm8, xm1, 6 - psrldq xm9, xm1, 8 - palignr xm10, xm4, xm1, 10 - palignr xm11, xm4, xm1, 12 - psrldq xm12, xm2, 2 - - punpcklwd xm6, xm7 - punpcklwd xm8, xm9 - punpcklwd xm10, xm11 - punpcklwd xm12, xm2, xm12 - pmaddwd xm6, [rsp+ 4*16] - pmaddwd xm8, [rsp+ 5*16] - pmaddwd xm10, [rsp+ 6*16] - pmaddwd xm12, [rsp+ 7*16] - paddd xm6, xm8 - paddd xm10, xm12 - paddd xm6, xm10 - paddd xm0, xm6 - - psrldq xm6, xm2, 4 - psrldq xm7, xm2, 6 - psrldq xm8, xm2, 8 - palignr xm9, xm5, xm2, 10 - palignr xm5, xm5, xm2, 12 - -%if %2 - movq xm1, [bufyq+xq*2] -%if %3 - movq xm2, [bufyq+xq*2+82] -%endif - pmaddubsw xm1, xm13, xm1 -%if %3 - pmaddubsw xm2, xm13, xm2 - paddw xm1, xm2 -%endif - pmulhrsw xm1, xm15 -%else - pmovsxbw xm1, [bufyq+xq] -%endif - - punpcklwd xm6, xm7 - punpcklwd xm8, xm9 - punpcklwd xm5, xm1 - pmaddwd xm6, [rsp+ 8*16] - pmaddwd xm8, [rsp+ 9*16] - pmaddwd xm5, [rsp+10*16] - paddd xm0, xm6 - paddd xm8, xm5 - paddd xm0, xm8 - paddd xm0, xm14 - - movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pmovsxbw xm1, xm1 - pmaddwd xm2, xm1, [rsp+16*11] - pshufd xm3, xm2, q1111 - paddd xm2, xm3 ; left+cur - paddd xm2, xm0 ; add top - psrldq xm0, 4 - psrad xm2, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw, we only care about one value - pslldq xm2, 6 - vpblendw xm1, xm2, 1000b - packsswb xm1, xm1 - pextrb [bufq+xq], xm1, 3 - psrldq xm1, 1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar3 - RET -%endmacro - -generate_grain_uv_fn 420, 1, 1 -generate_grain_uv_fn 422, 1, 0 -generate_grain_uv_fn 444, 0, 0 - -INIT_YMM avx2 -cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut - pcmpeqw m10, m10 - psrld m10, 24 - mov r7d, [fg_dataq+FGData.scaling_shift] - lea r8, [pb_mask] -%define base r8-pb_mask - vpbroadcastw m11, [base+mul_bits+r7*2-14] - mov r7d, [fg_dataq+FGData.clip_to_restricted_range] - vpbroadcastw m12, [base+max+r7*4] - vpbroadcastw m13, [base+min+r7*2] - - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap - - mov overlapd, [fg_dataq+FGData.overlap_flag] - movifnidn sbyd, sbym - test sbyd, sbyd - setnz r7b - test r7b, overlapb - jnz .vertical_overlap - - imul seed, sbyd, (173 << 24) | 37 - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - unused1, unused2, see, overlap - - lea src_bakq, [srcq+wq] - neg wq - sub dstq, srcq - -.loop_x: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, overlap - - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, overlap - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq+srcq], m0 - - add srcq, strideq - add grain_lutq, 82 - dec hd - jg .loop_y - - add wq, 32 - jge .end - lea srcq, [src_bakq+wq] - test overlapd, overlapd - jz .loop_x - - ; r8m = sbym - movd xm15, [pb_27_17_17_27] - cmp dword r8m, 0 - jne .loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) - movd xm14, [pw_1024] -.loop_x_h_overlap: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, left_offxy - - lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, left_offxy - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y_h_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - movd xm4, [grain_lutq+left_offxyq] - punpcklbw xm4, xm3 - pmaddubsw xm4, xm15, xm4 - pmulhrsw xm4, xm14 - packsswb xm4, xm4 - vpblendw xm4, xm3, 11111110b - vpblendd m3, m4, 00001111b - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq+srcq], m0 - - add srcq, strideq - add grain_lutq, 82 - dec hd - jg .loop_y_h_overlap - - add wq, 32 - jge .end - lea srcq, [src_bakq+wq] - - ; r8m = sbym - cmp dword r8m, 0 - jne .loop_x_hv_overlap - jmp .loop_x_h_overlap - -.end: - RET - -.vertical_overlap: - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap - - movzx sbyd, sbyb - imul seed, [fg_dataq+FGData.seed], 0x00010001 - imul r7d, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add r7d, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and r7d, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, r7d - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - unused1, unused2, see, overlap - - lea src_bakq, [srcq+wq] - neg wq - sub dstq, srcq - - vpbroadcastd m14, [pw_1024] -.loop_x_v_overlap: - vpbroadcastw m15, [pb_27_17_17_27] - - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, overlap, top_offxy - - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, overlap, top_offxy - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y_v_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - movu m4, [grain_lutq+top_offxyq] - punpckhbw m6, m4, m3 - punpcklbw m4, m3 - pmaddubsw m6, m15, m6 - pmaddubsw m4, m15, m4 - pmulhrsw m6, m14 - pmulhrsw m4, m14 - packsswb m3, m4, m6 - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq+srcq], m0 - - vpbroadcastw m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line - add srcq, strideq - add grain_lutq, 82 - dec hw - jz .end_y_v_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - btc hd, 16 - jnc .loop_y_v_overlap - jmp .loop_y - -.end_y_v_overlap: - add wq, 32 - jge .end_hv - lea srcq, [src_bakq+wq] - - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap - - movd xm15, [pb_27_17_17_27] -.loop_x_hv_overlap: - vpbroadcastw m8, [pb_27_17_17_27] - - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy - - lea topleft_offxyq, [top_offxyq+32] - lea left_offxyq, [offyq+32] - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -.loop_y_hv_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - punpckhwd m5, m0, m2 - punpcklwd m4, m0, m2 - punpckhwd m7, m1, m2 - punpcklwd m6, m1, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m3, m3 - ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel - vpgatherdd m9, [scalingq+m4], m3 - pcmpeqw m3, m3 - vpgatherdd m4, [scalingq+m5], m3 - pcmpeqw m3, m3 - vpgatherdd m5, [scalingq+m6], m3 - pcmpeqw m3, m3 - vpgatherdd m6, [scalingq+m7], m3 - pand m9, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m9, m4 - packusdw m5, m6 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - movu m6, [grain_lutq+top_offxyq] - movd xm4, [grain_lutq+left_offxyq] - movd xm7, [grain_lutq+topleft_offxyq] - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklbw xm4, xm3 - punpcklbw xm7, xm6 - pmaddubsw xm4, xm15, xm4 - pmaddubsw xm7, xm15, xm7 - pmulhrsw xm4, xm14 - pmulhrsw xm7, xm14 - packsswb xm4, xm4 - packsswb xm7, xm7 - vpblendw xm4, xm3, 11111110b - vpblendw xm7, xm6, 11111110b - vpblendd m3, m4, 00001111b - vpblendd m6, m7, 00001111b - ; followed by v interpolation (top | cur -> cur) - punpckhbw m7, m6, m3 - punpcklbw m6, m3 - pmaddubsw m7, m8, m7 - pmaddubsw m6, m8, m6 - pmulhrsw m7, m14 - pmulhrsw m6, m14 - packsswb m3, m6, m7 - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m9 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq+srcq], m0 - - vpbroadcastw m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line - add srcq, strideq - add grain_lutq, 82 - dec hw - jz .end_y_hv_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - btc hd, 16 - jnc .loop_y_hv_overlap - jmp .loop_y_h_overlap - -.end_y_hv_overlap: - add wq, 32 - lea srcq, [src_bakq+wq] - jl .loop_x_hv_overlap - -.end_hv: - RET - -%macro FGUV_FN 3 ; name, ss_hor, ss_ver -cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ - grain_lut, h, sby, luma, lstride, uv_pl, is_id - pcmpeqw m10, m10 - psrld m10, 24 - mov r7d, [fg_dataq+FGData.scaling_shift] - lea r8, [pb_mask] -%define base r8-pb_mask - vpbroadcastw m11, [base+mul_bits+r7*2-14] - mov r7d, [fg_dataq+FGData.clip_to_restricted_range] - mov r9d, dword is_idm - vpbroadcastw m13, [base+min+r7*2] - shlx r7d, r7d, r9d - vpbroadcastw m12, [base+max+r7*2] - - cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 - jne .csfl - -%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap - -%if %1 - mov r7d, dword r11m - vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4] - vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4] - punpcklbw m14, m1, m0 - vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] -%else - vpbroadcastd m14, [pw_1024] -%if %2 - vpbroadcastd m15, [pb_23_22] -%else - vpbroadcastd xm15, [pb_27_17_17_27] -%endif -%endif - - mov overlapd, [fg_dataq+FGData.overlap_flag] - movifnidn sbyd, sbym - test sbyd, sbyd - setnz r7b - test r7b, overlapb - jnz %%vertical_overlap - - imul seed, sbyd, (173 << 24) | 37 - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - unused2, unused3, see, overlap, unused4, unused5, lstride - - mov lumaq, r9mp - lea r12, [srcq+wq] - lea r13, [dstq+wq] - lea r14, [lumaq+wq*(1+%2)] - mov r11mp, r12 - mov r12mp, r13 - mov lstrideq, r10mp - neg wq - -%%loop_x: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, overlap, unused1, unused2, lstride - - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, overlap, unused1, unused2, lstride - - mov hd, hm - mov grain_lutq, grain_lutmp -%%loop_y: - ; src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm6, [lumaq+lstrideq*0+16] - mova xm0, [srcq] - vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 - vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 - vinserti128 m0, [srcq+strideq], 1 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - pxor m2, m2 - mova m4, [lumaq] - mova m0, [srcq] -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - - ; scaling[luma_src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; grain = grain_lut[offy+y][offx+x] -%if %2 - movu xm3, [grain_lutq+offxyq+ 0] - vinserti128 m3, [grain_lutq+offxyq+82], 1 -%else - movu m3, [grain_lutq+offxyq] -%endif - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 -%if %2 - mova [dstq], xm0 - vextracti128 [dstq+strideq], m0, 1 -%else - mova [dstq], m0 -%endif - -%if %2 - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82<<%2 - sub hb, 1+%2 - jg %%loop_y - - add wq, 32>>%2 - jge %%end - mov srcq, r11mp - mov dstq, r12mp - lea lumaq, [r14+wq*(1+%2)] - add srcq, wq - add dstq, wq - test overlapd, overlapd - jz %%loop_x - - ; r8m = sbym - cmp dword r8m, 0 - jne %%loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) -%%loop_x_h_overlap: - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, unused1, unused2, lstride - - lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx - mov offxd, seed - rorx offyd, seed, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, unused1, unused2, lstride - - mov hd, hm - mov grain_lutq, grain_lutmp -%%loop_y_h_overlap: - ; src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm6, [lumaq+lstrideq*0+16] - mova xm0, [srcq] - vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 - vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 - vinserti128 m0, [srcq+strideq], 1 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - - ; scaling[luma_src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; grain = grain_lut[offy+y][offx+x] -%if %2 -%if %1 - vpbroadcastd m6, [pb_23_22] ; FIXME -%endif - movu xm3, [grain_lutq+offxyq+ 0] - movd xm4, [grain_lutq+left_offxyq+ 0] - vinserti128 m3, [grain_lutq+offxyq+82], 1 - vinserti128 m4, [grain_lutq+left_offxyq+82], 1 - punpcklbw m4, m3 -%if %1 - pmaddubsw m4, m6, m4 - pmulhrsw m4, [pw_1024] -%else - pmaddubsw m4, m15, m4 - pmulhrsw m4, m14 -%endif - packsswb m4, m4 - pcmpeqw m6, m6 ; FIXME - psrldq m6, 15 ; FIXME - vpblendvb m3, m3, m4, m6 -%else -%if %1 - vpbroadcastd xm6, [pb_27_17_17_27] -%endif - movu m3, [grain_lutq+offxyq] - movd xm4, [grain_lutq+left_offxyq] - punpcklbw xm4, xm3 -%if %1 - pmaddubsw xm4, xm6, xm4 - pmulhrsw xm4, [pw_1024] -%else - pmaddubsw xm4, xm15, xm4 - pmulhrsw xm4, xm14 -%endif - packsswb xm4, xm4 - pcmpeqw xm6, xm6 - psrldq xm6, 14 - vpblendvb m3, m3, m4, m6 -%endif - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 -%if %2 - mova [dstq], xm0 - vextracti128 [dstq+strideq], m0, 1 -%else - mova [dstq], m0 -%endif - -%if %2 - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82*(1+%2) - sub hb, 1+%2 - jg %%loop_y_h_overlap - - add wq, 32>>%2 - jge %%end - mov srcq, r11mp - mov dstq, r12mp - lea lumaq, [r14+wq*(1+%2)] - add srcq, wq - add dstq, wq - - ; r8m = sbym - cmp dword r8m, 0 - jne %%loop_x_hv_overlap - jmp %%loop_x_h_overlap - -%%end: - RET - -%%vertical_overlap: - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ - sby, see, overlap, unused1, unused2, lstride - - movzx sbyd, sbyb - imul seed, [fg_dataq+FGData.seed], 0x00010001 - imul r7d, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add r7d, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and r7d, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, r7d - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - unused1, unused2, see, overlap, unused3, unused4, lstride - - mov lumaq, r9mp - lea r12, [srcq+wq] - lea r13, [dstq+wq] - lea r14, [lumaq+wq*(1+%2)] - mov r11mp, r12 - mov r12mp, r13 - mov lstrideq, r10mp - neg wq - -%%loop_x_v_overlap: - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, overlap, top_offxy, unused, lstride - - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, overlap, top_offxy, unused, lstride - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -%if %2 == 0 - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] -%endif -%%loop_y_v_overlap: - ; src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm6, [lumaq+lstrideq*0+16] - mova xm0, [srcq] - vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 - vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 - vinserti128 m0, [srcq+strideq], 1 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: luma_src as dword - - ; scaling[luma_src] - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m8, [scalingq+m4], m3 - vpgatherdd m4, [scalingq+m5], m9 - pcmpeqw m3, m3 - pcmpeqw m9, m9 - vpgatherdd m5, [scalingq+m6], m3 - vpgatherdd m6, [scalingq+m7], m9 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - -%if %2 - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word -%endif - - ; grain = grain_lut[offy+y][offx+x] -%if %3 == 0 -%if %2 - mova m6, [pb_8x_27_17_8x_17_27] - movu xm3, [grain_lutq+offxyq] - movu xm4, [grain_lutq+top_offxyq] - vinserti128 m3, [grain_lutq+offxyq+82], 1 - vinserti128 m4, [grain_lutq+top_offxyq+82], 1 -%else - movu m3, [grain_lutq+offxyq] - movu m4, [grain_lutq+top_offxyq] -%endif - punpckhbw m9, m4, m3 - punpcklbw m4, m3 -%if %2 - pmaddubsw m9, m6, m9 - pmaddubsw m4, m6, m4 -%else - pmaddubsw m9, m1, m9 - pmaddubsw m4, m1, m4 -%endif -%if %1 - pmulhrsw m9, [pw_1024] - pmulhrsw m4, [pw_1024] -%else - pmulhrsw m9, m14 - pmulhrsw m4, m14 -%endif - packsswb m3, m4, m9 -%else -%if %1 - vpbroadcastd m6, [pb_23_22] -%endif - movq xm3, [grain_lutq+offxyq] - movq xm4, [grain_lutq+top_offxyq] - vinserti128 m3, [grain_lutq+offxyq+8], 1 - vinserti128 m4, [grain_lutq+top_offxyq+8], 1 - punpcklbw m4, m3 -%if %1 - pmaddubsw m4, m6, m4 - pmulhrsw m4, [pw_1024] -%else - pmaddubsw m4, m15, m4 - pmulhrsw m4, m14 -%endif - packsswb m4, m4 - vpermq m4, m4, q3120 - ; only interpolate first line, insert second line unmodified - vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 -%endif - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) -%if %2 - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq], xm0 - vextracti128 [dstq+strideq], m0, 1 -%else - pxor m6, m6 - punpckhbw m9, m0, m6 - punpcklbw m0, m6 ; m0-1: src as word - - paddw m0, m2 - paddw m9, m3 - pmaxsw m0, m13 - pmaxsw m9, m13 - pminsw m0, m12 - pminsw m9, m12 - packuswb m0, m9 - mova [dstq], m0 -%endif - - sub hb, 1+%2 - jl %%end_y_v_overlap -%if %2 - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82<<%2 -%if %2 == 0 - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] - btc hd, 16 - jnc %%loop_y_v_overlap -%endif - jmp %%loop_y - -%%end_y_v_overlap: - add wq, 32>>%2 - jge %%end_hv - mov srcq, r11mp - mov dstq, r12mp - lea lumaq, [r14+wq*(1+%2)] - add srcq, wq - add dstq, wq - - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap - -%%loop_x_hv_overlap: - ; we assume from the block above that bits 8-15 of r7d are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp r7b ; parity of top_seed - shr seed, 16 - shl r7d, 16 - test seeb, seeh - setp r7b ; parity of cur_seed - or r6d, 0x00010001 - xor r7d, r6d - rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride - - lea topleft_offxyq, [top_offxyq+(32>>%2)] - lea left_offxyq, [offyq+(32>>%2)] - rorx offyd, seed, 8 - rorx offxd, seed, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride - - movzx top_offxyd, offxyw - shr offxyd, 16 - - mov hd, hm - mov grain_lutq, grain_lutmp -%if %2 == 0 - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] -%endif -%%loop_y_hv_overlap: - ; src -%if %2 - mova xm4, [lumaq+lstrideq*0+ 0] - mova xm6, [lumaq+lstrideq*0+16] - mova xm0, [srcq] - vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 - vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 - vinserti128 m0, [srcq+strideq], 1 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - punpckhwd m5, m4, m2 - punpcklwd m4, m2 - punpckhwd m7, m6, m2 - punpcklwd m6, m2 ; m4-7: src as dword - - ; scaling[src] - pcmpeqw m9, m9 - pcmpeqw m3, m3 - vpgatherdd m8, [scalingq+m4], m9 - vpgatherdd m4, [scalingq+m5], m3 - pcmpeqw m9, m9 - pcmpeqw m3, m3 - vpgatherdd m5, [scalingq+m6], m9 - vpgatherdd m6, [scalingq+m7], m3 - pand m8, m10 - pand m4, m10 - pand m5, m10 - pand m6, m10 - packusdw m8, m4 - packusdw m5, m6 - -%if %2 - ; unpack chroma source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word -%endif - - ; grain = grain_lut[offy+y][offx+x] -%if %1 -%if %2 - vpbroadcastd m9, [pb_23_22] -%else - vpbroadcastd xm9, [pb_27_17_17_27] -%endif -%endif - -%if %2 - movu xm3, [grain_lutq+offxyq] -%if %3 - movq xm6, [grain_lutq+top_offxyq] -%else - movu xm6, [grain_lutq+top_offxyq] -%endif - vinserti128 m3, [grain_lutq+offxyq+82], 1 -%if %3 - vinserti128 m6, [grain_lutq+top_offxyq+8], 1 -%else - vinserti128 m6, [grain_lutq+top_offxyq+82], 1 -%endif -%else - movu m3, [grain_lutq+offxyq] - movu m6, [grain_lutq+top_offxyq] -%endif - movd xm4, [grain_lutq+left_offxyq] - movd xm7, [grain_lutq+topleft_offxyq] -%if %2 - vinserti128 m4, [grain_lutq+left_offxyq+82], 1 -%if %3 == 0 - vinserti128 m7, [grain_lutq+topleft_offxyq+82], 1 -%endif -%endif - - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) -%if %2 - punpcklbw m4, m3 -%if %3 - punpcklbw xm7, xm6 -%else - punpcklbw m7, m6 -%endif - punpcklwd m4, m7 -%if %1 - pmaddubsw m4, m9, m4 - pmulhrsw m4, [pw_1024] -%else - pmaddubsw m4, m15, m4 - pmulhrsw m4, m14 -%endif - packsswb m4, m4 - pcmpeqw m9, m9 ; this is kind of ugly - psrldq m9, 15 - vpblendvb m3, m3, m4, m9 - psrldq m4, 1 -%if %3 - shufpd m9, m9, m9, 1110b ; clear upper lane -%endif - vpblendvb m6, m6, m4, m9 -%else - punpcklbw xm4, xm3 - punpcklbw xm7, xm6 - punpckldq xm4, xm7 -%if %1 - pmaddubsw xm4, xm9, xm4 - pmulhrsw xm4, [pw_1024] -%else - pmaddubsw xm4, xm15, xm4 - pmulhrsw xm4, xm14 -%endif - packsswb xm4, xm4 - pcmpeqw xm9, xm9 ; this is kind of ugly - psrldq xm9, 14 - vpblendvb m3, m3, m4, m9 - psrldq xm4, 2 - vpblendvb m6, m6, m4, m9 -%endif - - ; followed by v interpolation (top | cur -> cur) -%if %3 - vpermq m9, m3, q3120 - punpcklbw m6, m9 -%if %1 - vpbroadcastd m9, [pb_23_22] - pmaddubsw m6, m9, m6 - pmulhrsw m6, [pw_1024] -%else - pmaddubsw m6, m15, m6 - pmulhrsw m6, m14 -%endif - packsswb m6, m6 - vpermq m6, m6, q3120 - vpblendd m3, m3, m6, 00001111b -%else - punpckhbw m9, m6, m3 - punpcklbw m6, m3 -%if %2 - mova m3, [pb_8x_27_17_8x_17_27] - pmaddubsw m9, m3, m9 - pmaddubsw m6, m3, m6 -%else - pmaddubsw m9, m1, m9 - pmaddubsw m6, m1, m6 -%endif -%if %1 - pmulhrsw m9, [pw_1024] - pmulhrsw m6, [pw_1024] -%else - pmulhrsw m9, m14 - pmulhrsw m6, m14 -%endif - packsswb m3, m6, m9 -%endif - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m8 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) -%if %2 - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - mova [dstq], xm0 - vextracti128 [dstq+strideq], m0, 1 -%else - pxor m6, m6 - punpckhbw m9, m0, m6 - punpcklbw m0, m6 ; m0-1: src as word - paddw m0, m2 - paddw m9, m3 - pmaxsw m0, m13 - pmaxsw m9, m13 - pminsw m0, m12 - pminsw m9, m12 - packuswb m0, m9 - mova [dstq], m0 -%endif - -%if %2 - lea srcq, [srcq+strideq*2] - lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*(2<<%3)] -%else - add srcq, strideq - add dstq, strideq - add lumaq, lstrideq -%endif - add grain_lutq, 82<<%2 - sub hb, 1+%2 -%if %2 - jg %%loop_y_h_overlap -%else - je %%end_y_hv_overlap - vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] - btc hd, 16 - jnc %%loop_y_hv_overlap - jmp %%loop_y_h_overlap -%endif - -%%end_y_hv_overlap: - add wq, 32>>%2 - jge %%end_hv - mov srcq, r11mp - mov dstq, r12mp - lea lumaq, [r14+wq*(1+%2)] - add srcq, wq - add dstq, wq - jmp %%loop_x_hv_overlap - -%%end_hv: - RET -%endmacro - - %%FGUV_32x32xN_LOOP 1, %2, %3 -.csfl: - %%FGUV_32x32xN_LOOP 0, %2, %3 -%endmacro - -FGUV_FN 420, 1, 1 -FGUV_FN 422, 1, 0 -FGUV_FN 444, 0, 0 - -%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/film_grain_avx2.asm dav1d-0.9.1/src/x86/film_grain_avx2.asm --- dav1d-0.7.1/src/x86/film_grain_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/film_grain_avx2.asm 2021-07-28 21:38:28.893852000 +0000 @@ -0,0 +1,2378 @@ +; Copyright © 2019-2021, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pb_8x_27_17_8x_17_27: times 8 db 27, 17 + times 8 db 17, 27 +pw_1024: times 16 dw 1024 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pd_m65536: dd ~0xffff +pb_23_22: db 23, 22 + times 3 db 0, 32 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +max: dw 255, 240, 235 +min: dw 0, 16 +pb_27_17_17_27: db 27, 17, 17, 27 + times 2 db 0, 32 +pw_1: dw 1 + +%macro JMP_TABLE 2-* + %xdefine %1_8bpc_%2_table %%table + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base + %rotate 1 + %endrep +%endmacro + +ALIGN 4 +JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +INIT_XMM avx2 +cglobal generate_grain_y_8bpc, 2, 9, 16, buf, fg_data + lea r4, [pb_mask] +%define base r4-pb_mask + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r2d, [fg_dataq+FGData.grain_scale_shift] + vpbroadcastw xm8, [base+round+r2*2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastd xm9, [base+pd_m65536] + mov r2, -73*82 + sub bufq, r2 + lea r3, [gaussian_sequence] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r3+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + pmulhrsw xm2, xm8 + packsswb xm2, xm2 + movd [bufq+r2], xm2 + add r2, 4 + jl .loop + + ; auto-regression code + movsxd r2, [fg_dataq+FGData.ar_coeff_lag] + movsxd r2, [base+generate_grain_y_8bpc_avx2_table+r2*4] + lea r2, [r2+base+generate_grain_y_8bpc_avx2_table] + jmp r2 + +.ar1: + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_y] + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 + pinsrb xm4, [pb_1], 3 + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm0, [bufq+xq-82-1] ; top/left + pmovsxbw xm2, [bufq+xq-82+0] ; top + pmovsxbw xm1, [bufq+xq-82+1] ; top/right + punpcklwd xm0, xm2 + punpcklwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d +%if WIN64 + sarx val3d, val3d, shiftd +%else + sar val3d, shiftb +%endif + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: + DEFINE_ARGS buf, fg_data, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + movq xm15, [base+byte_blend+1] + pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + pmovsxbw xm9, xm9 + DEFINE_ARGS buf, fg_data, h, x + pshufd xm12, xm9, q0000 + pshufd xm13, xm9, q1111 + pshufd xm11, xm8, q3333 + pshufd xm10, xm8, q2222 + pshufd xm9, xm8, q1111 + pshufd xm8, xm8, q0000 + pmovzxwd xm14, xm14 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] + psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] + psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] + punpcklwd xm2, xm0, xm2 + punpcklwd xm3, xm4 + pmaddwd xm2, xm8 + pmaddwd xm3, xm11 + paddd xm2, xm3 + + psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] + psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] + psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5] + punpcklwd xm4, xm5 + punpcklwd xm6, xm1 + psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5] + psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] + punpcklwd xm7, xm1 + pmaddwd xm4, xm9 + pmaddwd xm6, xm10 + pmaddwd xm7, xm12 + paddd xm4, xm6 + paddd xm2, xm7 + paddd xm2, xm4 + paddd xm2, xm14 + + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm1, xm0 + pmaddwd xm3, xm1, xm13 + paddd xm3, xm2 + psrldq xm1, 4 ; y=0,x=0 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw xm3, xm1 + packsswb xm3, xm3 + pextrb [bufq+xq], xm3, 0 + pslldq xm3, 2 + pand xm3, xm15 + pandn xm0, xm15, xm0 + por xm0, xm3 + psrldq xm0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, shift +%if WIN64 + SUB rsp, 16*12 +%assign stack_size_padded (stack_size_padded+16*12) +%assign stack_size (stack_size+16*12) +%else + ALLOC_STACK 16*12 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + movq xm15, [base+byte_blend] + pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-7 + pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_y+ 8] ; cf8-15 + pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pshufd xm9, xm0, q1111 + pshufd xm10, xm0, q2222 + pshufd xm11, xm0, q3333 + pshufd xm0, xm0, q0000 + pshufd xm6, xm1, q1111 + pshufd xm7, xm1, q2222 + pshufd xm8, xm1, q3333 + pshufd xm1, xm1, q0000 + pshufd xm3, xm2, q1111 + psrldq xm13, xm2, 10 + pinsrw xm2, [pw_1], 5 + pshufd xm4, xm2, q2222 + pshufd xm2, xm2, q0000 + pinsrw xm13, [base+round_vals+shiftq*2-10], 3 + mova [rsp+ 0*16], xm0 + mova [rsp+ 1*16], xm9 + mova [rsp+ 2*16], xm10 + mova [rsp+ 3*16], xm11 + mova [rsp+ 4*16], xm1 + mova [rsp+ 5*16], xm6 + mova [rsp+ 6*16], xm7 + mova [rsp+ 7*16], xm8 + mova [rsp+ 8*16], xm2 + mova [rsp+ 9*16], xm3 + mova [rsp+10*16], xm4 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor xm3, xm3 + pcmpgtb xm6, xm3, xm2 + pcmpgtb xm5, xm3, xm1 + pcmpgtb xm4, xm3, xm0 + punpckhbw xm3, xm0, xm4 + punpcklbw xm0, xm4 + punpckhbw xm4, xm1, xm5 + punpcklbw xm1, xm5 + punpckhbw xm5, xm2, xm6 + punpcklbw xm2, xm6 + + psrldq xm6, xm0, 2 + psrldq xm7, xm0, 4 + psrldq xm8, xm0, 6 + psrldq xm9, xm0, 8 + palignr xm10, xm3, xm0, 10 + palignr xm11, xm3, xm0, 12 + + punpcklwd xm0, xm6 + punpcklwd xm7, xm8 + punpcklwd xm9, xm10 + punpcklwd xm11, xm1 + pmaddwd xm0, [rsp+ 0*16] + pmaddwd xm7, [rsp+ 1*16] + pmaddwd xm9, [rsp+ 2*16] + pmaddwd xm11, [rsp+ 3*16] + paddd xm0, xm7 + paddd xm9, xm11 + paddd xm0, xm9 + + psrldq xm6, xm1, 2 + psrldq xm7, xm1, 4 + psrldq xm8, xm1, 6 + psrldq xm9, xm1, 8 + palignr xm10, xm4, xm1, 10 + palignr xm11, xm4, xm1, 12 + psrldq xm12, xm2, 2 + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm10, xm11 + punpcklwd xm12, xm2, xm12 + pmaddwd xm6, [rsp+ 4*16] + pmaddwd xm8, [rsp+ 5*16] + pmaddwd xm10, [rsp+ 6*16] + pmaddwd xm12, [rsp+ 7*16] + paddd xm6, xm8 + paddd xm10, xm12 + paddd xm6, xm10 + paddd xm0, xm6 + + psrldq xm6, xm2, 4 + psrldq xm7, xm2, 6 + psrldq xm8, xm2, 8 + palignr xm9, xm5, xm2, 10 + palignr xm5, xm5, xm2, 12 + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm5, xm14 + pmaddwd xm6, [rsp+ 8*16] + pmaddwd xm8, [rsp+ 9*16] + pmaddwd xm5, [rsp+10*16] + paddd xm0, xm6 + paddd xm8, xm5 + paddd xm0, xm8 + + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmovsxbw xm2, xm1 + pmaddwd xm2, xm13 + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb xm2, xm2 + pextrb [bufq+xq], xm2, 0 + pslldq xm2, 3 + pand xm2, xm15 + pandn xm1, xm15, xm1 + por xm1, xm2 + psrldq xm1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM avx2 +cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv + lea r4, [pb_mask] +%define base r4-pb_mask + movq xm1, [base+rnd_next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + vpbroadcastw xm8, [base+round+r5*2] + mova xm5, [base+pb_mask] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + vpbroadcastw xm9, [base+pw_seed_xor+uvq*4] + pxor xm0, xm9 + vpbroadcastd xm9, [base+pd_m65536] + lea r6, [gaussian_sequence] +%if %2 + mov r7d, 73-35*%3 + add bufq, 44 +.loop_y: + mov r5, -44 +.loop_x: +%else + mov r5, -73*82 + sub bufq, r5 +.loop: +%endif + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds + psllq xm6, xm2, 30 + por xm2, xm6 + psllq xm6, xm2, 15 + por xm2, xm6 ; aggregate each bit into next seed's high bit + pmulhuw xm3, xm0, xm7 + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + pmovzxwd xm3, xm2 + mova xm6, xm9 + vpgatherdd xm2, [r6+xm3*2], xm6 + pandn xm2, xm9, xm2 + packusdw xm2, xm2 + pmulhrsw xm2, xm8 + packsswb xm2, xm2 + movd [bufq+r5], xm2 + add r5, 4 +%if %2 + jl .loop_x + add bufq, 82 + dec r7d + jg .loop_y +%else + jl .loop +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_8bpc_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_8bpc_avx2_table] + jmp r5 + +.ar0: + INIT_YMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd xm3, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h + pmovsxbw xm4, xm4 +%if %2 + vpbroadcastd m7, [pb_1] + vpbroadcastw m6, [hmul_bits+2+%3*2] +%endif + vpbroadcastw m4, xm4 + vpbroadcastw m3, xm3 + pxor m12, m12 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: +%if %2 + ; first 32 pixels + movu xm8, [bufyq] +%if %3 + movu xm9, [bufyq+82] +%endif + movu xm10, [bufyq+16] +%if %3 + movu xm11, [bufyq+82+16] +%endif + vinserti128 m8, [bufyq+32], 1 +%if %3 + vinserti128 m9, [bufyq+82+32], 1 +%endif + vinserti128 m10, [bufyq+48], 1 +%if %3 + vinserti128 m11, [bufyq+82+48], 1 +%endif + pmaddubsw m8, m7, m8 +%if %3 + pmaddubsw m9, m7, m9 +%endif + pmaddubsw m10, m7, m10 +%if %3 + pmaddubsw m11, m7, m11 + paddw m8, m9 + paddw m10, m11 +%endif + pmulhrsw m8, m6 + pmulhrsw m10, m6 +%else + xor r3d, r3d + ; first 32x2 pixels +.x_loop_ar0: + movu m8, [bufyq+r3] + pcmpgtb m9, m12, m8 + punpckhbw m10, m8, m9 + punpcklbw m8, m9 +%endif + pmullw m8, m4 + pmullw m10, m4 + pmulhrsw m8, m3 + pmulhrsw m10, m3 +%if %2 + movu m0, [bufq] +%else + movu m0, [bufq+r3] +%endif + pcmpgtb m1, m12, m0 + punpckhbw m9, m0, m1 + punpcklbw m0, m1 + paddw m0, m8 + paddw m9, m10 + packsswb m0, m9 +%if %2 + movu [bufq], m0 +%else + movu [bufq+r3], m0 + add r3d, 32 + cmp r3d, 64 + jl .x_loop_ar0 +%endif + + ; last 6/12 pixels + movu xm8, [bufyq+32*2] +%if %2 +%if %3 + movu xm9, [bufyq+32*2+82] +%endif + pmaddubsw xm8, xm7, xm8 +%if %3 + pmaddubsw xm9, xm7, xm9 + paddw xm8, xm9 +%endif + pmulhrsw xm8, xm6 + pmullw xm8, xm4 + pmulhrsw xm8, xm3 + movq xm0, [bufq+32] + pcmpgtb xm9, xm12, xm0 + punpcklbw xm9, xm0, xm9 + paddw xm8, xm9 + packsswb xm8, xm8 + vpblendw xm0, xm8, xm0, 1000b + movq [bufq+32], xm0 +%else + pcmpgtb xm9, xm12, xm8 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + pmullw xm10, xm4 + pmullw xm8, xm4 + pmulhrsw xm10, xm3 + pmulhrsw xm8, xm3 + movu xm0, [bufq+64] + pcmpgtb xm9, xm12, xm0 + punpcklbw xm1, xm0, xm9 + punpckhbw xm9, xm0, xm9 + paddw xm1, xm8 + paddw xm9, xm10 + packsswb xm1, xm9 + vpblendw xm0, xm1, xm0, 11000000b + movu [bufq+64], xm0 +%endif + + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +.ar1: + INIT_XMM avx2 + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + vpbroadcastd xm7, [pb_1] + vpbroadcastw xm6, [hmul_bits+2+%3*2] +%endif + vpbroadcastd xm3, xm3 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm0, [bufq+xq-82-1] ; top/left +%if %2 + movq xm8, [bufyq+xq*2] +%if %3 + movq xm9, [bufyq+xq*2+82] +%endif +%endif + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right +%if %2 + pmaddubsw xm8, xm7, xm8 +%if %3 + pmaddubsw xm9, xm7, xm9 + paddw xm8, xm9 +%endif + pmulhrsw xm8, xm6 +%else + pmovsxbw xm8, [bufyq+xq] +%endif + punpcklwd xm0, xm2 + punpcklwd xm1, xm8 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 + paddd xm0, xm3 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar1 + RET + +.ar2: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm15, [base+round_vals-12+shiftq*2] + pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 + pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 + pinsrw xm9, [base+pw_1], 5 +%if %2 + vpbroadcastw xm7, [base+hmul_bits+2+%3*2] + vpbroadcastd xm6, [base+pb_1] +%endif + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd xm12, xm9, q0000 + pshufd xm13, xm9, q1111 + pshufd xm14, xm9, q2222 + pshufd xm11, xm8, q3333 + pshufd xm10, xm8, q2222 + pshufd xm9, xm8, q1111 + pshufd xm8, xm8, q0000 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5] + psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5] + psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5] + punpcklwd xm2, xm0, xm2 + punpcklwd xm3, xm4 + pmaddwd xm2, xm8 + pmaddwd xm3, xm11 + paddd xm2, xm3 + + psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5] + psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5] + psrldq xm0, 8 ; y=-2,x=[+2,+5] + punpcklwd xm4, xm5 + punpcklwd xm0, xm1 + psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5] + psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5] + punpcklwd xm3, xm1 + pmaddwd xm4, xm9 + pmaddwd xm0, xm10 + pmaddwd xm3, xm12 + paddd xm4, xm0 + paddd xm2, xm3 + paddd xm2, xm4 + +%if %2 + movq xm0, [bufyq+xq*2] +%if %3 + movq xm3, [bufyq+xq*2+82] +%endif + pmaddubsw xm0, xm6, xm0 +%if %3 + pmaddubsw xm3, xm6, xm3 + paddw xm0, xm3 +%endif + pmulhrsw xm0, xm7 +%else + pmovsxbw xm0, [bufyq+xq] +%endif + punpcklwd xm0, xm15 + pmaddwd xm0, xm14 + paddd xm2, xm0 + + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm0, xm0 + pmaddwd xm3, xm0, xm13 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + pslldq xm3, 2 + psrldq xm0, 2 + paddw xm3, xm0 + vpblendw xm0, xm3, 00000010b + packsswb xm0, xm0 + pextrb [bufq+xq], xm0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + SUB rsp, 16*12 +%assign stack_size_padded (stack_size_padded+16*12) +%assign stack_size (stack_size+16*12) + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm14, [base+round_vals-12+shiftq*2] + pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7 + pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15 + pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 + pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] + pshufd xm9, xm0, q1111 + pshufd xm10, xm0, q2222 + pshufd xm11, xm0, q3333 + pshufd xm0, xm0, q0000 + pshufd xm6, xm1, q1111 + pshufd xm7, xm1, q2222 + pshufd xm8, xm1, q3333 + pshufd xm1, xm1, q0000 + pshufd xm3, xm2, q1111 + pshufd xm4, xm2, q2222 + vpbroadcastw xm5, xm5 + vpblendw xm4, xm5, 10101010b ; interleave luma cf + psrldq xm5, xm2, 10 + pshufd xm2, xm2, q0000 + pinsrw xm5, [base+round_vals+shiftq*2-10], 3 + pmovzxwd xm14, xm14 + mova [rsp+ 0*16], xm0 + mova [rsp+ 1*16], xm9 + mova [rsp+ 2*16], xm10 + mova [rsp+ 3*16], xm11 + mova [rsp+ 4*16], xm1 + mova [rsp+ 5*16], xm6 + mova [rsp+ 6*16], xm7 + mova [rsp+ 7*16], xm8 + mova [rsp+ 8*16], xm2 + mova [rsp+ 9*16], xm3 + mova [rsp+10*16], xm4 + mova [rsp+11*16], xm5 +%if %2 + vpbroadcastd xm13, [base+pb_1] + vpbroadcastw xm15, [base+hmul_bits+2+%3*2] +%endif + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor xm3, xm3 + pcmpgtb xm6, xm3, xm2 + pcmpgtb xm5, xm3, xm1 + pcmpgtb xm4, xm3, xm0 + punpckhbw xm3, xm0, xm4 + punpcklbw xm0, xm4 + punpckhbw xm4, xm1, xm5 + punpcklbw xm1, xm5 + punpckhbw xm5, xm2, xm6 + punpcklbw xm2, xm6 + + psrldq xm6, xm0, 2 + psrldq xm7, xm0, 4 + psrldq xm8, xm0, 6 + psrldq xm9, xm0, 8 + palignr xm10, xm3, xm0, 10 + palignr xm11, xm3, xm0, 12 + + punpcklwd xm0, xm6 + punpcklwd xm7, xm8 + punpcklwd xm9, xm10 + punpcklwd xm11, xm1 + pmaddwd xm0, [rsp+ 0*16] + pmaddwd xm7, [rsp+ 1*16] + pmaddwd xm9, [rsp+ 2*16] + pmaddwd xm11, [rsp+ 3*16] + paddd xm0, xm7 + paddd xm9, xm11 + paddd xm0, xm9 + + psrldq xm6, xm1, 2 + psrldq xm7, xm1, 4 + psrldq xm8, xm1, 6 + psrldq xm9, xm1, 8 + palignr xm10, xm4, xm1, 10 + palignr xm11, xm4, xm1, 12 + psrldq xm12, xm2, 2 + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm10, xm11 + punpcklwd xm12, xm2, xm12 + pmaddwd xm6, [rsp+ 4*16] + pmaddwd xm8, [rsp+ 5*16] + pmaddwd xm10, [rsp+ 6*16] + pmaddwd xm12, [rsp+ 7*16] + paddd xm6, xm8 + paddd xm10, xm12 + paddd xm6, xm10 + paddd xm0, xm6 + + psrldq xm6, xm2, 4 + psrldq xm7, xm2, 6 + psrldq xm8, xm2, 8 + palignr xm9, xm5, xm2, 10 + palignr xm5, xm5, xm2, 12 + +%if %2 + movq xm1, [bufyq+xq*2] +%if %3 + movq xm2, [bufyq+xq*2+82] +%endif + pmaddubsw xm1, xm13, xm1 +%if %3 + pmaddubsw xm2, xm13, xm2 + paddw xm1, xm2 +%endif + pmulhrsw xm1, xm15 +%else + pmovsxbw xm1, [bufyq+xq] +%endif + + punpcklwd xm6, xm7 + punpcklwd xm8, xm9 + punpcklwd xm5, xm1 + pmaddwd xm6, [rsp+ 8*16] + pmaddwd xm8, [rsp+ 9*16] + pmaddwd xm5, [rsp+10*16] + paddd xm0, xm6 + paddd xm8, xm5 + paddd xm0, xm8 + paddd xm0, xm14 + + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmovsxbw xm1, xm1 + pmaddwd xm2, xm1, [rsp+16*11] + pshufd xm3, xm2, q1111 + paddd xm2, xm3 ; left+cur + paddd xm2, xm0 ; add top + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw, we only care about one value + pslldq xm2, 6 + vpblendw xm1, xm2, 1000b + packsswb xm1, xm1 + pextrb [bufq+xq], xm1, 3 + psrldq xm1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +INIT_YMM avx2 +cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut + pcmpeqw m10, m10 + psrld m10, 24 + mov r7d, [fg_dataq+FGData.scaling_shift] + lea r8, [pb_mask] +%define base r8-pb_mask + vpbroadcastw m11, [base+mul_bits+r7*2-14] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + vpbroadcastw m12, [base+max+r7*4] + vpbroadcastw m13, [base+min+r7*2] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + mov overlapd, [fg_dataq+FGData.overlap_flag] + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, overlapb + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + +.loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test overlapd, overlapd + jz .loop_x + + ; r8m = sbym + movq xm15, [pb_27_17_17_27] + cmp dword r8m, 0 + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) + movq xm14, [pw_1024] +.loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm3 + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 + packsswb xm4, xm4 + vpblendd m3, m3, m4, 00000001b + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; r8m = sbym + cmp dword r8m, 0 + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.end: + RET + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + + vpbroadcastd m14, [pw_1024] +.loop_x_v_overlap: + vpbroadcastw m15, [pb_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq+m4], m3 + vpgatherdd m4, [scalingq+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq+m6], m3 + vpgatherdd m6, [scalingq+m7], m9 + pand m8, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m8, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] + punpckhbw m6, m4, m3 + punpcklbw m4, m3 + pmaddubsw m6, m15, m6 + pmaddubsw m4, m15, m4 + pmulhrsw m6, m14 + pmulhrsw m4, m14 + packsswb m3, m4, m6 + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + vpbroadcastw m15, [pb_27_17_17_27+2] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: + add wq, 32 + jge .end_hv + lea srcq, [src_bakq+wq] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + + movq xm15, [pb_27_17_17_27] +.loop_x_hv_overlap: + vpbroadcastw m8, [pb_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+32] + lea left_offxyq, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + punpckhwd m5, m0, m2 + punpcklwd m4, m0, m2 + punpckhwd m7, m1, m2 + punpcklwd m6, m1, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m3, m3 + ; FIXME it would be nice to have another register here to do 2 vpgatherdd's in parallel + vpgatherdd m9, [scalingq+m4], m3 + pcmpeqw m3, m3 + vpgatherdd m4, [scalingq+m5], m3 + pcmpeqw m3, m3 + vpgatherdd m5, [scalingq+m6], m3 + pcmpeqw m3, m3 + vpgatherdd m6, [scalingq+m7], m3 + pand m9, m10 + pand m4, m10 + pand m5, m10 + pand m6, m10 + packusdw m9, m4 + packusdw m5, m6 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + movu m6, [grain_lutq+top_offxyq] + movd xm4, [grain_lutq+left_offxyq] + movd xm7, [grain_lutq+topleft_offxyq] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw xm4, xm3 + punpcklbw xm7, xm6 + pmaddubsw xm4, xm15, xm4 + pmaddubsw xm7, xm15, xm7 + pmulhrsw xm4, xm14 + pmulhrsw xm7, xm14 + packsswb xm4, xm4 + packsswb xm7, xm7 + vpblendd m3, m4, 00000001b + vpblendd m6, m7, 00000001b + ; followed by v interpolation (top | cur -> cur) + punpckhbw m7, m6, m3 + punpcklbw m6, m3 + pmaddubsw m7, m8, m7 + pmaddubsw m6, m8, m6 + pmulhrsw m7, m14 + pmulhrsw m6, m14 + packsswb m3, m6, m7 + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m9 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq+srcq], m0 + + vpbroadcastw m8, [pb_27_17_17_27+2] ; swap weights for second v-overlap line + add srcq, strideq + add grain_lutq, 82 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq] + jl .loop_x_hv_overlap + +.end_hv: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id + mov r7d, [fg_dataq+FGData.scaling_shift] + lea r8, [pb_mask] +%define base r8-pb_mask + vpbroadcastw m11, [base+mul_bits+r7*2-14] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, dword is_idm + vpbroadcastw m13, [base+min+r7*2] + shlx r7d, r7d, r9d + vpbroadcastw m12, [base+max+r7*2] + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + +%if %1 + mov r7d, dword r11m + vpbroadcastb m0, [fg_dataq+FGData.uv_mult+r7*4] + vpbroadcastb m1, [fg_dataq+FGData.uv_luma_mult+r7*4] + punpcklbw m14, m1, m0 + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] +%else + vpbroadcastd m14, [pw_1024] +%if %2 + vpbroadcastq m15, [pb_23_22] +%else + vpbroadcastq xm15, [pb_27_17_17_27] +%endif +%endif +%if %3 + vpbroadcastw m10, [pb_23_22] +%elif %2 + mova m10, [pb_8x_27_17_8x_17_27] +%endif + + mov overlapd, [fg_dataq+FGData.overlap_flag] + movifnidn sbyd, sbym + test sbyd, sbyd + setnz r7b + test r7b, overlapb + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, unused5, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + pxor m2, m2 + mova m4, [lumaq] + mova m0, [srcq] +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq-3+m4], m3 + vpgatherdd m4, [scalingq-3+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq-3+m6], m3 + vpgatherdd m6, [scalingq-3+m7], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 + packusdw m8, m4 + packusdw m5, m6 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movu xm3, [grain_lutq+offxyq+ 0] + vinserti128 m3, [grain_lutq+offxyq+82], 1 +%else + movu m3, [grain_lutq+offxyq] +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 + jg %%loop_y + + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + test overlapd, overlapd + jz %%loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx + mov offxd, seed + rorx offyd, seed, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride + + mov hd, hm + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq-3+m4], m3 + vpgatherdd m4, [scalingq-3+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq-3+m6], m3 + vpgatherdd m6, [scalingq-3+m7], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 + packusdw m8, m4 + packusdw m5, m6 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] +%if %2 +%if %1 + vpbroadcastq m6, [pb_23_22] +%endif + movu xm3, [grain_lutq+offxyq+ 0] + movd xm4, [grain_lutq+left_offxyq+ 0] + vinserti128 m3, [grain_lutq+offxyq+82], 1 + vinserti128 m4, [grain_lutq+left_offxyq+82], 1 + punpcklbw m4, m3 +%if %1 + pmaddubsw m4, m6, m4 + pmulhrsw m4, [pw_1024] +%else + pmaddubsw m4, m15, m4 + pmulhrsw m4, m14 +%endif + packsswb m4, m4 + vpblendd m3, m3, m4, 00010001b +%else +%if %1 + movq xm6, [pb_27_17_17_27] +%endif + movu m3, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm3 +%if %1 + pmaddubsw xm4, xm6, xm4 + pmulhrsw xm4, [pw_1024] +%else + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 +%endif + packsswb xm4, xm4 + vpblendd m3, m3, m4, 00000001b +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(1+%2) + sub hb, 1+%2 + jg %%loop_y_h_overlap + + add wq, 32>>%2 + jge %%end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%end: + RET + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, overlap, unused1, unused2, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused1, unused2, see, overlap, unused3, unused4, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%if %2 == 0 + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27] +%endif +%%loop_y_v_overlap: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: luma_src as dword + + ; scaling[luma_src] + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m8, [scalingq-3+m4], m3 + vpgatherdd m4, [scalingq-3+m5], m9 + pcmpeqw m3, m3 + pcmpeqw m9, m9 + vpgatherdd m5, [scalingq-3+m6], m3 + vpgatherdd m6, [scalingq-3+m7], m9 + REPX {psrld x, 24}, m8, m4, m5, m6 + packusdw m8, m4 + packusdw m5, m6 + +%if %2 + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word +%endif + + ; grain = grain_lut[offy+y][offx+x] +%if %3 == 0 +%if %2 + movu xm3, [grain_lutq+offxyq] + movu xm4, [grain_lutq+top_offxyq] + vinserti128 m3, [grain_lutq+offxyq+82], 1 + vinserti128 m4, [grain_lutq+top_offxyq+82], 1 +%else + movu m3, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] +%endif + punpckhbw m9, m4, m3 + punpcklbw m4, m3 + pmaddubsw m9, m10, m9 + pmaddubsw m4, m10, m4 +%if %1 + pmulhrsw m9, [pw_1024] + pmulhrsw m4, [pw_1024] +%else + pmulhrsw m9, m14 + pmulhrsw m4, m14 +%endif + packsswb m3, m4, m9 +%else + movq xm3, [grain_lutq+offxyq] + movq xm4, [grain_lutq+top_offxyq] + vinserti128 m3, [grain_lutq+offxyq+8], 1 + vinserti128 m4, [grain_lutq+top_offxyq+8], 1 + punpcklbw m4, m3 + pmaddubsw m4, m10, m4 +%if %1 + pmulhrsw m4, [pw_1024] +%else + pmulhrsw m4, m14 +%endif + packsswb m4, m4 + vpermq m4, m4, q3120 + ; only interpolate first line, insert second line unmodified + vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) +%if %2 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + pxor m6, m6 + punpckhbw m9, m0, m6 + punpcklbw m0, m6 ; m0-1: src as word + + paddw m0, m2 + paddw m9, m3 + pmaxsw m0, m13 + pmaxsw m9, m13 + pminsw m0, m12 + pminsw m9, m12 + packuswb m0, m9 + mova [dstq], m0 +%endif + + sub hb, 1+%2 + jle %%end_y_v_overlap +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 +%if %2 == 0 + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16] + btc hd, 16 + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: + add wq, 32>>%2 + jge %%end_hv + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + + movzx top_offxyd, offxyw + shr offxyd, 16 + + mov hd, hm + mov grain_lutq, grain_lutmp +%if %2 == 0 + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27] +%endif +%%loop_y_hv_overlap: + ; src +%if %2 + mova xm4, [lumaq+lstrideq*0+ 0] + mova xm6, [lumaq+lstrideq*0+16] + mova xm0, [srcq] + vpbroadcastd m7, [pb_1] + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 + vinserti128 m0, [srcq+strideq], 1 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + punpckhwd m5, m4, m2 + punpcklwd m4, m2 + punpckhwd m7, m6, m2 + punpcklwd m6, m2 ; m4-7: src as dword + + ; scaling[src] + pcmpeqw m9, m9 + pcmpeqw m3, m3 + vpgatherdd m8, [scalingq-3+m4], m9 + vpgatherdd m4, [scalingq-3+m5], m3 + pcmpeqw m9, m9 + pcmpeqw m3, m3 + vpgatherdd m5, [scalingq-3+m6], m9 + vpgatherdd m6, [scalingq-3+m7], m3 + REPX {psrld x, 24}, m8, m4, m5, m6 + packusdw m8, m4 + packusdw m5, m6 + +%if %2 + ; unpack chroma source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word +%endif + + ; grain = grain_lut[offy+y][offx+x] +%if %1 +%if %2 + vpbroadcastq m9, [pb_23_22] +%else + vpbroadcastq xm9, [pb_27_17_17_27] +%endif +%endif + +%if %2 + movu xm3, [grain_lutq+offxyq] +%if %3 + movq xm6, [grain_lutq+top_offxyq] +%else + movu xm6, [grain_lutq+top_offxyq] +%endif + vinserti128 m3, [grain_lutq+offxyq+82], 1 +%if %3 + vinserti128 m6, [grain_lutq+top_offxyq+8], 1 +%else + vinserti128 m6, [grain_lutq+top_offxyq+82], 1 +%endif +%else + movu m3, [grain_lutq+offxyq] + movu m6, [grain_lutq+top_offxyq] +%endif + movd xm4, [grain_lutq+left_offxyq] + movd xm7, [grain_lutq+topleft_offxyq] +%if %2 + vinserti128 m4, [grain_lutq+left_offxyq+82], 1 +%if %3 == 0 + vinserti128 m7, [grain_lutq+topleft_offxyq+82], 1 +%endif +%endif + + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) +%if %2 + punpcklbw m4, m3 +%if %3 + punpcklbw xm7, xm6 +%else + punpcklbw m7, m6 +%endif + punpcklqdq m4, m7 +%if %1 + pmaddubsw m4, m9, m4 + pmulhrsw m4, [pw_1024] +%else + pmaddubsw m4, m15, m4 + pmulhrsw m4, m14 +%endif + packsswb m4, m4 + vpblendd m3, m4, 00010001b + psrldq m4, 4 +%if %3 + vpblendd m6, m6, m4, 00000001b +%else + vpblendd m6, m6, m4, 00010001b +%endif +%else + punpcklbw xm4, xm3 + punpcklbw xm7, xm6 + punpcklqdq xm4, xm7 +%if %1 + pmaddubsw xm4, xm9, xm4 + pmulhrsw xm4, [pw_1024] +%else + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 +%endif + packsswb xm4, xm4 + vpblendd m3, m3, m4, 00000001b + psrldq xm4, 4 + vpblendd m6, m6, m4, 00000001b +%endif + + ; followed by v interpolation (top | cur -> cur) +%if %3 + vpermq m9, m3, q3120 + punpcklbw m6, m9 + pmaddubsw m6, m10, m6 +%if %1 + pmulhrsw m6, [pw_1024] +%else + pmulhrsw m6, m14 +%endif + packsswb m6, m6 + vpermq m6, m6, q3120 + vpblendd m3, m3, m6, 00001111b +%else + punpckhbw m9, m6, m3 + punpcklbw m6, m3 + pmaddubsw m9, m10, m9 + pmaddubsw m6, m10, m6 +%if %1 + pmulhrsw m9, [pw_1024] + pmulhrsw m6, [pw_1024] +%else + pmulhrsw m9, m14 + pmulhrsw m6, m14 +%endif + packsswb m3, m6, m9 +%endif + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m8 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) +%if %2 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + pxor m6, m6 + punpckhbw m9, m0, m6 + punpcklbw m0, m6 ; m0-1: src as word + paddw m0, m2 + paddw m9, m3 + pmaxsw m0, m13 + pmaxsw m9, m13 + pminsw m0, m12 + pminsw m9, m12 + packuswb m0, m9 + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 +%if %2 + jg %%loop_y_h_overlap +%else + je %%end_y_hv_overlap + vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16] + btc hd, 16 + jnc %%loop_y_hv_overlap + jmp %%loop_y_h_overlap +%endif + +%%end_y_hv_overlap: + add wq, 32>>%2 + jge %%end_hv + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + jmp %%loop_x_hv_overlap + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/film_grain_init_tmpl.c dav1d-0.9.1/src/x86/film_grain_init_tmpl.c --- dav1d-0.7.1/src/x86/film_grain_init_tmpl.c 2020-06-21 11:48:55.016126400 +0000 +++ dav1d-0.9.1/src/x86/film_grain_init_tmpl.c 2021-07-28 21:38:28.897852200 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -28,50 +28,48 @@ #include "src/cpu.h" #include "src/film_grain.h" -decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3); -decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3); - -decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2); -decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2); -decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2); -decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2); +decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ssse3)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ssse3)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ssse3)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ssse3)); +decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ssse3)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ssse3)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ssse3)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ssse3)); + +decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, avx2)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, avx2)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, avx2)); +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, avx2)); +decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, avx2)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, avx2)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, avx2)); +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, avx2)); COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; -#if BITDEPTH == 8 - c->generate_grain_y = dav1d_generate_grain_y_ssse3; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_ssse3; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3; - c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3; -#endif + c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3); + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3); +#if ARCH_X86_64 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if BITDEPTH == 8 && ARCH_X86_64 - c->generate_grain_y = dav1d_generate_grain_y_avx2; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2; - c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2; - c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2; - c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2; + c->generate_grain_y = BF(dav1d_generate_grain_y, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2); + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); #endif } diff -Nru dav1d-0.7.1/src/x86/film_grain_sse.asm dav1d-0.9.1/src/x86/film_grain_sse.asm --- dav1d-0.7.1/src/x86/film_grain_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/film_grain_sse.asm 2021-07-28 21:38:28.897852200 +0000 @@ -0,0 +1,3262 @@ +; Copyright © 2019-2021, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +pw_1024: times 8 dw 1024 +pb_27_17_17_27: db 27, 17, 17, 27 + times 6 db 0, 32 +pb_23_22_h: db 23, 22 + times 7 db 0, 32 +pb_27_17: times 8 db 27, 17 +pb_17_27: times 8 db 17, 27 +pb_23_22: times 8 db 23, 22 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +max: dw 255, 240, 235 +min: dw 0, 16 +pw_1: dw 1 + +%macro JMP_TABLE 2-* + %xdefine %1_8bpc_%2_table %%table + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%if ARCH_X86_32 +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r2d, [fg_dataq+FGData.grain_scale_shift] + movd m2, [base+round+r2*2] + movd m0, [fg_dataq+FGData.seed] + mova m5, [base+pb_mask] + pshuflw m2, m2, q0000 + pshuflw m0, m0, q0000 + mov r2, -73*82 + sub bufq, r2 + lea r3, [base+gaussian_sequence] +.loop: + pand m6, m0, m1 + psrlw m3, m6, 10 + por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m6, m4 ; bits 0x0f00 are set + pshufb m3, m5, m6 ; set 15th bit for next 4 seeds + psllq m6, m3, 30 + por m3, m6 + psllq m6, m3, 15 + por m3, m6 ; aggregate each bit into next seed's high bit + pmulhuw m6, m0, m7 + por m3, m6 ; 4 next output seeds + pshuflw m0, m3, q3333 + psrlw m3, 5 +%if ARCH_X86_64 + movq r6, m3 + mov r8, r6 + movzx r5d, r6w + shr r6d, 16 + shr r8, 32 + movzx r7, r8w + shr r8, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + pinsrw m6, [r3+r7*2], 2 + pinsrw m6, [r3+r8*2], 3 +%else + movd r6, m3 + pshuflw m3, m3, q3232 + movzx r5, r6w + shr r6, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + + movd r6, m3 + movzx r5, r6w + shr r6, 16 + + pinsrw m6, [r3+r5*2], 2 + pinsrw m6, [r3+r6*2], 3 +%endif + pmulhrsw m6, m2 + packsswb m6, m6 + movd [bufq+r2], m6 + add r2, 4 + jl .loop + + ; auto-regression code + movsxd r2, [fg_dataq+FGData.ar_coeff_lag] + movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] + lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] + jmp r2 + +.ar1: +%if ARCH_X86_32 + DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max +%elif WIN64 + DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 + mov bufq, r0 +%else + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 +%endif + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov ecx, [fg_dataq+FGData.ar_coeff_shift] +%if ARCH_X86_32 + mov r1m, cf3d + DEFINE_ARGS buf, shift, val3, min, max, x, val0 +%define hd r0mp +%define cf3d r1mp +%elif WIN64 + DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 +%else + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 +%endif + pxor m6, m6 + pcmpgtb m7, m6, m4 + punpcklbw m4, m7 + pinsrw m4, [base+pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pshuflw m3, m3, q0000 + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + movq m0, [bufq+xq-82-1] ; top/left + pcmpgtb m7, m6, m0 + punpcklbw m0, m7 + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif + DEFINE_ARGS buf, fg_data, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend+1] + SCRATCH 7, 15, 7 + movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + pxor m7, m7 + pshuflw m6, m6, q0000 + punpcklwd m6, m7 + pcmpgtb m4, m7, m0 + pcmpgtb m5, m7, m1 + punpcklbw m0, m4 + punpcklbw m1, m5 + DEFINE_ARGS buf, fg_data, h, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m7, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m6, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m6, m1 + psrldq m5, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m5, m1 + pmaddwd m4, m9 + pmaddwd m6, m10 + pmaddwd m5, m12 + paddd m4, m6 + paddd m2, m5 + paddd m2, m4 + paddd m2, m14 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pcmpgtb m4, m7, m0 + punpcklbw m1, m0, m4 + pmaddwd m3, m1, m13 + paddd m3, m2 + psrldq m1, 4 ; y=0,x=0 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw m3, m1 + packsswb m3, m3 + pslldq m3, 2 + pand m3, m15 + pandn m1, m15, m0 + por m0, m1, m3 + psrldq m0, 1 + ; overwrite 2 pixels, but that's ok + movd [bufq+xq-1], m0 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, shift +%if ARCH_X86_32 +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 +%elif WIN64 + SUB rsp, 16*6 +%assign stack_size_padded (stack_size_padded+16*6) +%assign stack_size (stack_size+16*6) +%else + ALLOC_STACK -16*6 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend] + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pxor m3, m3 + pcmpgtb m4, m3, m0 + pcmpgtb m3, m2 + pshuflw m6, m6, q0000 + SCRATCH 6, 14, 12 + SCRATCH 7, 15, 13 + punpckhbw m1, m0, m4 + punpcklbw m0, m4 + punpcklbw m2, m3 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m5, m0, q3333 + pshufd m0, m0, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m3 + mova [rsp+ 2*16], m4 + mova [rsp+ 3*16], m5 + pshufd m6, m1, q1111 + pshufd m7, m1, q2222 + pshufd m5, m1, q3333 + pshufd m1, m1, q0000 + pshufd m3, m2, q1111 + psrldq m0, m2, 10 + pinsrw m2, [base+pw_1], 5 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + pinsrw m0, [base+round_vals+shiftq*2-10], 3 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m6 + SCRATCH 7, 8, 6 + SCRATCH 5, 9, 7 + SCRATCH 2, 10, 8 + SCRATCH 3, 11, 9 + SCRATCH 4, 12, 10 + SCRATCH 0, 13, 11 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m2, m0, m3 + punpcklbw m0, m3 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m5, m5 + pcmpgtb m5, m1 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + palignr m6, m2, m0, 10 + palignr m7, m2, m0, 12 + psrldq m0, 8 + punpcklwd m0, m6 + punpcklwd m7, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m7, [rsp+ 3*16] + paddd m0, m7 + paddd m0, m4 + + psrldq m4, m1, 2 + psrldq m5, m1, 4 + psrldq m6, m1, 6 + psrldq m7, m1, 8 + punpcklwd m4, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 4*16] + pmaddwd m6, [rsp+ 5*16] + paddd m4, m6 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m5, m2, m7 + punpcklbw m2, m7 + palignr m7, m3, m1, 10 + palignr m3, m1, 12 + psrldq m1, m2, 2 + punpcklwd m7, m3 + punpcklwd m3, m2, m1 + pmaddwd m7, m8 + pmaddwd m3, m9 + paddd m7, m3 + paddd m0, m7 + + psrldq m6, m2, 4 + psrldq m1, m2, 6 + psrldq m3, m2, 8 + palignr m4, m5, m2, 10 + palignr m5, m5, m2, 12 + + punpcklwd m6, m1 + punpcklwd m3, m4 + punpcklwd m5, m14 + pmaddwd m6, m10 + pmaddwd m3, m11 + pmaddwd m5, m12 + paddd m0, m6 + paddd m3, m5 + paddd m0, m3 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pxor m5, m5 + pcmpgtb m5, m1 + punpcklbw m2, m1, m5 + pmaddwd m2, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb m2, m2 + pslldq m2, 3 + pand m2, m15 + pandn m3, m15, m1 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM ssse3 +cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv + movifnidn r2, r2mp + movifnidn r3, r3mp + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + movd m6, [base+round+r5*2] + mova m5, [base+pb_mask] + movd m0, [fg_dataq+FGData.seed] + movd m2, [base+pw_seed_xor+uvq*4] + pxor m0, m2 + pshuflw m6, m6, q0000 + pshuflw m0, m0, q0000 + lea r6, [base+gaussian_sequence] +%if %2 +%if ARCH_X86_64 + mov r7d, 73-35*%3 +%else + mov r3mp, 73-35*%3 +%endif + add bufq, 44 +.loop_y: + mov r5, -44 +.loop_x: +%else + mov r5, -82*73 + sub bufq, r5 +.loop: +%endif + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m3, m2 + psllq m2, m3, 15 + por m3, m2 ; aggregate each bit into next seed's high bit + pmulhuw m2, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + movd r9d, m2 + pshuflw m2, m2, q3232 + movzx r8, r9w + shr r9, 16 + + movd m3, [r6+r8*2] + pinsrw m3, [r6+r9*2], 1 + + movd r9d, m2 + movzx r8, r9w + shr r9, 16 + + pinsrw m3, [r6+r8*2], 2 + pinsrw m3, [r6+r9*2], 3 +%else + movd r2, m2 + pshuflw m2, m2, q3232 + movzx r1, r2w + shr r2, 16 + + movd m3, [r6+r1*2] + pinsrw m3, [r6+r2*2], 1 + + movd r2, m2 + movzx r1, r2w + shr r2, 16 + + pinsrw m3, [r6+r1*2], 2 + pinsrw m3, [r6+r2*2], 3 +%endif + pmulhrsw m3, m6 + packsswb m3, m3 + movd [bufq+r5], m3 + add r5, 4 +%if %2 + jl .loop_x + add bufq, 82 +%if ARCH_X86_64 + dec r7d +%else + dec r3mp +%endif + jg .loop_y +%else + jl .loop +%endif + +%if ARCH_X86_32 + mov r2, r2mp +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] + jmp r5 + +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -2*16 +%endif + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd m4, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h, x + pxor m0, m0 + pcmpgtb m0, m5 + punpcklbw m5, m0 + movd m7, [base+pb_1] +%if %2 + movd m6, [base+hmul_bits+2+%3*2] +%endif + pshuflw m5, m5, q0000 + pshuflw m4, m4, q0000 + pshufd m7, m7, q0000 +%if %2 + pshuflw m6, m6, q0000 +%endif + punpcklqdq m5, m5 + punpcklqdq m4, m4 +%if %2 + punpcklqdq m6, m6 +%endif + pcmpeqw m1, m1 + pslldq m1, 12>>%2 + SCRATCH 1, 8, 0 + SCRATCH 4, 9, 1 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: + xor xd, xd +.x_loop_ar0: + ; first 32 pixels +%if %2 + movu m1, [bufyq+xq*2] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + movu m3, [bufyq+xq*2+16] +%if %3 + movu m4, [bufyq+xq*2+82+16] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 +%endif + pmaddubsw m2, m7, m3 +%if %3 + pmaddubsw m3, m7, m4 + paddw m0, m1 + paddw m2, m3 +%endif + pmulhrsw m0, m6 + pmulhrsw m2, m6 +%else + movu m0, [bufyq+xq] + pxor m6, m6 + pcmpgtb m6, m0 + punpckhbw m2, m0, m6 + punpcklbw m0, m6 +%endif + pmullw m0, m5 + pmullw m2, m5 + pmulhrsw m0, m9 + pmulhrsw m2, m9 + movu m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpckhbw m3, m1, m4 +%if %2 + punpcklbw m1, m4 + paddw m2, m3 + paddw m0, m1 +%else + punpcklbw m6, m1, m4 + paddw m2, m3 + paddw m0, m6 +%endif + packsswb m0, m2 +%if %2 + movu [bufq+xq], m0 + add xd, 16 + cmp xd, 32 + jl .x_loop_ar0 + + ; last 6/12 pixels + movu m1, [bufyq+xq*(1+%2)] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 + paddw m0, m1 +%endif + pmulhrsw m0, m6 + pmullw m0, m5 + pmulhrsw m0, m9 + movq m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + paddw m0, m2 + packsswb m0, m0 + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movq [bufq+xq], m2 +%else + add xd, 16 + cmp xd, 80 + je .y_loop_final_ar0 + movu [bufq+xq-16], m0 + jmp .x_loop_ar0 +.y_loop_final_ar0: + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movu [bufq+xq-16], m2 +%endif + + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +.ar1: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] + pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 +%if ARCH_X86_32 + mov r3mp, cf3d + DEFINE_ARGS buf, shift, fg_data, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x + mov bufq, r0 +%else + DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + movd m7, [base+pb_1] + movd m6, [base+hmul_bits+2+%3*2] +%endif + psrldq m4, 1 +%if ARCH_X86_32 + DEFINE_ARGS buf, shift, val0, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 +%else + DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 +%endif + pxor m5, m5 + punpcklwd m3, m5 +%if %2 + punpcklwd m6, m6 +%endif + pcmpgtb m5, m4 + punpcklbw m4, m5 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + pshufd m3, m3, q0000 +%if %2 + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif +%if ARCH_X86_32 + add r1mp, 79+82*3 + mov r0mp, 70-35*%3 +%else + add bufyq, 79+82*3 + mov hd, 70-35*%3 +%endif + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: +%if %2 +%if ARCH_X86_32 + mov r2, r1mp + movq m0, [r2+xq*2] +%if %3 + movq m1, [r2+xq*2+82] +%endif +%else + movq m0, [bufyq+xq*2] +%if %3 + movq m1, [bufyq+xq*2+82] +%endif +%endif + pmaddubsw m2, m7, m0 +%if %3 + pmaddubsw m0, m7, m1 + paddw m2, m0 +%endif + pmulhrsw m2, m6 +%else +%if ARCH_X86_32 + mov r2, r1mp + movd m2, [r2+xq] +%else + movd m2, [bufyq+xq] +%endif + pxor m0, m0 + pcmpgtb m0, m2 + punpcklbw m2, m0 +%endif + + movq m0, [bufq+xq-82-1] ; top/left + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 + psrldq m1, m0, 4 ; top/right + punpcklwd m1, m2 + psrldq m2, m0, 2 ; top + punpcklwd m0, m2 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 +%if ARCH_X86_32 + imul val3d, r3mp +%else + imul val3d, cf3d +%endif + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 +%if ARCH_X86_32 + add r1mp, 82<<%3 + dec r0mp +%else + add bufyq, 82<<%3 + dec hd +%endif + jg .y_loop_ar1 + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp + ALLOC_STACK -8*16 +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + movd m7, [base+round_vals-12+shiftq*2] + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 + pxor m2, m2 + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + pinsrw m1, [base+pw_1], 5 + punpcklwd m7, m7 + pshufd m7, m7, q0000 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + SCRATCH 7, 15, 7 +%if %2 + movd m7, [base+hmul_bits+2+%3*2] + movd m6, [base+pb_1] + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pxor m2, m2 + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m0, m1 + psrldq m3, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + pmaddwd m4, m9 + pmaddwd m0, m10 + pmaddwd m3, m12 + paddd m4, m0 + paddd m2, m3 + paddd m2, m4 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m6, m1 +%if %3 + pmaddubsw m1, m6, m3 + paddw m0, m1 +%endif + pmulhrsw m0, m7 +%else + movd m0, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 +%endif + punpcklwd m0, m15 + pmaddwd m0, m14 + paddd m2, m0 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] + pxor m4, m4 + movd m5, [base+byte_blend+1] + punpcklbw m5, m5 +.x_loop_ar2_inner: + pcmpgtb m1, m4, m0 + punpcklbw m0, m1 + pmaddwd m3, m0, m13 + paddd m3, m2 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + pslldq m3, 4 + pand m3, m5 + paddw m0, m3 + packsswb m0, m0 + movd [bufq+xq-2], m0 + psrldq m0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 + ALLOC_STACK -15*16 +%else + SUB rsp, 16*7 +%assign stack_size_padded (stack_size_padded+16*7) +%assign stack_size (stack_size+16*7) +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + pshufd m2, m0, q1111 + pshufd m3, m0, q2222 + pshufd m4, m0, q3333 + pshufd m0, m0, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m7, m1, q3333 + pshufd m1, m1, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m2 + mova [rsp+ 2*16], m3 + mova [rsp+ 3*16], m4 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m5 + mova [rsp+ 6*16], m6 + SCRATCH 7, 8, 7 + + movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] + pxor m4, m4 + pcmpgtb m4, m2 + punpckhbw m5, m2, m4 + punpcklbw m2, m4 + pshufd m4, m2, q3232 + punpcklwd m3, m4, m5 + pshuflw m5, m4, q3321 + pshufd m4, m3, q0000 + pshufd m3, m2, q1111 + pshufd m2, m2, q0000 + pinsrw m5, [base+round_vals+shiftq*2-10], 3 + SCRATCH 2, 9, 8 + SCRATCH 3, 10, 9 + SCRATCH 4, 11, 10 + SCRATCH 5, 12, 11 + + movd m2, [base+round_vals-12+shiftq*2] +%if %2 + movd m1, [base+pb_1] + movd m3, [base+hmul_bits+2+%3*2] +%endif + pxor m0, m0 + punpcklwd m2, m0 +%if %2 + punpcklwd m3, m3 +%endif + pshufd m2, m2, q0000 +%if %2 + pshufd m1, m1, q0000 + pshufd m3, m3, q0000 + SCRATCH 1, 13, 12 +%endif + SCRATCH 2, 14, 13 +%if %2 + SCRATCH 3, 15, 14 +%endif + + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m4, m4 + pcmpgtb m4, m0 + punpckhbw m3, m0, m4 + punpcklbw m0, m4 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + palignr m2, m3, m0, 10 + palignr m3, m0, 12 + psrldq m0, 8 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m6, m6 + pcmpgtb m6, m1 + punpckhbw m5, m1, m6 + punpcklbw m1, m6 + + punpcklwd m0, m2 + punpcklwd m3, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m3, [rsp+ 3*16] + paddd m0, m3 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m6, m2, m7 + punpcklbw m2, m7 + + palignr m3, m5, m1, 10 + palignr m5, m1, 12 + psrldq m4, m2, 2 + + punpcklwd m3, m5 + punpcklwd m5, m2, m4 + pmaddwd m3, [rsp+ 6*16] + pmaddwd m5, m8 + paddd m3, m5 + paddd m0, m3 + + psrldq m3, m1, 2 + psrldq m4, m1, 4 + psrldq m5, m1, 6 + psrldq m1, 8 + + punpcklwd m3, m4 + punpcklwd m5, m1 + pmaddwd m3, [rsp+ 4*16] + pmaddwd m5, [rsp+ 5*16] + paddd m3, m5 + paddd m0, m3 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m7, m13, m1 +%if %3 + pmaddubsw m5, m13, m3 + paddw m7, m5 +%endif + pmulhrsw m7, m15 +%else + movd m7, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m7 + punpcklbw m7, m1 +%endif + + psrldq m1, m2, 4 + psrldq m3, m2, 6 + palignr m4, m6, m2, 10 + palignr m6, m2, 12 + psrldq m2, 8 + + punpcklwd m1, m3 + punpcklwd m2, m4 + punpcklwd m6, m7 + pmaddwd m1, m9 + pmaddwd m2, m10 + pmaddwd m6, m11 + paddd m1, m2 + paddd m0, m6 + paddd m0, m1 + paddd m0, m14 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] + pxor m4, m4 + movd m5, [base+byte_blend] +.x_loop_ar3_inner: + pcmpgtb m2, m4, m1 + punpcklbw m3, m1, m2 + pmaddwd m2, m3, m12 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw, we only care about one value + packsswb m2, m2 + pandn m3, m5, m1 + pslld m2, 24 + pand m2, m5 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 6 +%define %%tmp %6 +%endif +%rep 4 +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4] +%else + pinsrw %1, [%3+%4], %%idx + 0 +%endif + pinsrw %1, [%3+%5], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +INIT_XMM ssse3 +; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + + mov [rsp+5*mmsize+ 4*gprsize], r0 + mov [rsp+5*mmsize+ 6*gprsize], r1 + mov [rsp+5*mmsize+ 8*gprsize], r2 + mov [rsp+5*mmsize+10*gprsize], r3 + mov [rsp+5*mmsize+11*gprsize], r4 + mov [rsp+5*mmsize+12*gprsize], r5 +%else +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+5*mmsize+ 4*gprsize] +%define r1m [rsp+5*mmsize+ 5*gprsize] +%define r2m [rsp+5*mmsize+ 6*gprsize] +%define r3m [rsp+5*mmsize+ 7*gprsize] +%define r4m [rsp+5*mmsize+ 8*gprsize] +%define r5m [rsp+5*mmsize+ 9*gprsize] +%define r6m [rsp+5*mmsize+10*gprsize] +%define r7m [rsp+5*mmsize+11*gprsize] +%define r8m [rsp+5*mmsize+12*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, picptrq +%else +cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + lea r7, [pb_mask] +%define base r7-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + movd m4, [base+max+r6*4] + movd m5, [base+min+r6*2] + punpcklwd m3, m3 + punpcklwd m4, m4 + punpcklwd m5, m5 + pshufd m3, m3, q0000 + pshufd m4, m4, q0000 + pshufd m5, m5, q0000 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz .no_vertical_overlap + mova m6, [base+pw_1024] + mova m7, [base+pb_27_17_17_27] + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 + test sbyd, sbyd + jnz .vertical_overlap + ; fall-through + +.no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 +%endif + +.loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused +%endif + +.loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .next_blk + + add offxyd, 16 + test dword r8m, 2 ; r8m & 2 = have_top_overlap + jz .loop_x_odd + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jnz .loop_x_odd_v_overlap + +.next_blk: + test dword r8m, 1 + jz .loop_x + + test dword r8m, 2 + jnz .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 + + add offxyd, 16 ; left_offxyd + mov [rsp+5*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+0*gprsize] + movd m7, [grain_lutq+r5] +%else + movd m7, [grain_lutq+left_offxyq] +%endif + punpcklbw m7, m3 + pmaddubsw m6, m15, m7 + pmulhrsw m6, m14 + packsswb m6, m6 + shufps m6, m3, q3210 + pcmpgtb m2, m6 + punpcklbw m7, m6, m2 + punpckhbw m6, m2 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m7, m4 + pmullw m6, m5 + pmulhrsw m7, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m7 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 + + ; since this half-block had left-overlap, the next does not + test dword r8m, 2 ; have_top_overlap + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%endif + +.loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed, + ; because of the 'and tmpd, 0x00ff00ff' above + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused, top_offxy + + mov offyd, seed + mov offxd, seed +%endif + + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused, top_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+5*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +.loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+12], r5 +%else + mova m8, [pb_27_17] +%endif + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+1*gprsize] + movu m7, [grain_lutq+r5] +%else + movu m7, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m7, m3 + punpcklbw m7, m3 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+12] + pmaddubsw m3, [r5], m6 + pmaddubsw m6, [r5], m7 +%else + pmaddubsw m3, m8, m6 + pmaddubsw m6, m8, m7 +%endif + pmulhrsw m3, m14 + pmulhrsw m6, m14 + packsswb m6, m3 + pcmpgtb m7, m2, m6 + punpcklbw m2, m6, m7 + punpckhbw m6, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m6, m5 + pmulhrsw m2, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+12], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.loop_x_hv_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+12], r5 + + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak + + mov r5, [rsp+5*mmsize+1*gprsize] + mov r4, offxyd + add r5, 16 + add r4, 16 + mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy + mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak + + xor tmpd, tmpd + mov seed, r3m +%else + mova m8, [pb_27_17] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut + + movzx r5, offxyw ; top_offxy + mov [rsp+5*mmsize+1*gprsize], r5 +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw +%endif + shr offxyd, 16 + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy + mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy + movu m6, [grain_lutq+r5] + mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy + movd m4, [grain_lutq+r0] + movd m7, [grain_lutq+r5] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] + movd m7, [grain_lutq+topleft_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m4, m3 + punpcklbw m7, m6 + pmaddubsw m2, m15, m4 + pmaddubsw m4, m15, m7 + pmulhrsw m2, m14 + pmulhrsw m4, m14 + packsswb m2, m2 + packsswb m4, m4 + shufps m2, m3, q3210 + shufps m4, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m3, m4, m2 + punpckhbw m4, m2 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+12] + pmaddubsw m7, [r5], m4 + pmaddubsw m4, [r5], m3 +%else + pmaddubsw m7, m8, m4 + pmaddubsw m4, m8, m3 +%endif + pmulhrsw m7, m14 + pmulhrsw m4, m14 + packsswb m4, m7 + pxor m2, m2 + pcmpgtb m7, m2, m4 + punpcklbw m3, m4, m7 + punpckhbw m4, m7 + + ; src + mova m0, [srcq] + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m5, m0, scalingq-1, r0, r5, m7 + vpgatherdw m6, m1, scalingq-1, r0, r5, m7 +%else + vpgatherdw m5, m0, scalingq-1, r13, r14, m7 + vpgatherdw m6, m1, scalingq-1, r13, r14, m7 +%endif + REPX {psrlw x, 8}, m5, m6 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m3, m5 + pmullw m4, m6 + pmulhrsw m3, m11 + pmulhrsw m4, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+12], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.end_hv: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +INIT_XMM ssse3 +%if ARCH_X86_32 +; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, +; sby, luma, lstride, uv_pl, is_id) +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov [rsp+7*mmsize+3*gprsize], r0 + mov [rsp+7*mmsize+5*gprsize], r1 + mov [rsp+7*mmsize+7*gprsize], r2 + mov [rsp+7*mmsize+9*gprsize], r3 + mov [rsp+7*mmsize+10*gprsize], r4 + + mov r0, r8m + mov r1, r9m + mov r2, r10m + mov r4, r11m + mov r3, r12m + mov [rsp+7*mmsize+11*gprsize], r0 + mov [rsp+7*mmsize+12*gprsize], r1 + mov [rsp+7*mmsize+13*gprsize], r2 + mov [rsp+7*mmsize+14*gprsize], r4 +%else +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+7*mmsize+ 3*gprsize] +%define r1m [rsp+7*mmsize+ 4*gprsize] +%define r2m [rsp+7*mmsize+ 5*gprsize] +%define r3m [rsp+7*mmsize+ 6*gprsize] +%define r4m [rsp+7*mmsize+ 7*gprsize] +%define r5m [rsp+7*mmsize+ 8*gprsize] +%define r6m [rsp+7*mmsize+ 9*gprsize] +%define r7m [rsp+7*mmsize+10*gprsize] +%define r8m [rsp+7*mmsize+11*gprsize] +%define r9m [rsp+7*mmsize+12*gprsize] +%define r10m [rsp+7*mmsize+13*gprsize] +%define r11m [rsp+7*mmsize+14*gprsize] +%define r12m [rsp+7*mmsize+15*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, r5 +%else +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, tmp, sby, luma, lstride, uv_pl, is_id + lea r8, [pb_mask] +%define base r8-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + lea tmpd, [r6d*2] +%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize + test r3, r3 +%else + cmp dword r12m, 0 ; is_idm +%endif + movd m5, [base+min+r6*2] + cmovne r6d, tmpd + movd m4, [base+max+r6*2] + punpcklwd m3, m3 + punpcklwd m5, m5 + punpcklwd m4, m4 + pshufd m3, m3, q0000 + pshufd m5, m5, q0000 + pshufd m4, m4, q0000 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + +%if %1 + mov r6d, dword r11m + movd m0, [fg_dataq+FGData.uv_mult+r6*4] + movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklbw m6, m1, m0 + movd m7, [fg_dataq+FGData.uv_offset+r6*4] + punpcklwd m6, m6 + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz %%no_vertical_overlap +%if ARCH_X86_32 +%if %2 + mova m1, [base+pb_23_22_h] +%else + mova m1, [base+pb_27_17_17_27] +%endif + mova m0, [base+pw_1024] +%else +%if %2 + mova m1, [pb_23_22_h] +%else + mova m1, [pb_27_17_17_27] +%endif + mova m0, [pw_1024] +%endif + SCRATCH 0, 8, 5 + SCRATCH 1, 9, 6 + test sbyd, sbyd + jnz %%vertical_overlap + ; fall-through + +%%no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak +%define luma_bakq lumaq + + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride, luma_bak +%endif + +%%loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq+ 0] + pcmpgtb m6, m2, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; we already incremented lumaq above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + ; adjust top_offxy +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jc %%loop_x_even + test dword r8m, 2 + jz %%loop_x_odd + jmp %%loop_x_odd_v_overlap +%%loop_x_even: +%endif + test dword r8m, 1 + jz %%loop_x + + ; r8m = sbym + test dword r8m, 2 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: +%if ARCH_X86_32 +%if %2 + lea r6, [offxyd+16] + mov [rsp+7*mmsize+0*gprsize], r6 +%else + mov [rsp+7*mmsize+0*gprsize], offxyd +%endif + + DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + +%if %2 + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%else + mov left_offxyd, offyd +%endif +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq+ 0] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+0*gprsize] + movd m2, [grain_lutq+r0+ 0] +%else + movd m2, [grain_lutq+left_offxyq+ 0] +%endif + punpcklbw m2, m4 + pmaddubsw m3, m9, m2 + pmulhrsw m3, m8 + packsswb m3, m3 + shufps m3, m4, q3210 + pxor m4, m4 + pcmpgtb m4, m3 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y_h_overlap + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + xor dword r8m, 4 + ; adjust top_offxyd +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 +%endif + + ; r8m = sbym + test dword r8m, 2 +%if %2 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap +%else + jne %%loop_x_odd_v_overlap + jmp %%loop_x_odd +%endif + +%%end: + RET + +%%vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor tmpd, tmpd +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+7*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%endif + +%%loop_x_odd_v_overlap: + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %3 + mova m1, [PIC_ptr(pb_23_22)] +%else + mova m1, [PIC_ptr(pb_27_17)] +%endif +%%loop_y_v_overlap: +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+1*gprsize] + movu m4, [grain_lutq+r0] +%else + movu m4, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, m1, m6 + pmaddubsw m3, m1, m4 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + packsswb m3, m2 + pxor m6, m6 + pcmpgtb m6, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; unpack chroma_source + pxor m4, m4 + punpckhbw m6, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m6, m3 + pmaxsw m0, m13 + pmaxsw m6, m13 + pminsw m0, m12 + pminsw m6, m12 + packuswb m0, m6 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + dec hw + je %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 +%if %3 == 0 + btc hd, 16 +%if ARCH_X86_32 + mov r5, r5m +%endif + mova m1, [PIC_ptr(pb_17_27)] + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif + +%if %2 + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%else +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jnc %%loop_x_odd_v_overlap +%endif + +%%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused + + mov r6, [rsp+7*mmsize+1*gprsize] +%if %2 + lea r0, [r3d+16] + add r6, 16 + mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy +%else + mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy +%endif + mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused + + mov seed, r3m + xor tmpd, tmpd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + +%if %2 + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offxyq+16] +%else + mov topleft_offxyq, top_offxyq + mov left_offxyq, offxyq +%endif + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+7*mmsize+1*gprsize], top_offxyd +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %3 + mova m3, [PIC_ptr(pb_23_22)] +%else + mova m3, [PIC_ptr(pb_27_17)] +%endif +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy + mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy + movd m1, [grain_lutq+r0] + mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy +%else + movd m1, [grain_lutq+topleft_offxyq] +%endif + movu m2, [grain_lutq+offxyq] +%if ARCH_X86_32 + movu m6, [grain_lutq+r5] + movd m4, [grain_lutq+r0] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m1, m6 + punpcklbw m4, m2 + pmaddubsw m0, m9, m1 + pmaddubsw m1, m9, m4 + REPX {pmulhrsw x, m8}, m0, m1 + packsswb m0, m1 + shufps m4, m0, m2, q3232 + shufps m0, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m2, m0, m4 + punpckhbw m0, m4 + pmaddubsw m4, m3, m0 + pmaddubsw m1, m3, m2 + pmulhrsw m4, m8 + pmulhrsw m1, m8 + packsswb m1, m4 + + ; src +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else +%if %3 + vpgatherdw m7, m4, scalingq-1, r2, r12 + vpgatherdw m5, m6, scalingq-1, r2, r12 +%else + vpgatherdw m7, m4, scalingq-1, r2, r13 + vpgatherdw m5, m6, scalingq-1, r2, r13 +%endif +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack grain + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + punpckhbw m1, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m1, m5 + pmulhrsw m2, m11 + pmulhrsw m1, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; unpack chroma source + pxor m4, m4 + punpckhbw m5, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m5, m1 + pmaxsw m0, m13 + pmaxsw m5, m13 + pminsw m0, m12 + pminsw m5, m12 + packuswb m0, m5 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has been adjusted above already +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*(1+%2)] +%else + add lumaq, r10mp +%endif +%endif + add grain_lutq, 82 + dec hw +%if %3 + jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap +%if ARCH_X86_32 + mov r5, r5m +%endif + mova m3, [PIC_ptr(pb_17_27)] + btc hd, 16 + jnc %%loop_y_hv_overlap +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif + jmp %%loop_y_h_overlap +%%end_y_hv_overlap: +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif +%endif + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 + jmp %%loop_x_hv_overlap +%else +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + xor dword r8m, 4 + jmp %%loop_x_odd_v_overlap +%endif + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 422, 1, 0 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 444, 0, 0 diff -Nru dav1d-0.7.1/src/x86/film_grain_ssse3.asm dav1d-0.9.1/src/x86/film_grain_ssse3.asm --- dav1d-0.7.1/src/x86/film_grain_ssse3.asm 2020-06-21 11:48:55.016126400 +0000 +++ dav1d-0.9.1/src/x86/film_grain_ssse3.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,3300 +0,0 @@ -; Copyright © 2019, VideoLAN and dav1d authors -; Copyright © 2019, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - -SECTION_RODATA - -pw_1024: times 8 dw 1024 -pb_27_17: times 8 db 27, 17 -pb_17_27: times 8 db 17, 27 -pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 -rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 -byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 -pw_seed_xor: times 2 dw 0xb524 - times 2 dw 0x49d8 -pb_23_22: times 2 db 23, 22 -pb_1: times 4 db 1 -hmul_bits: dw 32768, 16384, 8192, 4096 -round: dw 2048, 1024, 512 -mul_bits: dw 256, 128, 64, 32, 16 -round_vals: dw 32, 64, 128, 256, 512 -max: dw 255, 240, 235 -min: dw 0, 16 -pw_1: dw 1 - -%define pb_27_17_17_27 pb_17_27 - 2 - -%macro JMP_TABLE 1-* - %xdefine %1_table %%table - %xdefine %%base %1_table - %xdefine %%prefix mangle(private_prefix %+ _%1) - %%table: - %rep %0 - 1 - dd %%prefix %+ .ar%2 - %%base - %rotate 1 - %endrep -%endmacro - -JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3 -JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3 - -struc FGData - .seed: resd 1 - .num_y_points: resd 1 - .y_points: resb 14 * 2 - .chroma_scaling_from_luma: resd 1 - .num_uv_points: resd 2 - .uv_points: resb 2 * 10 * 2 - .scaling_shift: resd 1 - .ar_coeff_lag: resd 1 - .ar_coeffs_y: resb 24 - .ar_coeffs_uv: resb 2 * 28 ; includes padding - .ar_coeff_shift: resq 1 - .grain_scale_shift: resd 1 - .uv_mult: resd 2 - .uv_luma_mult: resd 2 - .uv_offset: resd 2 - .overlap_flag: resd 1 - .clip_to_restricted_range: resd 1 -endstruc - -cextern gaussian_sequence - -SECTION .text - -%macro SCRATCH 3 -%if ARCH_X86_32 - mova [rsp+%3*mmsize], m%1 -%define m%2 [rsp+%3*mmsize] -%else - SWAP %1, %2 -%endif -%endmacro - -INIT_XMM ssse3 -cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data - LEA r4, $$ -%define base r4-$$ - movq m1, [base+rnd_next_upperbit_mask] - movq m4, [base+mul_bits] - movq m7, [base+hmul_bits] - mov r2d, [fg_dataq+FGData.grain_scale_shift] - movd m2, [base+round+r2*2] - movd m0, [fg_dataq+FGData.seed] - mova m5, [base+pb_mask] - pshuflw m2, m2, q0000 - pshuflw m0, m0, q0000 - mov r2, -73*82 - sub bufq, r2 - lea r3, [base+gaussian_sequence] -.loop: - pand m6, m0, m1 - psrlw m3, m6, 10 - por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw m6, m4 ; bits 0x0f00 are set - pshufb m3, m5, m6 ; set 15th bit for next 4 seeds - psllq m6, m3, 30 - por m3, m6 - psllq m6, m3, 15 - por m3, m6 ; aggregate each bit into next seed's high bit - pmulhuw m6, m0, m7 - por m3, m6 ; 4 next output seeds - pshuflw m0, m3, q3333 - psrlw m3, 5 -%if ARCH_X86_64 - movq r6, m3 - mov r8, r6 - movzx r5d, r6w - shr r6d, 16 - shr r8, 32 - movzx r7, r8w - shr r8, 16 - - movd m6, [r3+r5*2] - pinsrw m6, [r3+r6*2], 1 - pinsrw m6, [r3+r7*2], 2 - pinsrw m6, [r3+r8*2], 3 -%else - movd r6, m3 - pshuflw m3, m3, q3232 - movzx r5, r6w - shr r6, 16 - - movd m6, [r3+r5*2] - pinsrw m6, [r3+r6*2], 1 - - movd r6, m3 - movzx r5, r6w - shr r6, 16 - - pinsrw m6, [r3+r5*2], 2 - pinsrw m6, [r3+r6*2], 3 -%endif - pmulhrsw m6, m2 - packsswb m6, m6 - movd [bufq+r2], m6 - add r2, 4 - jl .loop - - ; auto-regression code - movsxd r2, [fg_dataq+FGData.ar_coeff_lag] - movsxd r2, [base+generate_grain_y_ssse3_table+r2*4] - lea r2, [r2+base+generate_grain_y_ssse3_table] - jmp r2 - -.ar1: -%if ARCH_X86_32 - DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max -%elif WIN64 - DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 - mov bufq, r0 -%else - DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 -%endif - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] - movd m4, [fg_dataq+FGData.ar_coeffs_y] - mov ecx, [fg_dataq+FGData.ar_coeff_shift] -%if ARCH_X86_32 - mov r1m, cf3d - DEFINE_ARGS buf, shift, val3, min, max, x, val0 -%define hd r0mp -%define cf3d r1mp -%elif WIN64 - DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 -%else - DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 -%endif - pxor m6, m6 - pcmpgtb m7, m6, m4 - punpcklbw m4, m7 - pinsrw m4, [base+pw_1], 3 - pshufd m5, m4, q1111 - pshufd m4, m4, q0000 - movd m3, [base+round_vals+shiftq*2-12] ; rnd - pshuflw m3, m3, q0000 - sub bufq, 82*73-(82*3+79) - mov hd, 70 - mov mind, -128 - mov maxd, 127 -.y_loop_ar1: - mov xq, -76 - movsx val3d, byte [bufq+xq-1] -.x_loop_ar1: - movq m0, [bufq+xq-82-1] ; top/left - pcmpgtb m7, m6, m0 - punpcklbw m0, m7 - psrldq m2, m0, 2 ; top - psrldq m1, m0, 4 ; top/right - punpcklwd m0, m2 - punpcklwd m1, m3 - pmaddwd m0, m4 - pmaddwd m1, m5 - paddd m0, m1 -.x_loop_ar1_inner: - movd val0d, m0 - psrldq m0, 4 - imul val3d, cf3d - add val3d, val0d - sar val3d, shiftb - movsx val0d, byte [bufq+xq] - add val3d, val0d - cmp val3d, maxd - cmovns val3d, maxd - cmp val3d, mind - cmovs val3d, mind - mov byte [bufq+xq], val3b - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82 - dec hd - jg .y_loop_ar1 -.ar0: - RET - -.ar2: -%if ARCH_X86_32 -%assign stack_offset_old stack_offset - ALLOC_STACK -16*8 -%endif - DEFINE_ARGS buf, fg_data, shift - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m6, [base+round_vals-12+shiftq*2] - movd m7, [base+byte_blend+1] - SCRATCH 7, 15, 7 - movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 - movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 - pxor m7, m7 - pshuflw m6, m6, q0000 - punpcklwd m6, m7 - pcmpgtb m4, m7, m0 - pcmpgtb m5, m7, m1 - punpcklbw m0, m4 - punpcklbw m1, m5 - DEFINE_ARGS buf, fg_data, h, x - pshufd m4, m1, q0000 - pshufd m5, m1, q1111 - pshufd m3, m0, q3333 - pshufd m2, m0, q2222 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - SCRATCH 0, 8, 0 - SCRATCH 1, 9, 1 - SCRATCH 2, 10, 2 - SCRATCH 3, 11, 3 - SCRATCH 4, 12, 4 - SCRATCH 5, 13, 5 - SCRATCH 6, 14, 6 - sub bufq, 82*73-(82*3+79) - mov hd, 70 -.y_loop_ar2: - mov xq, -76 - -.x_loop_ar2: - movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] - movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] - pcmpgtb m2, m7, m0 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 - psrldq m5, m0, 2 ; y=-2,x=[-1,+5] - psrldq m3, m1, 2 ; y=-1,x=[-1,+5] - psrldq m4, m1, 4 ; y=-1,x=[+0,+5] - punpcklwd m2, m0, m5 - punpcklwd m3, m4 - pmaddwd m2, m8 - pmaddwd m3, m11 - paddd m2, m3 - - psrldq m4, m0, 4 ; y=-2,x=[+0,+5] - psrldq m5, m0, 6 ; y=-2,x=[+1,+5] - psrldq m6, m0, 8 ; y=-2,x=[+2,+5] - punpcklwd m4, m5 - punpcklwd m6, m1 - psrldq m5, m1, 6 ; y=-1,x=[+1,+5] - psrldq m1, m1, 8 ; y=-1,x=[+2,+5] - punpcklwd m5, m1 - pmaddwd m4, m9 - pmaddwd m6, m10 - pmaddwd m5, m12 - paddd m4, m6 - paddd m2, m5 - paddd m2, m4 - paddd m2, m14 - - movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] -.x_loop_ar2_inner: - pcmpgtb m4, m7, m0 - punpcklbw m1, m0, m4 - pmaddwd m3, m1, m13 - paddd m3, m2 - psrldq m1, 4 ; y=0,x=0 - psrldq m2, 4 ; shift top to next pixel - psrad m3, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw since we only care about one value - paddw m3, m1 - packsswb m3, m3 - pslldq m3, 2 - pand m3, m15 - pandn m1, m15, m0 - por m0, m1, m3 - psrldq m0, 1 - ; overwrite 2 pixels, but that's ok - movd [bufq+xq-1], m0 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82 - dec hd - jg .y_loop_ar2 - RET - -.ar3: - DEFINE_ARGS buf, fg_data, shift -%if ARCH_X86_32 -%assign stack_offset stack_offset_old - ALLOC_STACK -16*14 -%elif WIN64 - SUB rsp, 16*6 -%assign stack_size_padded (stack_size_padded+16*6) -%assign stack_size (stack_size+16*6) -%else - ALLOC_STACK -16*6 -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m6, [base+round_vals-12+shiftq*2] - movd m7, [base+byte_blend] - movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 - movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 - pxor m3, m3 - pcmpgtb m4, m3, m0 - pcmpgtb m3, m2 - pshuflw m6, m6, q0000 - SCRATCH 6, 14, 12 - SCRATCH 7, 15, 13 - punpckhbw m1, m0, m4 - punpcklbw m0, m4 - punpcklbw m2, m3 - pshufd m3, m0, q1111 - pshufd m4, m0, q2222 - pshufd m5, m0, q3333 - pshufd m0, m0, q0000 - mova [rsp+ 0*16], m0 - mova [rsp+ 1*16], m3 - mova [rsp+ 2*16], m4 - mova [rsp+ 3*16], m5 - pshufd m6, m1, q1111 - pshufd m7, m1, q2222 - pshufd m5, m1, q3333 - pshufd m1, m1, q0000 - pshufd m3, m2, q1111 - psrldq m0, m2, 10 - pinsrw m2, [base+pw_1], 5 - pshufd m4, m2, q2222 - pshufd m2, m2, q0000 - pinsrw m0, [base+round_vals+shiftq*2-10], 3 - mova [rsp+ 4*16], m1 - mova [rsp+ 5*16], m6 - SCRATCH 7, 8, 6 - SCRATCH 5, 9, 7 - SCRATCH 2, 10, 8 - SCRATCH 3, 11, 9 - SCRATCH 4, 12, 10 - SCRATCH 0, 13, 11 - DEFINE_ARGS buf, fg_data, h, x - sub bufq, 82*73-(82*3+79) - mov hd, 70 -.y_loop_ar3: - mov xq, -76 - -.x_loop_ar3: - movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] - pxor m3, m3 - pcmpgtb m3, m0 - punpckhbw m2, m0, m3 - punpcklbw m0, m3 - - psrldq m5, m0, 2 - psrldq m6, m0, 4 - psrldq m7, m0, 6 - punpcklwd m4, m0, m5 - punpcklwd m6, m7 - pmaddwd m4, [rsp+ 0*16] - pmaddwd m6, [rsp+ 1*16] - paddd m4, m6 - - movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] - pxor m5, m5 - pcmpgtb m5, m1 - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - palignr m6, m2, m0, 10 - palignr m7, m2, m0, 12 - psrldq m0, 8 - punpcklwd m0, m6 - punpcklwd m7, m1 - pmaddwd m0, [rsp+ 2*16] - pmaddwd m7, [rsp+ 3*16] - paddd m0, m7 - paddd m0, m4 - - psrldq m4, m1, 2 - psrldq m5, m1, 4 - psrldq m6, m1, 6 - psrldq m7, m1, 8 - punpcklwd m4, m5 - punpcklwd m6, m7 - pmaddwd m4, [rsp+ 4*16] - pmaddwd m6, [rsp+ 5*16] - paddd m4, m6 - paddd m0, m4 - - movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] - pxor m7, m7 - pcmpgtb m7, m2 - punpckhbw m5, m2, m7 - punpcklbw m2, m7 - palignr m7, m3, m1, 10 - palignr m3, m1, 12 - psrldq m1, m2, 2 - punpcklwd m7, m3 - punpcklwd m3, m2, m1 - pmaddwd m7, m8 - pmaddwd m3, m9 - paddd m7, m3 - paddd m0, m7 - - psrldq m6, m2, 4 - psrldq m1, m2, 6 - psrldq m3, m2, 8 - palignr m4, m5, m2, 10 - palignr m5, m5, m2, 12 - - punpcklwd m6, m1 - punpcklwd m3, m4 - punpcklwd m5, m14 - pmaddwd m6, m10 - pmaddwd m3, m11 - pmaddwd m5, m12 - paddd m0, m6 - paddd m3, m5 - paddd m0, m3 - - movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] -.x_loop_ar3_inner: - pxor m5, m5 - pcmpgtb m5, m1 - punpcklbw m2, m1, m5 - pmaddwd m2, m13 - pshufd m3, m2, q1111 - paddd m2, m3 ; left+cur - paddd m2, m0 ; add top - psrldq m0, 4 - psrad m2, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw since we only care about one value - packsswb m2, m2 - pslldq m2, 3 - pand m2, m15 - pandn m3, m15, m1 - por m1, m2, m3 - movd [bufq+xq-3], m1 - psrldq m1, 1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82 - dec hd - jg .y_loop_ar3 - RET - -%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y -INIT_XMM ssse3 -cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv - movifnidn r2, r2mp - movifnidn r3, r3mp - LEA r4, $$ -%define base r4-$$ - movq m1, [base+rnd_next_upperbit_mask] - movq m4, [base+mul_bits] - movq m7, [base+hmul_bits] - mov r5d, [fg_dataq+FGData.grain_scale_shift] - movd m6, [base+round+r5*2] - mova m5, [base+pb_mask] - movd m0, [fg_dataq+FGData.seed] - movd m2, [base+pw_seed_xor+uvq*4] - pxor m0, m2 - pshuflw m6, m6, q0000 - pshuflw m0, m0, q0000 - lea r6, [base+gaussian_sequence] -%if %2 -%if ARCH_X86_64 - mov r7d, 73-35*%3 -%else - mov r3mp, 73-35*%3 -%endif - add bufq, 44 -.loop_y: - mov r5, -44 -.loop_x: -%else - mov r5, -82*73 - sub bufq, r5 -.loop: -%endif - pand m2, m0, m1 - psrlw m3, m2, 10 - por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set - pmullw m2, m4 ; bits 0x0f00 are set - pshufb m3, m5, m2 ; set 15th bit for next 4 seeds - psllq m2, m3, 30 - por m3, m2 - psllq m2, m3, 15 - por m3, m2 ; aggregate each bit into next seed's high bit - pmulhuw m2, m0, m7 - por m2, m3 ; 4 next output seeds - pshuflw m0, m2, q3333 - psrlw m2, 5 -%if ARCH_X86_64 - movd r9d, m2 - pshuflw m2, m2, q3232 - movzx r8, r9w - shr r9, 16 - - movd m3, [r6+r8*2] - pinsrw m3, [r6+r9*2], 1 - - movd r9d, m2 - movzx r8, r9w - shr r9, 16 - - pinsrw m3, [r6+r8*2], 2 - pinsrw m3, [r6+r9*2], 3 -%else - movd r2, m2 - pshuflw m2, m2, q3232 - movzx r1, r2w - shr r2, 16 - - movd m3, [r6+r1*2] - pinsrw m3, [r6+r2*2], 1 - - movd r2, m2 - movzx r1, r2w - shr r2, 16 - - pinsrw m3, [r6+r1*2], 2 - pinsrw m3, [r6+r2*2], 3 -%endif - pmulhrsw m3, m6 - packsswb m3, m3 - movd [bufq+r5], m3 - add r5, 4 -%if %2 - jl .loop_x - add bufq, 82 -%if ARCH_X86_64 - dec r7d -%else - dec r3mp -%endif - jg .loop_y -%else - jl .loop -%endif - -%if ARCH_X86_32 - mov r2, r2mp -%endif - - ; auto-regression code - movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_%1_ssse3_table+r5*4] - lea r5, [r5+base+generate_grain_uv_%1_ssse3_table] - jmp r5 - -.ar0: - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - movifnidn bufyq, bufymp -%if ARCH_X86_32 -%assign stack_offset_old stack_offset - ALLOC_STACK -2*16 -%endif - imul uvd, 28 - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] - movd m4, [base+hmul_bits+shiftq*2] - DEFINE_ARGS buf, bufy, h, x - pxor m0, m0 - pcmpgtb m0, m5 - punpcklbw m5, m0 - movd m7, [base+pb_1] -%if %2 - movd m6, [base+hmul_bits+2+%3*2] -%endif - pshuflw m5, m5, q0000 - pshuflw m4, m4, q0000 - pshufd m7, m7, q0000 -%if %2 - pshuflw m6, m6, q0000 -%endif - punpcklqdq m5, m5 - punpcklqdq m4, m4 -%if %2 - punpcklqdq m6, m6 -%endif - pcmpeqw m1, m1 - pslldq m1, 12>>%2 - SCRATCH 1, 8, 0 - SCRATCH 4, 9, 1 -%if %2 - sub bufq, 82*(73-35*%3)+82-(82*3+41) -%else - sub bufq, 82*70-3 -%endif - add bufyq, 3+82*3 - mov hd, 70-35*%3 -.y_loop_ar0: - xor xd, xd -.x_loop_ar0: - ; first 32 pixels -%if %2 - movu m1, [bufyq+xq*2] -%if %3 - movu m2, [bufyq+xq*2+82] -%endif - movu m3, [bufyq+xq*2+16] -%if %3 - movu m4, [bufyq+xq*2+82+16] -%endif - pmaddubsw m0, m7, m1 -%if %3 - pmaddubsw m1, m7, m2 -%endif - pmaddubsw m2, m7, m3 -%if %3 - pmaddubsw m3, m7, m4 - paddw m0, m1 - paddw m2, m3 -%endif - pmulhrsw m0, m6 - pmulhrsw m2, m6 -%else - movu m0, [bufyq+xq] - pxor m6, m6 - pcmpgtb m6, m0 - punpckhbw m2, m0, m6 - punpcklbw m0, m6 -%endif - pmullw m0, m5 - pmullw m2, m5 - pmulhrsw m0, m9 - pmulhrsw m2, m9 - movu m1, [bufq+xq] - pxor m4, m4 - pcmpgtb m4, m1 - punpckhbw m3, m1, m4 -%if %2 - punpcklbw m1, m4 - paddw m2, m3 - paddw m0, m1 -%else - punpcklbw m6, m1, m4 - paddw m2, m3 - paddw m0, m6 -%endif - packsswb m0, m2 -%if %2 - movu [bufq+xq], m0 - add xd, 16 - cmp xd, 32 - jl .x_loop_ar0 - - ; last 6/12 pixels - movu m1, [bufyq+xq*(1+%2)] -%if %3 - movu m2, [bufyq+xq*2+82] -%endif - pmaddubsw m0, m7, m1 -%if %3 - pmaddubsw m1, m7, m2 - paddw m0, m1 -%endif - pmulhrsw m0, m6 - pmullw m0, m5 - pmulhrsw m0, m9 - movq m1, [bufq+xq] - pxor m4, m4 - pcmpgtb m4, m1 - punpcklbw m2, m1, m4 - paddw m0, m2 - packsswb m0, m0 - pandn m2, m8, m0 - pand m1, m8 - por m2, m1 - movq [bufq+xq], m2 -%else - add xd, 16 - cmp xd, 80 - je .y_loop_final_ar0 - movu [bufq+xq-16], m0 - jmp .x_loop_ar0 -.y_loop_final_ar0: - pandn m2, m8, m0 - pand m1, m8 - por m2, m1 - movu [bufq+xq-16], m2 -%endif - - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar0 - RET - -.ar1: -%if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp -%endif - DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x - imul uvd, 28 - movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] - movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] - pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 -%if ARCH_X86_32 - mov r3mp, cf3d - DEFINE_ARGS buf, shift, fg_data, val3, min, max, x -%elif WIN64 - DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x - mov bufq, r0 -%else - DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - movd m3, [base+round_vals+shiftq*2-12] ; rnd -%if %2 - movd m7, [base+pb_1] - movd m6, [base+hmul_bits+2+%3*2] -%endif - psrldq m4, 1 -%if ARCH_X86_32 - DEFINE_ARGS buf, shift, val0, val3, min, max, x -%elif WIN64 - DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 -%else - DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 -%endif - pxor m5, m5 - punpcklwd m3, m5 -%if %2 - punpcklwd m6, m6 -%endif - pcmpgtb m5, m4 - punpcklbw m4, m5 - pshufd m5, m4, q1111 - pshufd m4, m4, q0000 - pshufd m3, m3, q0000 -%if %2 - pshufd m7, m7, q0000 - pshufd m6, m6, q0000 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*69+3 -%endif -%if ARCH_X86_32 - add r1mp, 79+82*3 - mov r0mp, 70-35*%3 -%else - add bufyq, 79+82*3 - mov hd, 70-35*%3 -%endif - mov mind, -128 - mov maxd, 127 -.y_loop_ar1: - mov xq, -(76>>%2) - movsx val3d, byte [bufq+xq-1] -.x_loop_ar1: -%if %2 -%if ARCH_X86_32 - mov r2, r1mp - movq m0, [r2+xq*2] -%if %3 - movq m1, [r2+xq*2+82] -%endif -%else - movq m0, [bufyq+xq*2] -%if %3 - movq m1, [bufyq+xq*2+82] -%endif -%endif - pmaddubsw m2, m7, m0 -%if %3 - pmaddubsw m0, m7, m1 - paddw m2, m0 -%endif - pmulhrsw m2, m6 -%else -%if ARCH_X86_32 - mov r2, r1mp - movd m2, [r2+xq] -%else - movd m2, [bufyq+xq] -%endif - pxor m0, m0 - pcmpgtb m0, m2 - punpcklbw m2, m0 -%endif - - movq m0, [bufq+xq-82-1] ; top/left - pxor m1, m1 - pcmpgtb m1, m0 - punpcklbw m0, m1 - psrldq m1, m0, 4 ; top/right - punpcklwd m1, m2 - psrldq m2, m0, 2 ; top - punpcklwd m0, m2 - pmaddwd m0, m4 - pmaddwd m1, m5 - paddd m0, m1 - paddd m0, m3 -.x_loop_ar1_inner: - movd val0d, m0 - psrldq m0, 4 -%if ARCH_X86_32 - imul val3d, r3mp -%else - imul val3d, cf3d -%endif - add val3d, val0d - sar val3d, shiftb - movsx val0d, byte [bufq+xq] - add val3d, val0d - cmp val3d, maxd - cmovns val3d, maxd - cmp val3d, mind - cmovs val3d, mind - mov byte [bufq+xq], val3b - ; keep val3d in-place as left for next x iteration - inc xq - jz .x_loop_ar1_end - test xq, 3 - jnz .x_loop_ar1_inner - jmp .x_loop_ar1 - -.x_loop_ar1_end: - add bufq, 82 -%if ARCH_X86_32 - add r1mp, 82<<%3 - dec r0mp -%else - add bufyq, 82<<%3 - dec hd -%endif - jg .y_loop_ar1 - RET - -.ar2: -%if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp - ALLOC_STACK -8*16 -%endif - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - movifnidn bufyq, bufymp - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - movd m7, [base+round_vals-12+shiftq*2] - movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 - pxor m2, m2 - pcmpgtb m2, m0 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 - pinsrw m1, [base+pw_1], 5 - punpcklwd m7, m7 - pshufd m7, m7, q0000 - DEFINE_ARGS buf, bufy, fg_data, h, unused, x - pshufd m4, m1, q0000 - pshufd m5, m1, q1111 - pshufd m6, m1, q2222 - pshufd m3, m0, q3333 - pshufd m2, m0, q2222 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - SCRATCH 0, 8, 0 - SCRATCH 1, 9, 1 - SCRATCH 2, 10, 2 - SCRATCH 3, 11, 3 - SCRATCH 4, 12, 4 - SCRATCH 5, 13, 5 - SCRATCH 6, 14, 6 - SCRATCH 7, 15, 7 -%if %2 - movd m7, [base+hmul_bits+2+%3*2] - movd m6, [base+pb_1] - punpcklwd m7, m7 - pshufd m6, m6, q0000 - pshufd m7, m7, q0000 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*69+3 -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 -.y_loop_ar2: - mov xq, -(76>>%2) - -.x_loop_ar2: - pxor m2, m2 - movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] - movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] - pcmpgtb m2, m0 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 - psrldq m5, m0, 2 ; y=-2,x=[-1,+5] - psrldq m3, m1, 2 ; y=-1,x=[-1,+5] - psrldq m4, m1, 4 ; y=-1,x=[+0,+5] - punpcklwd m2, m0, m5 - punpcklwd m3, m4 - pmaddwd m2, m8 - pmaddwd m3, m11 - paddd m2, m3 - - psrldq m4, m0, 4 ; y=-2,x=[+0,+5] - psrldq m5, m0, 6 ; y=-2,x=[+1,+5] - psrldq m0, 8 ; y=-2,x=[+2,+5] - punpcklwd m4, m5 - punpcklwd m0, m1 - psrldq m3, m1, 6 ; y=-1,x=[+1,+5] - psrldq m1, m1, 8 ; y=-1,x=[+2,+5] - punpcklwd m3, m1 - pmaddwd m4, m9 - pmaddwd m0, m10 - pmaddwd m3, m12 - paddd m4, m0 - paddd m2, m3 - paddd m2, m4 - -%if %2 - movq m1, [bufyq+xq*2] -%if %3 - movq m3, [bufyq+xq*2+82] -%endif - pmaddubsw m0, m6, m1 -%if %3 - pmaddubsw m1, m6, m3 - paddw m0, m1 -%endif - pmulhrsw m0, m7 -%else - movd m0, [bufyq+xq] - pxor m1, m1 - pcmpgtb m1, m0 - punpcklbw m0, m1 -%endif - punpcklwd m0, m15 - pmaddwd m0, m14 - paddd m2, m0 - - movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] - pxor m4, m4 - movd m5, [base+byte_blend+1] - punpcklbw m5, m5 -.x_loop_ar2_inner: - pcmpgtb m1, m4, m0 - punpcklbw m0, m1 - pmaddwd m3, m0, m13 - paddd m3, m2 - psrldq m2, 4 ; shift top to next pixel - psrad m3, [fg_dataq+FGData.ar_coeff_shift] - pslldq m3, 4 - pand m3, m5 - paddw m0, m3 - packsswb m0, m0 - movd [bufq+xq-2], m0 - psrldq m0, 1 - inc xq - jz .x_loop_ar2_end - test xq, 3 - jnz .x_loop_ar2_inner - jmp .x_loop_ar2 - -.x_loop_ar2_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar2 - RET - -.ar3: -%if ARCH_X86_32 -%assign stack_offset stack_offset_old -%assign stack_size_padded 0 -%xdefine rstk rsp -%endif - DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift - movifnidn bufyq, bufymp -%if ARCH_X86_32 - ALLOC_STACK -15*16 -%else - SUB rsp, 16*7 -%assign stack_size_padded (stack_size_padded+16*7) -%assign stack_size (stack_size+16*7) -%endif - mov shiftd, [fg_dataq+FGData.ar_coeff_shift] - imul uvd, 28 - - movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 - pxor m3, m3 - pcmpgtb m3, m0 - punpckhbw m1, m0, m3 - punpcklbw m0, m3 - pshufd m2, m0, q1111 - pshufd m3, m0, q2222 - pshufd m4, m0, q3333 - pshufd m0, m0, q0000 - pshufd m5, m1, q1111 - pshufd m6, m1, q2222 - pshufd m7, m1, q3333 - pshufd m1, m1, q0000 - mova [rsp+ 0*16], m0 - mova [rsp+ 1*16], m2 - mova [rsp+ 2*16], m3 - mova [rsp+ 3*16], m4 - mova [rsp+ 4*16], m1 - mova [rsp+ 5*16], m5 - mova [rsp+ 6*16], m6 - SCRATCH 7, 8, 7 - - movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] - pxor m4, m4 - pcmpgtb m4, m2 - punpckhbw m5, m2, m4 - punpcklbw m2, m4 - pshufd m4, m2, q3232 - punpcklwd m3, m4, m5 - pshuflw m5, m4, q3321 - pshufd m4, m3, q0000 - pshufd m3, m2, q1111 - pshufd m2, m2, q0000 - pinsrw m5, [base+round_vals+shiftq*2-10], 3 - SCRATCH 2, 9, 8 - SCRATCH 3, 10, 9 - SCRATCH 4, 11, 10 - SCRATCH 5, 12, 11 - - movd m2, [base+round_vals-12+shiftq*2] -%if %2 - movd m1, [base+pb_1] - movd m3, [base+hmul_bits+2+%3*2] -%endif - pxor m0, m0 - punpcklwd m2, m0 -%if %2 - punpcklwd m3, m3 -%endif - pshufd m2, m2, q0000 -%if %2 - pshufd m1, m1, q0000 - pshufd m3, m3, q0000 - SCRATCH 1, 13, 12 -%endif - SCRATCH 2, 14, 13 -%if %2 - SCRATCH 3, 15, 14 -%endif - - DEFINE_ARGS buf, bufy, fg_data, h, unused, x -%if %2 - sub bufq, 82*(73-35*%3)+44-(82*3+41) -%else - sub bufq, 82*69+3 -%endif - add bufyq, 79+82*3 - mov hd, 70-35*%3 -.y_loop_ar3: - mov xq, -(76>>%2) - -.x_loop_ar3: - movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] - pxor m4, m4 - pcmpgtb m4, m0 - punpckhbw m3, m0, m4 - punpcklbw m0, m4 - - psrldq m5, m0, 2 - psrldq m6, m0, 4 - psrldq m7, m0, 6 - punpcklwd m4, m0, m5 - punpcklwd m6, m7 - pmaddwd m4, [rsp+ 0*16] - pmaddwd m6, [rsp+ 1*16] - paddd m4, m6 - - palignr m2, m3, m0, 10 - palignr m3, m0, 12 - psrldq m0, 8 - - movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] - pxor m6, m6 - pcmpgtb m6, m1 - punpckhbw m5, m1, m6 - punpcklbw m1, m6 - - punpcklwd m0, m2 - punpcklwd m3, m1 - pmaddwd m0, [rsp+ 2*16] - pmaddwd m3, [rsp+ 3*16] - paddd m0, m3 - paddd m0, m4 - - movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] - pxor m7, m7 - pcmpgtb m7, m2 - punpckhbw m6, m2, m7 - punpcklbw m2, m7 - - palignr m3, m5, m1, 10 - palignr m5, m1, 12 - psrldq m4, m2, 2 - - punpcklwd m3, m5 - punpcklwd m5, m2, m4 - pmaddwd m3, [rsp+ 6*16] - pmaddwd m5, m8 - paddd m3, m5 - paddd m0, m3 - - psrldq m3, m1, 2 - psrldq m4, m1, 4 - psrldq m5, m1, 6 - psrldq m1, 8 - - punpcklwd m3, m4 - punpcklwd m5, m1 - pmaddwd m3, [rsp+ 4*16] - pmaddwd m5, [rsp+ 5*16] - paddd m3, m5 - paddd m0, m3 - -%if %2 - movq m1, [bufyq+xq*2] -%if %3 - movq m3, [bufyq+xq*2+82] -%endif - pmaddubsw m7, m13, m1 -%if %3 - pmaddubsw m5, m13, m3 - paddw m7, m5 -%endif - pmulhrsw m7, m15 -%else - movd m7, [bufyq+xq] - pxor m1, m1 - pcmpgtb m1, m7 - punpcklbw m7, m1 -%endif - - psrldq m1, m2, 4 - psrldq m3, m2, 6 - palignr m4, m6, m2, 10 - palignr m6, m2, 12 - psrldq m2, 8 - - punpcklwd m1, m3 - punpcklwd m2, m4 - punpcklwd m6, m7 - pmaddwd m1, m9 - pmaddwd m2, m10 - pmaddwd m6, m11 - paddd m1, m2 - paddd m0, m6 - paddd m0, m1 - paddd m0, m14 - - movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] - pxor m4, m4 - movd m5, [base+byte_blend] -.x_loop_ar3_inner: - pcmpgtb m2, m4, m1 - punpcklbw m3, m1, m2 - pmaddwd m2, m3, m12 - pshufd m3, m2, q1111 - paddd m2, m3 ; left+cur - paddd m2, m0 ; add top - psrldq m0, 4 - psrad m2, [fg_dataq+FGData.ar_coeff_shift] - ; don't packssdw, we only care about one value - packsswb m2, m2 - pandn m3, m5, m1 - pslld m2, 24 - pand m2, m5 - por m1, m2, m3 - movd [bufq+xq-3], m1 - psrldq m1, 1 - inc xq - jz .x_loop_ar3_end - test xq, 3 - jnz .x_loop_ar3_inner - jmp .x_loop_ar3 - -.x_loop_ar3_end: - add bufq, 82 - add bufyq, 82<<%3 - dec hd - jg .y_loop_ar3 - RET -%endmacro - -generate_grain_uv_fn 420, 1, 1 -generate_grain_uv_fn 422, 1, 0 -generate_grain_uv_fn 444, 0, 0 - -%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg -%assign %%idx 0 -%define %%tmp %2 -%if %0 == 6 -%define %%tmp %6 -%endif -%rep 4 -%if %%idx == 0 - movd %5 %+ d, %2 - pshuflw %%tmp, %2, q3232 -%else - movd %5 %+ d, %%tmp -%if %%idx == 2 - punpckhqdq %%tmp, %%tmp -%elif %%idx == 4 - psrlq %%tmp, 32 -%endif -%endif - movzx %4 %+ d, %5 %+ w - shr %5 %+ d, 16 - -%if %%idx == 0 - movd %1, [%3+%4] -%else - pinsrw %1, [%3+%4], %%idx + 0 -%endif - pinsrw %1, [%3+%5], %%idx + 1 -%assign %%idx %%idx+2 -%endrep -%endmacro - -INIT_XMM ssse3 -; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) -%if ARCH_X86_32 -%if STACK_ALIGNMENT < mmsize -cglobal fgy_32x32xn, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \ - dst, src, scaling, unused1, fg_data, picptr, unused2 - ; copy stack arguments to new position post-alignment, so that we - ; don't have to keep the old stack location in a separate register - mov r0, r0m - mov r1, r2m - mov r2, r4m - mov r3, r6m - mov r4, r7m - mov r5, r8m - - mov [rsp+6*mmsize+ 3*gprsize], r0 - mov [rsp+6*mmsize+ 5*gprsize], r1 - mov [rsp+6*mmsize+ 7*gprsize], r2 - mov [rsp+6*mmsize+ 9*gprsize], r3 - mov [rsp+6*mmsize+10*gprsize], r4 - mov [rsp+6*mmsize+11*gprsize], r5 -%else -cglobal fgy_32x32xn, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \ - dst, src, scaling, unused1, fg_data, picptr, unused2 -%endif - mov srcq, srcm - mov fg_dataq, r3m - mov scalingq, r5m -%if STACK_ALIGNMENT < mmsize -%define r0m [rsp+6*mmsize+ 3*gprsize] -%define r1m [rsp+6*mmsize+ 4*gprsize] -%define r2m [rsp+6*mmsize+ 5*gprsize] -%define r3m [rsp+6*mmsize+ 6*gprsize] -%define r4m [rsp+6*mmsize+ 7*gprsize] -%define r5m [rsp+6*mmsize+ 8*gprsize] -%define r6m [rsp+6*mmsize+ 9*gprsize] -%define r7m [rsp+6*mmsize+10*gprsize] -%define r8m [rsp+6*mmsize+11*gprsize] -%endif - LEA r5, pb_mask -%define base r5-pb_mask - mov r5m, picptrq -%else -cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut - lea r7, [pb_mask] -%define base r7-pb_mask -%endif - mov r6d, [fg_dataq+FGData.scaling_shift] - movd m3, [base+mul_bits+r6*2-14] - mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - pcmpeqw m2, m2 - psrldq m2, 14 - movd m4, [base+max+r6*4] - movd m5, [base+min+r6*2] - punpcklwd m3, m3 - punpcklwd m4, m4 - punpcklwd m5, m5 - pshufd m3, m3, q0000 - pshufd m4, m4, q0000 - pshufd m5, m5, q0000 - SCRATCH 2, 10, 0 - SCRATCH 3, 11, 1 - SCRATCH 4, 12, 2 - SCRATCH 5, 13, 3 - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap -%endif - - mov sbyd, r8m - mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 - test overlapd, overlapd - jz .no_vertical_overlap - mova m6, [base+pw_1024] - movd m7, [base+pb_27_17_17_27] - SCRATCH 6, 14, 4 - SCRATCH 7, 15, 5 - test sbyd, sbyd - jnz .vertical_overlap - ; fall-through - -.no_vertical_overlap: - mov r8m, overlapd -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused - imul seed, (173 << 24) | 37 -%else - imul seed, sbyd, (173 << 24) | 37 -%endif - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak - - mov r3m, seed - mov wq, r4m -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - unused1, unused2, see, unused3 -%endif - - lea src_bakq, [srcq+wq] - neg wq - sub dstmp, srcq -%if ARCH_X86_32 - mov r1m, src_bakq - mov r4m, wq - DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 -%endif - -.loop_x: -%if ARCH_X86_32 - mov seed, r3m -%endif - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, unused - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - -%if ARCH_X86_32 - ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, - ; r6m=grain_lut, r7m=h, r8m=overlap_v|h - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, unused -%endif - -.loop_x_odd: - mov hd, r7m - mov grain_lutq, grain_lutmp -.loop_y: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m4, m0, scalingq, r0, r5, m3 - vpgatherdw m5, m1, scalingq, r0, r5, m3 -%else - vpgatherdw m4, m0, scalingq, r12, r13, m3 - vpgatherdw m5, m1, scalingq, r12, r13, m3 -%endif - pcmpeqw m3, m3 - psrlw m3, 8 - pand m4, m3 - pand m5, m3 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] - pcmpgtb m7, m2, m3 - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m4 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - - add srcq, r2mp - add grain_lutq, 82 - dec hd - jg .loop_y - -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end -%if ARCH_X86_32 - mov srcq, r1mp - add srcq, r4mp -%else - lea srcq, [src_bakq+wq] -%endif - btc dword r8m, 2 - jc .next_blk - - add offxyd, 16 - test dword r8m, 2 ; r8m & 2 = have_top_overlap - jz .loop_x_odd - -%if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 -%else - add r11d, 16 ; top_offxyd -%endif - jnz .loop_x_odd_v_overlap - -.next_blk: - test dword r8m, 1 - jz .loop_x - - test dword r8m, 2 - jnz .loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) -.loop_x_h_overlap: -%if ARCH_X86_32 - ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, - ; r6m=grain_lut, r7m=h, r8m=overlap_v|h - DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 - - add offxyd, 16 ; left_offxyd - mov [rsp+6*mmsize+0*gprsize], offxyd - - DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 - - mov seed, r3m -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, left_offxy - - lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx -%endif - - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164 - lea offyq, [offyq+offxq*2+747] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, left_offxy -%endif - - mov hd, r7m - mov grain_lutq, grain_lutmp -.loop_y_h_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m4, m0, scalingq, r0, r5, m3 - vpgatherdw m5, m1, scalingq, r0, r5, m3 -%else - vpgatherdw m4, m0, scalingq, r12, r13, m3 - vpgatherdw m5, m1, scalingq, r12, r13, m3 -%endif - pcmpeqw m3, m3 - psrlw m3, 8 - pand m4, m3 - pand m5, m3 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] -%if ARCH_X86_32 - mov r5, [rsp+6*mmsize+0*gprsize] - movd m7, [grain_lutq+r5] -%else - movd m7, [grain_lutq+left_offxyq] -%endif - punpcklbw m7, m3 - pmaddubsw m6, m15, m7 - pmulhrsw m6, m14 - packsswb m6, m6 - pand m6, m10 - pandn m7, m10, m3 - por m6, m7 - pcmpgtb m2, m6 - punpcklbw m7, m6, m2 - punpckhbw m6, m2 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m7, m4 - pmullw m6, m5 - pmulhrsw m7, m11 - pmulhrsw m6, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m7 - paddw m1, m6 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - - add srcq, r2mp - add grain_lutq, 82 - dec hd - jg .loop_y_h_overlap - -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end -%if ARCH_X86_32 - mov srcq, r1m - add srcq, r4m -%else - lea srcq, [src_bakq+wq] -%endif - xor dword r8m, 4 - add offxyd, 16 - - ; since this half-block had left-overlap, the next does not - test dword r8m, 2 ; have_top_overlap - jz .loop_x_odd -%if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 -%else - add r11d, 16 ; top_offxyd -%endif - jmp .loop_x_odd_v_overlap - -.end: - RET - -.vertical_overlap: -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap -%endif - - or overlapd, 2 ; top_overlap: overlap & 2 - mov r8m, overlapd - movzx sbyd, sbyb -%if ARCH_X86_32 - imul r4, [fg_dataq+FGData.seed], 0x00010001 - DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused -%else - imul seed, [fg_dataq+FGData.seed], 0x00010001 -%endif - imul tmpd, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add tmpd, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and tmpd, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, tmpd -%if ARCH_X86_32 - xor sbyd, seed ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak - - mov r3m, seed - mov wq, r4m -%else - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - tmp, unused2, see, unused3 -%endif - - lea src_bakq, [srcq+wq] - neg wq - sub dstmp, srcq -%if ARCH_X86_32 - mov r1m, src_bakq - mov r4m, wq - DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 -%endif - -.loop_x_v_overlap: -%if ARCH_X86_32 - mov seed, r3m -%endif - ; we assume from the block above that bits 8-15 of tmpd are zero'ed, - ; because of the 'and tmpd, 0x00ff00ff' above - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp tmpb ; parity of top_seed - shr seed, 16 - shl tmpd, 16 - test seeb, seeh - setp tmpb ; parity of cur_seed - or r6d, 0x00010001 - xor tmpd, r6d - mov seed, tmpd - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, unused, top_offxy - - mov offyd, seed - mov offxd, seed -%endif - - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - -%if ARCH_X86_32 - DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, unused, top_offxy -%endif - - movzx top_offxyd, offxyw -%if ARCH_X86_32 - mov [rsp+6*mmsize+1*gprsize], top_offxyd - - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - shr offxyd, 16 - -.loop_x_odd_v_overlap: -%if ARCH_X86_32 - mov r5, r5m - lea r5, [base+pb_27_17] - mov [rsp+5*mmsize+8], r5 -%else - mova m8, [pb_27_17] -%endif - mov hd, r7m - mov grain_lutq, grain_lutmp -.loop_y_v_overlap: - ; src - mova m0, [srcq] - pxor m2, m2 - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m4, m0, scalingq, r0, r5, m3 - vpgatherdw m5, m1, scalingq, r0, r5, m3 -%else - vpgatherdw m4, m0, scalingq, r12, r13, m3 - vpgatherdw m5, m1, scalingq, r12, r13, m3 -%endif - pcmpeqw m3, m3 - psrlw m3, 8 - pand m4, m3 - pand m5, m3 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] -%if ARCH_X86_32 - mov r5, [rsp+6*mmsize+1*gprsize] - movu m7, [grain_lutq+r5] -%else - movu m7, [grain_lutq+top_offxyq] -%endif - punpckhbw m6, m7, m3 - punpcklbw m7, m3 -%if ARCH_X86_32 - mov r5, [rsp+5*mmsize+8] - pmaddubsw m3, [r5], m6 - pmaddubsw m6, [r5], m7 -%else - pmaddubsw m3, m8, m6 - pmaddubsw m6, m8, m7 -%endif - pmulhrsw m3, m14 - pmulhrsw m6, m14 - packsswb m6, m3 - pcmpgtb m7, m2, m6 - punpcklbw m2, m6, m7 - punpckhbw m6, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m4 - pmullw m6, m5 - pmulhrsw m2, m11 - pmulhrsw m6, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m6 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add dword [rsp+5*mmsize+8], mmsize -%else - mova m8, [pb_17_27] -%endif - add srcq, r2mp - add grain_lutq, 82 - dec hw - jz .end_y_v_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - btc hd, 16 - jnc .loop_y_v_overlap - jmp .loop_y - -.end_y_v_overlap: -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end_hv -%if ARCH_X86_32 - mov srcq, r1mp - add srcq, r4mp -%else - lea srcq, [src_bakq+wq] -%endif - btc dword r8m, 2 - jc .loop_x_hv_overlap - add offxyd, 16 -%if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 -%else - add top_offxyd, 16 -%endif - jmp .loop_x_odd_v_overlap - -.loop_x_hv_overlap: -%if ARCH_X86_32 - mov r5, r5m - lea r5, [base+pb_27_17] - mov [rsp+5*mmsize+8], r5 - - DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak - - mov r5, [rsp+6*mmsize+1*gprsize] - mov r4, offxyd - add r5, 16 - add r4, 16 - mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy - mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy - - DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak - - xor tmpd, tmpd - mov seed, r3m -%else - mova m8, [pb_27_17] - - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - tmp, unused2, see, unused3 - - ; we assume from the block above that bits 8-15 of tmpd are zero'ed -%endif - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp tmpb ; parity of top_seed - shr seed, 16 - shl tmpd, 16 - test seeb, seeh - setp tmpb ; parity of cur_seed - or r6d, 0x00010001 - xor tmpd, r6d - mov seed, tmpd - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy - - lea topleft_offxyq, [top_offxyq+16] - lea left_offxyq, [offyq+16] - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*2+0x10001*747+32*82] - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut - - movzx r5, offxyw ; top_offxy - mov [rsp+6*mmsize+1*gprsize], r5 -%else - DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy - - movzx top_offxyd, offxyw -%endif - shr offxyd, 16 - - mov hd, r7m - mov grain_lutq, grain_lutmp -.loop_y_hv_overlap: - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] -%if ARCH_X86_32 - mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy - mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy - movu m6, [grain_lutq+r5] - mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy - movd m4, [grain_lutq+r0] - movd m7, [grain_lutq+r5] -%else - movu m6, [grain_lutq+top_offxyq] - movd m4, [grain_lutq+left_offxyq] - movd m7, [grain_lutq+topleft_offxyq] -%endif - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklbw m4, m3 - punpcklbw m7, m6 - pmaddubsw m2, m15, m4 - pmaddubsw m4, m15, m7 - pmulhrsw m2, m14 - pmulhrsw m4, m14 - packsswb m2, m2 - packsswb m4, m4 - pand m2, m10 - pand m4, m10 - pandn m7, m10, m3 - pandn m3, m10, m6 - por m7, m2 - por m3, m4 - ; followed by v interpolation (top | cur -> cur) - punpckhbw m4, m3, m7 - punpcklbw m3, m7 -%if ARCH_X86_32 - mov r5, [rsp+5*mmsize+8] - pmaddubsw m7, [r5], m4 - pmaddubsw m4, [r5], m3 -%else - pmaddubsw m7, m8, m4 - pmaddubsw m4, m8, m3 -%endif - pmulhrsw m7, m14 - pmulhrsw m4, m14 - packsswb m4, m7 - pxor m2, m2 - pcmpgtb m7, m2, m4 - punpcklbw m3, m4, m7 - punpckhbw m4, m7 - - ; src - mova m0, [srcq] - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m5, m0, scalingq, r0, r5, m7 - vpgatherdw m6, m1, scalingq, r0, r5, m7 -%else - vpgatherdw m5, m0, scalingq, r13, r14, m7 - vpgatherdw m6, m1, scalingq, r13, r14, m7 -%endif - pcmpeqw m7, m7 - psrlw m7, 8 - pand m5, m7 - pand m6, m7 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m3, m5 - pmullw m4, m6 - pmulhrsw m3, m11 - pmulhrsw m4, m11 - - ; dst = clip_pixel(src, noise) - paddw m0, m3 - paddw m1, m4 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add dword [rsp+5*mmsize+8], mmsize -%else - mova m8, [pb_17_27] -%endif - add srcq, r2mp - add grain_lutq, 82 - dec hw - jz .end_y_hv_overlap - ; 2 lines get vertical overlap, then fall back to non-overlap code for - ; remaining (up to) 30 lines - btc hd, 16 - jnc .loop_y_hv_overlap - jmp .loop_y_h_overlap - -.end_y_hv_overlap: -%if ARCH_X86_32 - add r4mp, 16 -%else - add wq, 16 -%endif - jge .end_hv -%if ARCH_X86_32 - mov srcq, r1m - add srcq, r4m -%else - lea srcq, [src_bakq+wq] -%endif - xor dword r8m, 4 - add offxyd, 16 -%if ARCH_X86_32 - add dword [rsp+6*mmsize+1*gprsize], 16 -%else - add top_offxyd, 16 -%endif - jmp .loop_x_odd_v_overlap - -.end_hv: - RET - -%macro FGUV_FN 3 ; name, ss_hor, ss_ver -INIT_XMM ssse3 -%if ARCH_X86_32 -; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, -; sby, luma, lstride, uv_pl, is_id) -%if STACK_ALIGNMENT < mmsize -DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 -cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ - tmp, src, scaling, h, fg_data, picptr, unused - mov r0, r0m - mov r1, r2m - mov r2, r4m - mov r3, r6m - mov r4, r7m - mov [rsp+8*mmsize+3*gprsize], r0 - mov [rsp+8*mmsize+5*gprsize], r1 - mov [rsp+8*mmsize+7*gprsize], r2 - mov [rsp+8*mmsize+9*gprsize], r3 - mov [rsp+8*mmsize+10*gprsize], r4 - - mov r0, r8m - mov r1, r9m - mov r2, r10m - mov r4, r11m - mov r3, r12m - mov [rsp+8*mmsize+11*gprsize], r0 - mov [rsp+8*mmsize+12*gprsize], r1 - mov [rsp+8*mmsize+13*gprsize], r2 - mov [rsp+8*mmsize+14*gprsize], r4 -%else -cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ - tmp, src, scaling, h, fg_data, picptr, unused -%endif - mov srcq, srcm - mov fg_dataq, r3m - mov scalingq, r5m -%if STACK_ALIGNMENT < mmsize -%define r0m [rsp+8*mmsize+ 3*gprsize] -%define r1m [rsp+8*mmsize+ 4*gprsize] -%define r2m [rsp+8*mmsize+ 5*gprsize] -%define r3m [rsp+8*mmsize+ 6*gprsize] -%define r4m [rsp+8*mmsize+ 7*gprsize] -%define r5m [rsp+8*mmsize+ 8*gprsize] -%define r6m [rsp+8*mmsize+ 9*gprsize] -%define r7m [rsp+8*mmsize+10*gprsize] -%define r8m [rsp+8*mmsize+11*gprsize] -%define r9m [rsp+8*mmsize+12*gprsize] -%define r10m [rsp+8*mmsize+13*gprsize] -%define r11m [rsp+8*mmsize+14*gprsize] -%define r12m [rsp+8*mmsize+15*gprsize] -%endif - LEA r5, pb_mask -%define base r5-pb_mask - mov r5m, r5 -%else -cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ - grain_lut, tmp, sby, luma, lstride, uv_pl, is_id - lea r8, [pb_mask] -%define base r8-pb_mask -%endif - mov r6d, [fg_dataq+FGData.scaling_shift] - pcmpeqw m2, m2 - movd m3, [base+mul_bits+r6*2-14] - mov r6d, [fg_dataq+FGData.clip_to_restricted_range] - lea tmpd, [r6d*2] -%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize - test r3, r3 -%else - cmp dword r12m, 0 ; is_idm -%endif - movd m5, [base+min+r6*2] - cmovne r6d, tmpd - movd m4, [base+max+r6*2] - psrldq m2, 14+%2 - punpcklwd m3, m3 - punpcklwd m5, m5 - punpcklwd m4, m4 - pshufd m3, m3, q0000 - pshufd m5, m5, q0000 - pshufd m4, m4, q0000 - SCRATCH 2, 10, 0 - SCRATCH 3, 11, 1 - SCRATCH 4, 12, 2 - SCRATCH 5, 13, 3 - - cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 - jne .csfl - -%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap -%endif - -%if %1 - mov r6d, dword r11m - movd m0, [fg_dataq+FGData.uv_mult+r6*4] - movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] - punpcklbw m6, m1, m0 - movd m7, [fg_dataq+FGData.uv_offset+r6*4] - punpcklwd m6, m6 - punpcklwd m7, m7 - pshufd m6, m6, q0000 - pshufd m7, m7, q0000 - SCRATCH 6, 14, 4 - SCRATCH 7, 15, 5 -%endif - - mov sbyd, r8m - mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 - test overlapd, overlapd - jz %%no_vertical_overlap -%if ARCH_X86_32 -%if %2 - movd m1, [base+pb_23_22] -%else - movd m1, [base+pb_27_17_17_27] -%endif - mova m0, [base+pw_1024] -%else -%if %2 - movd m1, [pb_23_22] -%else - movd m1, [pb_27_17_17_27] -%endif - mova m0, [pw_1024] -%endif - pshufd m1, m1, q0000 - SCRATCH 0, 8, 6 - SCRATCH 1, 9, 7 - test sbyd, sbyd - jnz %%vertical_overlap - ; fall-through - -%%no_vertical_overlap: - mov r8m, overlapd -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap - imul seed, (173 << 24) | 37 -%else - imul seed, sbyd, (173 << 24) | 37 -%endif - add seed, (105 << 24) | 178 - rol seed, 8 - movzx seed, seew - xor seed, [fg_dataq+FGData.seed] - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak -%define luma_bakq lumaq - - mov wq, r4m -%if %3 - shl r10mp, 1 -%endif -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak - - mov lstrideq, r10mp -%endif - - mov lumaq, r9mp - lea src_bakq, [srcq+wq] - lea luma_bakq, [lumaq+wq*(1+%2)] - neg wq - sub r0mp, srcq -%if ARCH_X86_32 - mov r1m, src_bakq - mov r11m, luma_bakq - mov r4m, wq - - DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 -%else - mov r11mp, src_bakq - mov r12mp, strideq -%endif - -%%loop_x: -%if ARCH_X86_32 - mov seed, r3m -%endif - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, overlap, unused1, unused2, lstride - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, overlap, unused1, unused2, lstride, luma_bak -%endif - -%%loop_x_odd: - mov hd, r7m - mov grain_lutq, grain_lutmp -%%loop_y: - ; src -%if ARCH_X86_32 - mov lumaq, r9mp -%endif -%if %2 - mova m4, [lumaq+ 0] - mova m6, [lumaq+16] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq - mov r5, r5m - movd m7, [base+pb_1] -%else - movd m7, [pb_1] -%endif - pshufd m7, m7, q0000 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 -%else - vpgatherdw m7, m4, scalingq, r12, r2 - vpgatherdw m5, m6, scalingq, r12, r2 -%endif - pcmpeqw m1, m1 - psrlw m1, 8 - pand m7, m1 - pand m5, m1 - - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq+ 0] - pcmpgtb m6, m2, m3 - punpcklbw m2, m3, m6 - punpckhbw m3, m6 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m7 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add srcq, r2mp - ; we already incremented lumaq above -%else - add srcq, r12mp -%if %3 - lea lumaq, [lumaq+lstrideq*2] -%else - add lumaq, lstrideq -%endif -%endif - add grain_lutq, 82 - dec hw - jg %%loop_y - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end -%if ARCH_X86_32 - mov srcq, r1mp - mov lumaq, r11mp -%else - mov srcq, r11mp -%endif - lea lumaq, [luma_bakq+wq*(1+%2)] - add srcq, wq -%if ARCH_X86_32 - mov r4m, wq - mov r9m, lumaq -%endif -%if %2 == 0 - ; adjust top_offxy -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add r11d, 16 -%endif - add offxyd, 16 - btc dword r8m, 2 - jc %%loop_x_even - test dword r8m, 2 - jz %%loop_x_odd - jmp %%loop_x_odd_v_overlap -%%loop_x_even: -%endif - test dword r8m, 1 - jz %%loop_x - - ; r8m = sbym - test dword r8m, 2 - jne %%loop_x_hv_overlap - - ; horizontal overlap (without vertical overlap) -%%loop_x_h_overlap: -%if ARCH_X86_32 -%if %2 - lea r6, [offxyd+16] - mov [rsp+8*mmsize+0*gprsize], r6 -%else - mov [rsp+8*mmsize+0*gprsize], offxyd -%endif - - DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut - - mov seed, r3m -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, unused1, unused2, lstride - -%if %2 - lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx -%else - mov left_offxyd, offyd -%endif -%endif - mov r6d, seed - or seed, 0xEFF4 - shr r6d, 1 - test seeb, seeh - lea seed, [r6+0x8000] - cmovp seed, r6d ; updated seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, unused1, unused2, lstride - - mov offyd, seed - mov offxd, seed -%endif - ror offyd, 8 - shr offxd, 12 - and offyd, 0xf - imul offyd, 164>>%3 - lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak -%endif - - mov hd, r7m - mov grain_lutq, grain_lutmp -%%loop_y_h_overlap: - ; src -%if ARCH_X86_32 - mov lumaq, r9mp -%endif -%if %2 - mova m4, [lumaq+ 0] - mova m6, [lumaq+16] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq - mov r5, r5m - movd m7, [base+pb_1] -%else - movd m7, [pb_1] -%endif - pshufd m7, m7, q0000 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 -%else - vpgatherdw m7, m4, scalingq, r12, r2 - vpgatherdw m5, m6, scalingq, r12, r2 -%endif - pcmpeqw m1, m1 - psrlw m1, 8 - pand m7, m1 - pand m5, m1 - - ; unpack chroma_source - punpckhbw m1, m0, m2 - punpcklbw m0, m2 ; m0-1: src as word - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq+ 0] -%if ARCH_X86_32 - mov r0, [rsp+8*mmsize+0*gprsize] - movd m4, [grain_lutq+r0+ 0] -%else - movd m4, [grain_lutq+left_offxyq+ 0] -%endif - punpcklbw m2, m4, m3 - pmaddubsw m4, m9, m2 - pmulhrsw m4, m8 - packsswb m4, m4 - pand m4, m10 - pandn m2, m10, m3 - por m3, m4, m2 - pxor m4, m4 - pcmpgtb m4, m3 - punpcklbw m2, m3, m4 - punpckhbw m3, m4 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m7 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m1, m3 - pmaxsw m0, m13 - pmaxsw m1, m13 - pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add srcq, r2mp - ; lumaq has already been incremented above -%else - add srcq, r12mp -%if %3 - lea lumaq, [lumaq+lstrideq*2] -%else - add lumaq, lstrideq -%endif -%endif - add grain_lutq, 82 - dec hw - jg %%loop_y_h_overlap - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end -%if ARCH_X86_32 - mov srcq, r1mp - mov lumaq, r11mp -%else - mov srcq, r11mp -%endif - lea lumaq, [luma_bakq+wq*(1+%2)] - add srcq, wq -%if ARCH_X86_32 - mov r4m, wq - mov r9m, lumaq -%endif -%if %2 == 0 - xor dword r8m, 4 - ; adjust top_offxyd -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add r11d, 16 -%endif - add offxyd, 16 -%endif - - ; r8m = sbym - test dword r8m, 2 -%if %2 - jne %%loop_x_hv_overlap - jmp %%loop_x_h_overlap -%else - jne %%loop_x_odd_v_overlap - jmp %%loop_x_odd -%endif - -%%end: - RET - -%%vertical_overlap: -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap -%else - DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap -%endif - - or overlapd, 2 ; top_overlap: overlap & 2 - mov r8m, overlapd - movzx sbyd, sbyb -%if ARCH_X86_32 - imul r4, [fg_dataq+FGData.seed], 0x00010001 - DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused -%else - imul seed, [fg_dataq+FGData.seed], 0x00010001 -%endif - imul tmpd, sbyd, 173 * 0x00010001 - imul sbyd, 37 * 0x01000100 - add tmpd, (105 << 16) | 188 - add sbyd, (178 << 24) | (141 << 8) - and tmpd, 0x00ff00ff - and sbyd, 0xff00ff00 - xor seed, tmpd -%if ARCH_X86_32 - xor sbyd, seed ; (cur_seed << 16) | top_seed - - DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak - - mov r3m, seed - mov wq, r4m -%if %3 - shl r10mp, 1 -%endif -%else - xor seed, sbyd ; (cur_seed << 16) | top_seed - - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak - - mov lstrideq, r10mp -%endif - - mov lumaq, r9mp - lea src_bakq, [srcq+wq] - lea luma_bakq, [lumaq+wq*(1+%2)] - neg wq - sub r0mp, srcq -%if ARCH_X86_32 - mov r1m, src_bakq - mov r11m, luma_bakq - mov r4m, wq - - DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 -%else - mov r11mp, src_bakq - mov r12mp, strideq -%endif - -%%loop_x_v_overlap: -%if ARCH_X86_32 - mov seed, r3m - xor tmpd, tmpd -%endif - ; we assume from the block above that bits 8-15 of tmpd are zero'ed - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp tmpb ; parity of top_seed - shr seed, 16 - shl tmpd, 16 - test seeb, seeh - setp tmpb ; parity of cur_seed - or r6d, 0x00010001 - xor tmpd, r6d - mov seed, tmpd - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, overlap, top_offxy, unused, lstride - - mov offxd, seed - mov offyd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - -%if ARCH_X86_32 - DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak -%endif - - movzx top_offxyd, offxyw - shr offxyd, 16 -%if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd - - DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut -%endif - -%%loop_x_odd_v_overlap: - mov hd, r7m - mov grain_lutq, grain_lutmp -%if ARCH_X86_32 - mov r5, r5m - mova m1, [base+pb_27_17] -%else - mova m1, [pb_27_17] -%endif -%%loop_y_v_overlap: -%if ARCH_X86_32 - mov lumaq, r9mp -%endif -%if %2 - mova m4, [lumaq+ 0] - mova m6, [lumaq+16] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq - mov r5, r5m - movd m7, [base+pb_1] -%else - movd m7, [pb_1] -%endif - pshufd m7, m7, q0000 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - ; scaling[luma_src] -%if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 -%else - vpgatherdw m7, m4, scalingq, r12, r2 - vpgatherdw m5, m6, scalingq, r12, r2 -%endif - pcmpeqw m4, m4 - psrlw m4, 8 - pand m7, m4 - pand m5, m4 - - ; grain = grain_lut[offy+y][offx+x] - movu m3, [grain_lutq+offxyq] -%if ARCH_X86_32 - mov r0, [rsp+8*mmsize+1*gprsize] - movu m4, [grain_lutq+r0] -%else - movu m4, [grain_lutq+top_offxyq] -%endif - punpckhbw m6, m4, m3 - punpcklbw m4, m3 -%if %3 - pmaddubsw m2, m9, m6 - pmaddubsw m3, m9, m4 -%else - pmaddubsw m2, m1, m6 - pmaddubsw m3, m1, m4 -%endif - pmulhrsw m2, m8 - pmulhrsw m3, m8 - packsswb m3, m2 - pxor m6, m6 - pcmpgtb m6, m3 - punpcklbw m2, m3, m6 - punpckhbw m3, m6 - - ; noise = round2(scaling[luma_src] * grain, scaling_shift) - pmullw m2, m7 - pmullw m3, m5 - pmulhrsw m2, m11 - pmulhrsw m3, m11 - - ; unpack chroma_source - pxor m4, m4 - punpckhbw m6, m0, m4 - punpcklbw m0, m4 ; m0-1: src as word - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m6, m3 - pmaxsw m0, m13 - pmaxsw m6, m13 - pminsw m0, m12 - pminsw m6, m12 - packuswb m0, m6 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - - dec hw - je %%end_y_v_overlap -%if ARCH_X86_32 - add srcq, r2mp - ; lumaq has already been incremented above -%else - add srcq, r12mp -%if %3 - lea lumaq, [lumaq+lstrideq*2] -%else - add lumaq, lstrideq -%endif -%endif - add grain_lutq, 82 -%if %3 == 0 - btc hd, 16 -%if ARCH_X86_32 - mov r5, r5m - mova m1, [base+pb_17_27] -%else - mova m1, [pb_17_27] -%endif - jnc %%loop_y_v_overlap -%endif - jmp %%loop_y - -%%end_y_v_overlap: -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end_hv -%if ARCH_X86_32 - mov srcq, r1mp - mov lumaq, r11mp -%else - mov srcq, r11mp -%endif - lea lumaq, [luma_bakq+wq*(1+%2)] - add srcq, wq -%if ARCH_X86_32 - mov r4m, wq - mov r9m, lumaq -%endif - -%if %2 - ; since fg_dataq.overlap is guaranteed to be set, we never jump - ; back to .loop_x_v_overlap, and instead always fall-through to - ; h+v overlap -%else -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add top_offxyd, 16 -%endif - add offxyd, 16 - btc dword r8m, 2 - jnc %%loop_x_odd_v_overlap -%endif - -%%loop_x_hv_overlap: -%if ARCH_X86_32 - DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused - - mov r6, [rsp+8*mmsize+1*gprsize] -%if %2 - lea r0, [r3d+16] - add r6, 16 - mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy -%else - mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy -%endif - mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy - - DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused - - mov seed, r3m - xor tmpd, tmpd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride - -%if %2 - lea topleft_offxyq, [top_offxyq+16] - lea left_offxyq, [offxyq+16] -%else - mov topleft_offxyq, top_offxyq - mov left_offxyq, offxyq -%endif - - ; we assume from the block above that bits 8-15 of tmpd are zero'ed -%endif - mov r6d, seed - or seed, 0xeff4eff4 - test seeb, seeh - setp tmpb ; parity of top_seed - shr seed, 16 - shl tmpd, 16 - test seeb, seeh - setp tmpb ; parity of cur_seed - or r6d, 0x00010001 - xor tmpd, r6d - mov seed, tmpd - ror seed, 1 ; updated (cur_seed << 16) | top_seed - -%if ARCH_X86_32 - mov r3m, seed - - DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx - - mov offxd, offyd -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride - - mov offxd, seed - mov offyd, seed -%endif - ror offyd, 8 - ror offxd, 12 - and offyd, 0xf000f - and offxd, 0xf000f - imul offyd, 164>>%3 - ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] - -%if ARCH_X86_32 - DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut -%else - DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ - h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak -%endif - - movzx top_offxyd, offxyw - shr offxyd, 16 -%if ARCH_X86_32 - mov [rsp+8*mmsize+1*gprsize], top_offxyd -%endif - - mov hd, r7m - mov grain_lutq, grain_lutmp -%if ARCH_X86_32 - mov r5, r5m - mova m3, [base+pb_27_17] -%else - mova m3, [pb_27_17] -%endif -%%loop_y_hv_overlap: - ; src -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov lumaq, r9mp -%endif -%if %2 - mova m4, [lumaq+ 0] - mova m6, [lumaq+16] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq - mov r5, r5m - movd m7, [base+pb_1] -%else - movd m7, [pb_1] -%endif - pshufd m7, m7, q0000 - pxor m2, m2 - pmaddubsw m4, m7 - pmaddubsw m6, m7 - pavgw m4, m2 - pavgw m6, m2 -%else - mova m4, [lumaq] - mova m0, [srcq] -%if ARCH_X86_32 - add lumaq, r10mp - mov r9mp, lumaq -%endif - pxor m2, m2 -%endif - -%if %1 -%if %2 - packuswb m4, m6 ; luma -%endif - punpckhbw m6, m4, m0 - punpcklbw m4, m0 ; { luma, chroma } - pmaddubsw m6, m14 - pmaddubsw m4, m14 - psraw m6, 6 - psraw m4, 6 - paddw m6, m15 - paddw m4, m15 - packuswb m4, m6 ; pack+unpack = clip - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%elif %2 == 0 - punpckhbw m6, m4, m2 - punpcklbw m4, m2 -%endif - - ; scaling[src] -%if ARCH_X86_32 - vpgatherdw m7, m4, scalingq, r0, r5 - vpgatherdw m5, m6, scalingq, r0, r5 -%else - movd m1, [grain_lutq+topleft_offxyq] -%if %3 - vpgatherdw m7, m4, scalingq, r2, r12 - vpgatherdw m5, m6, scalingq, r2, r12 -%else - vpgatherdw m7, m4, scalingq, r2, r13 - vpgatherdw m5, m6, scalingq, r2, r13 -%endif -%endif - pcmpeqw m2, m2 - psrlw m2, 8 - pand m7, m2 - pand m5, m2 - - ; grain = grain_lut[offy+y][offx+x] -%if ARCH_X86_32 - mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy - mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy - movd m1, [grain_lutq+r0] - mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy -%endif - movu m2, [grain_lutq+offxyq] -%if ARCH_X86_32 - movu m6, [grain_lutq+r5] - movd m4, [grain_lutq+r0] -%else - movu m6, [grain_lutq+top_offxyq] - movd m4, [grain_lutq+left_offxyq] -%endif - ; do h interpolation first (so top | top/left -> top, left | cur -> cur) - punpcklbw m1, m6 - punpcklbw m4, m2 -%if %2 - punpcklwd m4, m1 -%else - punpckldq m4, m1 -%endif - pmaddubsw m1, m9, m4 - pmulhrsw m1, m8 - packsswb m1, m1 - pandn m4, m10, m2 - pandn m2, m10, m6 - psrldq m6, m1, 2-%2 - pand m1, m10 - pand m6, m10 - por m4, m1 - por m2, m6 - ; followed by v interpolation (top | cur -> cur) - punpckhbw m1, m2, m4 - punpcklbw m2, m4 -%if %3 - pmaddubsw m4, m9, m1 - pmaddubsw m1, m9, m2 -%else - pmaddubsw m4, m3, m1 - pmaddubsw m1, m3, m2 -%endif - pmulhrsw m4, m8 - pmulhrsw m1, m8 - packsswb m1, m4 - pxor m4, m4 - pcmpgtb m4, m1 - punpcklbw m2, m1, m4 - punpckhbw m1, m4 - - ; noise = round2(scaling[src] * grain, scaling_shift) - pmullw m2, m7 - pmullw m1, m5 - pmulhrsw m2, m11 - pmulhrsw m1, m11 - -%if ARCH_X86_32 - DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut -%endif - - ; unpack chroma source - pxor m4, m4 - punpckhbw m5, m0, m4 - punpcklbw m0, m4 ; m0-1: src as word - - ; dst = clip_pixel(src, noise) - paddw m0, m2 - paddw m5, m1 - pmaxsw m0, m13 - pmaxsw m5, m13 - pminsw m0, m12 - pminsw m5, m12 - packuswb m0, m5 - movifnidn dstq, dstmp - mova [dstq+srcq], m0 - -%if ARCH_X86_32 - add srcq, r2mp - ; lumaq has been adjusted above already -%else - add srcq, r12mp -%if %3 - lea lumaq, [lumaq+lstrideq*(1+%2)] -%else - add lumaq, r10mp -%endif -%endif - add grain_lutq, 82 - dec hw -%if %3 - jg %%loop_y_h_overlap -%else - jle %%end_y_hv_overlap -%if ARCH_X86_32 - mov r5, r5m - mova m3, [base+pb_17_27] -%else - mova m3, [pb_17_27] -%endif - btc hd, 16 - jnc %%loop_y_hv_overlap -%if ARCH_X86_64 - mov lstrideq, r10mp -%endif - jmp %%loop_y_h_overlap -%%end_y_hv_overlap: -%if ARCH_X86_64 - mov lstrideq, r10mp -%endif -%endif - -%if ARCH_X86_32 - DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut - - mov wq, r4m -%endif - add wq, 16 - jge %%end_hv -%if ARCH_X86_32 - mov srcq, r1mp - mov lumaq, r11mp -%else - mov srcq, r11mp -%endif - lea lumaq, [luma_bakq+wq*(1+%2)] - add srcq, wq -%if ARCH_X86_32 - mov r4m, wq - mov r9m, lumaq -%endif -%if %2 - jmp %%loop_x_hv_overlap -%else -%if ARCH_X86_32 - add dword [rsp+8*mmsize+1*gprsize], 16 -%else - add top_offxyd, 16 -%endif - add offxyd, 16 - xor dword r8m, 4 - jmp %%loop_x_odd_v_overlap -%endif - -%%end_hv: - RET -%endmacro - - %%FGUV_32x32xN_LOOP 1, %2, %3 -.csfl: - %%FGUV_32x32xN_LOOP 0, %2, %3 -%endmacro - -FGUV_FN 420, 1, 1 - -%if STACK_ALIGNMENT < mmsize -DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 -%endif - -FGUV_FN 422, 1, 0 - -%if STACK_ALIGNMENT < mmsize -DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 -%endif - -FGUV_FN 444, 0, 0 diff -Nru dav1d-0.7.1/src/x86/ipred16_avx2.asm dav1d-0.9.1/src/x86/ipred16_avx2.asm --- dav1d-0.7.1/src/x86/ipred16_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/ipred16_avx2.asm 2021-07-28 21:38:28.897852200 +0000 @@ -0,0 +1,4909 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 32 + +%macro SMOOTH_WEIGHTS 1-* +const smooth_weights_1d_16bpc ; sm_weights[] << 7 + %rep %0 + dw %1*128 + %rotate 1 + %endrep +const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] + %rep %0 + dw %1, 256-%1 + %rotate 1 + %endrep +%endmacro + +SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +%if ARCH_X86_64 + +ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 + db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 +filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 +filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 +filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 +pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 + dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 +pw_m1024: times 2 dw -1024 +pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4 +z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8 +pb_90: times 4 db 90 +z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4 +z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11 +z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9 +z_filter_k: dw 4, 4, 5, 5, 4, 4 + dw 8, 8, 6, 6, 4, 4 + dw 0, 0, 0, 0, 2, 2 + +%define pw_2 (z_filter_k+32) +%define pw_4 (z_filter_k+ 0) +%define pw_16 (z2_ymul8 +20) + +pw_3: times 2 dw 3 +pw_62: times 2 dw 62 +pw_512: times 2 dw 512 +pw_2048: times 2 dw 2048 +pd_8: dd 8 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) +%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4) + +JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 +JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_420_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3 +JMP_TABLE ipred_cfl_ac_422_16bpc, avx2, w16_wpad_pad1, w16_wpad_pad2, w16_wpad_pad3 +JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 + +cextern dr_intra_derivative +cextern filter_intra_taps + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +INIT_YMM avx2 +cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h + movifnidn hd, hm + add tlq, 2 + movd xm4, wd + pxor xm3, xm3 + pavgw xm4, xm3 + tzcnt wd, wd + movd xm5, wd + movu m0, [tlq] + lea r5, [ipred_dc_left_16bpc_avx2_table] + movsxd r6, [r5+wq*4] + add r6, r5 + add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + mov hd, hm + sub tlq, hq + movd xm4, hd + sub tlq, hq + pxor xm3, xm3 + pavgw xm4, xm3 + tzcnt r6d, hd + movd xm5, r6d + movu m0, [tlq] + lea r5, [ipred_dc_left_16bpc_avx2_table] + movsxd r6, [r5+r6*4] + add r6, r5 + add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + paddw m0, [tlq+96] + paddw m0, [tlq+64] +.h32: + paddw m0, [tlq+32] +.h16: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h8: + psrldq xm1, xm0, 8 + paddw xm0, xm1 +.h4: + punpcklwd xm0, xm3 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + paddd xm0, xm4 + psrld xm0, xm5 + lea stride3q, [strideq*3] + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp wq + +cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm4, r5d + tzcnt r5d, r5d + movd xm5, r5d + lea r5, [ipred_dc_16bpc_avx2_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pxor m3, m3 + psrlw xm4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movq xm0, [tlq-8] + jmp wq +.w4: + movq xm1, [tlq+2] + paddw m0, m4 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrld m1, m0, 16 + paddw m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + lea r2d, [hq*2] + mov r6d, 0xAAAB6667 + shrx r6d, r6d, r2d + punpckhwd xm1, xm0, xm3 + punpcklwd xm0, xm3 + paddd xm0, xm1 + movd xm1, r6d + psrld xm0, 2 + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w4_end: + vpbroadcastw xm0, xm0 +.s4: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + mova xm0, [tlq-16] + jmp wq +.w8: + vextracti128 xm1, m0, 1 + paddw xm0, [tlq+2] + paddw xm0, xm4 + paddw xm0, xm1 + psrld xm1, xm0, 16 + paddw xm0, xm1 + pblendw xm0, xm3, 0xAA + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w8_end: + vpbroadcastw xm0, xm0 +.s8: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova m0, [tlq-32] + jmp wq +.w16: + paddw m0, [tlq+2] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpckhwd xm1, xm0, xm3 + punpcklwd xm0, xm3 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-32] + jmp wq +.w32: + paddw m0, [tlq+ 2] + paddw m0, [tlq+34] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm3 + punpckhwd xm0, xm3 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x6667AAAB + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w32_end: + vpbroadcastw m0, xm0 + mova m1, m0 +.s32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m0 + mova [dstq+stride3q +32*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-128] + mova m1, [tlq- 96] + paddw m0, [tlq- 64] + paddw m1, [tlq- 32] + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 2] + paddw m0, [tlq+34] + paddw m1, [tlq+66] + paddw m0, [tlq+98] + paddw m0, m1 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm3 + punpckhwd xm0, xm3 + paddd xm1, xm4 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 64 + je .w64_end + mov r6d, 0x6667AAAB + shrx r6d, r6d, hd + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w64_end: + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s64: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s64 + RET + +cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + mov r6d, r8m + shr r6d, 11 + lea r5, [ipred_dc_splat_16bpc_avx2_table] + tzcnt wd, wd + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] + mova m1, m0 + mova m2, m0 + mova m3, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movu m0, [tlq+ 2] + movu m1, [tlq+34] + movu m2, [tlq+66] + movu m3, [tlq+98] + lea r5, [ipred_dc_splat_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +%macro IPRED_H 2 ; w, store_type + vpbroadcastw m0, [tlq-2] + vpbroadcastw m1, [tlq-4] + vpbroadcastw m2, [tlq-6] + vpbroadcastw m3, [tlq-8] + sub tlq, 8 + mov%2 [dstq+strideq*0], m0 + mov%2 [dstq+strideq*1], m1 + mov%2 [dstq+strideq*2], m2 + mov%2 [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w%1 + RET +ALIGN function_align +%endmacro + +cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + lea r5, [ipred_h_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +INIT_XMM avx2 +.w4: + IPRED_H 4, q +.w8: + IPRED_H 8, a +INIT_YMM avx2 +.w16: + IPRED_H 16, a +.w32: + vpbroadcastw m0, [tlq-2] + vpbroadcastw m1, [tlq-4] + vpbroadcastw m2, [tlq-6] + vpbroadcastw m3, [tlq-8] + sub tlq, 8 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m0 + mova [dstq+strideq*1+32*0], m1 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m2 + mova [dstq+strideq*2+32*1], m2 + mova [dstq+stride3q +32*0], m3 + mova [dstq+stride3q +32*1], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32 + RET +.w64: + vpbroadcastw m0, [tlq-2] + vpbroadcastw m1, [tlq-4] + sub tlq, 4 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m0 + mova [dstq+strideq*0+32*2], m0 + mova [dstq+strideq*0+32*3], m0 + mova [dstq+strideq*1+32*0], m1 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m1 + mova [dstq+strideq*1+32*3], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64 + RET + +%macro PAETH 3 ; top, signed_ldiff, ldiff + paddw m0, m%2, m1 + psubw m7, m3, m0 ; tldiff + psubw m0, m%1 ; tdiff + pabsw m7, m7 + pabsw m0, m0 + pminsw m7, m0 + pcmpeqw m0, m7 + pcmpgtw m7, m%3, m7 + vpblendvb m0, m3, m%1, m0 + vpblendvb m0, m1, m0, m7 +%endmacro + +cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h +%define base r5-ipred_paeth_16bpc_avx2_table + movifnidn hd, hm + lea r5, [ipred_paeth_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + vpbroadcastw m3, [tlq] ; topleft + add wq, r5 + jmp wq +.w4: + vpbroadcastq m2, [tlq+2] ; top + movsldup m6, [base+ipred_hv_shuf] + lea r3, [strideq*3] + psubw m4, m2, m3 + pabsw m5, m4 +.w4_loop: + sub tlq, 8 + vpbroadcastq m1, [tlq] + pshufb m1, m6 ; left + PAETH 2, 4, 5 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + vbroadcasti128 m2, [tlq+2] + movsldup m6, [base+ipred_hv_shuf] + psubw m4, m2, m3 + pabsw m5, m4 +.w8_loop: + sub tlq, 4 + vpbroadcastd m1, [tlq] + pshufb m1, m6 + PAETH 2, 4, 5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + movu m2, [tlq+2] + psubw m4, m2, m3 + pabsw m5, m4 +.w16_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m2, [tlq+2] + movu m6, [tlq+34] +%if WIN64 + movaps r4m, xmm8 + movaps r6m, xmm9 +%endif + psubw m4, m2, m3 + psubw m8, m6, m3 + pabsw m5, m4 + pabsw m9, m8 +.w32_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq+32*0], m0 + PAETH 6, 8, 9 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w32_loop +%if WIN64 + movaps xmm8, r4m + movaps xmm9, r6m +%endif + RET +ALIGN function_align +.w64: + WIN64_SPILL_XMM 16 + movu m2, [tlq+ 2] + movu m6, [tlq+34] + movu m10, [tlq+66] + movu m13, [tlq+98] + psubw m4, m2, m3 + psubw m8, m6, m3 + psubw m11, m10, m3 + psubw m14, m13, m3 + pabsw m5, m4 + pabsw m9, m8 + pabsw m12, m11 + pabsw m15, m14 +.w64_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq+32*0], m0 + PAETH 6, 8, 9 + mova [dstq+32*1], m0 + PAETH 10, 11, 12 + mova [dstq+32*2], m0 + PAETH 13, 14, 15 + mova [dstq+32*3], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights +%define base r6-ipred_smooth_v_16bpc_avx2_table + lea r6, [ipred_smooth_v_16bpc_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] + neg hq + vpbroadcastw m5, [tlq+hq*2] ; bottom + add wq, r6 + jmp wq +.w4: + vpbroadcastq m4, [tlq+2] ; top + movsldup m3, [base+ipred_hv_shuf] + lea r6, [strideq*3] + psubw m4, m5 ; top - bottom +.w4_loop: + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m3 + pmulhrsw m0, m4 + paddw m0, m5 + vextracti128 xm1, m0, 1 + movhps [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movq [dstq+r6 ], xm0 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop +.ret: + RET +.w8: + vbroadcasti128 m4, [tlq+2] + movsldup m3, [base+ipred_hv_shuf] + lea r6, [strideq*3] + psubw m4, m5 +.w8_loop: + vpbroadcastd m0, [weightsq+hq*2+0] + vpbroadcastd m1, [weightsq+hq*2+4] + pshufb m0, m3 + pshufb m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + vextracti128 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], xm0 + vextracti128 [dstq+strideq*2], m1, 1 + mova [dstq+r6 ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +.w16: + movu m4, [tlq+2] + lea r6, [strideq*3] + psubw m4, m5 +.w16_loop: + vpbroadcastw m0, [weightsq+hq*2+0] + vpbroadcastw m1, [weightsq+hq*2+2] + vpbroadcastw m2, [weightsq+hq*2+4] + vpbroadcastw m3, [weightsq+hq*2+6] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r6 ], m3 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w16_loop + RET +.w32: + WIN64_SPILL_XMM 7 + movu m4, [tlq+ 2] + movu m6, [tlq+34] + psubw m4, m5 + psubw m6, m5 +.w32_loop: + vpbroadcastw m1, [weightsq+hq*2+0] + vpbroadcastw m3, [weightsq+hq*2+2] + pmulhrsw m0, m4, m1 + pmulhrsw m1, m6 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m6 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w32_loop + RET +.w64: + WIN64_SPILL_XMM 8 + movu m3, [tlq+ 2] + movu m4, [tlq+34] + movu m6, [tlq+66] + movu m7, [tlq+98] + REPX {psubw x, m5}, m3, m4, m6, m7 +.w64_loop: + vpbroadcastw m2, [weightsq+hq*2] + pmulhrsw m0, m3, m2 + pmulhrsw m1, m4, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*0], m0 + pmulhrsw m0, m6, m2 + mova [dstq+32*1], m1 + pmulhrsw m1, m7, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + add dstq, strideq + inc hq + jl .w64_loop + RET + +cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 +%define base r6-ipred_smooth_h_16bpc_avx2_table + lea r6, [ipred_smooth_h_16bpc_avx2_table] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m5, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [r6+wq*4] + sub tlq, hq + lea stride3q, [strideq*3] + add wq, r6 + jmp wq +.w4: + vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] + movsldup m3, [base+ipred_hv_shuf] +.w4_loop: + vpbroadcastq m0, [tlq+hq-8] ; left + pshufb m0, m3 + psubw m0, m5 ; left - right + pmulhrsw m0, m4 + paddw m0, m5 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w4_loop + RET +.w8: + vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] + movsldup m3, [base+ipred_hv_shuf] +.w8_loop: + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastd m1, [tlq+hq-8] + pshufb m0, m3 + pshufb m1, m3 + psubw m0, m5 + psubw m1, m5 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w8_loop + RET +.w16: + movu m4, [base+smooth_weights_1d_16bpc+16*2] +.w16_loop: + vpbroadcastq m3, [tlq+hq-8] + punpcklwd m3, m3 + psubw m3, m5 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w16_loop + RET +.w32: + WIN64_SPILL_XMM 7 + movu m4, [base+smooth_weights_1d_16bpc+32*2] + movu m6, [base+smooth_weights_1d_16bpc+32*3] +.w32_loop: + vpbroadcastw m1, [tlq+hq-2] + vpbroadcastw m3, [tlq+hq-4] + psubw m1, m5 + psubw m3, m5 + pmulhrsw m0, m4, m1 + pmulhrsw m1, m6 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m6 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w32_loop + RET +.w64: + WIN64_SPILL_XMM 8 + movu m3, [base+smooth_weights_1d_16bpc+32*4] + movu m4, [base+smooth_weights_1d_16bpc+32*5] + movu m6, [base+smooth_weights_1d_16bpc+32*6] + movu m7, [base+smooth_weights_1d_16bpc+32*7] +.w64_loop: + vpbroadcastw m2, [tlq+hq-2] + psubw m2, m5 + pmulhrsw m0, m3, m2 + pmulhrsw m1, m4, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*0], m0 + pmulhrsw m0, m6, m2 + mova [dstq+32*1], m1 + pmulhrsw m1, m7, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + add dstq, strideq + sub hq, 1*2 + jg .w64_loop + RET + +%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] + pmaddwd m0, m%1, m%3 + pmaddwd m1, m%2, m%4 + paddd m0, m%5 + paddd m1, m%6 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m5 +%endmacro + +cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights +%define base r6-ipred_smooth_16bpc_avx2_table + lea r6, [ipred_smooth_16bpc_avx2_table] + mov wd, wm + vpbroadcastw m4, [tlq+wq*2] ; right + tzcnt wd, wd + mov hd, hm + sub tlq, hq + sub tlq, hq + movsxd wq, [r6+wq*4] + pxor m5, m5 + add wq, r6 + lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] + jmp wq +.w4: + WIN64_SPILL_XMM 11 + vpbroadcastw m0, [tlq] ; bottom + vpbroadcastq m6, [tlq+hq*2+2] + movsldup m7, [base+ipred_hv_shuf] + movshdup m9, [base+ipred_hv_shuf] + vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] + punpcklwd m6, m0 ; top, bottom + punpcklqdq m8, m9, m9 + punpckhqdq m9, m9 + lea r3, [strideq*3] +.w4_loop: + vpbroadcastq m3, [tlq+hq*2-8] + vbroadcasti128 m1, [v_weightsq] + pshufb m3, m7 + punpcklwd m2, m3, m4 ; left, right + punpckhwd m3, m4 + pmaddwd m2, m10 + pmaddwd m3, m10 + pshufb m0, m1, m8 + pshufb m1, m9 + SMOOTH_2D_END 0, 1, 6, 6, 2, 3 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + add v_weightsq, 16 + sub hd, 4 + jg .w4_loop + RET +.w8: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 + vpbroadcastw m0, [tlq] ; bottom + vbroadcasti128 m7, [tlq+hq*2+2] + movsldup m8, [base+ipred_hv_shuf] + movshdup m9, [base+ipred_hv_shuf] + vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] + vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] + punpcklwd m6, m7, m0 ; top, bottom + punpckhwd m7, m0 +.w8_loop: + vpbroadcastd m3, [tlq+hq*2-4] + vpbroadcastq m1, [v_weightsq] + pshufb m3, m8 + punpcklwd m2, m3, m4 ; left, right + punpckhwd m3, m4 + pmaddwd m2, m10 + pmaddwd m3, m11 + pshufb m1, m9 + SMOOTH_2D_END 1, 1, 6, 7, 2, 3 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add v_weightsq, 8 + sub hd, 2 + jg .w8_loop + RET +.w16: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 11 + vpbroadcastw m0, [tlq] ; bottom + movu m7, [tlq+hq*2+2] + mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] + mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] + vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 + vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 + punpcklwd m6, m7, m0 ; top, bottom + punpckhwd m7, m0 +.w16_loop: + vpbroadcastd m3, [tlq+hq*2-4] + vpbroadcastd m1, [v_weightsq+0] + punpcklwd m3, m4 ; left, right + pshufd m2, m3, q1111 + pmaddwd m10, m8, m2 + pmaddwd m2, m9 + pshufd m3, m3, q0000 + SMOOTH_2D_END 1, 1, 6, 7, 10, 2 + vpbroadcastd m1, [v_weightsq+4] + pmaddwd m2, m8, m3 + pmaddwd m3, m9 + mova [dstq+strideq*0], m0 + SMOOTH_2D_END 1, 1, 6, 7, 2, 3 + mova [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + add v_weightsq, 8 + sub hq, 2 + jg .w16_loop + RET +.w32: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 15 + vpbroadcastw m0, [tlq] ; bottom + movu m7, [tlq+hq*2+ 2] + movu m9, [tlq+hq*2+34] + mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] + mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] + vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 + vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 + mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] + mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] + vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 + vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 + punpcklwd m6, m7, m0 + punpckhwd m7, m0 + punpcklwd m8, m9, m0 + punpckhwd m9, m0 +.w32_loop: + vpbroadcastw m3, [tlq+hq*2-2] + vpbroadcastd m14, [v_weightsq] + punpcklwd m3, m4 + pmaddwd m1, m10, m3 + pmaddwd m2, m11, m3 + pmaddwd m0, m6, m14 + paddd m0, m1 + pmaddwd m1, m7, m14 + paddd m1, m2 + pmaddwd m2, m12, m3 + pmaddwd m3, m13 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m5 + mova [dstq+32*0], m0 + SMOOTH_2D_END 14, 14, 8, 9, 2, 3 + mova [dstq+32*1], m0 + add dstq, strideq + add v_weightsq, 4 + dec hd + jg .w32_loop + RET +.w64: +%assign stack_offset stack_offset - stack_size_padded + PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base + mov dst_baseq, dstq + mov tl_baseq, tlq + mov v_weights_baseq, v_weightsq + xor xq, xq +.w64_loop_x: + mov yq, hq + lea tlq, [tl_baseq+hq*2] + vpbroadcastw m0, [tl_baseq] ; bottom + movu m7, [tlq+xq*2+ 2] + movu m9, [tlq+xq*2+34] + mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] + mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] + vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 + vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 + mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] + mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] + vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 + vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 + punpcklwd m6, m7, m0 + punpckhwd m7, m0 + punpcklwd m8, m9, m0 + punpckhwd m9, m0 + lea tlq, [tl_baseq-2] +.w64_loop_y: + vpbroadcastw m3, [tlq+yq*2] + vpbroadcastd m1, [v_weightsq] + punpcklwd m3, m4 + pmaddwd m14, m10, m3 + pmaddwd m15, m11, m3 + pmaddwd m2, m12, m3 + pmaddwd m3, m13 + pmaddwd m0, m6, m1 + paddd m0, m14 + pmaddwd m14, m7, m1 + paddd m14, m15 + psrld m0, 8 + psrld m14, 8 + packssdw m0, m14 + pavgw m0, m5 + mova [dstq+32*0], m0 + SMOOTH_2D_END 8, 9, 1, 1, 2, 3 + mova [dstq+32*1], m0 + add dstq, strideq + add v_weightsq, 4 + dec yq + jg .w64_loop_y + lea dstq, [dst_baseq+32*2] + add r6, 16*8 + mov v_weightsq, v_weights_baseq + add xq, 32 + test xb, 64 + jz .w64_loop_x + RET + +cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z1_16bpc_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea r7, [dr_intra_derivative] + movsxd wq, [r6+wq*4] + add tlq, 2 + add wq, r6 + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [r7+dxq] + xor angled, 0x4ff ; d = 90 - angle + vpbroadcastd m5, [pw_62] + jmp wq +.w4: + ALLOC_STACK -64, 7 + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + vpbroadcastw xm3, [tlq+14] + movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 + paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 + add dxd, dxd + palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 + paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d + psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 + psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 + pxor xm4, xm4 + paddw xm2, xm0 + vpbroadcastw xm0, r8m ; pixel_max + mova [rsp+32], xm3 + movd xm3, dxd + pmaxsw xm2, xm4 + mov r3d, dxd + pavgw xm2, xm4 + vpbroadcastw m3, xm3 + pminsw xm2, xm0 + punpcklwd xm0, xm1, xm2 + punpckhwd xm1, xm2 + lea r5, [strideq*3] + pslldq m2, m3, 8 + mova [rsp+ 0], xm0 + mova [rsp+16], xm1 + paddw m6, m3, m3 + paddw m3, m2 + vpblendd m4, m6, 0xf0 + paddw m6, m6 + paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 + vbroadcasti128 m4, [z_upsample] +.w4_upsample_loop: + lea r2d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [rsp+r3*2] + lea r3d, [r2+dxq] + shr r2d, 6 ; base1 + movu xm2, [rsp+r2*2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base2 + vinserti128 m1, [rsp+r3*2], 1 ; 0 2 + lea r3d, [r2+dxq] + shr r2d, 6 ; base3 + vinserti128 m2, [rsp+r2*2], 1 ; 1 3 + pshufb m1, m4 + pshufb m2, m4 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m5, m3 ; frac + psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 + psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) + pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) + paddw m3, m6 ; xpos += dx + paddw m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; w4/w8/w16 +%define base r3-z_filter_t0 + movd xm0, maxbased + lea r3, [z_filter_t0] + movd xm1, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m1, xm1 + pcmpeqb m0, [base+z_filter_wh] + mova xm2, [r3+angleq*8] + pand m0, m1 + pcmpgtb m0, m2 + pmovmskb r5d, m0 + ret +.w4_no_upsample: + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea maxbased, [hq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastw xm3, [tlq+14] + mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 + vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] + palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 + pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + paddw xm2, xm0 + pmullw xm2, xm4 + movd [rsp+16], xm3 + cmp r5d, 3 + jne .w4_3tap + paddw xm1, xm2 + palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 + pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 + movzx r3d, word [tlq+14] + movzx r2d, word [tlq+12] + inc maxbased + paddw xm2, xm0 + sub r2d, r3d + paddw xm2, xm2 + lea r2d, [r2+r3*8+4] + shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 + mov [rsp+16], r2w +.w4_3tap: + pxor xm0, xm0 + paddw xm1, xm2 + mov tlq, rsp + psrlw xm1, 3 + cmp hd, 8 + sbb maxbased, -1 + pavgw xm0, xm1 + mova [tlq], xm0 +.w4_main: + movd xm3, dxd + vpbroadcastq m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] + shl maxbased, 6 + vpbroadcastw m3, xm3 + movd xm0, maxbased + mov r3d, dxd ; xpos + vpbroadcastw m0, xm0 + paddw m4, m3, m3 + psubw m1, m0 ; -max_base_x + vpblendd m3, m4, 0xcc + paddw m0, m4, m3 + vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 + paddw m4, m4 + paddw m3, m1 +.w4_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [tlq+r3*2] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + movu xm2, [tlq+r5*2] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + vinserti128 m1, [tlq+r3*2], 1 ; 0 2 + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + vinserti128 m2, [tlq+r5*2], 1 ; 1 3 + punpcklqdq m0, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + vpblendd m1, m2, 0xcc + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 ; xpos < max_base_x + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w4_loop + lea r6, [strideq*3] +.w4_end_loop: + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + movq [dstq+strideq*2], xm6 + movq [dstq+r6 ], xm6 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_end_loop +.w4_end: + RET +.w8: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 7 + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ + movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ + movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + cmp hd, 4 + jne .w8_upsample_h8 ; awkward single-pixel edge case + vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ +.w8_upsample_h8: + paddw m2, m1 + paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + add dxd, dxd + psubw m0, m2, m0 + psraw m0, 3 + pxor m4, m4 + paddw m2, m0 + vpbroadcastw m0, r8m + movd xm3, dxd + pmaxsw m2, m4 + mov r3d, dxd + pavgw m2, m4 + vpbroadcastw m3, xm3 + pminsw m2, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vbroadcasti128 m4, [z_upsample] + mova [rsp+ 0], xm0 + mova [rsp+16], xm1 + paddw m6, m3, m3 + vextracti128 [rsp+32], m0, 1 + vextracti128 [rsp+48], m1, 1 + vpblendd m3, m6, 0xf0 ; xpos0 xpos1 +.w8_upsample_loop: + lea r2d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [rsp+r3*2] + movu xm2, [rsp+r3*2+16] + lea r3d, [r2+dxq] + shr r2d, 6 ; base1 + vinserti128 m1, [rsp+r2*2], 1 + vinserti128 m2, [rsp+r2*2+16], 1 + pshufb m1, m4 + pshufb m2, m4 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m3, m6 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_upsample_loop + RET +.w8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(h+7, 15) + jmp .w8_main +.w8_no_upsample: + lea maxbased, [hq+7] + test angled, 0x400 + jnz .w8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w8_main + popcnt r5d, r5d + vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m2 + cmp hd, 8 + jl .w8_filter_h4 + punpckhwd m2, m2 + vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + je .w8_filter_end ; 8x4 and 8x8 are always 3-tap + movzx r3d, word [tlq+30] + mov maxbased, 16 + mov [rsp+32], r3d + cmp r5d, 3 + jne .w8_filter_end + punpcklwd xm6, xm0, xm0 + vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g + vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movzx r5d, word [tlq+28] + mov [rsp+34], r3w + paddw m2, m6 + sub r5d, r3d + inc maxbased + paddw m2, m2 + lea r3d, [r5+r3*8+4] + paddw m1, m2 + shr r3d, 3 + mov [rsp+32], r3w + jmp .w8_filter_end +.w8_filter_h4: + pshuflw m3, m2, q3321 + vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ +.w8_filter_end: + paddw m0, m3 + pmullw m0, m4 + mov tlq, rsp + pxor m2, m2 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + mova [tlq], m0 +.w8_main: + movd xm3, dxd + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m3, xm3 + movd xm0, maxbased + mov r3d, dxd + vpbroadcastw m0, xm0 + paddw m4, m3, m3 + psubw m1, m0 + vpblendd m3, m4, 0xf0 ; xpos0 xpos1 + paddw m3, m1 +.w8_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + movu xm0, [tlq+r3*2] + movu xm1, [tlq+r3*2+2] + lea r3d, [r5+dxq] + shr r5d, 6 + vinserti128 m0, [tlq+r5*2], 1 + vinserti128 m1, [tlq+r5*2+2], 1 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w8_loop +.w8_end_loop: + mova [dstq+strideq*0], xm6 + mova [dstq+strideq*1], xm6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_end_loop +.w8_end: + RET +.w16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(h+15, 31) + jmp .w16_main +.w16: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 7 + lea maxbased, [hq+15] + test angled, 0x400 + jnz .w16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w16_main + popcnt r5d, r5d + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + cmp r5d, 3 + jne .w16_filter_3tap + vpbroadcastd m2, [base+pw_3] + punpcklwd xm0, xm0 + vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m0, m2 + pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m0, m1 + psrlw m0, 2 + movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + cmp hd, 8 + jl .w16_filter_5tap_h4 + punpckhwd m3, m3 + je .w16_filter_5tap_h8 + vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + movzx r3d, word [tlq+62] + movzx r2d, word [tlq+60] + pavgw m2, m4 + sub r2d, r3d + paddw m1, m3 + lea r2d, [r2+r3*8+4] + paddw m1, m2 + shr r2d, 3 + psrlw m1, 2 + mov [rsp+66], r3w + mov [rsp+64], r2w + mov tlq, rsp + mov r3d, 33 + cmp hd, 16 + cmovg maxbased, r3d + jmp .w16_filter_end2 +.w16_filter_5tap_h8: + vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 + vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 + pavgw xm2, xm4 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 + jmp .w16_filter_end2 +.w16_filter_5tap_h4: + pshuflw xm4, xm3, q3332 ; 4 5 5 5 + pshuflw xm3, xm3, q3321 ; 3 4 5 5 + pavgw xm2, xm4 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 + jmp .w16_filter_end2 +.w16_filter_3tap: + vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m4 + pmullw m3, m2 + paddw m0, m1 + cmp hd, 8 + je .w16_filter_3tap_h8 + jl .w16_filter_3tap_h4 + punpckhwd m2, m2 + vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + jmp .w16_filter_end +.w16_filter_3tap_h4: + pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ + jmp .w16_filter_end +.w16_filter_3tap_h8: + psrldq xm2, 2 + pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 +.w16_filter_end: + paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + pmullw m2, m4 + psrlw m0, 3 + pxor m1, m1 + paddw m2, m3 + psrlw m2, 3 + pavgw m0, m1 + pavgw m1, m2 +.w16_filter_end2: + mov tlq, rsp + mova [tlq+ 0], m0 + mova [tlq+32], m1 +.w16_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r3d, dxd + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + psubw m3, m0 +.w16_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + lea r3d, [r5+dxq] + shr r5d, 6 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m1, m0 + movu m0, [tlq+r5*2] + vpblendvb m2, m6, m1, m2 + movu m1, [tlq+r5*2+2] + mova [dstq+strideq*0], m2 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+strideq*1], m0 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w16_loop +.w16_end_loop: + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_end_loop +.w16_end: + RET +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK -160, 8 + lea maxbased, [hq+31] + mov r3d, 63 + cmp hd, 32 + cmova maxbased, r3d + test angled, 0x400 + jnz .w32_main + vpbroadcastd m2, [pw_3] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m1, m2 + paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + mov r3, rsp + paddw m0, m1 + lea r5d, [maxbaseq-31] + psrlw m0, 2 + mova [r3], m0 +.w32_filter_loop: + mova m0, [tlq+30] + paddw m1, m2, [tlq+28] + add tlq, 32 + paddw m0, [tlq+0] + pavgw m1, [tlq+4] + paddw m0, [tlq+2] + add r3, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r3], m0 + sub r5d, 16 + jg .w32_filter_loop + movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + punpckhwd m1, m0, m0 + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + jl .w32_filter_h8 + vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + movzx r5d, word [tlq+62] + movzx r2d, word [tlq+60] + pavgw m2, m3 + sub r2d, r5d + paddw m0, m1 + lea r2d, [r2+r5*8+4] + paddw m0, m2 + shr r2d, 3 + psrlw m0, 2 + mova [r3+32], m0 + mov [r3+66], r5w + mov [r3+64], r2w + mov tlq, rsp + mov r3d, 65 + cmp hd, 64 + cmove maxbased, r3d + jmp .w32_main +.w32_filter_h8: + vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 + vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 + pavgw xm2, xm3 + paddw xm0, xm1 + mov tlq, rsp + paddw xm0, xm2 + psrlw xm0, 2 + mova [r3+32], xm0 +.w32_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r5d, dxd + vpbroadcastd m7, [pw_m1024] ; -16 * 64 + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + psubw m3, m0 +.w32_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + psraw m1, m3, 15 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*0], m0 + movu m0, [tlq+r3*2+32] + movu m1, [tlq+r3*2+34] + add r5d, dxd + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m7, m3 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+32*1], m0 + dec hd + jz .w32_end + add dstq, strideq + cmp r5d, maxbased + jb .w32_loop +.w32_end_loop: + mova [dstq+32*0], m6 + mova [dstq+32*1], m6 + add dstq, strideq + dec hd + jg .w32_end_loop +.w32_end: + RET +.w64: + %assign stack_offset org_stack_offset + ALLOC_STACK -256, 10 + lea maxbased, [hq+63] + test angled, 0x400 + jnz .w64_main + vpbroadcastd m2, [pw_3] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m1, m2 + paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + mov r3, rsp + paddw m0, m1 + lea r5d, [hq+32] + psrlw m0, 2 + mova [r3], m0 +.w64_filter_loop: + mova m0, [tlq+30] + paddw m1, m2, [tlq+28] + add tlq, 32 + paddw m0, [tlq+0] + pavgw m1, [tlq+4] + paddw m0, [tlq+2] + add r3, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r3], m0 + sub r5d, 16 + jg .w64_filter_loop + movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + punpckhwd m1, m0, m0 + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + pavgw m2, m3 + paddw m0, m1 + paddw m0, m2 + mov tlq, rsp + psrlw m0, 2 + mova [r3+32], m0 +.w64_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r5d, dxd + vpbroadcastd m7, [pw_m1024] ; -16 * 64 + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + paddw m8, m7, m7 ; -32 * 64 + psubw m3, m0 + paddw m9, m8, m7 ; -48 * 64 +.w64_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + psraw m1, m3, 15 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*0], m0 + movu m0, [tlq+r3*2+32] + movu m1, [tlq+r3*2+34] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m7, m3 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*1], m0 + movu m0, [tlq+r3*2+64] + movu m1, [tlq+r3*2+66] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m8, m3 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*2], m0 + movu m0, [tlq+r3*2+96] + movu m1, [tlq+r3*2+98] + add r5d, dxd + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m9, m3 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+32*3], m0 + dec hd + jz .w64_end + add dstq, strideq + cmp r5d, maxbased + jb .w64_loop +.w64_end_loop: + mova [dstq+32*0], m6 + mova [dstq+32*1], m6 + mova [dstq+32*2], m6 + mova [dstq+32*3], m6 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + +cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy +%define base r9-z_filter_t0 + lea r9, [ipred_z2_16bpc_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea dxq, [dr_intra_derivative-90] + movsxd wq, [r9+wq*4] + mova m1, [tlq- 0] + movzx dyd, angleb + xor angled, 0x400 + mova m2, [tlq- 32] + mov r8, dxq + sub dxq, dyq + mova m3, [tlq- 64] + add wq, r9 + add r9, z_filter_t0-ipred_z2_16bpc_avx2_table + mova m4, [tlq- 96] + and dyd, ~1 + mova m5, [tlq-128] + and dxq, ~1 + movzx dyd, word [r8+dyq] ; angle - 90 + movzx dxd, word [dxq+270] ; 180 - angle + vpbroadcastd m11, [base+pw_62] + mova [rsp+128], m1 + mova [rsp+ 96], m2 + mova [rsp+ 64], m3 + neg dxd + mova [rsp+ 32], m4 + neg dyq + mova [rsp+ 0], m5 + jmp wq +.w4: + vbroadcasti128 m10, [base+z2_x_shuf] + vpbroadcastq m6, [base+z_base_inc+2] + lea r8d, [dxq+(65<<6)] ; xpos + mov r10d, (63-4)<<6 + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + movq xm0, [tlq+2] ; 1 2 3 4 + movq xm1, [tlq+0] ; 0 1 2 3 + pshuflw xm2, xm0, q3321 ; 2 3 4 4 + pshuflw xm3, xm1, q2100 ; 0 0 1 2 + vpbroadcastw xm4, r8m ; pixel_max + vbroadcasti128 m10, [base+z_upsample] + paddw xm1, xm0 + paddw xm2, xm3 + lea r8d, [r8+dxq+(1<<6)] + psubw xm2, xm1, xm2 + add dxd, dxd + psraw xm2, 3 + pxor xm3, xm3 + sub r10d, 3<<6 + paddw xm1, xm2 + paddw m6, m6 + pmaxsw xm1, xm3 + sub angled, 1075 ; angle - 53 + pavgw xm1, xm3 + lea r3d, [hq+3] + pminsw xm1, xm4 + xor angled, 0x7f ; 180 - angle + punpcklwd xm1, xm0 + movu [rsp+130], xm1 + call .filter_strength + jmp .w4_filter_left +ALIGN function_align +.filter_strength: + movd xm8, r3d + mov r3d, angled + movd xm7, angled + vpbroadcastb m8, xm8 + shr r3d, 8 ; is_sm << 1 + vpbroadcastb m7, xm7 + pcmpeqb m8, [base+z_filter_wh] + mova xm9, [r9+r3*8] + pand m0, m8, m7 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + ret +ALIGN function_align +.upsample_left: ; h4/h8 + mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 + movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 +%if STACK_ALIGNMENT < 32 + vpbroadcastw xm4, r8m ; pixel_max +%else + vpbroadcastw xm4, r9m ; r8m -> r9m due to call +%endif + cmp hd, 8 + je .upsample_left_h8 + pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 + pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0 + jmp .upsample_left_end +.upsample_left_h8: + pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2 + pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0 +.upsample_left_end: + paddw xm1, xm0 + paddw xm2, xm3 + psubw xm2, xm1, xm2 + add dyq, dyq + psraw xm2, 3 + pxor xm3, xm3 + paddw xm1, xm2 + pmaxsw xm1, xm3 + pavgw xm1, xm3 + pminsw xm1, xm4 + punpcklwd xm2, xm0, xm1 + punpckhwd xm0, xm1 + mova [rsp+ 96+gprsize], xm2 + mova [rsp+112+gprsize], xm0 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + sub angled, 1112 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w4_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] + psrldq xm0, xm1, 2 ; 1 2 3 4 + pshuflw xm2, xm1, q2100 ; 0 0 1 2 + pmullw xm4, xm0 + pshuflw xm3, xm0, q3321 ; 2 3 4 4 + paddw xm1, xm3 + pshuflw xm3, xm0, q3332 ; 3 4 4 4 + pmullw xm1, xm5 + vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2] + paddw xm2, xm3 + vpbroadcastd xm3, r6m ; max_width + pmullw xm2, xm5 + packssdw xm3, xm3 + paddw xm1, xm4 + paddw xm1, xm2 + psubw xm3, [base+pw_1to16] + pxor xm4, xm4 + psrlw xm1, 3 + pminsw xm3, xm11 ; clip to byte range since there's no variable word blend + pavgw xm1, xm4 + vpblendvb xm1, xm0, xm3 + movq [rsp+130], xm1 +.w4_no_filter_above: + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm0, [base+pb_90] + psubb xm0, xm7 ; 180 - angle + pand xm0, xm8 ; reuse from previous filter_strength call + pcmpgtb xm0, xm9 + pmovmskb r3d, xm0 +.w4_filter_left: + test r3d, r3d + jz .w4_main + popcnt r3d, r3d + mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + vpbroadcastd m5, r7m ; max_height + cmp r3d, 3 + je .w4_filter_left_s3 + vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] + pmullw m2, m0 + cmp hd, 8 + jl .w4_filter_left_h4 + movu m4, [tlq-34] + punpcklwd m1, m0, m0 + vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e + je .w4_filter_left_end + vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + jmp .w4_filter_left_end +.w4_upsample_left: + call .upsample_left + mov r11, -16 + vbroadcasti128 m9, [base+z_upsample] + jmp .w4_main_upsample_left +.w4_filter_left_s3: ; can only be h16 + movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m4, [base+pw_3] + paddw m1, m0, m2 + punpckhwd m2, m2 + vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + punpcklwd xm3, xm0, xm0 + paddw m2, m4 + vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e + vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d + paddw m1, m4 + pavgw m2, m3 + paddw m1, m2 + psrlw m1, 2 + jmp .w4_filter_left_end2 +.w4_filter_left_h4: + pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e +.w4_filter_left_end: + paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m3 + paddw m1, m2 + pxor m2, m2 + psrlw m1, 3 + pavgw m1, m2 +.w4_filter_left_end2: + packssdw m5, m5 + psubw m5, [base+pw_16to1] + pminsw m5, m11 + vpblendvb m1, m0, m5 + mova [rsp+96], m1 +.w4_main: + vbroadcasti128 m9, [base+z2_x_shuf] + mov r11, -8 +.w4_main_upsample_left: + movd xm5, dyd + mova m4, [base+z2_y_shuf_h4] + mov r2d, r8d + movd xm0, dxd + vpbroadcastw m5, xm5 + rorx r5, dyq, 5 + lea r8d, [dyq*3] + pmullw m5, [base+z2_ymul] + rorx r9, dyq, 4 + sar dyd, 6 + vpbroadcastw m0, xm0 + sar r8d, 6 + pand m5, m11 ; frac_y + neg dyd + psllw m5, 9 + add r5d, dyd + add r8d, dyd + add r9d, dyd + paddw m7, m0, m0 + lea dyq, [rsp+dyq*2+126] + vpblendd m0, m7, 0xcc + add dyq, r11 + neg r5d + paddw m1, m0, m7 + neg r8d + vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3 + neg r9d + paddw m7, m7 + paddw m6, m0 +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm1, [rsp+r2*2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movu xm3, [rsp+r3*2] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + vinserti128 m1, [rsp+r2*2], 1 + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + vinserti128 m3, [rsp+r3*2], 1 + pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3 + pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3 + pand m2, m11, m6 + punpcklqdq m0, m1, m3 + punpckhqdq m1, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + cmp r3d, 64 + jge .w4_toponly + movu xm2, [dyq] + vinserti128 m2, [dyq+r8*2], 1 + movu xm3, [dyq+r5*2] + vinserti128 m3, [dyq+r9*2], 1 + pshufb m2, m9 + pshufb m3, m9 + punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m2, m3 + psubw m2, m1 + pmulhrsw m2, m5 + psraw m3, m6, 15 ; base_x < topleft + paddw m1, m2 + vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 + vpblendvb m0, m1, m3 +.w4_toponly: + paddw m6, m7 ; xpos += dx + lea r3, [strideq*3] + add dyq, r11 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*4] + cmp r2d, r10d + jge .w4_loop +.w4_leftonly_loop: + movu xm1, [dyq] + vinserti128 m1, [dyq+r8*2], 1 + movu xm2, [dyq+r5*2] + vinserti128 m2, [dyq+r9*2], 1 + add dyq, r11 + pshufb m1, m9 + pshufb m2, m9 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + psubw m1, m0 + pmulhrsw m1, m5 + paddw m0, m1 + vpermd m0, m4, m0 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + RET +.w8: + mov r10d, hd + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + xor r8d, r8d + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 + mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7 + pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 + pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 + vpbroadcastw xm4, r8m ; pixel_max + paddw xm1, xm0 + paddw xm2, xm3 + not r8d + psubw xm2, xm1, xm2 + add dxd, dxd + psraw xm2, 3 + sub angled, 53 ; angle - 53 + pxor xm3, xm3 + paddw xm2, xm1 + lea r3d, [hq+7] + pmaxsw xm2, xm3 + xor angled, 0x7f ; 180 - angle + pavgw xm2, xm3 + pminsw xm2, xm4 + punpcklwd xm1, xm2, xm0 + punpckhwd xm2, xm0 + movu [rsp+130], xm1 + movu [rsp+146], xm2 + call .filter_strength + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + sub angled, 90 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w8_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2] + movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x + pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x + pmullw xm4, xm0 + pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x + paddw xm1, xm3 + vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x + paddw xm2, xm3 + vpbroadcastd xm3, r6m ; max_width + pmullw xm1, xm5 + pmullw xm2, xm6 + packssdw xm3, xm3 + paddw xm1, xm4 + paddw xm1, xm2 + psubw xm3, [base+pw_1to16] + pxor xm4, xm4 + psrlw xm1, 3 + pminsw xm3, xm11 + pavgw xm1, xm4 + vpblendvb xm1, xm0, xm3 + movu [rsp+130], xm1 +.w8_no_filter_above: + lea r3d, [angleq-51] + mov r3b, hb + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 +.w8_filter_left: + test r3d, r3d + jz .w8_main + popcnt r3d, r3d + cmp r3d, 3 + jne .w8_filter_left_s12 + vpbroadcastd m6, [base+pw_3] + vpbroadcastd m7, [base+pw_16] + cmp hd, 16 ; flags needed for later + jmp .filter_left_s3b +.w8_upsample_left: + call .upsample_left + vbroadcasti128 m7, [base+z2_y_shuf_us] + lea r11, [rsp+118] + mov r8, -8 + jmp .w8_main_upsample_left +.w16_filter_left_s12: + xor r8d, r8d +.w8_filter_left_s12: + mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + vpbroadcastd m5, r7m ; max_height + vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] + pmullw m2, m0 + cmp hd, 8 + jl .w8_filter_left_h4 + movu m4, [tlq-34] + punpcklwd m1, m0, m0 + vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e + je .w8_filter_left_end + vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + jmp .w8_filter_left_end +.w8_filter_left_h4: + pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e +.w8_filter_left_end: + paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m3 + paddw m1, m2 + pxor m2, m2 + psrlw m1, 3 + pavgw m1, m2 + packssdw m5, m5 + psubw m5, [base+pw_16to1] + pminsw m5, m11 + vpblendvb m1, m0, m5 + mova [rsp+96], m1 + test r8d, r8d + jz .w8_main +; upsample_main + vbroadcasti128 m10, [base+z_upsample] + vbroadcasti128 m7, [base+z2_y_shuf] + lea r5, [rsp+120] + movd xm1, dyd + vbroadcasti128 m4, [base+z_base_inc+2] + movd xm2, dxd + vpbroadcastw m1, xm1 + vpbroadcastw m2, xm2 + mov r7, dstq + paddw m4, m4 + pmullw m0, m1, [base+z2_ymul8] + paddw m5, m2, m2 + psllw xm1, 3 + vpblendd m2, m5, 0xf0 + lea r2d, [dxq+(66<<6)] ; xpos + paddw m4, m2 + pshufd m6, m0, q2020 + psraw xm0, 6 + pxor xm1, xm1 + psubw xm8, xm1, xm0 + pand m6, m11 + punpckhwd xm9, xm8, xm1 + psllw m6, 9 + punpcklwd xm8, xm1 +.w8_upsample_above_loop: + lea r3d, [r2+dxq] + shr r2d, 6 + movu xm1, [rsp+r2*2] + movu xm2, [rsp+r2*2+16] + lea r2d, [r3+dxq] + shr r3d, 6 + vinserti128 m1, [rsp+r3*2], 1 + vinserti128 m2, [rsp+r3*2+16], 1 + pshufb m1, m10 + pshufb m2, m10 + punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 + punpckhqdq m1, m2 + pand m2, m11, m4 + psubw m1, m0 + psllw m2, 9 + pmulhrsw m1, m2 + paddw m0, m1 + cmp r3d, 64 + jge .w8_upsample_above_toponly + mova m1, m5 + vpgatherdq m3, [r5+xm9*2], m5 + mova m5, m1 + vpgatherdq m2, [r5+xm8*2], m1 + pshufb m3, m7 + pshufb m2, m7 + punpckldq m1, m2, m3 + punpckhdq m2, m3 + psubw m2, m1 + pmulhrsw m2, m6 + paddw m1, m2 + vpermq m1, m1, q3120 + psraw m2, m4, 15 + vpblendvb m0, m1, m2 +.w8_upsample_above_toponly: + paddw m4, m5 + sub r5, 4 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_ret + lea dstq, [dstq+strideq*2] + jmp .w8_upsample_above_loop +.w8_main: + vbroadcasti128 m7, [base+z2_y_shuf] + lea r11, [rsp+120] + mov r8, -4 +.w8_main_upsample_left: + movd xm1, dyd + vbroadcasti128 m4, [base+z_base_inc+2] + movd xm2, dxd + vpbroadcastw m1, xm1 + vpbroadcastw m2, xm2 + mov r7, dstq + pmullw m0, m1, [base+z2_ymul8] + paddw m5, m2, m2 + psllw xm1, 3 + vpblendd m2, m5, 0xf0 ; xpos0 xpos1 + lea r9d, [dxq+(65<<6)] ; xpos + paddw m4, m2 + movd [rsp+284], xm1 +.w8_loop0: + mov r2d, r9d + mova [rsp+288], m0 + mov r5, r11 + mova [rsp+320], m4 + pshufd m6, m0, q2020 + psraw xm0, 6 + pxor xm1, xm1 + psubw xm8, xm1, xm0 ; base_y + pand m6, m11 ; frac_y + punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7 + psllw m6, 9 + punpcklwd xm8, xm1 ; base_y 0 1 4 5 +.w8_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm0, [rsp+r2*2] + movu xm1, [rsp+r2*2+2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + vinserti128 m0, [rsp+r3*2], 1 + vinserti128 m1, [rsp+r3*2+2], 1 + pand m2, m11, m4 + psubw m1, m0 + psllw m2, 9 + pmulhrsw m1, m2 + paddw m0, m1 + cmp r3d, 64 + jge .w8_toponly + mova m1, m5 + vpgatherdq m3, [r5+xm9*2], m5 + mova m5, m1 + vpgatherdq m2, [r5+xm8*2], m1 + pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1 + pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1 + punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m2, m3 + psubw m2, m1 + pmulhrsw m2, m6 + paddw m1, m2 + vpermq m1, m1, q3120 + psraw m2, m4, 15 ; base_x < topleft + vpblendvb m0, m1, m2 +.w8_toponly: + paddw m4, m5 ; xpos += dx + add r5, r8 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r2d, (63-8)<<6 + jge .w8_loop +.w8_leftonly_loop: + mova m0, m5 + vpgatherdq m4, [r5+xm9*2], m5 + mova m5, m0 + vpgatherdq m3, [r5+xm8*2], m0 + add r5, r8 + pshufb m2, m4, m7 + pshufb m1, m3, m7 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + psubw m1, m0 + pmulhrsw m1, m6 + paddw m0, m1 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_leftonly_loop +.w8_end: + sub r10d, 1<<8 + jl .w8_ret + vpbroadcastd m0, [rsp+284] + add r7, 16 + paddw m0, [rsp+288] ; base_y += 8*dy + add r9d, 8<<6 + vpbroadcastd m4, [pw_512] + movzx hd, r10b + paddw m4, [rsp+320] ; base_x += 8*64 + mov dstq, r7 + jmp .w8_loop0 +.w8_ret: + RET +.w16: + movd xm0, [tlq+32] + lea r10d, [hq+(1<<8)] + movd [rsp+160], xm0 + test angled, 0x400 + jnz .w8_main + lea r3d, [hq+15] + sub angled, 90 + call .filter_strength + test r3d, r3d + jz .w16_no_filter_above + popcnt r3d, r3d + vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2] + movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + punpcklwd xm2, xm1, xm1 + vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + punpckhwd m3, m0, m0 + pmullw m4, m0 + vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + paddw m1, m3 + vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g + paddw m2, m3 + vpbroadcastd m3, r6m ; max_width + pmullw m1, m5 + pmullw m2, m6 + packssdw m3, m3 + paddw m1, m4 + paddw m1, m2 + psubw m3, [base+pw_1to16] + pxor m4, m4 + psrlw m1, 3 + pminsw m3, m11 + pavgw m1, m4 + vpblendvb m1, m0, m3 + movu [rsp+130], m1 +.w16_no_filter_above: + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + test r3d, r3d + jz .w8_main + popcnt r3d, r3d + cmp r3d, 3 + jne .w16_filter_left_s12 + vpbroadcastd m6, [base+pw_3] + vpbroadcastd m7, [base+pw_16] + cmp hd, 4 + jne .filter_left_s3 + movq xm0, [tlq-8] ; 0 1 2 3 + movq xm1, [tlq-6] ; 1 2 3 4 + vpbroadcastd xm5, r7m ; max_height + movq xm4, [base+pw_16to1+24] ; 4to1 + pshuflw xm2, xm0, q2100 ; 0 0 1 2 + pshuflw xm3, xm1, q3321 ; 2 3 4 4 + paddw xm1, xm0 + paddw xm1, xm2 + pshuflw xm2, xm0, q1000 ; 0 0 0 1 + paddw xm3, xm6 + packssdw xm5, xm5 + pavgw xm2, xm3 + psubw xm5, xm4 + paddw xm1, xm2 + pminsw xm5, xm11 + psrlw xm1, 2 + vpblendvb xm1, xm0, xm5 + movq [rsp+120], xm1 + jmp .w8_main +.w32: + mova m2, [tlq+32] + movd xm0, [tlq+64] + lea r10d, [hq+(3<<8)] + mova [rsp+160], m2 + movd [rsp+192], xm0 + test angled, 0x400 + jnz .w8_main + vpbroadcastd m6, [base+pw_3] + vpbroadcastd m0, r6m ; max_width + vpbroadcastd m7, [base+pw_16] + mov r3d, 32 + packssdw m0, m0 + psubw m0, [base+pw_1to16] + pminsw m8, m0, m11 + psubw m9, m8, m7 +.w32_filter_above: + movu m0, [tlq+2] + punpcklwd xm4, xm1, xm1 + paddw m2, m6, [tlq+6] + paddw m1, m0 + vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m1, [tlq+4] + movu m3, [tlq+r3+2] + paddw m5, m6, [tlq+r3-2] + pavgw m2, m4 + punpckhwd m4, m3, m3 + paddw m1, m2 + vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + pavgw m2, m5 + paddw m5, m3, [tlq+r3] + paddw m4, m5 + psrlw m1, 2 + paddw m2, m4 + vpblendvb m1, m0, m8 + psrlw m2, 2 + vpblendvb m2, m3, m9 + movu [rsp+130], m1 + movu [rsp+r3+130], m2 +.filter_left_s3: + cmp hd, 16 + jl .filter_left_s3_h8 ; h8 +.filter_left_s3b: + mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i + vpbroadcastd m5, r7m ; max_height + paddw m1, m0, m2 + punpckhwd m2, m2 + mov r3d, hd + vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + packssdw m5, m5 + not r3 + psubw m5, [base+pw_16to1] + paddw m2, m6 + pminsw m8, m11, m5 + je .filter_left_s3_end ; h16 + paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m2 + psrlw m1, 2 + vpblendvb m3, m1, m0, m8 + mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j + psubw m8, m7 + mova [rsp+96], m3 + jnp .filter_left_s3_end ; h32 + mova m5, [tlq-96] + paddw m1, [tlq-66] + pavgw m2, [tlq-68] + paddw m1, m2 + paddw m4, m5, [tlq-94] + paddw m2, m6, [tlq-92] + psrlw m1, 2 + paddw m4, [tlq- 98] + pavgw m2, [tlq-100] + vpblendvb m3, m1, m0, m8 + mova m0, [tlq-128] + psubw m8, m7 + paddw m4, m2 + paddw m1, m0, [tlq-126] + paddw m2, m6, [tlq-124] + psrlw m4, 2 + mova [rsp+64], m3 + vpblendvb m4, m5, m8 + psubw m8, m7 + mova [rsp+32], m4 +.filter_left_s3_end: + punpcklwd xm3, xm0, xm0 + vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m4 + pavgw m2, m3 + paddw m1, m2 + psrlw m1, 2 + vpblendvb m1, m0, m8 + mova [rsp+r3*2+130], m1 + jmp .w8_main +.filter_left_s3_h8: + mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7 + movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8 + pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6 + vpbroadcastd xm5, r7m ; max_height + paddw xm1, xm0, xm3 + pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8 + paddw xm1, xm2 + vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5 + paddw xm3, xm6 + packssdw xm5, xm5 + pavgw xm2, xm3 + psubw xm5, [base+pw_16to1+16] ; 8to1 + paddw xm1, xm2 + pminsw xm5, xm11 + psrlw xm1, 2 + vpblendvb xm1, xm0, xm5 + mova [rsp+112], xm1 + jmp .w8_main +.w64: + mova m2, [tlq+ 32] + mova m3, [tlq+ 64] + mova m4, [tlq+ 96] + movd xm0, [tlq+128] + lea r10d, [hq+(7<<8)] + mova [rsp+160], m2 + mova [rsp+192], m3 + mova [rsp+224], m4 + movd [rsp+256], xm0 + test angled, 0x400 + jnz .w8_main + vpbroadcastd m6, [base+pw_3] + movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h + paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h + movu m4, [tlq+66] + paddw m3, m6, [tlq+62] + paddw m7, m4, [tlq+64] + pavgw m3, [tlq+70] + paddw m7, [tlq+68] + paddw m2, m5 + vpbroadcastd m5, r6m ; max_width + mov r3d, 96 + packssdw m5, m5 + paddw m3, m7 + psubw m5, [base+pw_1to16] + psrlw m2, 2 + vpbroadcastd m7, [base+pw_16] + psrlw m3, 2 + pminsw m8, m11, m5 + psubw m9, m8, m7 + vpblendvb m2, m0, m9 + psubw m9, m7 + vpblendvb m3, m4, m9 + psubw m9, m7 + movu [rsp+162], m2 + movu [rsp+194], m3 + jmp .w32_filter_above + +cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z3_16bpc_avx2_table] + tzcnt hd, hm + movifnidn angled, anglem + lea r7, [dr_intra_derivative+45*2-1] + sub tlq, 2 + movsxd hq, [r6+hq*4] + sub angled, 180 + add hq, r6 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + movzx dyd, word [r7+dyq] + vpbroadcastd m5, [pw_62] + mov org_wd, wd + jmp hq +.h4: + ALLOC_STACK -64, 7 + lea r7, [strideq*3] + cmp angleb, 40 + jae .h4_no_upsample + lea r4d, [angleq-1024] + sar r4d, 7 + add r4d, wd + jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) + mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 + pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 + vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 + pshufd xm3, xm1, q0000 + paddw xm1, xm2 + paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 + vpbroadcastw xm4, r8m ; pixel_max + add dyd, dyd + psubw xm0, xm1, xm0 + mova [rsp+ 0], xm3 + movd xm3, dyd + psraw xm0, 3 + neg dyd + paddw xm1, xm0 + pxor xm0, xm0 + lea r2d, [dyq+(16<<6)+63] ; ypos + pmaxsw xm1, xm0 + pavgw xm1, xm0 + vpbroadcastw m3, xm3 + pminsw xm1, xm4 + punpckhwd xm0, xm1, xm2 + punpcklwd xm1, xm2 + paddw m2, m3, m3 + mova [rsp+32], xm0 + punpcklwd m3, m2 + mova [rsp+16], xm1 + paddw m4, m2, m2 + paddw m2, m3 + vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 +.h4_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 + movu xm1, [rsp+r2*2] + lea r2d, [r4+dyq] + shr r4d, 6 + movu xm2, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 + vinserti128 m1, [rsp+r2*2], 1 + lea r2d, [r4+dyq] + shr r4d, 6 + vinserti128 m2, [rsp+r4*2], 1 + psrld m0, m1, 16 + pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 + pslld m2, 16 + pblendw m1, m2, 0xaa + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m3, m4 + paddw m1, m0 + vextracti128 xm2, m1, 1 + punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 + movhps [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r7 ], xm1 + add dstq, 8 + sub wd, 4 + jg .h4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; h4/h8/h16 +%define base r4-z_filter_t0 + lea r4, [z_filter_t0] + movd xm0, maxbased + movd xm1, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m1, xm1 + pcmpeqb m0, [base+z_filter_wh] + pand m0, m1 + mova xm1, [r4+angleq*8] + pcmpgtb m0, m1 + pmovmskb r5d, m0 + ret +.h4_no_upsample: + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea maxbased, [wq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + popcnt r5d, r5d + mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 + movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 + vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] + pmullw xm2, xm0 + pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 + paddw xm1, xm0, xm3 + movd [rsp+12], xm0 + pmullw xm1, xm4 + cmp r5d, 3 + jne .h4_filter_3tap + pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 + vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 + movzx r4d, word [tlq-14] + movzx r2d, word [tlq-12] + inc maxbased + paddw xm1, xm2 + paddw xm0, xm3 + sub r2d, r4d + paddw xm2, xm0, xm0 + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+14], r2w +.h4_filter_3tap: + pxor xm0, xm0 + paddw xm1, xm2 + lea tlq, [rsp+30] + psrlw xm1, 3 + cmp wd, 8 + sbb maxbased, -1 + pavgw xm0, xm1 + mova [rsp+16], xm0 +.h4_main: + movd xm3, dyd + neg maxbaseq + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m3, xm3 + lea r4d, [maxbaseq+3*64] + neg dyq + movd xm2, r4d + sub tlq, 8 + lea r4, [dyq+63] ; ypos + punpcklwd m1, m1 + paddw m0, m3, m3 + vpbroadcastw m2, xm2 + punpcklwd m3, m0 + paddw m4, m0, m0 + paddw m0, m3 + psubw m2, m1 + vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 + or maxbased, 63 + paddw m3, m2 +.h4_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu xm1, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + movu xm2, [tlq+r5*2] + lea r5, [r4+dyq] + sar r4, 6 ; base2 + vinserti128 m1, [tlq+r4*2], 1 + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vinserti128 m2, [tlq+r5*2], 1 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + pand m2, m5, m3 + palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 ; ypos < max_base_y + paddw m3, m4 + paddw m1, m0 + vpblendvb m1, m6, m1, m2 + vextracti128 xm2, m1, 1 + punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 + movhps [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r7 ], xm1 + sub wd, 4 + jz .h4_end + add dstq, 8 + cmp r4d, maxbased + jg .h4_loop +.h4_end_loop: + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + movq [dstq+strideq*2], xm6 + movq [dstq+r7 ], xm6 + add dstq, 8 + sub wd, 4 + jg .h4_end_loop +.h4_end: + RET +.h8: + lea r4d, [angleq+216] + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 8 + mov r4b, wb + lea r7, [strideq*3] + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e + movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d + cmp wd, 8 + je .h8_upsample_w8 + pshufhw xm3, xm2, q1000 + vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d +.h8_upsample_w8: + paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastw m4, r8m ; pixel_max + add dyd, dyd + psubw m0, m1, m0 + movd xm6, dyd + psraw m0, 3 + neg dyd + paddw m1, m0 + pxor m0, m0 + pmaxsw m1, m0 + lea r4d, [dyq+(16<<6)+63] ; ypos + pavgw m1, m0 + vpbroadcastw m6, xm6 + pminsw m1, m4 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + vextracti128 [rsp+48], m0, 1 + vextracti128 [rsp+32], m1, 1 + paddw m7, m6, m6 + mova [rsp+16], xm0 + mova [rsp+ 0], xm1 + punpcklwd m6, m7 ; ypos0 ypos1 +.h8_upsample_loop: + lea r2d, [r4+dyq] + shr r4d, 6 ; base0 + movu m1, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 ; base1 + movu m2, [rsp+r2*2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base2 + movu m3, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 ; base3 + movu m4, [rsp+r2*2] + psrld m0, m1, 16 + pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 + pslld m2, 16 + pblendw m1, m2, 0xaa + psrld m2, m3, 16 + pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 + pslld m4, 16 + pblendw m3, m4, 0xaa + pand m4, m5, m6 + paddw m6, m7 + psllw m4, 9 + psubw m1, m0 + pmulhrsw m1, m4 + pand m4, m5, m6 + psllw m4, 9 + psubw m3, m2 + pmulhrsw m3, m4 + paddw m6, m7 + lea r2, [dstq+strideq*4] + paddw m1, m0 + paddw m3, m2 + punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + movhps [r2 +strideq*0], xm0 + movq [r2 +strideq*1], xm0 + movhps [r2 +strideq*2], xm1 + movq [r2 +r7 ], xm1 + movhps [dstq+strideq*0], xm2 + movq [dstq+strideq*1], xm2 + movhps [dstq+strideq*2], xm3 + movq [dstq+r7 ], xm3 + add dstq, 8 + sub wd, 4 + jg .h8_upsample_loop + RET +.h8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(w+7, 15) + jmp .h8_main +.h8_no_upsample: + lea maxbased, [wq+7] + test angled, 0x400 + jnz .h8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h8_main + popcnt r5d, r5d + mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + pmullw m2, m0 + cmp wd, 8 + jl .h8_filter_w4 + punpcklwd xm0, xm0 + vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movd [rsp+28], xm0 + paddw m1, m3 + mov r4d, 16 + pmullw m1, m4 + cmovg maxbased, r4d + cmp r5d, 3 + jne .h8_filter_3tap + punpckhwd m3, m3 + vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + movzx r4d, word [tlq-30] + movzx r2d, word [tlq-28] + inc maxbased + paddw m1, m2 + paddw m0, m3 + sub r2d, r4d + paddw m2, m0, m0 + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+30], r2w + jmp .h8_filter_3tap +.h8_filter_w4: + pshufhw xm1, xm0, q2100 + vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e + paddw m1, m3 + pmullw m1, m4 +.h8_filter_3tap: + pxor m0, m0 + paddw m1, m2 + lea tlq, [rsp+62] + psrlw m1, 3 + pavgw m0, m1 + mova [rsp+32], m0 +.h8_main: + movd xm4, dyd + neg maxbaseq + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m7, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + lea r4d, [maxbaseq+7*64] + neg dyq + movd xm2, r4d + sub tlq, 16 + lea r4, [dyq+63] + paddw m6, m4, m4 + vpbroadcastw m2, xm2 + vpblendd m4, m6, 0xf0 ; ypos0 ypos1 + psubw m2, m1 + or maxbased, 63 + paddw m4, m2 +.h8_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu xm0, [tlq+r4*2+2] + movu xm1, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + vinserti128 m0, [tlq+r5*2+2], 1 + vinserti128 m1, [tlq+r5*2], 1 + lea r5, [r4+dyq] + sar r4, 6 ; base2 + pand m3, m5, m4 + psllw m3, 9 + psubw m1, m0 + pmulhrsw m1, m3 + psraw m3, m4, 15 + paddw m4, m6 + paddw m0, m1 + movu xm1, [tlq+r4*2+2] + movu xm2, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vpblendvb m0, m7, m0, m3 + vinserti128 m1, [tlq+r5*2+2], 1 + vinserti128 m2, [tlq+r5*2], 1 + pand m3, m5, m4 + psllw m3, 9 + psubw m2, m1 + pmulhrsw m2, m3 + psraw m3, m4, 15 + paddw m4, m6 + lea r5, [dstq+strideq*4] + paddw m1, m2 + vpblendvb m1, m7, m1, m3 + punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 + vextracti128 xm3, m2, 1 + punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 + punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 + vextracti128 xm3, m0, 1 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movq [dstq+r7 ], xm2 + punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 + punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 + movhps [r5 +strideq*0], xm1 + movq [r5 +strideq*1], xm1 + movhps [r5 +strideq*2], xm0 + movq [r5 +r7 ], xm0 + sub wd, 4 + jz .h8_end + add dstq, 8 + cmp r4d, maxbased + jg .h8_loop + lea r6, [strideq*5] + lea r2, [strideq+r7*2] ; stride*7 + test wd, 4 + jz .h8_end_loop + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + movq [dstq+strideq*2], xm7 + movq [dstq+r7 ], xm7 + movq [dstq+strideq*4], xm7 + movq [dstq+r6 ], xm7 + movq [dstq+r7*2 ], xm7 + movq [dstq+r2 ], xm7 + add dstq, 8 + sub wd, 4 + jz .h8_end +.h8_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + mova [dstq+strideq*2], xm7 + mova [dstq+r7 ], xm7 + mova [dstq+strideq*4], xm7 + mova [dstq+r6 ], xm7 + mova [dstq+r7*2 ], xm7 + mova [dstq+r2 ], xm7 + add dstq, 16 + sub wd, 8 + jg .h8_end_loop +.h8_end: + RET +.h16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(w+15, 31) + jmp .h16_main +ALIGN function_align +.h16: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 10 + lea maxbased, [wq+15] + lea r7, [strideq*3] + test angled, 0x400 + jnz .h16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h16_main ; filter_strength == 0 + popcnt r5d, r5d + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pmullw m1, m7 + paddw m1, m2 + cmp wd, 8 + jg .h16_filter_w16 + mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 + pmullw xm6, xm3 + jl .h16_filter_w4 + pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 + cmp r5d, 3 + jne .h16_filter_w8_3tap + vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 +.h16_filter_w8_5tap: + punpckhwd m0, m0 + vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 + paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw xm4, xm4 + paddw m0, m0 + paddw xm6, xm4 + paddw m1, m0 +.h16_filter_w8_3tap: + paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 + pmullw xm3, xm7 + pxor m0, m0 + paddw xm3, xm6 + psrlw xm3, 3 + pavgw xm3, xm0 + mova [rsp+48], xm3 + jmp .h16_filter_end +.h16_filter_w4: + pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 + cmp r5d, 3 + jne .h16_filter_w8_3tap + pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 + jmp .h16_filter_w8_5tap +.h16_filter_w16: + mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + pmullw m6, m3 + punpcklwd xm3, xm3 + vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + mov r4d, 32 + cmp wd, 16 + cmovg maxbased, r4d + movd [rsp+28], xm3 + pmullw m4, m7 + cmp r5d, 3 + jne .h16_filter_w16_3tap + punpckhwd m0, m0 + vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movzx r4d, word [tlq-62] + movzx r2d, word [tlq-60] + or maxbased, 1 + paddw m3, m3 + sub r2d, r4d + paddw m0, m0 + lea r2d, [r2+r4*8+4] + paddw m4, m3 + shr r2d, 3 + paddw m1, m0 + mov [rsp+30], r2w +.h16_filter_w16_3tap: + pxor m0, m0 + paddw m4, m6 + psrlw m4, 3 + pavgw m4, m0 + mova [rsp+32], m4 +.h16_filter_end: + psrlw m1, 3 + lea tlq, [rsp+94] + pavgw m1, m0 + mova [rsp+64], m1 +.h16_main: + movd xm8, dyd + neg maxbaseq + vpbroadcastw m9, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m8, xm8 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + movd xm7, r4d + sub tlq, 32 + lea r4, [dyq+63] + vpbroadcastw m7, xm7 + or maxbased, 63 + psubw m7, [z_base_inc] +.h16_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu m0, [tlq+r4*2+2] + movu m2, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + movu m1, [tlq+r5*2+2] + movu m3, [tlq+r5*2] + lea r5, [r4+dyq] + sar r4, 6 ; base3 + pand m6, m5, m7 + psllw m6, 9 + psubw m2, m0 + pmulhrsw m2, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m0, m2 + movu m2, [tlq+r4*2+2] + movu m4, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vpblendvb m0, m9, m0, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m3, m1 + pmulhrsw m3, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m1, m3 + vpblendvb m1, m9, m1, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m4, m2 + pmulhrsw m4, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m2, m4 + movu m3, [tlq+r5*2+2] + movu m4, [tlq+r5*2] + vpblendvb m2, m9, m2, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m4, m3 + pmulhrsw m4, m6 + psraw m6, m7, 15 + paddw m7, m8 + lea r5, [dstq+strideq*4] + paddw m3, m4 + vpblendvb m3, m9, m3, m6 + punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 + punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 + punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 + punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 + vextracti128 xm6, m3, 1 + punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 + punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 + punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 + vextracti128 xm2, m4, 1 + movhps [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + vextracti128 xm6, m1, 1 + movhps [dstq+strideq*2], xm2 + movq [dstq+r7 ], xm2 + vextracti128 xm2, m0, 1 + movhps [r5 +strideq*0], xm6 + movq [r5 +strideq*1], xm6 + movhps [r5 +strideq*2], xm2 + movq [r5 +r7 ], xm2 + lea r5, [dstq+strideq*8] + movhps [r5 +strideq*0], xm3 + movq [r5 +strideq*1], xm3 + movhps [r5 +strideq*2], xm4 + movq [r5 +r7 ], xm4 + lea r5, [r5+strideq*4] + movhps [r5 +strideq*0], xm1 + movq [r5 +strideq*1], xm1 + movhps [r5 +strideq*2], xm0 + movq [r5 +r7 ], xm0 + sub wd, 4 + jz .h16_end + add dstq, 8 + cmp r4d, maxbased + jg .h16_loop + mov hd, 4 +.h16_end_loop0: + mov r6d, wd + mov r2, dstq + test wb, 4 + jz .h16_end_loop + movq [dstq+strideq*0], xm9 + movq [dstq+strideq*1], xm9 + movq [dstq+strideq*2], xm9 + movq [dstq+r7 ], xm9 + and r6d, 120 + jz .h16_end_w4 + add dstq, 8 +.h16_end_loop: + mova [dstq+strideq*0], xm9 + mova [dstq+strideq*1], xm9 + mova [dstq+strideq*2], xm9 + mova [dstq+r7 ], xm9 + add dstq, 16 + sub r6d, 8 + jg .h16_end_loop +.h16_end_w4: + lea dstq, [r2+strideq*4] + dec hd + jg .h16_end_loop0 +.h16_end: + RET +.h32: + %assign stack_offset org_stack_offset + ALLOC_STACK -160, 9 + lea maxbased, [wq+31] + and maxbased, 31 + or maxbased, 32 ; imin(w+31, 63) + test angled, 0x400 + jnz .h32_main + vpbroadcastd m2, [pw_3] + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + punpckhwd m1, m0, m0 + vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m2 + paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + lea r4, [rsp+128] + paddw m0, m1 + lea r5d, [maxbaseq-31] + psrlw m0, 2 + mova [r4], m0 +.h32_filter_loop: + mova m0, [tlq-62] + paddw m1, m2, [tlq-66] + paddw m0, [tlq-64] + pavgw m1, [tlq-58] + paddw m0, [tlq-60] + sub tlq, 32 + sub r4, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r4], m0 + sub r5d, 16 + jg .h32_filter_loop + jl .h32_filter_h8 + mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movzx r5d, word [tlq-62] + movzx r2d, word [tlq-60] + pavgw m2, m3 + sub r2d, r5d + paddw m0, m1 + lea r2d, [r2+r5*8+4] + paddw m0, m2 + shr r2d, 3 + psrlw m0, 2 + mova [r4-32], m0 + mov [r4-36], r5w + mov [r4-34], r2w + lea tlq, [rsp+158] + mov r4d, 65 + cmp wd, 64 + cmove maxbased, r4d + jmp .h32_main +.h32_filter_h8: + mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 + pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 + paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 + paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 + vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 + lea tlq, [rsp+158] + pavgw xm2, xm3 + paddw xm0, xm1 + paddw xm0, xm2 + psrlw xm0, 2 + mova [r4-16], xm0 +.h32_main: + movd xm6, dyd + neg maxbaseq + vpbroadcastw m7, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m6, xm6 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + movd xm4, r4d + vpbroadcastd m8, [pw_m1024] + lea r4, [dyq+63] + vpbroadcastw m4, xm4 + or maxbased, 63 + psubw m4, [z_base_inc] +.h32_loop: + mov r5, r4 + sar r5, 6 + movu m1, [tlq+r5*2-64] + movu m0, [tlq+r5*2-62] + pand m3, m5, m4 + psllw m3, 9 + psubw m1, m0 + pmulhrsw m1, m3 + pcmpgtw m2, m8, m4 + paddw m0, m1 + vpblendvb m0, m7, m0, m2 + movu m2, [tlq+r5*2-32] + movu m1, [tlq+r5*2-30] + add r4, dyq + sub rsp, 64 + psubw m2, m1 + pmulhrsw m2, m3 + psraw m3, m4, 15 + paddw m4, m6 + mova [rsp+32*0], m0 + paddw m1, m2 + vpblendvb m1, m7, m1, m3 + mova [rsp+32*1], m1 + dec wd + jz .h32_transpose + cmp r4d, maxbased + jg .h32_loop +.h32_end_loop: + sub rsp, 64 + mova [rsp+32*0], m7 + mova [rsp+32*1], m7 + dec wd + jg .h32_end_loop +.h32_transpose: + lea r3, [strideq*3] + lea r4, [strideq*5] + mov r8, dstq + lea r5, [strideq+r3*2] +.h32_transpose_loop0: + lea r6, [rsp+32] + lea r2, [r8+org_wq*2-16] +.h32_transpose_loop: + mova m0, [r6+64*7] + mova m1, [r6+64*6] + mova m2, [r6+64*5] + mova m3, [r6+64*4] + mova m4, [r6+64*3] + mova m5, [r6+64*2] + mova m6, [r6+64*1] + mova m7, [r6+64*0] + punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 + punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 + punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 + punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 + punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 + punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 + punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 + lea dstq, [r2+strideq*8] + sub r6, 32 + punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 + punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 + punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 + punpckhqdq m5, m7, m1 ; 8 0 + vextracti128 [r2 +strideq*0], m5, 1 + punpcklqdq m7, m1 ; 9 1 + mova [dstq+strideq*0], xm5 + punpckhqdq m1, m8, m3 ; 10 2 + vextracti128 [r2 +strideq*1], m7, 1 + punpcklqdq m8, m3 ; 11 3 + mova [dstq+strideq*1], xm7 + punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 + vextracti128 [r2 +strideq*2], m1, 1 + punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 + mova [dstq+strideq*2], xm1 + punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 + vextracti128 [r2 +r3 ], m8, 1 + punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 + mova [dstq+r3 ], xm8 + punpckhqdq m6, m3, m2 ; 12 4 + vextracti128 [r2 +strideq*4], m6, 1 + punpcklqdq m3, m2 ; 13 5 + mova [dstq+strideq*4], xm6 + punpckhqdq m2, m0, m4 ; 14 6 + vextracti128 [r2 +r4 ], m3, 1 + punpcklqdq m0, m4 ; 15 7 + mova [dstq+r4 ], xm3 + vextracti128 [r2 +r3*2 ], m2, 1 + mova [dstq+r3*2 ], xm2 + vextracti128 [r2 +r5 ], m0, 1 + mova [dstq+r5 ], xm0 + lea r2, [dstq+strideq*8] + cmp r6, rsp + jae .h32_transpose_loop + add rsp, 64*8 + sub org_wd, 8 + jg .h32_transpose_loop0 +.h32_end: + RET +.h64: + %assign stack_offset org_stack_offset + ALLOC_STACK -256, 10 + lea maxbased, [wq+63] + test angled, 0x400 + jnz .h64_main + vpbroadcastd m2, [pw_3] + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + punpckhwd m1, m0, m0 + vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m2 + paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + lea r4, [rsp+224] + paddw m0, m1 + lea r5d, [wq+32] + psrlw m0, 2 + mova [r4], m0 +.h64_filter_loop: + mova m0, [tlq-62] + paddw m1, m2, [tlq-66] + paddw m0, [tlq-64] + pavgw m1, [tlq-58] + paddw m0, [tlq-60] + sub tlq, 32 + sub r4, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r4], m0 + sub r5d, 16 + jg .h64_filter_loop + mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + lea tlq, [rsp+254] + pavgw m2, m3 + paddw m0, m1 + paddw m0, m2 + psrlw m0, 2 + mova [r4-32], m0 +.h64_main: + neg maxbaseq + movd xm4, dyd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + vpbroadcastd m7, [pw_m1024] + movd xm3, r4d + lea r4, [dyq+63] + paddw m8, m7, m7 + vpbroadcastw m3, xm3 + or maxbased, 63 + paddw m9, m8, m7 + psubw m3, [z_base_inc] +.h64_loop: + mov r5, r4 + sar r5, 6 + movu m1, [tlq+r5*2-128] + movu m0, [tlq+r5*2-126] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + sub rsp, 128 + paddw m0, m1 + pcmpgtw m1, m9, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*0], m0 + movu m1, [tlq+r5*2-96] + movu m0, [tlq+r5*2-94] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m8, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*1], m0 + movu m1, [tlq+r5*2-64] + movu m0, [tlq+r5*2-62] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m7, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*2], m0 + movu m1, [tlq+r5*2-32] + movu m0, [tlq+r5*2-30] + psubw m1, m0 + pmulhrsw m1, m2 + add r4, dyq + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [rsp+32*3], m0 + dec wd + jz .h64_transpose + cmp r4d, maxbased + jg .h64_loop +.h64_end_loop: + sub rsp, 128 + mova [rsp+32*0], m6 + mova [rsp+32*1], m6 + mova [rsp+32*2], m6 + mova [rsp+32*3], m6 + dec wd + jg .h64_end_loop +.h64_transpose: + lea r2, [strideq*3] + lea r3, [strideq*5] + mov r5, dstq + lea r4, [strideq+r2*2] +.h64_transpose_loop0: + lea r6, [rsp+112] + lea dstq, [r5+org_wq*2-32] +.h64_transpose_loop: + mova xm0, [r6+128*15] + vinserti128 m0, [r6+128* 7], 1 + mova xm1, [r6+128*14] + vinserti128 m1, [r6+128* 6], 1 + mova xm2, [r6+128*13] + vinserti128 m2, [r6+128* 5], 1 + mova xm3, [r6+128*12] + vinserti128 m3, [r6+128* 4], 1 + mova xm4, [r6+128*11] + vinserti128 m4, [r6+128* 3], 1 + mova xm5, [r6+128*10] + vinserti128 m5, [r6+128* 2], 1 + mova xm6, [r6+128* 9] + vinserti128 m6, [r6+128* 1], 1 + mova xm7, [r6+128* 8] + vinserti128 m7, [r6+128* 0], 1 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + sub r6, 16 + punpckhdq m7, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpckhqdq m5, m7, m1 + punpcklqdq m7, m1 + punpckhqdq m1, m8, m3 + punpcklqdq m8, m3 + punpckhdq m3, m0, m2 + mova [dstq+strideq*0], m5 + punpckldq m0, m2 + mova [dstq+strideq*1], m7 + punpckhdq m2, m4, m6 + mova [dstq+strideq*2], m1 + punpckldq m4, m6 + mova [dstq+r2 ], m8 + punpckhqdq m6, m3, m2 + mova [dstq+strideq*4], m6 + punpcklqdq m3, m2 + mova [dstq+r3 ], m3 + punpckhqdq m2, m0, m4 + mova [dstq+r2*2 ], m2 + punpcklqdq m0, m4 + mova [dstq+r4 ], m0 + lea dstq, [dstq+strideq*8] + cmp r6, rsp + jae .h64_transpose_loop + add rsp, 128*16 + sub org_wd, 16 + jg .h64_transpose_loop0 +.h64_end: + RET + +%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax +%ifnum %4 + pshufb xm%2, xm%4 +%else + pshufb xm%2, %4 +%endif + vinserti128 m%2, xm%2, 1 + pshufd m%1, m%2, q0000 + pmaddwd m%1, m2 + pshufd m%3, m%2, q1111 + pmaddwd m%3, m3 + paddd m%1, m1 + paddd m%1, m%3 + pshufd m%3, m%2, q2222 + pmaddwd m%3, m4 + paddd m%1, m%3 + pshufd m%3, m%2, q3333 + pmaddwd m%3, m5 + paddd m%1, m%3 + psrad m%1, 4 + packusdw m%1, m%1 + pminsw m%1, m%5 +%endmacro + +%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax + pshufb m%2, m%6 + vpermq m%4, m%2, q3232 + vinserti128 m%2, xm%2, 1 + pshufd m%1, m%2, q0000 + pshufd m%3, m%4, q0000 + pmaddwd m%1, m2 + pmaddwd m%3, m2 + paddd m%1, m1 + paddd m%3, m1 + pshufd m%5, m%2, q1111 + pmaddwd m%5, m3 + paddd m%1, m%5 + pshufd m%5, m%4, q1111 + pmaddwd m%5, m3 + paddd m%3, m%5 + pshufd m%5, m%2, q2222 + pmaddwd m%5, m4 + paddd m%1, m%5 + pshufd m%5, m%4, q2222 + pmaddwd m%5, m4 + paddd m%3, m%5 + pshufd m%5, m%2, q3333 + pmaddwd m%5, m5 + paddd m%1, m%5 + pshufd m%5, m%4, q3333 + pmaddwd m%5, m5 + paddd m%3, m%5 + psrad m%1, 4 + psrad m%3, 4 + packusdw m%1, m%3 + pminsw m%1, m%7 +%endmacro + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. One redundant +; block is calculated for w8 and w16, two for w32. +; w4 w8 w16 w32 +; 1 1 2 1 2 3 5 1 2 3 5 b c d f +; 2 2 3 2 4 5 7 2 4 5 7 c e f h +; 3 3 4 4 6 7 9 4 6 7 9 e g h j +; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ +; 5 8 8 i + +cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter +%assign org_stack_offset stack_offset +%define base r6-ipred_filter_16bpc_avx2_table + lea r6, [filter_intra_taps] + tzcnt wd, wm +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + add filterq, r6 + lea r6, [ipred_filter_16bpc_avx2_table] + vbroadcasti128 m0, [tlq-6] + movsxd wq, [r6+wq*4] + vpbroadcastd m1, [base+pd_8] + pmovsxbw m2, [filterq+16*0] + pmovsxbw m3, [filterq+16*1] + pmovsxbw m4, [filterq+16*2] + pmovsxbw m5, [filterq+16*3] + add wq, r6 + mov hd, hm + jmp wq +.w4: + WIN64_SPILL_XMM 10 + mova xm8, [base+filter_shuf2] + vpbroadcastw m9, r8m ; bitdepth_max + lea r7, [6+hq*2] + sub tlq, r7 + jmp .w4_loop_start +.w4_loop: + pinsrq xm0, [tlq+hq*2], 0 + lea dstq, [dstq+strideq*2] +.w4_loop_start: + FILTER_1BLK 6, 0, 7, 8, 9 + vextracti128 xm0, m6, 1 + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm0 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vbroadcasti128 m14, [base+filter_shuf3] + vpbroadcastw m15, r8m ; bitdepth_max + FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15 + vpermq m6, m10, q1302 ; ____ ____ | ____ 4321 + pslldq m8, m0, 4 + psrldq m7, m6, 2 + psrldq m0, m6, 10 + punpcklwd m7, m0 + vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321 + vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321 + vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321 + lea r7, [16+hq*2] + sub tlq, r7 + jmp .w8_loop_start +.w8_loop: + vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321 + vpermq m6, m9, q2031 + psrldq m0, m6, 2 + psrldq m6, 10 + punpcklwd m6, m0 + vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321 + vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321 + mova m10, m9 +.w8_loop_start: + vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321 + call .main + vpblendd m10, m9, 0xCC + mova [dstq+strideq*0], xm10 + vextracti128 [dstq+strideq*1], m10, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + %assign stack_offset stack_offset - stack_size_padded + ALLOC_STACK 32, 16 + vpbroadcastw m15, r8m ; bitdepth_max + sub hd, 2 + TAIL_CALL .w16_main, 0 +.w16_main: + mova xm10, [base+filter_shuf2] + FILTER_1BLK 13, 0, 6, 10, 15 + vpermq m12, m13, q3120 + mova xm14, [base+filter_shuf3] + vinserti128 m14, [base+filter_shuf1], 1 + vpbroadcastq m0, [tlq+10] + vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____ + psrldq m6, m12, 8 + vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321 + punpcklwd m6, m12 + vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321 + FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 + vpblendd m13, m12, 0xCC + vpermq m12, m12, q2031 ; 6___ 5___ + psrldq xm6, xm12, 2 + psrldq xm8, xm12, 12 + vpblendd xm6, xm8, 0x01 + pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ + FILTER_1BLK 11, 6, 8, 10, 15 + vpermq m11, m11, q3120 + pshufd m9, m11, q1032 + movu m8, [tlq+6] ; __43 210_ | ____ ____ + pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ + pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ + vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 + lea r7, [20+hq*2] + sub tlq, r7 + vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 + jmp .w16_loop_start +.w16_loop: + vpermq m13, m13, q3322 + vpermq m11, m9, q2020 + vpermq m9, m9, q1302 + vpermq m6, m12, q0123 + psrldq m7, 4 + vpblendd m13, m10, 0xCC + vpblendd m9, m7, 0x40 + mova m0, [rsp+8] + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 +.w16_loop_start: + mova m13, m12 + vpblendd m0, [tlq+hq*2], 0x0C + psrldq m7, m12, 8 + punpcklwd m7, m12 + vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 + vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 + FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 + vpermq m12, m10, q2031 + mova [rsp+8], m0 + psrldq m8, m11, 8 + psrldq xm6, xm12, 2 + psrldq xm7, xm12, 10 + psrldq xm0, xm13, 2 + punpcklwd m8, m11 + punpcklwd xm7, xm6 + vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 + vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 + vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 + call .main + vpermq m8, m11, q3120 + vpblendd m6, m8, m9, 0xCC + mova [dstq+strideq*0+16], xm6 + vextracti128 [dstq+strideq*1+16], m6, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + vpermq m8, m9, q3120 + vextracti128 xm0, m8, 1 ; 4321 ____ + pshufd xm11, xm11, q1032 + vpblendd xm0, xm11, 0x02 ; 4321 0___ + psrldq xm6, xm8, 2 + psrldq xm7, xm8, 12 + pblendw xm0, xm6, 0x4 ; 4321 05__ + pblendw xm0, xm7, 0x2 ; 4321 056_ + FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 + vpermq m12, m13, q1302 + vpblendd m12, m10, 0xCC + vpblendd m9, m6, 0xCC + mova [dstq+strideq*0+ 0], xm12 + mova [dstq+strideq*0+16], xm9 + vextracti128 [dstq+strideq*1+ 0], m12, 1 + vextracti128 [dstq+strideq*1+16], m9, 1 + ret +ALIGN function_align +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK 64, 16 + vpbroadcastw m15, r8m ; bitdepth_max + sub hd, 2 + lea r3, [dstq+32] + lea r5d, [hd*2+20] + call .w16_main + mov dstq, r3 + lea tlq, [tlq+r5+32] + sub r5d, 20 + shr r5d, 1 + sub r5d, 2 + lea r4, [dstq+strideq*2-2] +DEFINE_ARGS dst, stride, tl, stride3, left, h + lea stride3q, [strideq*3] + movu m8, [tlq-6] ; 4321 0___ + mova xm10, [base+filter_shuf2] + pinsrw xm0, xm8, [dstq+strideq*0-2], 2 + pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_ + pinsrw xm9, [leftq+strideq*0], 5 + pinsrw xm9, [leftq+strideq*1], 4 + FILTER_1BLK 13, 0, 6, 10, 15 + vpermq m12, m13, q3120 + mova xm14, [base+filter_shuf3] + vinserti128 m14, [base+filter_shuf1], 1 + psrldq m6, m12, 8 + punpcklwd m7, m6, m12 + vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321 + vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321 + vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321 + vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321 + FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 + vpblendd m13, m12, 0xCC + pinsrw xm9, [leftq+strideq*2], 3 + pinsrw xm9, [leftq+stride3q ], 2 + lea leftq, [leftq+strideq*4] + pinsrw xm9, [leftq+strideq*0], 1 + pinsrw xm9, [leftq+strideq*1], 0 + movq [rsp+32], xm9 + mov r7d, 1 + pslldq m8, m9, 4 + vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____ + vpermq m12, m12, q2031 ; 6___ 5___ + psrldq xm6, xm12, 2 + psrldq xm7, xm12, 12 + vpblendd xm6, xm7, 0x01 ; ____ _56_ + pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ + FILTER_1BLK 11, 6, 7, 10, 15 + vpermq m11, m11, q3120 + pshufd m9, m11, q1032 + vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____ + pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ + pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ + vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 + vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 + jmp .w32_loop_start +.w32_loop_last: + mova m0, [rsp+0] + jmp .w32_loop +.w32_loop_left: + mova m0, [rsp+0] + vpblendd m0, [rsp+32+r7*4-12], 0x0C + dec r7d + jg .w32_loop + cmp hd, 2 + je .w32_loop + pinsrw xm6, [rsp+32], 6 + pinsrw xm6, [leftq+strideq*2], 5 + pinsrw xm6, [leftq+stride3q ], 4 + lea leftq, [leftq+strideq*4] + pinsrw xm6, [leftq+strideq*0], 3 + pinsrw xm6, [leftq+strideq*1], 2 + pinsrw xm6, [leftq+strideq*2], 1 + pinsrw xm6, [leftq+stride3q ], 0 + lea leftq, [leftq+strideq*4] + movu [rsp+36], xm6 + pinsrw xm6, [leftq+strideq*0], 1 + pinsrw xm6, [leftq+strideq*1], 0 + movd [rsp+32], xm6 + mov r7d, 4 +.w32_loop: + vpermq m13, m13, q3322 + vpermq m11, m9, q2020 + vpermq m9, m9, q1302 + vpermq m6, m12, q0123 + psrldq m7, 4 + vpblendd m13, m10, 0xCC + vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321 + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 +.w32_loop_start: + mova m13, m12 + psrldq m7, m12, 8 + punpcklwd m7, m12 + vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 + vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 + FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 + vpermq m12, m10, q2031 + mova [rsp+0], m0 + psrldq m8, m11, 8 + psrldq xm6, xm12, 2 + psrldq xm7, xm12, 10 + psrldq xm0, xm13, 2 + punpcklwd m8, m11 + punpcklwd xm7, xm6 + vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 + vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 + vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 + call .main + vpermq m8, m11, q3120 + vpblendd m6, m8, m9, 0xCC + mova [dstq+strideq*0+16], xm6 + vextracti128 [dstq+strideq*1+16], m6, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop_left + jz .w32_loop_last + vpermq m8, m9, q3120 + vextracti128 xm0, m8, 1 ; 4321 ____ + pshufd xm11, xm11, q1032 + vpblendd xm0, xm11, 0x02 ; 4321 0___ + psrldq xm6, xm8, 2 + psrldq xm7, xm8, 12 + pblendw xm0, xm6, 0x4 ; 4321 05__ + pblendw xm0, xm7, 0x2 ; 4321 056_ + FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 + vpermq m12, m13, q1302 + vpblendd m12, m10, 0xCC + vpblendd m9, m6, 0xCC + mova [dstq+strideq*0+ 0], xm12 + mova [dstq+strideq*0+16], xm9 + vextracti128 [dstq+strideq*1+ 0], m12, 1 + vextracti128 [dstq+strideq*1+16], m9, 1 + RET +.main: + FILTER_2BLK 9, 8, 6, 7, 0, 14, 15 + ret + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +%macro IPRED_CFL 1 ; ac in, unpacked pixels out + psignw m3, m%1, m1 + pabsw m%1, m%1 + pmulhrsw m%1, m2 + psignw m%1, m3 + paddw m%1, m0 +%endmacro + +cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + add tlq, 2 + movd xm4, wd + pxor m6, m6 + vpbroadcastw m7, r7m + pavgw xm4, xm6 + tzcnt wd, wd + movd xm5, wd + movu m0, [tlq] + lea t0, [ipred_cfl_left_16bpc_avx2_table] + movsxd r6, [t0+wq*4] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 + +cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + mov hd, hm ; zero upper half + sub tlq, hq + movd xm4, hd + sub tlq, hq + pxor m6, m6 + vpbroadcastw m7, r7m + pavgw xm4, xm6 + tzcnt r6d, hd + movd xm5, r6d + movu m0, [tlq] + lea t0, [ipred_cfl_left_16bpc_avx2_table] + movsxd r6, [t0+r6*4] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table + tzcnt wd, wd + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h32: + paddw m0, [tlq+32] +.h16: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h8: + psrldq xm1, xm0, 8 + paddw xm0, xm1 +.h4: + punpcklwd xm0, xm6 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + paddd xm0, xm4 + psrld xm0, xm5 + vpbroadcastw m0, xm0 + jmp wq + +cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd xm4, t0d + tzcnt t0d, t0d + movd xm5, t0d + lea t0, [ipred_cfl_16bpc_avx2_table] + tzcnt wd, wd + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + psrlw xm4, 1 + pxor m6, m6 + vpbroadcastw m7, r7m + add r6, t0 + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h4: + movq xm0, [tlq-8] + jmp wq +.w4: + movq xm1, [tlq+2] + paddw m0, m4 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrld m1, m0, 16 + paddw m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + lea r2d, [hq*2] + mov r6d, 0xAAAB6667 + shrx r6d, r6d, r2d + punpckhwd xm1, xm0, xm6 + punpcklwd xm0, xm6 + paddd xm0, xm1 + movd xm1, r6d + psrld xm0, 2 + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w4_end: + vpbroadcastw m0, xm0 +.s4: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq] + IPRED_CFL 4 + pmaxsw m4, m6 + pminsw m4, m7 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+strideq*1], xm4 + movhps [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + add acq, 32 + sub hd, 4 + jg .s4_loop + RET +ALIGN function_align +.h8: + mova xm0, [tlq-16] + jmp wq +.w8: + vextracti128 xm1, m0, 1 + paddw xm0, [tlq+2] + paddw xm0, xm4 + paddw xm0, xm1 + psrld xm1, xm0, 16 + paddw xm0, xm1 + pblendw xm0, xm6, 0xAA + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w8_end: + vpbroadcastw m0, xm0 +.s8: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + pmaxsw m4, m6 + pmaxsw m5, m6 + pminsw m4, m7 + pminsw m5, m7 + mova [dstq+strideq*0], xm4 + mova [dstq+strideq*2], xm5 + vextracti128 [dstq+strideq*1], m4, 1 + vextracti128 [dstq+r6 ], m5, 1 + lea dstq, [dstq+strideq*4] + add acq, 64 + sub hd, 4 + jg .s8_loop + RET +ALIGN function_align +.h16: + mova m0, [tlq-32] + jmp wq +.w16: + paddw m0, [tlq+2] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpckhwd xm1, xm0, xm6 + punpcklwd xm0, xm6 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + pmaxsw m4, m6 + pmaxsw m5, m6 + pminsw m4, m7 + pminsw m5, m7 + mova [dstq+strideq*0], m4 + mova [dstq+strideq*1], m5 + lea dstq, [dstq+strideq*2] + add acq, 64 + sub hd, 2 + jg .s16_loop + RET +ALIGN function_align +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-32] + jmp wq +.w32: + paddw m0, [tlq+ 2] + paddw m0, [tlq+34] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm6 + punpckhwd xm0, xm6 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x6667AAAB + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w32_end: + vpbroadcastw m0, xm0 +.s32: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + pmaxsw m4, m6 + pmaxsw m5, m6 + pminsw m4, m7 + pminsw m5, m7 + mova [dstq+32*0], m4 + mova [dstq+32*1], m5 + add dstq, strideq + add acq, 64 + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + mov r6d, r7m + shr r6d, 11 + lea t0, [ipred_cfl_splat_16bpc_avx2_table] + tzcnt wd, wd + movifnidn hd, hm + movsxd wq, [t0+wq*4] + vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4] + pxor m6, m6 + vpbroadcastw m7, r7m + add wq, t0 + movifnidn acq, acmp + jmp wq + +cglobal ipred_cfl_ac_420_16bpc, 4, 9, 6, ac, ypx, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pw_2] + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 +DEFINE_ARGS ac, ypx, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + mova xm0, [ypxq+strideq*2] + mova xm1, [ypxq+stride3q ] + vinserti128 m0, [ypxq+strideq*0], 1 + vinserti128 m1, [ypxq+strideq*1], 1 + pmaddwd m0, m2 + pmaddwd m1, m2 + paddd m0, m1 + vextracti128 xm1, m0, 1 + paddd m4, m0 + packssdw xm1, xm0 + mova [acq], xm1 + lea ypxq, [ypxq+strideq*4] + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m1, m1, q1111 + pslld xm0, 2 +.w4_hpad_loop: + mova [acq], m1 + paddd m4, m0 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg +.w8: + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + pmaddwd m0, m2, [ypxq+strideq*0] + pmaddwd m1, m2, [ypxq+strideq*1] + paddd m0, m1 + vextracti128 xm1, m0, 1 + paddd m4, m0 + packssdw xm1, xm0, xm1 + mova [acq], xm1 + lea ypxq, [ypxq+strideq*2] + add acq, 16 + dec hd + jg .w8_loop + jmp .w8_hpad +.w8_wpad: + pmaddwd xm0, xm2, [ypxq+strideq*0] + pmaddwd xm3, xm2, [ypxq+strideq*1] + paddd xm0, xm3 + pshufd xm3, xm0, q3333 + packssdw xm1, xm0, xm3 + paddd xm0, xm3 + paddd xm4, xm0 + mova [acq], xm1 + lea ypxq, [ypxq+strideq*2] + add acq, 16 + dec hd + jg .w8_wpad +.w8_hpad: + test hpadd, hpadd + jz .calc_avg + vinserti128 m1, xm1, 1 + paddd m0, m0 +.w8_hpad_loop: + paddd m4, m0 + mova [acq], m1 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmaddwd m0, m2, [ypxq+strideq*0+ 0] + pmaddwd m1, m2, [ypxq+strideq*1+ 0] + pmaddwd m3, m2, [ypxq+strideq*0+32] + pmaddwd m5, m2, [ypxq+strideq*1+32] + paddd m0, m1 + paddd m3, m5 + packssdw m1, m0, m3 + paddd m0, m3 + vpermq m1, m1, q3120 + paddd m4, m0 + mova [acq], m1 + lea ypxq, [ypxq+strideq*2] + add acq, 32 + dec hd + jg .w16_loop + jmp .w16_hpad +.w16_wpad: +DEFINE_ARGS ac, ypx, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_420_16bpc_avx2_table] + mov wpadd, wpadd + movsxd wpadq, [iptrq+wpadq*4+4] + add iptrq, wpadq + jmp iptrq +.w16_wpad_pad3: + vpbroadcastd m3, [ypxq+strideq*0+12] + vpbroadcastd m5, [ypxq+strideq*1+12] + vinserti128 m0, m3, [ypxq+strideq*0], 0 + vinserti128 m1, m5, [ypxq+strideq*1], 0 + jmp .w16_wpad_end +.w16_wpad_pad2: + mova m0, [ypxq+strideq*0+ 0] + mova m1, [ypxq+strideq*1+ 0] + vpbroadcastd m3, [ypxq+strideq*0+28] + vpbroadcastd m5, [ypxq+strideq*1+28] + jmp .w16_wpad_end +.w16_wpad_pad1: + mova m0, [ypxq+strideq*0+ 0] + mova m1, [ypxq+strideq*1+ 0] + vpbroadcastd m3, [ypxq+strideq*0+44] + vpbroadcastd m5, [ypxq+strideq*1+44] + vinserti128 m3, [ypxq+strideq*0+32], 0 + vinserti128 m5, [ypxq+strideq*1+32], 0 +.w16_wpad_end: + pmaddwd m0, m2 + pmaddwd m1, m2 + pmaddwd m3, m2 + pmaddwd m5, m2 + paddd m0, m1 + paddd m3, m5 + packssdw m1, m0, m3 + paddd m0, m3 + vpermq m1, m1, q3120 + paddd m4, m0 + mova [acq], m1 + lea ypxq, [ypxq+strideq*2] + add acq, 32 + dec hd + jz .w16_hpad + jmp iptrq +.w16_hpad: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m1 + paddd m4, m0 + add acq, 32 + dec hpadd + jg .w16_hpad_loop +.calc_avg: + vextracti128 xm0, m4, 1 + tzcnt r1d, szd + movd xm3, szd + paddd xm0, xm4 + movd xm2, r1d + punpckhqdq xm1, xm0, xm0 + psrld xm3, 1 + paddd xm0, xm1 + pshuflw xm1, xm0, q1032 + paddd xm0, xm3 + paddd xm0, xm1 + psrld xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal ipred_cfl_ac_422_16bpc, 4, 9, 6, ac, ypx, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pw_4] + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 +DEFINE_ARGS ac, ypx, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + mova xm0, [ypxq+strideq*0] + mova xm1, [ypxq+strideq*1] + vinserti128 m0, [ypxq+strideq*2], 1 + vinserti128 m1, [ypxq+stride3q ], 1 + pmaddwd m0, m2 + pmaddwd m1, m2 + paddd m4, m0 + packssdw m0, m1 + paddd m4, m1 + mova [acq], m0 + lea ypxq, [ypxq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q3333 + vextracti128 xm1, m1, 1 + pslld xm1, 2 +.w4_hpad_loop: + mova [acq], m0 + paddd m4, m1 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg +.w8: + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + pmaddwd m0, m2, [ypxq+strideq*0] + pmaddwd m1, m2, [ypxq+strideq*1] + paddd m4, m0 + packssdw m0, m1 + paddd m4, m1 + vpermq m0, m0, q3120 + mova [acq], m0 + lea ypxq, [ypxq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_loop + jmp .w8_hpad +.w8_wpad: + vpbroadcastd m0, [ypxq+strideq*0+12] + vpbroadcastd m1, [ypxq+strideq*1+12] + vinserti128 m0, [ypxq+strideq*0+ 0], 0 + vinserti128 m1, [ypxq+strideq*1+ 0], 0 + pmaddwd m0, m2 + pmaddwd m1, m2 + paddd m4, m0 + packssdw m0, m1 + paddd m4, m1 + vpermq m0, m0, q3120 + mova [acq], m0 + lea ypxq, [ypxq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_wpad +.w8_hpad: + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q3232 + paddd m1, m1 +.w8_hpad_loop: + mova [acq], m0 + paddd m4, m1 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmaddwd m3, m2, [ypxq+strideq*0+ 0] + pmaddwd m0, m2, [ypxq+strideq*0+32] + pmaddwd m1, m2, [ypxq+strideq*1+ 0] + pmaddwd m5, m2, [ypxq+strideq*1+32] + paddd m4, m3 + packssdw m3, m0 + paddd m4, m0 + packssdw m0, m1, m5 + paddd m1, m5 + paddd m4, m1 + vpermq m3, m3, q3120 + vpermq m0, m0, q3120 + mova [acq+ 0], m3 + mova [acq+32], m0 + lea ypxq, [ypxq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + jmp .w16_hpad +.w16_wpad: +DEFINE_ARGS ac, ypx, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_422_16bpc_avx2_table] + mov wpadd, wpadd + movsxd wpadq, [iptrq+wpadq*4+4] + add iptrq, wpadq + jmp iptrq +.w16_wpad_pad3: + vpbroadcastd m0, [ypxq+strideq*0+12] + vpbroadcastd m3, [ypxq+strideq*1+12] + vinserti128 m5, m0, [ypxq+strideq*0], 0 + vinserti128 m1, m3, [ypxq+strideq*1], 0 + jmp .w16_wpad_end +.w16_wpad_pad2: + mova m5, [ypxq+strideq*0+ 0] + mova m1, [ypxq+strideq*1+ 0] + vpbroadcastd m0, [ypxq+strideq*0+28] + vpbroadcastd m3, [ypxq+strideq*1+28] + jmp .w16_wpad_end +.w16_wpad_pad1: + mova m5, [ypxq+strideq*0+ 0] + mova m1, [ypxq+strideq*1+ 0] + vpbroadcastd m0, [ypxq+strideq*0+44] + vpbroadcastd m3, [ypxq+strideq*1+44] + vinserti128 m0, [ypxq+strideq*0+32], 0 + vinserti128 m3, [ypxq+strideq*1+32], 0 +.w16_wpad_end: + pmaddwd m5, m2 + pmaddwd m1, m2 + pmaddwd m0, m2 + pmaddwd m3, m2 + paddd m4, m5 + packssdw m5, m0 + paddd m4, m0 + packssdw m0, m1, m3 + paddd m1, m3 + paddd m4, m1 + vpermq m5, m5, q3120 + vpermq m0, m0, q3120 + mova [acq+ 0], m5 + mova [acq+32], m0 + lea ypxq, [ypxq+strideq*2] + add acq, 64 + sub hd, 2 + jz .w16_hpad + jmp iptrq +.w16_hpad: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m0 + paddd m4, m1 + add acq, 32 + dec hpadd + jg .w16_hpad_loop +.calc_avg: + vextracti128 xm0, m4, 1 + tzcnt r1d, szd + movd xm2, r1d + paddd xm0, xm4 + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrld xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrld xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h + vbroadcasti128 m3, [palq] + lea r2, [pal_pred_16bpc_avx2_table] + tzcnt wd, wm + vbroadcasti128 m4, [pal_pred_shuf] + movifnidn hd, hm + movsxd wq, [r2+wq*4] + pshufb m3, m4 + punpckhqdq m4, m3, m3 + add wq, r2 +DEFINE_ARGS dst, stride, stride3, idx, w, h + lea stride3q, [strideq*3] + jmp wq +.w4: + mova xm2, [idxq] + add idxq, 16 + pshufb xm1, xm3, xm2 + pshufb xm2, xm4, xm2 + punpcklbw xm0, xm1, xm2 + punpckhbw xm1, xm2 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+strideq*1], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + movu m2, [idxq] ; only 16-byte alignment + add idxq, 32 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*2], m0, 1 + vextracti128 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + vpermq m2, [idxq+ 0], q3120 + vpermq m5, [idxq+32], q3120 + add idxq, 64 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufb m1, m3, m5 + pshufb m2, m4, m5 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpermq m2, [idxq+ 0], q3120 + vpermq m5, [idxq+32], q3120 + add idxq, 64 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+32], m1 + pshufb m1, m3, m5 + pshufb m2, m4, m5 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+32], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + vpermq m2, [idxq+ 0], q3120 + vpermq m5, [idxq+32], q3120 + add idxq, 64 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+ 0], m0 + mova [dstq+32], m1 + pshufb m1, m3, m5 + pshufb m2, m4, m5 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+64], m0 + mova [dstq+96], m1 + add dstq, strideq + dec hd + jg .w64 + RET + +%endif diff -Nru dav1d-0.7.1/src/x86/ipred16_sse.asm dav1d-0.9.1/src/x86/ipred16_sse.asm --- dav1d-0.7.1/src/x86/ipred16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/ipred16_sse.asm 2021-07-28 21:38:28.897852200 +0000 @@ -0,0 +1,1931 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 +pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + +pb_0_1: times 4 db 0, 1 +pb_2_3: times 4 db 2, 3 +pw_1: times 4 dw 1 +pw_2: times 4 dw 2 +pw_4: times 4 dw 4 +pw_512: times 4 dw 512 +pw_2048: times 4 dw 2048 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) +%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) +%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) + +JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 +JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ + s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 +JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 +JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 + +cextern smooth_weights_1d_16bpc +cextern smooth_weights_2d_16bpc +cextern filter_intra_taps + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +INIT_XMM ssse3 +cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h + LEA r5, ipred_dc_left_16bpc_ssse3_table + movd m4, wm + tzcnt wd, wm + add tlq, 2 + movifnidn hd, hm + pxor m3, m3 + pavgw m4, m3 + movd m5, wd + movu m0, [tlq] + movsxd r6, [r5+wq*4] + add r6, r5 + add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_left_16bpc_ssse3_table + mov hd, hm + movd m4, hm + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + pxor m3, m3 + sub tlq, hq + pavgw m4, m3 + movd m5, r6d + movu m0, [tlq] + movsxd r6, [r5+r6*4] + add r6, r5 + add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m2, [tlq+112] + movu m1, [tlq+ 96] + paddw m0, m2 + movu m2, [tlq+ 80] + paddw m1, m2 + movu m2, [tlq+ 64] + paddw m0, m2 + paddw m0, m1 +.h32: + movu m1, [tlq+ 48] + movu m2, [tlq+ 32] + paddw m1, m2 + paddw m0, m1 +.h16: + movu m1, [tlq+ 16] + paddw m0, m1 +.h8: + movhlps m1, m0 + paddw m0, m1 +.h4: + punpcklwd m0, m3 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + lea stride3q, [strideq*3] + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd m4, r5d + tzcnt r5d, r5d + movd m5, r5d + LEA r5, ipred_dc_16bpc_ssse3_table + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pxor m3, m3 + psrlw m4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movq m0, [tlq-8] + jmp wq +.w4: + movq m1, [tlq+2] + paddw m1, m0 + punpckhwd m0, m3 + punpcklwd m1, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 + jmp .w4_end +.w4_mul: + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 16 + cmove r2d, r3d + psrld m0, 2 + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w4_end: + pshuflw m0, m0, q0000 +.s4: + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +.h8: + mova m0, [tlq-16] + jmp wq +.w8: + movu m1, [tlq+2] + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 8 + je .w8_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 32 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +.h16: + mova m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w16: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + paddw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 16 + je .w16_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + test hd, 8|32 + cmovz r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16c: + mova m1, m0 +.s16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*2+16*0], m0 + mova [dstq+strideq*2+16*1], m1 + mova [dstq+stride3q +16*0], m0 + mova [dstq+stride3q +16*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-48] + paddw m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w32: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + movu m2, [tlq+34] + paddw m0, m2 + movu m2, [tlq+50] + paddw m1, m2 + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 32 + je .w32_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 8 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32c: + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s32: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + mova [dstq+strideq*0+16*2], m2 + mova [dstq+strideq*0+16*3], m3 + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*3], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s32 + RET +.h64: + mova m0, [tlq-128] + mova m1, [tlq-112] + paddw m0, [tlq- 96] + paddw m1, [tlq- 80] + paddw m0, [tlq- 64] + paddw m1, [tlq- 48] + paddw m0, [tlq- 32] + paddw m1, [tlq- 16] + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 2] + movu m2, [tlq+ 18] + paddw m1, m2 + movu m2, [tlq+ 34] + paddw m0, m2 + movu m2, [tlq+ 50] + paddw m1, m2 + movu m2, [tlq+ 66] + paddw m0, m2 + movu m2, [tlq+ 82] + paddw m1, m2 + movu m2, [tlq+ 98] + paddw m0, m2 + movu m2, [tlq+114] + paddw m1, m2 + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 64 + je .w64_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 16 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w64_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m0 + mova [dstq+16*2], m0 + mova [dstq+16*3], m0 + mova [dstq+16*4], m0 + mova [dstq+16*5], m0 + mova [dstq+16*6], m0 + mova [dstq+16*7], m0 + add dstq, strideq + dec hd + jg .s64 + RET + +cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + mov r6d, r8m + LEA r5, ipred_dc_128_16bpc_ssse3_table + tzcnt wd, wm + shr r6d, 11 + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_16bpc_ssse3_table + movifnidn hd, hm + movu m0, [tlq+ 2] + movu m1, [tlq+ 18] + movu m2, [tlq+ 34] + movu m3, [tlq+ 50] + cmp wd, 64 + je .w64 + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w64: + WIN64_SPILL_XMM 8 + movu m4, [tlq+ 66] + movu m5, [tlq+ 82] + movu m6, [tlq+ 98] + movu m7, [tlq+114] +.w64_loop: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + mova [dstq+16*4], m4 + mova [dstq+16*5], m5 + mova [dstq+16*6], m6 + mova [dstq+16*7], m7 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 +%define base r5-ipred_h_16bpc_ssse3_table + tzcnt wd, wm + LEA r5, ipred_h_16bpc_ssse3_table + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m2, [base+pb_0_1] + movddup m3, [base+pb_2_3] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + sub tlq, 8 + movq m3, [tlq] + pshuflw m0, m3, q3333 + pshuflw m1, m3, q2222 + pshuflw m2, m3, q1111 + pshuflw m3, m3, q0000 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m1 + movq [dstq+strideq*2], m2 + movq [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + sub tlq, 8 + movq m3, [tlq] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + sub tlq, 4 + movd m1, [tlq] + pshufb m0, m1, m3 + pshufb m1, m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m0 + mova [dstq+strideq*1+16*0], m1 + mova [dstq+strideq*1+16*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16 + RET +.w32: + sub tlq, 4 + movd m1, [tlq] + pshufb m0, m1, m3 + pshufb m1, m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m0 + mova [dstq+strideq*0+16*2], m0 + mova [dstq+strideq*0+16*3], m0 + mova [dstq+strideq*1+16*0], m1 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*1+16*2], m1 + mova [dstq+strideq*1+16*3], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + sub tlq, 2 + movd m0, [tlq] + pshufb m0, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m0 + mova [dstq+16*2], m0 + mova [dstq+16*3], m0 + mova [dstq+16*4], m0 + mova [dstq+16*5], m0 + mova [dstq+16*6], m0 + mova [dstq+16*7], m0 + add dstq, strideq + dec hd + jg .w64 + RET + +cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left +%define base r5-ipred_paeth_16bpc_ssse3_table + movifnidn hd, hm + pshuflw m4, [tlq], q0000 + mov leftq, tlq + add hd, hd + punpcklqdq m4, m4 ; topleft + sub leftq, hq + and wd, ~7 + jnz .w8 + movddup m5, [tlq+2] ; top + psubw m6, m5, m4 + pabsw m7, m6 +.w4_loop: + movd m1, [leftq+hq-4] + punpcklwd m1, m1 + punpckldq m1, m1 ; left +%macro PAETH 0 + paddw m0, m6, m1 + psubw m2, m4, m0 ; tldiff + psubw m0, m5 ; tdiff + pabsw m2, m2 + pabsw m0, m0 + pminsw m2, m0 + pcmpeqw m0, m2 + pand m3, m5, m0 + pandn m0, m4 + por m0, m3 + pcmpgtw m3, m7, m2 + pand m0, m3 + pandn m3, m1 + por m0, m3 +%endmacro + PAETH + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2*2 + jg .w4_loop + RET +.w8: +%if ARCH_X86_32 + PUSH r6 + %define r7d hm + %assign regs_used 7 +%elif WIN64 + movaps r4m, m8 + PUSH r7 + %assign regs_used 8 +%endif +%if ARCH_X86_64 + movddup m8, [pb_0_1] +%endif + lea tlq, [tlq+wq*2+2] + neg wq + mov r7d, hd +.w8_loop0: + movu m5, [tlq+wq*2] + mov r6, dstq + add dstq, 16 + psubw m6, m5, m4 + pabsw m7, m6 +.w8_loop: + movd m1, [leftq+hq-2] +%if ARCH_X86_64 + pshufb m1, m8 +%else + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 +%endif + PAETH + mova [r6], m0 + add r6, strideq + sub hd, 1*2 + jg .w8_loop + mov hd, r7d + add wq, 8 + jl .w8_loop0 +%if WIN64 + movaps m8, r4m +%endif + RET + +%if ARCH_X86_64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 4 +%endif + +cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights + LEA weightsq, smooth_weights_1d_16bpc + mov hd, hm + lea weightsq, [weightsq+hq*4] + neg hq + movd m5, [tlq+hq*2] ; bottom + pshuflw m5, m5, q0000 + punpcklqdq m5, m5 + cmp wd, 4 + jne .w8 + movddup m4, [tlq+2] ; top + lea r3, [strideq*3] + psubw m4, m5 ; top - bottom +.w4_loop: + movq m1, [weightsq+hq*2] + punpcklwd m1, m1 + pshufd m0, m1, q1100 + punpckhdq m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop + RET +.w8: +%if ARCH_X86_32 + PUSH r6 + %assign regs_used 7 + mov hm, hq + %define hq hm +%elif WIN64 + PUSH r7 + %assign regs_used 8 +%endif +.w8_loop0: + mov t0, hq + movu m4, [tlq+2] + add tlq, 16 + mov r6, dstq + add dstq, 16 + psubw m4, m5 +.w8_loop: + movq m3, [weightsq+t0*2] + punpcklwd m3, m3 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [r6+strideq*0], m0 + mova [r6+strideq*1], m1 + lea r6, [r6+strideq*2] + mova [r6+strideq*0], m2 + mova [r6+strideq*1], m3 + lea r6, [r6+strideq*2] + add t0, 4 + jl .w8_loop + sub wd, 8 + jg .w8_loop0 + RET + +cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights + LEA weightsq, smooth_weights_1d_16bpc + mov wd, wm + movifnidn hd, hm + movd m5, [tlq+wq*2] ; right + sub tlq, 8 + add hd, hd + pshuflw m5, m5, q0000 + sub tlq, hq + punpcklqdq m5, m5 + cmp wd, 4 + jne .w8 + movddup m4, [weightsq+4*2] + lea r3, [strideq*3] +.w4_loop: + movq m1, [tlq+hq] ; left + punpcklwd m1, m1 + psubw m1, m5 ; left - right + pshufd m0, m1, q3322 + punpckldq m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movhps [dstq+strideq*2], m1 + movq [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w4_loop + RET +.w8: + lea weightsq, [weightsq+wq*4] + neg wq +%if ARCH_X86_32 + PUSH r6 + %assign regs_used 7 + %define hd hm +%elif WIN64 + PUSH r7 + %assign regs_used 8 +%endif +.w8_loop0: + mov t0d, hd + mova m4, [weightsq+wq*2] + mov r6, dstq + add dstq, 16 +.w8_loop: + movq m3, [tlq+t0*(1+ARCH_X86_32)] + punpcklwd m3, m3 + psubw m3, m5 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [r6+strideq*0], m0 + mova [r6+strideq*1], m1 + lea r6, [r6+strideq*2] + mova [r6+strideq*0], m2 + mova [r6+strideq*1], m3 + lea r6, [r6+strideq*2] + sub t0d, 4*(1+ARCH_X86_64) + jg .w8_loop + add wq, 8 + jl .w8_loop0 + RET + +%if ARCH_X86_64 +DECLARE_REG_TMP 10 +%else +DECLARE_REG_TMP 3 +%endif + +cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ + h_weights, v_weights, top + LEA h_weightsq, smooth_weights_2d_16bpc + mov wd, wm + mov hd, hm + movd m7, [tlq+wq*2] ; right + lea v_weightsq, [h_weightsq+hq*8] + neg hq + movd m6, [tlq+hq*2] ; bottom + pshuflw m7, m7, q0000 + pshuflw m6, m6, q0000 + cmp wd, 4 + jne .w8 + movq m4, [tlq+2] ; top + mova m5, [h_weightsq+4*4] + punpcklwd m4, m6 ; top, bottom + pxor m6, m6 +.w4_loop: + movq m1, [v_weightsq+hq*4] + sub tlq, 4 + movd m3, [tlq] ; left + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + pmaddwd m0, m4 + punpcklwd m3, m7 ; left, right + pmaddwd m1, m4 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m6 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w4_loop + RET +.w8: +%if ARCH_X86_32 + lea h_weightsq, [h_weightsq+wq*4] + mov t0, tlq + mov r1m, tlq + mov r2m, hq + %define m8 [h_weightsq+16*0] + %define m9 [h_weightsq+16*1] +%else +%if WIN64 + movaps r4m, m8 + movaps r6m, m9 + PUSH r7 + PUSH r8 +%endif + PUSH r9 + PUSH r10 + %assign regs_used 11 + lea h_weightsq, [h_weightsq+wq*8] + lea topq, [tlq+wq*2] + neg wq + mov r8, tlq + mov r9, hq +%endif + punpcklqdq m6, m6 +.w8_loop0: +%if ARCH_X86_32 + movu m5, [t0+2] + add t0, 16 + mov r0m, t0 +%else + movu m5, [topq+wq*2+2] + mova m8, [h_weightsq+wq*4+16*0] + mova m9, [h_weightsq+wq*4+16*1] +%endif + mov t0, dstq + add dstq, 16 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 +.w8_loop: + movd m1, [v_weightsq+hq*4] + sub tlq, 2 + movd m3, [tlq] ; left + pshufd m1, m1, q0000 + pmaddwd m0, m4, m1 + pshuflw m3, m3, q0000 + pmaddwd m1, m5 + punpcklwd m3, m7 ; left, right + pmaddwd m2, m8, m3 + pmaddwd m3, m9 + paddd m0, m2 + paddd m1, m3 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pxor m1, m1 + pavgw m0, m1 + mova [t0], m0 + add t0, strideq + inc hq + jl .w8_loop +%if ARCH_X86_32 + mov t0, r0m + mov tlq, r1m + add h_weightsq, 16*2 + mov hq, r2m + sub dword wm, 8 + jg .w8_loop0 +%else + mov tlq, r8 + mov hq, r9 + add wq, 8 + jl .w8_loop0 +%endif +%if WIN64 + movaps m8, r4m + movaps m9, r6m +%endif + RET + +%if ARCH_X86_64 +cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter +%else +cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter +%define m8 [esp+16*0] +%define m9 [esp+16*1] +%define m10 [esp+16*2] +%define m11 [esp+16*3] +%define m12 [esp+16*4] +%define m13 [esp+16*5] +%define m14 [esp+16*6] +%define m15 [esp+16*7] +%endif +%define base r6-$$ + movifnidn hd, hm + movd m6, r8m ; bitdepth_max +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + LEA r6, $$ + shl filterd, 6 + movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 + mova m1, [base+filter_intra_taps+filterq+16*0] + mova m2, [base+filter_intra_taps+filterq+16*1] + mova m3, [base+filter_intra_taps+filterq+16*2] + mova m4, [base+filter_intra_taps+filterq+16*3] + pxor m5, m5 +%if ARCH_X86_64 + punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper + punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid + punpcklbw m10, m5, m2 ; having to perform sign-extension. + punpckhbw m11, m5, m2 + punpcklbw m12, m5, m3 + punpckhbw m13, m5, m3 + punpcklbw m14, m5, m4 + punpckhbw m15, m5, m4 +%else + punpcklbw m7, m5, m1 + mova m8, m7 + punpckhbw m7, m5, m1 + mova m9, m7 + punpcklbw m7, m5, m2 + mova m10, m7 + punpckhbw m7, m5, m2 + mova m11, m7 + punpcklbw m7, m5, m3 + mova m12, m7 + punpckhbw m7, m5, m3 + mova m13, m7 + punpcklbw m7, m5, m4 + mova m14, m7 + punpckhbw m7, m5, m4 + mova m15, m7 +%endif + mova m7, [base+filter_shuf] + add hd, hd + mov r5, dstq + pshuflw m6, m6, q0000 + mov r6, tlq + punpcklqdq m6, m6 + sub tlq, hq +.left_loop: + pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ + pshufd m1, m0, q0000 + pmaddwd m2, m8, m1 + pmaddwd m1, m9 + pshufd m4, m0, q1111 + pmaddwd m3, m10, m4 + pmaddwd m4, m11 + paddd m2, m3 + paddd m1, m4 + pshufd m4, m0, q2222 + pmaddwd m3, m12, m4 + pmaddwd m4, m13 + paddd m2, m3 + paddd m1, m4 + pshufd m3, m0, q3333 + pmaddwd m0, m14, m3 + pmaddwd m3, m15 + paddd m0, m2 + paddd m1, m3 + psrad m0, 11 ; x >> 3 + psrad m1, 11 + packssdw m0, m1 + pmaxsw m0, m5 + pavgw m0, m5 ; (x + 8) >> 4 + pminsw m0, m6 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movlps m0, [tlq+hq-10] + lea dstq, [dstq+strideq*2] + sub hd, 2*2 + jg .left_loop + sub wd, 4 + jz .end + sub tld, r6d ; -h*2 + sub r6, r5 ; tl-dst +.right_loop0: + add r5, 8 + mov hd, tld + movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ + mov dstq, r5 +.right_loop: + pshufd m2, m0, q0000 + pmaddwd m1, m8, m2 + pmaddwd m2, m9 + pshufd m4, m0, q1111 + pmaddwd m3, m10, m4 + pmaddwd m4, m11 + pinsrw m0, [dstq+strideq*0-2], 5 + paddd m1, m3 + paddd m2, m4 + pshufd m0, m0, q2222 + movddup m4, [dstq+strideq*1-8] + pmaddwd m3, m12, m0 + pmaddwd m0, m13 + paddd m1, m3 + paddd m0, m2 + pshuflw m2, m4, q3333 + punpcklwd m2, m5 + pmaddwd m3, m14, m2 + pmaddwd m2, m15 + paddd m1, m3 + paddd m0, m2 + psrad m1, 11 + psrad m0, 11 + packssdw m0, m1 + pmaxsw m0, m5 + pavgw m0, m5 + pminsw m0, m6 + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + palignr m0, m4, 14 + lea dstq, [dstq+strideq*2] + add hd, 2*2 + jl .right_loop + sub wd, 4 + jg .right_loop0 +.end: + RET + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac + LEA t0, ipred_cfl_left_16bpc_ssse3_table + movd m4, wd + tzcnt wd, wd + movifnidn hd, hm + add tlq, 2 + movsxd r6, [t0+wq*4] + movd m5, wd + jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) + +cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + LEA t0, ipred_cfl_left_16bpc_ssse3_table + tzcnt wd, wm + lea r6d, [hq*2] + movd m4, hd + sub tlq, r6 + tzcnt r6d, hd + movd m5, r6d + movsxd r6, [t0+r6*4] +.start: + movd m7, r7m + movu m0, [tlq] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table + movsxd wq, [t0+wq*4] + pxor m6, m6 + pshuflw m7, m7, q0000 + pcmpeqw m3, m3 + add wq, t0 + movifnidn acq, acmp + pavgw m4, m6 + punpcklqdq m7, m7 + jmp r6 +.h32: + movu m1, [tlq+48] + movu m2, [tlq+32] + paddw m0, m1 + paddw m0, m2 +.h16: + movu m1, [tlq+16] + paddw m0, m1 +.h8: + pshufd m1, m0, q1032 + paddw m0, m1 +.h4: + pmaddwd m0, m3 + psubd m4, m0 + pshuflw m0, m4, q1032 + paddd m0, m4 + psrld m0, m5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +%macro IPRED_CFL 2 ; dst, src + pabsw m%1, m%2 + pmulhrsw m%1, m2 + psignw m%2, m1 + psignw m%1, m%2 + paddw m%1, m0 + pmaxsw m%1, m6 + pminsw m%1, m7 +%endmacro + +cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd m4, t0d + tzcnt t0d, t0d + movd m5, t0d + LEA t0, ipred_cfl_16bpc_ssse3_table + tzcnt wd, wd + movd m7, r7m + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + psrlw m4, 1 + pxor m6, m6 + pshuflw m7, m7, q0000 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + pcmpeqw m3, m3 + punpcklqdq m7, m7 + jmp r6 +.h4: + movq m0, [tlq-8] + jmp wq +.w4: + movq m1, [tlq+2] + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + cmp hd, 4 + jg .w4_mul + psrld m0, 3 + jmp .w4_end +.w4_mul: + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 16 + cmove r6d, r2d + movd m1, r6d + psrld m0, 2 + pmulhuw m0, m1 + psrlw m0, 1 +.w4_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s4: + movd m1, alpham + lea r6, [strideq*3] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + movq [dstq+strideq*0], m3 + movhps [dstq+strideq*1], m3 + movq [dstq+strideq*2], m4 + movhps [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4_loop + RET +.h8: + mova m0, [tlq-16] + jmp wq +.w8: + movu m1, [tlq+2] + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmove r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+strideq*0], m3 + mova [dstq+strideq*1], m4 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s8_loop + RET +.h16: + mova m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w16: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*0], m3 + mova [dstq+16*1], m4 + add dstq, strideq + dec hd + jg .s16_loop + RET +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-48] + paddw m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w32: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + movu m2, [tlq+34] + paddw m1, m2 + movu m2, [tlq+50] + paddw m1, m2 + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 32 + je .w32_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 8 + cmove r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*0], m3 + mova [dstq+16*1], m4 + mova m4, [acq+16*2] + mova m5, [acq+16*3] + add acq, 16*4 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*2], m3 + mova [dstq+16*3], m4 + add dstq, strideq + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac + tzcnt wd, wm + LEA t0, ipred_cfl_splat_16bpc_ssse3_table + mov r6d, r7m + movifnidn hd, hm + shr r6d, 11 + movd m7, r7m + movsxd wq, [t0+wq*4] + movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] + pshuflw m7, m7, q0000 + pxor m6, m6 + add wq, t0 + movifnidn acq, acmp + punpcklqdq m7, m7 + jmp wq + +cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm +%if ARCH_X86_32 && PIC + pcmpeqw m5, m5 + pabsw m5, m5 + paddw m5, m5 +%else + movddup m5, [pw_2] +%endif + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + mov r5, acq + jg .w16 + je .w8 + lea r3, [strideq*3] +.w4_loop: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + pmaddwd m2, m5, [ypxq+strideq*2] + pmaddwd m3, m5, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + paddd m0, m1 + paddd m2, m3 + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .dc + punpckhqdq m0, m0 + pslld m2, 2 +.w4_hpad: + mova [acq+16*0], m0 + paddd m4, m2 + mova [acq+16*1], m0 + add acq, 16*2 + sub hpadd, 4 + jg .w4_hpad + jmp .dc +.w8: +%if ARCH_X86_32 + cmp dword wpadm, 0 +%else + test wpadd, wpadd +%endif + jnz .w8_wpad1 +.w8_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m2, m5, [ypxq+strideq*1+16*0] + pmaddwd m1, m5, [ypxq+strideq*0+16*1] + pmaddwd m3, m5, [ypxq+strideq*1+16*1] + lea ypxq, [ypxq+strideq*2] + paddd m0, m2 + paddd m1, m3 + paddd m2, m0, m1 + packssdw m0, m1 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + dec hd + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz .dc + pslld m2, 2 + mova m1, m0 + jmp .hpad +.w8_wpad1: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + paddd m0, m1 + pshufd m1, m0, q3333 + paddd m2, m0, m1 + packssdw m0, m1 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + dec hd + jg .w8_wpad1 + jmp .w8_hpad +.w16_wpad3: + pshufd m3, m0, q3333 + mova m1, m3 + mova m2, m3 + jmp .w16_wpad_end +.w16_wpad2: + pshufd m1, m3, q3333 + mova m2, m1 + jmp .w16_wpad_end +.w16_wpad1: + pshufd m2, m1, q3333 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + WIN64_SPILL_XMM 7 +.w16_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m6, m5, [ypxq+strideq*1+16*0] + paddd m0, m6 + cmp wpadd, 2 + jg .w16_wpad3 + pmaddwd m3, m5, [ypxq+strideq*0+16*1] + pmaddwd m6, m5, [ypxq+strideq*1+16*1] + paddd m3, m6 + je .w16_wpad2 + pmaddwd m1, m5, [ypxq+strideq*0+16*2] + pmaddwd m6, m5, [ypxq+strideq*1+16*2] + paddd m1, m6 + jp .w16_wpad1 + pmaddwd m2, m5, [ypxq+strideq*0+16*3] + pmaddwd m6, m5, [ypxq+strideq*1+16*3] + paddd m2, m6 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + paddd m6, m0, m3 + packssdw m0, m3 + paddd m6, m1 + mova [acq+16*0], m0 + packssdw m1, m2 + paddd m2, m6 + mova [acq+16*1], m1 + add acq, 16*2 + paddd m4, m2 + dec hd + jg .w16_loop + WIN64_RESTORE_XMM + add hpadd, hpadd + jz .dc + paddd m2, m2 +.hpad: + mova [acq+16*0], m0 + mova [acq+16*1], m1 + paddd m4, m2 + mova [acq+16*2], m0 + mova [acq+16*3], m1 + add acq, 16*4 + sub hpadd, 4 + jg .hpad +.dc: + sub r5, acq ; -w*h*2 + pshufd m2, m4, q1032 + tzcnt r1d, r5d + paddd m2, m4 + sub r1d, 2 + pshufd m4, m2, q2301 + movd m0, r1d + paddd m2, m4 + psrld m2, m0 + pxor m0, m0 + pavgw m2, m0 + packssdw m2, m2 +.dc_loop: + mova m0, [acq+r5+16*0] + mova m1, [acq+r5+16*1] + psubw m0, m2 + psubw m1, m2 + mova [acq+r5+16*0], m0 + mova [acq+r5+16*1], m1 + add r5, 16*2 + jl .dc_loop + RET + +cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm +%if ARCH_X86_32 && PIC + pcmpeqw m5, m5 + pabsw m5, m5 + psllw m5, 2 +%else + movddup m5, [pw_4] +%endif + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + mov r5, acq + jg .w16 + je .w8 + lea r3, [strideq*3] +.w4_loop: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m3, m5, [ypxq+strideq*1] + pmaddwd m1, m5, [ypxq+strideq*2] + pmaddwd m2, m5, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + paddd m4, m0 + packssdw m0, m3 + paddd m3, m1 + packssdw m1, m2 + paddd m4, m2 + paddd m4, m3 + mova [acq+16*0], m0 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + punpckhqdq m1, m1 + pslld m2, 3 + mova [acq+16*0], m1 + mova [acq+16*1], m1 + paddd m4, m2 + mova [acq+16*2], m1 + mova [acq+16*3], m1 + add acq, 16*4 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w8: +%if ARCH_X86_32 + cmp dword wpadm, 0 +%else + test wpadd, wpadd +%endif + jnz .w8_wpad1 +.w8_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m2, m5, [ypxq+strideq*0+16*1] + pmaddwd m1, m5, [ypxq+strideq*1+16*0] + pmaddwd m3, m5, [ypxq+strideq*1+16*1] + lea ypxq, [ypxq+strideq*2] + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + mova [acq+16*0], m0 + paddd m2, m1, m3 + packssdw m1, m3 + paddd m4, m2 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 2 + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + pslld m2, 2 + mova m0, m1 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w8_wpad1: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + pshufd m2, m0, q3333 + pshufd m3, m1, q3333 + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + paddd m2, m1, m3 + packssdw m1, m3 + paddd m4, m2 + mova [acq+16*0], m0 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 2 + jg .w8_wpad1 + jmp .w8_hpad +.w16_wpad3: + pshufd m3, m0, q3333 + mova m1, m3 + mova m2, m3 + jmp .w16_wpad_end +.w16_wpad2: + pshufd m1, m3, q3333 + mova m2, m1 + jmp .w16_wpad_end +.w16_wpad1: + pshufd m2, m1, q3333 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + WIN64_SPILL_XMM 7 +.w16_loop: + pmaddwd m0, m5, [ypxq+16*0] + cmp wpadd, 2 + jg .w16_wpad3 + pmaddwd m3, m5, [ypxq+16*1] + je .w16_wpad2 + pmaddwd m1, m5, [ypxq+16*2] + jp .w16_wpad1 + pmaddwd m2, m5, [ypxq+16*3] +.w16_wpad_end: + add ypxq, strideq + paddd m6, m0, m3 + packssdw m0, m3 + mova [acq+16*0], m0 + paddd m6, m1 + packssdw m1, m2 + paddd m2, m6 + mova [acq+16*1], m1 + add acq, 16*2 + paddd m4, m2 + dec hd + jg .w16_loop + WIN64_RESTORE_XMM + add hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + paddd m2, m2 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad + +cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h +%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table + LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table + tzcnt wd, wm + movifnidn hpadd, hpadm + pxor m4, m4 + movsxd wq, [r6+wq*4] + movddup m5, [base+pw_1] + add wq, r6 + mov hd, hm + shl hpadd, 2 + sub hd, hpadd + jmp wq +.w4: + lea r3, [strideq*3] + mov r5, acq +.w4_loop: + movq m0, [ypxq+strideq*0] + movhps m0, [ypxq+strideq*1] + movq m1, [ypxq+strideq*2] + movhps m1, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + psllw m0, 3 + psllw m1, 3 + mova [acq+16*0], m0 + pmaddwd m0, m5 + mova [acq+16*1], m1 + pmaddwd m2, m5, m1 + add acq, 16*2 + paddd m4, m0 + paddd m4, m2 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + punpckhqdq m1, m1 + mova [acq+16*0], m1 + pslld m2, 2 + mova [acq+16*1], m1 + punpckhqdq m2, m2 + mova [acq+16*2], m1 + paddd m4, m2 + mova [acq+16*3], m1 + add acq, 16*4 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w8: + mov r5, acq +.w8_loop: + mova m0, [ypxq+strideq*0] + mova m1, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + psllw m0, 3 + psllw m1, 3 + mova [acq+16*0], m0 + pmaddwd m0, m5 + mova [acq+16*1], m1 + pmaddwd m2, m5, m1 + add acq, 16*2 + paddd m4, m0 + paddd m4, m2 + sub hd, 2 + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + pslld m2, 2 + mova m0, m1 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w16_wpad2: + pshufhw m3, m2, q3333 + pshufhw m1, m0, q3333 + punpckhqdq m3, m3 + punpckhqdq m1, m1 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + mov r5, acq +.w16_loop: + mova m2, [ypxq+strideq*0+16*0] + mova m0, [ypxq+strideq*1+16*0] + psllw m2, 3 + psllw m0, 3 + test wpadd, wpadd + jnz .w16_wpad2 + mova m3, [ypxq+strideq*0+16*1] + mova m1, [ypxq+strideq*1+16*1] + psllw m3, 3 + psllw m1, 3 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + mova [acq+16*0], m2 + pmaddwd m2, m5 + mova [acq+16*1], m3 + pmaddwd m3, m5 + paddd m4, m2 + pmaddwd m2, m5, m0 + mova [acq+16*2], m0 + paddd m4, m3 + pmaddwd m3, m5, m1 + mova [acq+16*3], m1 + add acq, 16*4 + paddd m2, m3 + paddd m4, m2 + sub hd, 2 + jg .w16_loop + add hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + paddd m2, m2 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w32_wpad6: + pshufhw m1, m0, q3333 + punpckhqdq m1, m1 + mova m2, m1 + mova m3, m1 + jmp .w32_wpad_end +.w32_wpad4: + pshufhw m2, m1, q3333 + punpckhqdq m2, m2 + mova m3, m2 + jmp .w32_wpad_end +.w32_wpad2: + pshufhw m3, m2, q3333 + punpckhqdq m3, m3 + jmp .w32_wpad_end +.w32: + movifnidn wpadd, wpadm + mov r5, acq + WIN64_SPILL_XMM 8 +.w32_loop: + mova m0, [ypxq+16*0] + psllw m0, 3 + cmp wpadd, 4 + jg .w32_wpad6 + mova m1, [ypxq+16*1] + psllw m1, 3 + je .w32_wpad4 + mova m2, [ypxq+16*2] + psllw m2, 3 + jnp .w32_wpad2 + mova m3, [ypxq+16*3] + psllw m3, 3 +.w32_wpad_end: + add ypxq, strideq + pmaddwd m6, m5, m0 + mova [acq+16*0], m0 + pmaddwd m7, m5, m1 + mova [acq+16*1], m1 + paddd m6, m7 + pmaddwd m7, m5, m2 + mova [acq+16*2], m2 + paddd m6, m7 + pmaddwd m7, m5, m3 + mova [acq+16*3], m3 + add acq, 16*4 + paddd m6, m7 + paddd m4, m6 + dec hd + jg .w32_loop +%if WIN64 + mova m5, m6 + WIN64_RESTORE_XMM + SWAP 5, 6 +%endif + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w32_hpad_loop: + mova [acq+16*0], m0 + mova [acq+16*1], m1 + paddd m4, m6 + mova [acq+16*2], m2 + mova [acq+16*3], m3 + add acq, 16*4 + dec hpadd + jg .w32_hpad_loop + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + +cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h +%define base r2-pal_pred_16bpc_ssse3_table +%if ARCH_X86_32 + %define hd r2d +%endif + mova m3, [palq] + LEA r2, pal_pred_16bpc_ssse3_table + tzcnt wd, wm + pshufb m3, [base+pal_pred_shuf] + movsxd wq, [r2+wq*4] + pshufd m4, m3, q1032 + add wq, r2 + movifnidn hd, hm + jmp wq +.w4: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w4 + RET +.w8: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8 + RET +.w16: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16 + RET +.w32: + mova m0, [idxq+16*0] + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova m2, [idxq+16*1] + add idxq, 16*2 + mova [dstq+16*0], m0 + pshufb m0, m3, m2 + mova [dstq+16*1], m1 + pshufb m1, m4, m2 + punpcklbw m2, m0, m1 + punpckhbw m0, m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m0 + add dstq, strideq + dec hd + jg .w32 + RET +.w64: + mova m0, [idxq+16*0] + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova m2, [idxq+16*1] + mova [dstq+16*0], m0 + pshufb m0, m3, m2 + mova [dstq+16*1], m1 + pshufb m1, m4, m2 + punpcklbw m2, m0, m1 + punpckhbw m0, m1 + mova m1, [idxq+16*2] + mova [dstq+16*2], m2 + pshufb m2, m3, m1 + mova [dstq+16*3], m0 + pshufb m0, m4, m1 + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + mova m0, [idxq+16*3] + add idxq, 16*4 + mova [dstq+16*4], m1 + pshufb m1, m3, m0 + mova [dstq+16*5], m2 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + add dstq, strideq + dec hd + jg .w64 + RET diff -Nru dav1d-0.7.1/src/x86/ipred.asm dav1d-0.9.1/src/x86/ipred.asm --- dav1d-0.7.1/src/x86/ipred.asm 2020-06-21 11:48:55.020126300 +0000 +++ dav1d-0.9.1/src/x86/ipred.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,5386 +0,0 @@ -; Copyright © 2018, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - -%if ARCH_X86_64 - -SECTION_RODATA 64 - -%macro SMOOTH_WEIGHT_TABLE 1-* - %rep %0 - db %1-128, 127-%1 - %rotate 1 - %endrep -%endmacro - -; sm_weights[], but modified to precalculate x and 256-x with offsets to -; enable efficient use of pmaddubsw (which requires signed values) -smooth_weights: SMOOTH_WEIGHT_TABLE \ - 0, 0, 255, 128, 255, 149, 85, 64, \ - 255, 197, 146, 105, 73, 50, 37, 32, \ - 255, 225, 196, 170, 145, 123, 102, 84, \ - 68, 54, 43, 33, 26, 20, 17, 16, \ - 255, 240, 225, 210, 196, 182, 169, 157, \ - 145, 133, 122, 111, 101, 92, 83, 74, \ - 66, 59, 52, 45, 39, 34, 29, 25, \ - 21, 17, 14, 12, 10, 9, 8, 8, \ - 255, 248, 240, 233, 225, 218, 210, 203, \ - 196, 189, 182, 176, 169, 163, 156, 150, \ - 144, 138, 133, 127, 121, 116, 111, 106, \ - 101, 96, 91, 86, 82, 77, 73, 69, \ - 65, 61, 57, 54, 50, 47, 44, 41, \ - 38, 35, 32, 29, 27, 25, 22, 20, \ - 18, 16, 15, 13, 12, 10, 9, 8, \ - 7, 6, 6, 5, 5, 4, 4, 4 - -pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 - db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 -pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 -pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 -z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 - db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 -z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 - db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 - db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 -z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 - db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 - db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line -pb_128: times 4 db 128 ; those are just placed here for alignment. -pb_36_m4: times 2 db 36, -4 -z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 -z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 -z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 -z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 -z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 -z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 -z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 -z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 -z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 -z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 -z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 -z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 - dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 -z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 - dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 -z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 -z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 - db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 -; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 -filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 - db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 -filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 -filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 -pb_127_m127: times 2 db 127, -127 -ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 - db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 -ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 - db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 -pw_64: times 2 dw 64 - -cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 - times 9 db 7, -1 -cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 - db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - ; w=8, w_pad=1 as well as second half of previous one -cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 - times 5 db 6, 7 - ; w=16,w_pad=2 - db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - times 8 db 14, 15 - ; w=16,w_pad=3 - db 0, 1, 2, 3, 4, 5 - times 13 db 6, 7 -pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -%define pb_0to15 cfl_ac_w16_pad_shuffle -%define pb_1 (ipred_h_shuf+12) -%define pb_2 (ipred_h_shuf+20) -%define pb_3 (ipred_h_shuf+ 4) -%define pb_4 (ipred_h_shuf+24) -%define pb_5 (ipred_h_shuf+ 8) -%define pb_7 (ipred_h_shuf+ 0) -%define pb_8 (z_upsample2 +12) -%define pb_12 (z2_y_shuf_h4+20) -%define pb_14 (z2_y_shuf_h4+ 4) -%define pb_15 (z_filter_s +32) -%define pb_27 (z2_y_shuf_h4+ 8) -%define pb_31 (z2_y_shuf_h4+12) -%define pb_32 (z2_y_shuf_h4+16) -%define pb_90 (z2_y_shuf_h4+ 0) -%define pw_1 (z2_y_shuf_h4+24) -%define pw_8 (z_filter_k +32) - -pw_62: times 2 dw 62 -pw_128: times 2 dw 128 -pw_255: times 2 dw 255 -pw_512: times 2 dw 512 - -%macro JMP_TABLE 3-* - %xdefine %1_%2_table (%%table - 2*4) - %xdefine %%base mangle(private_prefix %+ _%1_%2) - %%table: - %rep %0 - 2 - dd %%base %+ .%3 - (%%table - 2*4) - %rotate 1 - %endrep -%endmacro - -%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4) -%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) - -JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 -JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 -JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 -JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 -JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 -JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ - s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 -JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64 -JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 -JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 -JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 -JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 -JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ - s4-8*4, s8-8*4, s16-8*4, s32-8*4 -JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 -JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 -JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 -JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 -JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 - -cextern dr_intra_derivative -cextern filter_intra_taps - -SECTION .text - -INIT_YMM avx2 -cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h - lea r5, [ipred_dc_left_avx2_table] - tzcnt wd, wm - inc tlq - movu m0, [tlq] - movifnidn hd, hm - mov r6d, 0x8000 - shrx r6d, r6d, wd - movd xm3, r6d - movsxd r6, [r5+wq*4] - pcmpeqd m2, m2 - pmaddubsw m0, m2 - add r6, r5 - add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table - movsxd wq, [r5+wq*4] - add wq, r5 - jmp r6 - -cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 - mov hd, hm ; zero upper half - tzcnt r6d, hd - sub tlq, hq - tzcnt wd, wm - movu m0, [tlq] - mov r5d, 0x8000 - shrx r5d, r5d, r6d - movd xm3, r5d - lea r5, [ipred_dc_left_avx2_table] - movsxd r6, [r5+r6*4] - pcmpeqd m2, m2 - pmaddubsw m0, m2 - add r6, r5 - add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table - movsxd wq, [r5+wq*4] - add wq, r5 - jmp r6 -.h64: - movu m1, [tlq+32] ; unaligned when jumping here from dc_top - pmaddubsw m1, m2 - paddw m0, m1 -.h32: - vextracti128 xm1, m0, 1 - paddw xm0, xm1 -.h16: - punpckhqdq xm1, xm0, xm0 - paddw xm0, xm1 -.h8: - psrlq xm1, xm0, 32 - paddw xm0, xm1 -.h4: - pmaddwd xm0, xm2 - pmulhrsw xm0, xm3 - lea stride3q, [strideq*3] - vpbroadcastb m0, xm0 - mova m1, m0 - jmp wq - -cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3 - movifnidn hd, hm - movifnidn wd, wm - tzcnt r6d, hd - lea r5d, [wq+hq] - movd xm4, r5d - tzcnt r5d, r5d - movd xm5, r5d - lea r5, [ipred_dc_avx2_table] - tzcnt wd, wd - movsxd r6, [r5+r6*4] - movsxd wq, [r5+wq*4+5*4] - pcmpeqd m3, m3 - psrlw xm4, 1 - add r6, r5 - add wq, r5 - lea stride3q, [strideq*3] - jmp r6 -.h4: - movd xm0, [tlq-4] - pmaddubsw xm0, xm3 - jmp wq -.w4: - movd xm1, [tlq+1] - pmaddubsw xm1, xm3 - psubw xm0, xm4 - paddw xm0, xm1 - pmaddwd xm0, xm3 - cmp hd, 4 - jg .w4_mul - psrlw xm0, 3 - jmp .w4_end -.w4_mul: - punpckhqdq xm1, xm0, xm0 - lea r2d, [hq*2] - mov r6d, 0x55563334 - paddw xm0, xm1 - shrx r6d, r6d, r2d - psrlq xm1, xm0, 32 - paddw xm0, xm1 - movd xm1, r6d - psrlw xm0, 2 - pmulhuw xm0, xm1 -.w4_end: - vpbroadcastb xm0, xm0 -.s4: - movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xm0 - movd [dstq+strideq*2], xm0 - movd [dstq+stride3q ], xm0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .s4 - RET -ALIGN function_align -.h8: - movq xm0, [tlq-8] - pmaddubsw xm0, xm3 - jmp wq -.w8: - movq xm1, [tlq+1] - vextracti128 xm2, m0, 1 - pmaddubsw xm1, xm3 - psubw xm0, xm4 - paddw xm0, xm2 - punpckhqdq xm2, xm0, xm0 - paddw xm0, xm2 - paddw xm0, xm1 - psrlq xm1, xm0, 32 - paddw xm0, xm1 - pmaddwd xm0, xm3 - psrlw xm0, xm5 - cmp hd, 8 - je .w8_end - mov r6d, 0x5556 - mov r2d, 0x3334 - cmp hd, 32 - cmove r6d, r2d - movd xm1, r6d - pmulhuw xm0, xm1 -.w8_end: - vpbroadcastb xm0, xm0 -.s8: - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm0 - movq [dstq+strideq*2], xm0 - movq [dstq+stride3q ], xm0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .s8 - RET -ALIGN function_align -.h16: - mova xm0, [tlq-16] - pmaddubsw xm0, xm3 - jmp wq -.w16: - movu xm1, [tlq+1] - vextracti128 xm2, m0, 1 - pmaddubsw xm1, xm3 - psubw xm0, xm4 - paddw xm0, xm2 - paddw xm0, xm1 - punpckhqdq xm1, xm0, xm0 - paddw xm0, xm1 - psrlq xm1, xm0, 32 - paddw xm0, xm1 - pmaddwd xm0, xm3 - psrlw xm0, xm5 - cmp hd, 16 - je .w16_end - mov r6d, 0x5556 - mov r2d, 0x3334 - test hb, 8|32 - cmovz r6d, r2d - movd xm1, r6d - pmulhuw xm0, xm1 -.w16_end: - vpbroadcastb xm0, xm0 -.s16: - mova [dstq+strideq*0], xm0 - mova [dstq+strideq*1], xm0 - mova [dstq+strideq*2], xm0 - mova [dstq+stride3q ], xm0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .s16 - RET -ALIGN function_align -.h32: - mova m0, [tlq-32] - pmaddubsw m0, m3 - jmp wq -.w32: - movu m1, [tlq+1] - pmaddubsw m1, m3 - paddw m0, m1 - vextracti128 xm1, m0, 1 - psubw xm0, xm4 - paddw xm0, xm1 - punpckhqdq xm1, xm0, xm0 - paddw xm0, xm1 - psrlq xm1, xm0, 32 - paddw xm0, xm1 - pmaddwd xm0, xm3 - psrlw xm0, xm5 - cmp hd, 32 - je .w32_end - lea r2d, [hq*2] - mov r6d, 0x33345556 - shrx r6d, r6d, r2d - movd xm1, r6d - pmulhuw xm0, xm1 -.w32_end: - vpbroadcastb m0, xm0 -.s32: - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .s32 - RET -ALIGN function_align -.h64: - mova m0, [tlq-64] - mova m1, [tlq-32] - pmaddubsw m0, m3 - pmaddubsw m1, m3 - paddw m0, m1 - jmp wq -.w64: - movu m1, [tlq+ 1] - movu m2, [tlq+33] - pmaddubsw m1, m3 - pmaddubsw m2, m3 - paddw m0, m1 - paddw m0, m2 - vextracti128 xm1, m0, 1 - psubw xm0, xm4 - paddw xm0, xm1 - punpckhqdq xm1, xm0, xm0 - paddw xm0, xm1 - psrlq xm1, xm0, 32 - paddw xm0, xm1 - pmaddwd xm0, xm3 - psrlw xm0, xm5 - cmp hd, 64 - je .w64_end - mov r6d, 0x33345556 - shrx r6d, r6d, hd - movd xm1, r6d - pmulhuw xm0, xm1 -.w64_end: - vpbroadcastb m0, xm0 - mova m1, m0 -.s64: - mova [dstq+strideq*0+32*0], m0 - mova [dstq+strideq*0+32*1], m1 - mova [dstq+strideq*1+32*0], m0 - mova [dstq+strideq*1+32*1], m1 - mova [dstq+strideq*2+32*0], m0 - mova [dstq+strideq*2+32*1], m1 - mova [dstq+stride3q +32*0], m0 - mova [dstq+stride3q +32*1], m1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .s64 - RET - -cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 - lea r5, [ipred_dc_splat_avx2_table] - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, [r5+wq*4] - vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128] - mova m1, m0 - add wq, r5 - lea stride3q, [strideq*3] - jmp wq - -cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 - lea r5, [ipred_dc_splat_avx2_table] - tzcnt wd, wm - movu m0, [tlq+ 1] - movu m1, [tlq+33] - movifnidn hd, hm - movsxd wq, [r5+wq*4] - add wq, r5 - lea stride3q, [strideq*3] - jmp wq - -%macro IPRED_H 2 ; w, store_type - vpbroadcastb m0, [tlq-1] - vpbroadcastb m1, [tlq-2] - vpbroadcastb m2, [tlq-3] - sub tlq, 4 - vpbroadcastb m3, [tlq+0] - mov%2 [dstq+strideq*0], m0 - mov%2 [dstq+strideq*1], m1 - mov%2 [dstq+strideq*2], m2 - mov%2 [dstq+stride3q ], m3 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w%1 - RET -ALIGN function_align -%endmacro - -INIT_XMM avx2 -cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3 - lea r5, [ipred_h_avx2_table] - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, [r5+wq*4] - add wq, r5 - lea stride3q, [strideq*3] - jmp wq -.w4: - IPRED_H 4, d -.w8: - IPRED_H 8, q -.w16: - IPRED_H 16, a -INIT_YMM avx2 -.w32: - IPRED_H 32, a -.w64: - vpbroadcastb m0, [tlq-1] - vpbroadcastb m1, [tlq-2] - vpbroadcastb m2, [tlq-3] - sub tlq, 4 - vpbroadcastb m3, [tlq+0] - mova [dstq+strideq*0+32*0], m0 - mova [dstq+strideq*0+32*1], m0 - mova [dstq+strideq*1+32*0], m1 - mova [dstq+strideq*1+32*1], m1 - mova [dstq+strideq*2+32*0], m2 - mova [dstq+strideq*2+32*1], m2 - mova [dstq+stride3q +32*0], m3 - mova [dstq+stride3q +32*1], m3 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w64 - RET - -%macro PAETH 2 ; top, ldiff - pavgb m1, m%1, m3 ; Calculating tldiff normally requires - pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it - pand m0, m4 ; in 8-bit with some tricks which avoids - psubusb m2, m5, m1 ; having to unpack everything to 16-bit. - psubb m1, m0 - psubusb m1, m5 - por m1, m2 - paddusb m1, m1 - por m1, m0 ; min(tldiff, 255) - psubusb m2, m5, m3 - psubusb m0, m3, m5 - por m2, m0 ; tdiff - pminub m2, m%2 - pcmpeqb m0, m%2, m2 ; ldiff <= tdiff - vpblendvb m0, m%1, m3, m0 - pminub m1, m2 - pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff - vpblendvb m0, m5, m0, m1 -%endmacro - -cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h -%define base r5-ipred_paeth_avx2_table - lea r5, [ipred_paeth_avx2_table] - tzcnt wd, wm - vpbroadcastb m5, [tlq] ; topleft - movifnidn hd, hm - movsxd wq, [r5+wq*4] - vpbroadcastd m4, [base+pb_1] - add wq, r5 - jmp wq -.w4: - vpbroadcastd m6, [tlq+1] ; top - mova m8, [base+ipred_h_shuf] - lea r3, [strideq*3] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 ; ldiff -.w4_loop: - sub tlq, 8 - vpbroadcastq m3, [tlq] - pshufb m3, m8 ; left - PAETH 6, 7 - vextracti128 xm1, m0, 1 - movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xm1 - pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+r3 ], xm1, 2 - cmp hd, 4 - je .ret - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xm1, 1 - pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+r3 ], xm1, 3 - lea dstq, [dstq+strideq*4] - sub hd, 8 - jg .w4_loop -.ret: - RET -ALIGN function_align -.w8: - vpbroadcastq m6, [tlq+1] - mova m8, [base+ipred_h_shuf] - lea r3, [strideq*3] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 -.w8_loop: - sub tlq, 4 - vpbroadcastd m3, [tlq] - pshufb m3, m8 - PAETH 6, 7 - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+r3 ], xm1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w8_loop - RET -ALIGN function_align -.w16: - vbroadcasti128 m6, [tlq+1] - mova xm8, xm4 ; lower half = 1, upper half = 0 - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 -.w16_loop: - sub tlq, 2 - vpbroadcastd m3, [tlq] - pshufb m3, m8 - PAETH 6, 7 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w16_loop - RET -ALIGN function_align -.w32: - movu m6, [tlq+1] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 -.w32_loop: - dec tlq - vpbroadcastb m3, [tlq] - PAETH 6, 7 - mova [dstq], m0 - add dstq, strideq - dec hd - jg .w32_loop - RET -ALIGN function_align -.w64: - movu m6, [tlq+ 1] - movu m7, [tlq+33] -%if WIN64 - movaps r4m, xmm9 -%endif - psubusb m8, m5, m6 - psubusb m0, m6, m5 - psubusb m9, m5, m7 - psubusb m1, m7, m5 - por m8, m0 - por m9, m1 -.w64_loop: - dec tlq - vpbroadcastb m3, [tlq] - PAETH 6, 8 - mova [dstq+32*0], m0 - PAETH 7, 9 - mova [dstq+32*1], m0 - add dstq, strideq - dec hd - jg .w64_loop -%if WIN64 - movaps xmm9, r4m -%endif - RET - -%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] - ; w * a = (w - 128) * a + 128 * a - ; (256 - w) * b = (127 - w) * b + 129 * b - pmaddubsw m0, m%3, m%1 - pmaddubsw m1, m%4, m%2 - paddw m0, m%5 - paddw m1, m%6 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 -%endmacro - -cglobal ipred_smooth_v, 3, 7, 0, dst, stride, tl, w, h, weights -%define base r6-ipred_smooth_v_avx2_table - lea r6, [ipred_smooth_v_avx2_table] - tzcnt wd, wm - mov hd, hm - movsxd wq, [r6+wq*4] - vpbroadcastd m0, [base+pb_127_m127] - vpbroadcastd m1, [base+pw_128] - lea weightsq, [base+smooth_weights+hq*4] - neg hq - vpbroadcastb m5, [tlq+hq] ; bottom - add wq, r6 - jmp wq -.w4: - vpbroadcastd m2, [tlq+1] - punpcklbw m2, m5 ; top, bottom - mova m5, [base+ipred_v_shuf] - lea r3, [strideq*3] - punpckldq m4, m5, m5 - punpckhdq m5, m5 - pmaddubsw m3, m2, m0 - paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok - paddw m3, m1 ; 128 * top + 129 * bottom + 128 -.w4_loop: - vbroadcasti128 m1, [weightsq+hq*2] - pshufb m0, m1, m4 - pshufb m1, m5 - SMOOTH 0, 1, 2, 2, 3, 3 - vextracti128 xm1, m0, 1 - movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xm1 - pextrd [dstq+strideq*2], xm0, 1 - pextrd [dstq+r3 ], xm1, 1 - cmp hd, -4 - je .ret - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm1, 2 - pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+r3 ], xm1, 3 - lea dstq, [dstq+strideq*4] - add hq, 8 - jl .w4_loop -.ret: - RET -ALIGN function_align -.w8: - vpbroadcastq m2, [tlq+1] - punpcklbw m2, m5 - mova m5, [base+ipred_v_shuf] - lea r3, [strideq*3] - pshufd m4, m5, q0000 - pshufd m5, m5, q1111 - pmaddubsw m3, m2, m0 - paddw m1, m2 - paddw m3, m1 -.w8_loop: - vpbroadcastq m1, [weightsq+hq*2] - pshufb m0, m1, m4 - pshufb m1, m5 - SMOOTH 0, 1, 2, 2, 3, 3 - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+r3 ], xm1 - lea dstq, [dstq+strideq*4] - add hq, 4 - jl .w8_loop - RET -ALIGN function_align -.w16: - WIN64_SPILL_XMM 7 - vbroadcasti128 m3, [tlq+1] - mova m6, [base+ipred_v_shuf] - punpcklbw m2, m3, m5 - punpckhbw m3, m5 - pmaddubsw m4, m2, m0 - pmaddubsw m5, m3, m0 - paddw m0, m1, m2 - paddw m1, m3 - paddw m4, m0 - paddw m5, m1 -.w16_loop: - vpbroadcastd m1, [weightsq+hq*2] - pshufb m1, m6 - SMOOTH 1, 1, 2, 3, 4, 5 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - add hq, 2 - jl .w16_loop - RET -ALIGN function_align -.w32: - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 6 - movu m3, [tlq+1] - punpcklbw m2, m3, m5 - punpckhbw m3, m5 - pmaddubsw m4, m2, m0 - pmaddubsw m5, m3, m0 - paddw m0, m1, m2 - paddw m1, m3 - paddw m4, m0 - paddw m5, m1 -.w32_loop: - vpbroadcastw m1, [weightsq+hq*2] - SMOOTH 1, 1, 2, 3, 4, 5 - mova [dstq], m0 - add dstq, strideq - inc hq - jl .w32_loop - RET -ALIGN function_align -.w64: - WIN64_SPILL_XMM 11 - movu m4, [tlq+ 1] - movu m8, [tlq+33] - punpcklbw m3, m4, m5 - punpckhbw m4, m5 - punpcklbw m7, m8, m5 - punpckhbw m8, m5 - pmaddubsw m5, m3, m0 - pmaddubsw m6, m4, m0 - pmaddubsw m9, m7, m0 - pmaddubsw m10, m8, m0 - paddw m2, m1, m3 - paddw m5, m2 - paddw m2, m1, m4 - paddw m6, m2 - paddw m0, m1, m7 - paddw m9, m0 - paddw m1, m8 - paddw m10, m1 -.w64_loop: - vpbroadcastw m2, [weightsq+hq*2] - SMOOTH 2, 2, 3, 4, 5, 6 - mova [dstq+32*0], m0 - SMOOTH 2, 2, 7, 8, 9, 10 - mova [dstq+32*1], m0 - add dstq, strideq - inc hq - jl .w64_loop - RET - -%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used - %assign stack_offset 0 - %assign stack_size_padded 0 - %assign regs_used %2 - %xdefine rstk rsp - SETUP_STACK_POINTER %1 - %if regs_used != %2 && WIN64 - PUSH r%2 - %endif - ALLOC_STACK %1, %3 -%endmacro - -cglobal ipred_smooth_h, 3, 7, 0, dst, stride, tl, w, h -%define base r6-ipred_smooth_h_avx2_table - lea r6, [ipred_smooth_h_avx2_table] - mov wd, wm - vpbroadcastb m3, [tlq+wq] ; right - tzcnt wd, wd - mov hd, hm - movsxd wq, [r6+wq*4] - vpbroadcastd m4, [base+pb_127_m127] - vpbroadcastd m5, [base+pw_128] - add wq, r6 - jmp wq -.w4: - WIN64_SPILL_XMM 8 - vpbroadcastq m6, [base+smooth_weights+4*2] - mova m7, [base+ipred_h_shuf] - sub tlq, 8 - sub tlq, hq - lea r3, [strideq*3] -.w4_loop: - vpbroadcastq m2, [tlq+hq] - pshufb m2, m7 - punpcklbw m1, m2, m3 ; left, right - punpckhbw m2, m3 - pmaddubsw m0, m1, m4 ; 127 * left - 127 * right - paddw m0, m1 ; 128 * left + 129 * right - pmaddubsw m1, m6 - paddw m1, m5 - paddw m0, m1 - pmaddubsw m1, m2, m4 - paddw m1, m2 - pmaddubsw m2, m6 - paddw m2, m5 - paddw m1, m2 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 - vextracti128 xm1, m0, 1 - movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xm1 - pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+r3 ], xm1, 2 - cmp hd, 4 - je .ret - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xm1, 1 - pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+r3 ], xm1, 3 - lea dstq, [dstq+strideq*4] - sub hd, 8 - jg .w4_loop -.ret: - RET -ALIGN function_align -.w8: - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 8 - vbroadcasti128 m6, [base+smooth_weights+8*2] - mova m7, [base+ipred_h_shuf] - sub tlq, 4 - lea r3, [strideq*3] - sub tlq, hq -.w8_loop: - vpbroadcastd m2, [tlq+hq] - pshufb m2, m7 - punpcklbw m1, m2, m3 - punpckhbw m2, m3 - pmaddubsw m0, m1, m4 - paddw m0, m1 - pmaddubsw m1, m6 - paddw m1, m5 - paddw m0, m1 - pmaddubsw m1, m2, m4 - paddw m1, m2 - pmaddubsw m2, m6 - paddw m2, m5 - paddw m1, m2 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+r3 ], xm1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w8_loop - RET -ALIGN function_align -.w16: - SETUP_STACK_FRAME 32*4, 7, 8 - lea r3, [rsp+64*2-4] - call .prep ; only worthwhile for for w16 and above - sub tlq, 2 - vpbroadcastd xm6, [base+pb_1] - mova xm7, [base+ipred_v_shuf+16] - vinserti128 m7, [base+ipred_v_shuf+ 0], 1 - vbroadcasti128 m4, [base+smooth_weights+16*2] - vbroadcasti128 m5, [base+smooth_weights+16*3] -.w16_loop: - vpbroadcastd m1, [tlq+hq] - vpbroadcastd m2, [r3+hq*2] - pshufb m1, m6 - punpcklbw m1, m3 - pshufb m2, m7 - SMOOTH 4, 5, 1, 1, 2, 2 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w16_loop - RET -ALIGN function_align -.w32: - SETUP_STACK_FRAME 32*4, 7, 6 - lea r3, [rsp+64*2-2] - call .prep - dec tlq - mova xm4, [base+smooth_weights+16*4] - vinserti128 m4, [base+smooth_weights+16*6], 1 - mova xm5, [base+smooth_weights+16*5] - vinserti128 m5, [base+smooth_weights+16*7], 1 -.w32_loop: - vpbroadcastb m1, [tlq+hq] - punpcklbw m1, m3 - vpbroadcastw m2, [r3+hq*2] - SMOOTH 4, 5, 1, 1, 2, 2 - mova [dstq], m0 - add dstq, strideq - dec hd - jg .w32_loop - RET -ALIGN function_align -.w64: - SETUP_STACK_FRAME 32*4, 7, 9 - lea r3, [rsp+64*2-2] - call .prep - add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table - dec tlq - mova xm5, [r6-16*7] - vinserti128 m5, [r6-16*5], 1 - mova xm6, [r6-16*6] - vinserti128 m6, [r6-16*4], 1 - mova xm7, [r6-16*3] - vinserti128 m7, [r6-16*1], 1 - mova xm8, [r6-16*2] - vinserti128 m8, [r6-16*0], 1 -.w64_loop: - vpbroadcastb m2, [tlq+hq] - punpcklbw m2, m3 - vpbroadcastw m4, [r3+hq*2] - SMOOTH 5, 6, 2, 2, 4, 4 - mova [dstq+32*0], m0 - SMOOTH 7, 8, 2, 2, 4, 4 - mova [dstq+32*1], m0 - add dstq, strideq - dec hd - jg .w64_loop - RET -ALIGN function_align -.prep: - vpermq m2, [tlq-32*1], q3120 - punpckhbw m1, m2, m3 - punpcklbw m2, m3 - pmaddubsw m0, m1, m4 ; 127 * left - 127 * right - paddw m1, m5 ; 1 * left + 256 * right + 128 - paddw m0, m1 ; 128 * left + 129 * right + 128 - pmaddubsw m1, m2, m4 - paddw m2, m5 - paddw m1, m2 - vpermq m2, [tlq-32*2], q3120 - mova [rsp+gprsize+32*3], m0 - mova [rsp+gprsize+32*2], m1 - punpckhbw m1, m2, m3 - punpcklbw m2, m3 - pmaddubsw m0, m1, m4 - paddw m1, m5 - paddw m0, m1 - pmaddubsw m1, m2, m4 - paddw m2, m5 - paddw m1, m2 - mova [rsp+gprsize+32*1], m0 - mova [rsp+gprsize+32*0], m1 - sub r3, hq - sub tlq, hq - sub r3, hq - ret - -%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] - pmaddubsw m0, m%3, m%1 - pmaddubsw m1, m%4, m%2 -%ifnum %5 - paddw m0, m%5 -%else - paddw m0, %5 -%endif -%ifnum %6 - paddw m1, m%6 -%else - paddw m1, %6 -%endif - pavgw m0, m2 - pavgw m1, m3 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 -%endmacro - -cglobal ipred_smooth, 3, 7, 0, dst, stride, tl, w, h, v_weights -%define base r6-ipred_smooth_avx2_table - lea r6, [ipred_smooth_avx2_table] - mov wd, wm - vpbroadcastb m4, [tlq+wq] ; right - tzcnt wd, wd - mov hd, hm - mov r5, tlq - sub r5, hq - movsxd wq, [r6+wq*4] - vpbroadcastd m5, [base+pb_127_m127] - vpbroadcastb m0, [r5] ; bottom - vpbroadcastd m3, [base+pw_255] - add wq, r6 - lea v_weightsq, [base+smooth_weights+hq*2] - jmp wq -.w4: - WIN64_SPILL_XMM 12 - mova m10, [base+ipred_h_shuf] - vpbroadcastq m11, [base+smooth_weights+4*2] - mova m7, [base+ipred_v_shuf] - vpbroadcastd m8, [tlq+1] - sub tlq, 8 - lea r3, [strideq*3] - sub tlq, hq - punpcklbw m8, m0 ; top, bottom - pshufd m6, m7, q2200 - pshufd m7, m7, q3311 - pmaddubsw m9, m8, m5 - paddw m3, m8 ; 1 * top + 255 * bottom + 255 - paddw m9, m3 ; 128 * top + 129 * bottom + 255 -.w4_loop: - vpbroadcastq m1, [tlq+hq] - pshufb m1, m10 - punpcklbw m0, m1, m4 ; left, right - punpckhbw m1, m4 - pmaddubsw m2, m0, m5 ; 127 * left - 127 * right - pmaddubsw m3, m1, m5 - paddw m2, m0 ; 128 * left + 129 * right - paddw m3, m1 - pmaddubsw m0, m11 - pmaddubsw m1, m11 - paddw m2, m0 - paddw m3, m1 - vbroadcasti128 m1, [v_weightsq] - add v_weightsq, 16 - pshufb m0, m1, m6 - pshufb m1, m7 - SMOOTH_2D_END 0, 1, 8, 8, 9, 9 - vextracti128 xm1, m0, 1 - movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xm1 - pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+r3 ], xm1, 2 - cmp hd, 4 - je .ret - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xm1, 1 - pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+r3 ], xm1, 3 - lea dstq, [dstq+strideq*4] - sub hd, 8 - jg .w4_loop -.ret: - RET -ALIGN function_align -.w8: - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 12 - mova m10, [base+ipred_h_shuf] - vbroadcasti128 m11, [base+smooth_weights+8*2] - mova m7, [base+ipred_v_shuf] - vpbroadcastq m8, [tlq+1] - sub tlq, 4 - lea r3, [strideq*3] - sub tlq, hq - punpcklbw m8, m0 - pshufd m6, m7, q0000 - pshufd m7, m7, q1111 - pmaddubsw m9, m8, m5 - paddw m3, m8 - paddw m9, m3 -.w8_loop: - vpbroadcastd m1, [tlq+hq] - pshufb m1, m10 - punpcklbw m0, m1, m4 - punpckhbw m1, m4 - pmaddubsw m2, m0, m5 - pmaddubsw m3, m1, m5 - paddw m2, m0 - paddw m3, m1 - pmaddubsw m0, m11 - pmaddubsw m1, m11 - paddw m2, m0 - paddw m3, m1 - vpbroadcastq m1, [v_weightsq] - add v_weightsq, 8 - pshufb m0, m1, m6 - pshufb m1, m7 - SMOOTH_2D_END 0, 1, 8, 8, 9, 9 - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+r3 ], xm1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w8_loop - RET -ALIGN function_align -.w16: - SETUP_STACK_FRAME 32*4, 7, 14 - vbroadcasti128 m11, [tlq+1] - lea r3, [rsp+64*2-4] - punpcklbw m10, m11, m0 ; top, bottom - punpckhbw m11, m0 - call .prep_v - sub tlq, 2 - pmaddubsw m12, m10, m5 - pmaddubsw m13, m11, m5 - vpbroadcastd xm5, [base+pb_1] - mova m9, [base+ipred_v_shuf] - vbroadcasti128 m6, [base+smooth_weights+16*2] - vbroadcasti128 m7, [base+smooth_weights+16*3] - vpermq m8, m9, q1032 - paddw m0, m10, m3 - paddw m3, m11 - paddw m12, m0 - paddw m13, m3 -.w16_loop: - vpbroadcastd m3, [tlq+hq] - vpbroadcastd m0, [r3+hq*2] - vpbroadcastd m1, [v_weightsq] - add v_weightsq, 4 - pshufb m3, m5 - punpcklbw m3, m4 ; left, right - pmaddubsw m2, m3, m6 - pmaddubsw m3, m7 - pshufb m0, m8 - pshufb m1, m9 - paddw m2, m0 - paddw m3, m0 - SMOOTH_2D_END 1, 1, 10, 11, 12, 13 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w16_loop - RET -ALIGN function_align -.w32: - SETUP_STACK_FRAME 32*4, 7, 11 - movu m8, [tlq+1] - lea r3, [rsp+64*2-2] - punpcklbw m7, m8, m0 - punpckhbw m8, m0 - call .prep_v - dec tlq - pmaddubsw m9, m7, m5 - pmaddubsw m10, m8, m5 - mova xm5, [base+smooth_weights+16*4] - vinserti128 m5, [base+smooth_weights+16*6], 1 - mova xm6, [base+smooth_weights+16*5] - vinserti128 m6, [base+smooth_weights+16*7], 1 - paddw m0, m7, m3 - paddw m3, m8 - paddw m9, m0 - paddw m10, m3 -.w32_loop: - vpbroadcastb m3, [tlq+hq] - punpcklbw m3, m4 - vpbroadcastw m0, [r3+hq*2] - vpbroadcastw m1, [v_weightsq] - add v_weightsq, 2 - pmaddubsw m2, m3, m5 - pmaddubsw m3, m6 - paddw m2, m0 - paddw m3, m0 - SMOOTH_2D_END 1, 1, 7, 8, 9, 10 - mova [dstq], m0 - add dstq, strideq - dec hd - jg .w32_loop - RET -ALIGN function_align -.w64: - SETUP_STACK_FRAME 32*8, 7, 16 - movu m13, [tlq+1 ] - movu m15, [tlq+33] - add r6, smooth_weights+16*15-ipred_smooth_avx2_table - lea r3, [rsp+64*2-2] - punpcklbw m12, m13, m0 - punpckhbw m13, m0 - punpcklbw m14, m15, m0 - punpckhbw m15, m0 - call .prep_v - dec tlq - pmaddubsw m0, m12, m5 - pmaddubsw m1, m13, m5 - pmaddubsw m2, m14, m5 - pmaddubsw m5, m15, m5 - mova xm8, [r6-16*7] - vinserti128 m8, [r6-16*5], 1 - mova xm9, [r6-16*6] - vinserti128 m9, [r6-16*4], 1 - mova xm10, [r6-16*3] - vinserti128 m10, [r6-16*1], 1 - mova xm11, [r6-16*2] - vinserti128 m11, [r6-16*0], 1 - lea r6, [rsp+32*4] - paddw m0, m3 - paddw m1, m3 - paddw m2, m3 - paddw m3, m5 - paddw m0, m12 - paddw m1, m13 - paddw m2, m14 - paddw m3, m15 - mova [r6+32*0], m0 - mova [r6+32*1], m1 - mova [r6+32*2], m2 - mova [r6+32*3], m3 -.w64_loop: - vpbroadcastb m5, [tlq+hq] - punpcklbw m5, m4 - vpbroadcastw m6, [r3+hq*2] - vpbroadcastw m7, [v_weightsq] - add v_weightsq, 2 - pmaddubsw m2, m5, m8 - pmaddubsw m3, m5, m9 - paddw m2, m6 - paddw m3, m6 - SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1] - mova [dstq+32*0], m0 - pmaddubsw m2, m5, m10 - pmaddubsw m3, m5, m11 - paddw m2, m6 - paddw m3, m6 - SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3] - mova [dstq+32*1], m0 - add dstq, strideq - dec hd - jg .w64_loop - RET -ALIGN function_align -.prep_v: - vpermq m2, [tlq-32*1], q3120 - punpckhbw m1, m2, m4 - punpcklbw m2, m4 - pmaddubsw m0, m1, m5 ; 127 * left - 127 * right - paddw m0, m1 ; 128 * left + 129 * right - pmaddubsw m1, m2, m5 - paddw m1, m2 - vpermq m2, [tlq-32*2], q3120 - mova [rsp+gprsize+32*3], m0 - mova [rsp+gprsize+32*2], m1 - punpckhbw m1, m2, m4 - punpcklbw m2, m4 - pmaddubsw m0, m1, m5 - paddw m0, m1 - pmaddubsw m1, m2, m5 - paddw m1, m2 - mova [rsp+gprsize+32*1], m0 - mova [rsp+gprsize+32*0], m1 - sub r3, hq - sub tlq, hq - sub r3, hq - ret - -cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase - %assign org_stack_offset stack_offset - lea r6, [ipred_z1_avx2_table] - tzcnt wd, wm - movifnidn angled, anglem - movifnidn hd, hm - lea r7, [dr_intra_derivative] - inc tlq - movsxd wq, [r6+wq*4] - add wq, r6 - mov dxd, angled - and dxd, 0x7e - add angled, 165 ; ~90 - movzx dxd, word [r7+dxq] - xor angled, 0x4ff ; d = 90 - angle - vpbroadcastd m3, [pw_512] - vpbroadcastd m4, [pw_62] - vpbroadcastd m5, [pw_64] - jmp wq -.w4: - cmp angleb, 40 - jae .w4_no_upsample - lea r3d, [angleq-1024] - sar r3d, 7 - add r3d, hd - jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) - ALLOC_STACK -32, 8 - mova xm1, [tlq-1] - pshufb xm0, xm1, [z_upsample1] - pshufb xm1, [z_upsample2] - vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse - add dxd, dxd ; pw_512 (which is already in m3) - pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 - pextrd [rsp+16], xm1, 3 ; top[max_base_x] - pmaddubsw xm1, xm2 - movd xm7, dxd - mov r3d, dxd ; xpos - vpbroadcastw m7, xm7 - paddw xm1, xm0 - movq xm0, [tlq] - pmulhrsw xm1, xm3 - pslldq m6, m7, 8 - paddw xm2, xm7, xm7 - lea r2, [strideq*3] - paddw m6, m7 - packuswb xm1, xm1 - paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 - punpcklbw xm0, xm1 - psllw m7, 2 - mova [rsp], xm0 -.w4_upsample_loop: - lea r5d, [r3+dxq] - shr r3d, 6 ; base0 - vpbroadcastq m1, [rsp+r3] - lea r3d, [r5+dxq] - shr r5d, 6 ; base1 - vpbroadcastq m2, [rsp+r5] - lea r5d, [r3+dxq] - shr r3d, 6 ; base2 - movq xm0, [rsp+r3] - lea r3d, [r5+dxq] - shr r5d, 6 ; base3 - movhps xm0, [rsp+r5] - vpblendd m1, m2, 0xc0 - pand m2, m4, m6 ; frac - vpblendd m0, m1, 0xf0 - psubw m1, m5, m2 ; 64-frac - psllw m2, 8 - por m1, m2 ; 64-frac, frac - pmaddubsw m0, m1 - paddw m6, m7 ; xpos += dx - pmulhrsw m0, m3 - packuswb m0, m0 - vextracti128 xm1, m0, 1 - movd [dstq+strideq*2], xm0 - pextrd [dstq+r2 ], xm0, 1 - movd [dstq+strideq*0], xm1 - pextrd [dstq+strideq*1], xm1, 1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w4_upsample_loop - RET -ALIGN function_align -.filter_strength: ; w4/w8/w16 - ; The C version uses a lot of branches, but we can do all the comparisons - ; in parallel and use popcnt to get the final filter strength value. -%define base r3-z_filter_t0 - lea r3, [z_filter_t0] - movd xm0, maxbased - movd xm2, angled - shr angled, 8 ; is_sm << 1 - vpbroadcastb m0, xm0 - vpbroadcastb m2, xm2 - pcmpeqb m1, m0, [base+z_filter_wh] - pand m1, m2 - mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases - pcmpgtb m1, m2 - pmovmskb r5d, m1 - ret -.w4_no_upsample: - %assign stack_offset org_stack_offset - ALLOC_STACK -16, 11 - mov maxbased, 7 - test angled, 0x400 ; !enable_intra_edge_filter - jnz .w4_main - lea maxbased, [hq+3] - call .filter_strength - mov maxbased, 7 - test r5d, r5d - jz .w4_main ; filter_strength == 0 - popcnt r5d, r5d - vpbroadcastd m7, [base+pb_8] - vbroadcasti128 m2, [tlq-1] - pminub m1, m7, [base+z_filter_s] - vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] - pminub m7, [base+z_filter_s+8] - vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] - vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] - pshufb m0, m2, m1 - shufps m1, m7, q2121 - pmaddubsw m0, m8 - pshufb m1, m2, m1 - pmaddubsw m1, m9 - pshufb m2, m7 - pmaddubsw m2, m10 - paddw m0, m1 - paddw m0, m2 - pmulhrsw m0, m3 - mov r3d, 9 - mov tlq, rsp - cmp hd, 4 - cmovne maxbased, r3d - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - mova [tlq], xm0 -.w4_main: - movd xm6, dxd - vpbroadcastq m0, [z_base_inc] ; base_inc << 6 - vpbroadcastb m7, [tlq+maxbaseq] - shl maxbased, 6 - vpbroadcastw m6, xm6 - mov r3d, dxd ; xpos - movd xm9, maxbased - vpbroadcastw m9, xm9 - vbroadcasti128 m8, [z1_shuf_w4] - psrlw m7, 8 ; top[max_base_x] - paddw m10, m6, m6 - psubw m9, m0 ; max_base_x - vpblendd m6, m10, 0xcc - mova xm0, xm10 - paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 - paddw m10, m10 -.w4_loop: - lea r5d, [r3+dxq] - shr r3d, 6 ; base0 - vpbroadcastq m1, [tlq+r3] - lea r3d, [r5+dxq] - shr r5d, 6 ; base1 - vpbroadcastq m2, [tlq+r5] - lea r5d, [r3+dxq] - shr r3d, 6 ; base2 - movq xm0, [tlq+r3] - lea r3d, [r5+dxq] - shr r5d, 6 ; base3 - movhps xm0, [tlq+r5] - vpblendd m1, m2, 0xc0 - pand m2, m4, m6 ; frac - vpblendd m0, m1, 0xf0 - psubw m1, m5, m2 ; 64-frac - psllw m2, 8 - pshufb m0, m8 - por m1, m2 ; 64-frac, frac - pmaddubsw m0, m1 - pcmpgtw m1, m9, m6 ; base < max_base_x - pmulhrsw m0, m3 - paddw m6, m10 ; xpos += dx - lea r5, [dstq+strideq*2] - vpblendvb m0, m7, m0, m1 - packuswb m0, m0 - vextracti128 xm1, m0, 1 - movd [r5 +strideq*0], xm0 - pextrd [r5 +strideq*1], xm0, 1 - movd [dstq+strideq*0], xm1 - pextrd [dstq+strideq*1], xm1, 1 - sub hd, 4 - jz .w4_end - lea dstq, [dstq+strideq*4] - cmp r3d, maxbased - jb .w4_loop - packuswb xm7, xm7 - lea r6, [strideq*3] -.w4_end_loop: - movd [dstq+strideq*0], xm7 - movd [dstq+strideq*1], xm7 - movd [dstq+strideq*2], xm7 - movd [dstq+r6 ], xm7 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w4_end_loop -.w4_end: - RET -ALIGN function_align -.w8: - lea r3d, [angleq+216] - mov r3b, hb - cmp r3d, 8 - ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 - %assign stack_offset org_stack_offset - ALLOC_STACK -32, 8 - movu xm2, [z_filter_s+6] - mova xm0, [tlq-1] - movd xm6, hd - vinserti128 m0, [tlq+7], 1 - vpbroadcastb xm6, xm6 - vbroadcasti128 m1, [z_upsample1] - pminub xm6, xm2 - vpbroadcastd m7, [pb_36_m4] - vinserti128 m2, xm6, 1 - add dxd, dxd - pshufb m1, m0, m1 - pshufb m2, m0, m2 - movd xm6, dxd - pmaddubsw m1, m7 - pmaddubsw m2, m7 - vpbroadcastw m6, xm6 - mov r3d, dxd - psrldq m0, 1 - lea r2, [strideq*3] - paddw m7, m6, m6 - paddw m1, m2 - vpblendd m6, m7, 0xf0 - pmulhrsw m1, m3 - pslldq m2, m7, 8 - paddw m7, m7 - paddw m6, m2 - packuswb m1, m1 - punpcklbw m0, m1 - mova [rsp], m0 -.w8_upsample_loop: - lea r5d, [r3+dxq] - shr r3d, 6 ; base0 - movu xm0, [rsp+r3] - lea r3d, [r5+dxq] - shr r5d, 6 ; base1 - vinserti128 m0, [rsp+r5], 1 - lea r5d, [r3+dxq] - shr r3d, 6 ; base2 - pand m1, m4, m6 - psubw m2, m5, m1 - psllw m1, 8 - por m2, m1 - punpcklqdq m1, m2, m2 ; frac0 frac1 - pmaddubsw m0, m1 - movu xm1, [rsp+r3] - lea r3d, [r5+dxq] - shr r5d, 6 ; base3 - vinserti128 m1, [rsp+r5], 1 - punpckhqdq m2, m2 ; frac2 frac3 - pmaddubsw m1, m2 - pmulhrsw m0, m3 - paddw m6, m7 - pmulhrsw m1, m3 - packuswb m0, m1 - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movhps [dstq+strideq*2], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+r2 ], xm1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w8_upsample_loop - RET -.w8_no_intra_edge_filter: - and maxbased, 7 - or maxbased, 8 ; imin(h+7, 15) - jmp .w8_main -.w8_no_upsample: - %assign stack_offset org_stack_offset - ALLOC_STACK -32, 10 - lea maxbased, [hq+7] - test angled, 0x400 - jnz .w8_no_intra_edge_filter - call .filter_strength - test r5d, r5d - jz .w8_main ; filter_strength == 0 - popcnt r5d, r5d - movu xm2, [tlq] - pminub xm1, xm0, [base+z_filter_s+14] - vinserti128 m2, [tlq-1], 1 - vinserti128 m1, [base+z_filter_s+ 0], 1 - vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] - pminub xm0, [base+z_filter_s+22] - vinserti128 m0, [base+z_filter_s+ 8], 1 - pshufb m6, m2, m1 - pmaddubsw m6, m7 - vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] - movzx r3d, byte [tlq+15] - shufps m1, m0, q2121 - pshufb m1, m2, m1 - pmaddubsw m1, m7 - paddw m1, m6 - sub r5d, 3 - jnz .w8_3tap - ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, - ; which also results in an awkward edge case where out[w*2] is - ; slightly different from out[max_base_x] when h > w. - vpbroadcastd m7, [z_filter_k+4*8] - movzx r2d, byte [tlq+14] - pshufb m2, m0 - pmaddubsw m2, m7 - sub r2d, r3d - lea r2d, [r2+r3*8+4] - shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 - mov [rsp+16], r2b - paddw m1, m2 -.w8_3tap: - pmulhrsw m1, m3 - sar r5d, 1 - mov tlq, rsp - add r5d, 17 ; w*2 + (filter_strength == 3) - cmp hd, 16 - cmovns maxbased, r5d - mov [tlq+r5], r3b - vextracti128 xm0, m1, 1 - packuswb xm0, xm1 - mova [tlq], xm0 -.w8_main: - movd xm2, dxd - vbroadcasti128 m0, [z_base_inc] - vpbroadcastw m2, xm2 - vpbroadcastb m7, [tlq+maxbaseq] - shl maxbased, 6 - movd xm9, maxbased - vbroadcasti128 m8, [z_filter_s+2] - vpbroadcastw m9, xm9 - psrlw m7, 8 - psubw m9, m0 - mov r3d, dxd - paddw m6, m2, m2 - vpblendd m2, m6, 0xf0 -.w8_loop: - lea r5d, [r3+dxq] - shr r3d, 6 - pand m0, m4, m2 - psubw m1, m5, m0 - psllw m0, 8 - por m1, m0 - movu xm0, [tlq+r3] - lea r3d, [r5+dxq] - shr r5d, 6 ; base1 - vinserti128 m0, [tlq+r5], 1 - pshufb m0, m8 - pmaddubsw m0, m1 - pcmpgtw m1, m9, m2 - paddw m2, m6 - pmulhrsw m0, m3 - vpblendvb m0, m7, m0, m1 - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - movq [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xm0 - sub hd, 2 - jz .w8_end - lea dstq, [dstq+strideq*2] - cmp r3d, maxbased - jb .w8_loop - packuswb xm7, xm7 -.w8_end_loop: - movq [dstq+strideq*0], xm7 - movq [dstq+strideq*1], xm7 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w8_end_loop -.w8_end: - RET -.w16_no_intra_edge_filter: - and maxbased, 15 - or maxbased, 16 ; imin(h+15, 31) - jmp .w16_main -ALIGN function_align -.w16: - %assign stack_offset org_stack_offset - ALLOC_STACK -64, 12 - lea maxbased, [hq+15] - test angled, 0x400 - jnz .w16_no_intra_edge_filter - call .filter_strength - test r5d, r5d - jz .w16_main ; filter_strength == 0 - popcnt r5d, r5d - vpbroadcastd m1, [base+pb_12] - vbroadcasti128 m6, [base+z_filter_s+8] - vinserti128 m2, m6, [base+z_filter_s], 0 - vinserti128 m6, [base+z_filter_s+16], 1 - mova xm10, [tlq-1] - vinserti128 m10, [tlq+3], 1 - vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] - vbroadcasti128 m7, [base+z_filter_s+14] - vinserti128 m8, m7, [base+z_filter_s+6], 0 - vinserti128 m7, [base+z_filter_s+22], 1 - psubw m0, m1 - movu xm11, [tlq+12] - vinserti128 m11, [tlq+16], 1 - pminub m8, m0 - pminub m7, m0 - pshufb m0, m10, m2 - shufps m2, m6, q2121 - pmaddubsw m0, m9 - pshufb m1, m11, m8 - shufps m8, m7, q2121 - pmaddubsw m1, m9 - vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] - movzx r3d, byte [tlq+31] - pshufb m2, m10, m2 - pmaddubsw m2, m9 - pshufb m8, m11, m8 - pmaddubsw m8, m9 - paddw m0, m2 - paddw m1, m8 - sub r5d, 3 - jnz .w16_3tap - vpbroadcastd m9, [z_filter_k+4*8] - movzx r2d, byte [tlq+30] - pshufb m10, m6 - pmaddubsw m10, m9 - pshufb m11, m7 - pmaddubsw m11, m9 - sub r2d, r3d - lea r2d, [r2+r3*8+4] - shr r2d, 3 - mov [rsp+32], r2b - paddw m0, m10 - paddw m1, m11 -.w16_3tap: - pmulhrsw m0, m3 - pmulhrsw m1, m3 - sar r5d, 1 - mov tlq, rsp - add r5d, 33 - cmp hd, 32 - cmovns maxbased, r5d - mov [tlq+r5], r3b - packuswb m0, m1 - vpermq m0, m0, q3120 - mova [tlq], m0 -.w16_main: - movd xm6, dxd - vbroadcasti128 m0, [z_base_inc] - vpbroadcastb m7, [tlq+maxbaseq] - shl maxbased, 6 - vpbroadcastw m6, xm6 - movd xm9, maxbased - vbroadcasti128 m8, [z_filter_s+2] - vpbroadcastw m9, xm9 - mov r3d, dxd - psubw m9, m0 - paddw m11, m6, m6 - psubw m10, m9, m3 ; 64*8 - vpblendd m6, m11, 0xf0 -.w16_loop: - lea r5d, [r3+dxq] - shr r3d, 6 ; base0 - pand m1, m4, m6 - psubw m2, m5, m1 - psllw m1, 8 - por m2, m1 - movu xm0, [tlq+r3+0] - movu xm1, [tlq+r3+8] - lea r3d, [r5+dxq] - shr r5d, 6 ; base1 - vinserti128 m0, [tlq+r5+0], 1 - vinserti128 m1, [tlq+r5+8], 1 - pshufb m0, m8 - pshufb m1, m8 - pmaddubsw m0, m2 - pmaddubsw m1, m2 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packuswb m0, m1 - pcmpgtw m1, m9, m6 - pcmpgtw m2, m10, m6 - packsswb m1, m2 - paddw m6, m11 - vpblendvb m0, m7, m0, m1 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - sub hd, 2 - jz .w16_end - lea dstq, [dstq+strideq*2] - cmp r3d, maxbased - jb .w16_loop -.w16_end_loop: - mova [dstq+strideq*0], xm7 - mova [dstq+strideq*1], xm7 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w16_end_loop -.w16_end: - RET -ALIGN function_align -.w32: - %assign stack_offset org_stack_offset - ALLOC_STACK -96, 15 - lea r3d, [hq+31] - mov maxbased, 63 - cmp hd, 32 - cmovs maxbased, r3d - test angled, 0x400 ; !enable_intra_edge_filter - jnz .w32_main - vbroadcasti128 m0, [pb_0to15] - sub r3d, 29 ; h+2 - movu xm13, [tlq+29] ; 32-39 - movd xm1, r3d - movu xm14, [tlq+37] ; 40-47 - sub r3d, 8 ; h-6 - vinserti128 m14, [tlq+51], 1 ; 56-63 - vpbroadcastb xm1, xm1 - mova xm11, [tlq- 1] ; 0- 7 - vinserti128 m11, [tlq+13], 1 ; 16-23 - movd xm2, r3d - movu xm12, [tlq+ 5] ; 8-15 - vinserti128 m12, [tlq+19], 1 ; 24-31 - pminub xm1, xm0 ; clip 32x8 - mova m7, [z_filter_s+0] - pshufb xm13, xm1 - vpbroadcastd m1, [pb_12] - vpbroadcastb xm2, xm2 - vinserti128 m13, [tlq+43], 1 ; 48-55 - vinserti128 m8, m7, [z_filter_s+4], 1 - vpblendd m2, m1, 0xf0 - vinserti128 m7, [z_filter_s+12], 0 - pminub m2, m0 ; clip 32x16 and 32x(32|64) - vpbroadcastd m9, [z_filter_k+4*2+12*0] - pshufb m14, m2 - pshufb m0, m11, m8 - shufps m8, m7, q1021 - pmaddubsw m0, m9 - pshufb m2, m12, m8 - pmaddubsw m2, m9 - pshufb m1, m13, m8 - pmaddubsw m1, m9 - pshufb m6, m14, m8 - pmaddubsw m6, m9 - vpbroadcastd m9, [z_filter_k+4*2+12*1] - pshufb m10, m11, m8 - shufps m8, m7, q2121 - pmaddubsw m10, m9 - paddw m0, m10 - pshufb m10, m12, m8 - pmaddubsw m10, m9 - paddw m2, m10 - pshufb m10, m13, m8 - pmaddubsw m10, m9 - paddw m1, m10 - pshufb m10, m14, m8 - pmaddubsw m10, m9 - paddw m6, m10 - vpbroadcastd m9, [z_filter_k+4*2+12*2] - pshufb m11, m8 - pmaddubsw m11, m9 - pshufb m12, m7 - pmaddubsw m12, m9 - movzx r3d, byte [tlq+63] - movzx r2d, byte [tlq+62] - paddw m0, m11 - paddw m2, m12 - pshufb m13, m7 - pmaddubsw m13, m9 - pshufb m14, m7 - pmaddubsw m14, m9 - paddw m1, m13 - paddw m6, m14 - sub r2d, r3d - lea r2d, [r2+r3*8+4] ; edge case for 32x64 - pmulhrsw m0, m3 - pmulhrsw m2, m3 - pmulhrsw m1, m3 - pmulhrsw m6, m3 - shr r2d, 3 - mov [rsp+64], r2b - mov tlq, rsp - mov [tlq+65], r3b - mov r3d, 65 - cmp hd, 64 - cmove maxbased, r3d - packuswb m0, m2 - packuswb m1, m6 - mova [tlq+ 0], m0 - mova [tlq+32], m1 -.w32_main: - movd xm6, dxd - vpbroadcastb m7, [tlq+maxbaseq] - shl maxbased, 6 - vpbroadcastw m6, xm6 - movd xm9, maxbased - vbroadcasti128 m8, [z_filter_s+2] - vpbroadcastw m9, xm9 - mov r5d, dxd - psubw m9, [z_base_inc] - mova m11, m6 - psubw m10, m9, m3 ; 64*8 -.w32_loop: - mov r3d, r5d - shr r3d, 6 - pand m1, m4, m6 - psubw m2, m5, m1 - psllw m1, 8 - por m2, m1 - movu m0, [tlq+r3+0] - movu m1, [tlq+r3+8] - add r5d, dxd - pshufb m0, m8 - pshufb m1, m8 - pmaddubsw m0, m2 - pmaddubsw m1, m2 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packuswb m0, m1 - pcmpgtw m1, m9, m6 - pcmpgtw m2, m10, m6 - packsswb m1, m2 - paddw m6, m11 - vpblendvb m0, m7, m0, m1 - mova [dstq], m0 - dec hd - jz .w32_end - add dstq, strideq - cmp r5d, maxbased - jb .w32_loop - test hb, 1 - jz .w32_end_loop - mova [dstq], m7 - add dstq, strideq - dec hd - jz .w32_end -.w32_end_loop: - mova [dstq+strideq*0], m7 - mova [dstq+strideq*1], m7 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w32_end_loop -.w32_end: - RET -ALIGN function_align -.w64: - %assign stack_offset org_stack_offset - ALLOC_STACK -128, 16 - lea maxbased, [hq+63] - test angled, 0x400 ; !enable_intra_edge_filter - jnz .w64_main - mova xm11, [tlq- 1] ; 0- 7 - vinserti128 m11, [tlq+13], 1 ; 16-23 - movu xm12, [tlq+ 5] ; 8-15 - vinserti128 m12, [tlq+19], 1 ; 24-31 - mova m7, [z_filter_s+0] - vinserti128 m8, m7, [z_filter_s+4], 1 - vinserti128 m7, [z_filter_s+12], 0 - vpbroadcastd m9, [z_filter_k+4*2+12*0] - movu xm13, [tlq+29] ; 32-39 - vinserti128 m13, [tlq+43], 1 ; 48-55 - movu xm14, [tlq+37] ; 40-47 - vinserti128 m14, [tlq+51], 1 ; 56-63 - pshufb m0, m11, m8 - shufps m8, m7, q1021 - pmaddubsw m0, m9 - pshufb m2, m12, m8 - pmaddubsw m2, m9 - pshufb m1, m13, m8 - pmaddubsw m1, m9 - pshufb m6, m14, m8 - pmaddubsw m6, m9 - vpbroadcastd m9, [z_filter_k+4*2+12*1] - pshufb m10, m11, m8 - shufps m15, m8, m7, q2121 - pmaddubsw m10, m9 - paddw m0, m10 - pshufb m10, m12, m15 - pmaddubsw m10, m9 - paddw m2, m10 - pshufb m10, m13, m15 - pmaddubsw m10, m9 - paddw m1, m10 - pshufb m10, m14, m15 - pmaddubsw m10, m9 - paddw m6, m10 - vpbroadcastd m10, [z_filter_k+4*2+12*2] - pshufb m11, m15 - pmaddubsw m11, m10 - pshufb m12, m7 - pmaddubsw m12, m10 - pshufb m13, m7 - pmaddubsw m13, m10 - pshufb m14, m7 - pmaddubsw m14, m10 - paddw m0, m11 - paddw m2, m12 - paddw m1, m13 - paddw m6, m14 - movu xm11, [tlq+ 61] ; 64- 71 - vinserti128 m11, [tlq+ 75], 1 ; 80- 87 - movu xm12, [tlq+ 69] ; 72- 79 - vinserti128 m12, [tlq+ 83], 1 ; 88- 95 - movu xm13, [tlq+ 93] ; 96-103 - vinserti128 m13, [tlq+107], 1 ; 112-119 - movu xm14, [tlq+101] ; 104-111 - vinserti128 m14, [tlq+115], 1 ; 120-127 - pmulhrsw m0, m3 - pmulhrsw m2, m3 - pmulhrsw m1, m3 - pmulhrsw m6, m3 - lea r3d, [hq-20] - mov tlq, rsp - packuswb m0, m2 - packuswb m1, m6 - vpbroadcastd xm2, [pb_14] - vbroadcasti128 m6, [pb_0to15] - mova [tlq+32*0], m0 - mova [tlq+32*1], m1 - movd xm0, r3d - vpbroadcastd m1, [pb_12] - vpbroadcastb m0, xm0 - paddb m0, m2 - pminub m0, m6 ; clip 64x16 and 64x32 - pshufb m12, m0 - pminub m1, m6 ; clip 64x64 - pshufb m14, m1 - pshufb m0, m11, m7 - pmaddubsw m0, m10 - pshufb m2, m12, m7 - pmaddubsw m2, m10 - pshufb m1, m13, m7 - pmaddubsw m1, m10 - pshufb m6, m14, m7 - pmaddubsw m6, m10 - pshufb m7, m11, m15 - pmaddubsw m7, m9 - pshufb m10, m12, m15 - pmaddubsw m10, m9 - paddw m0, m7 - pshufb m7, m13, m15 - pmaddubsw m7, m9 - paddw m2, m10 - pshufb m10, m14, m15 - pmaddubsw m10, m9 - paddw m1, m7 - paddw m6, m10 - vpbroadcastd m9, [z_filter_k+4*2+12*0] - pshufb m11, m8 - pmaddubsw m11, m9 - pshufb m12, m8 - pmaddubsw m12, m9 - pshufb m13, m8 - pmaddubsw m13, m9 - pshufb m14, m8 - pmaddubsw m14, m9 - paddw m0, m11 - paddw m2, m12 - paddw m1, m13 - paddw m6, m14 - pmulhrsw m0, m3 - pmulhrsw m2, m3 - pmulhrsw m1, m3 - pmulhrsw m6, m3 - packuswb m0, m2 - packuswb m1, m6 - mova [tlq+32*2], m0 - mova [tlq+32*3], m1 -.w64_main: - movd xm12, dxd - vpbroadcastb m7, [tlq+maxbaseq] - lea r3d, [dxq-64] - shl maxbased, 6 - vpbroadcastw m12, xm12 - sub r3d, maxbased - vbroadcasti128 m8, [z_filter_s+2] - movd xm6, r3d - mov r5d, dxd - mova m10, [pb_1to32] - vpbroadcastd m11, [pb_32] - vpbroadcastw m6, xm6 -.w64_loop: - mov r3d, r5d - shr r3d, 6 - movu m0, [tlq+r3+ 0] - movu m1, [tlq+r3+ 8] - pand m2, m4, m6 - psubw m9, m5, m2 - psllw m2, 8 - por m9, m2 - pshufb m0, m8 - pshufb m1, m8 - pmaddubsw m0, m9 - pmaddubsw m1, m9 - psraw m2, m6, 6 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packsswb m2, m2 - paddb m2, m10 - packuswb m0, m1 - vpblendvb m0, m7, m0, m2 - mova [dstq+ 0], m0 - movu m0, [tlq+r3+32] - movu m1, [tlq+r3+40] - add r5d, dxd - pshufb m0, m8 - pshufb m1, m8 - pmaddubsw m0, m9 - pmaddubsw m1, m9 - paddb m2, m11 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - paddw m6, m12 - packuswb m0, m1 - vpblendvb m0, m7, m0, m2 - mova [dstq+32], m0 - dec hd - jz .w64_end - add dstq, strideq - cmp r5d, maxbased - jb .w64_loop -.w64_end_loop: - mova [dstq+ 0], m7 - mova [dstq+32], m7 - add dstq, strideq - dec hd - jg .w64_end_loop -.w64_end: - RET - -cglobal ipred_z2, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy -%define base r9-z_filter_t0 - lea r9, [ipred_z2_avx2_table] - tzcnt wd, wm - movifnidn angled, anglem - movifnidn hd, hm - lea dxq, [dr_intra_derivative-90] - movsxd wq, [r9+wq*4] - movzx dyd, angleb - xor angled, 0x400 - mov r8, dxq - sub dxq, dyq - add wq, r9 - add r9, z_filter_t0-ipred_z2_avx2_table - mova m2, [tlq-64] - mova m0, [tlq-32] - mova m1, [tlq] - and dyd, ~1 - and dxq, ~1 - movzx dyd, word [r8+dyq] ; angle - 90 - movzx dxd, word [dxq+270] ; 180 - angle - vpbroadcastd m13, [base+pw_512] - vpbroadcastd m14, [base+pw_62] - vpbroadcastd m15, [base+pw_64] - mova [rsp+ 0], m2 - mova [rsp+32], m0 - mova [rsp+64], m1 - neg dxd - neg dyd - jmp wq -.w4: - vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 - vbroadcasti128 m10, [base+z1_shuf_w4] - vbroadcasti128 m11, [base+z2_shuf_h4] - lea r2d, [dxq+(65<<6)] ; xpos - movd xm5, dyd - mov r8d, (63-4)<<6 - mov dyq, -4 - pshuflw xm5, xm5, q0000 - pmullw xm5, [base+z2_ymul] - test angled, 0x400 - jnz .w4_main ; !enable_intra_edge_filter - lea r3d, [hq+2] - add angled, 1022 - shl r3d, 6 - test r3d, angled - jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) - vpbroadcastd xm3, [base+pb_4] - call .upsample_above - sub angled, 1075 ; angle - 53 - lea r3d, [hq+3] - xor angled, 0x7f ; 180 - angle - call .filter_strength - jmp .w4_filter_left -ALIGN function_align -.filter_strength: - movd xm8, r3d - mov r3d, angled - movd xm7, angled - vpbroadcastb m8, xm8 - shr r3d, 8 ; is_sm << 1 - vpbroadcastb m7, xm7 - pcmpeqb m8, [base+z_filter_wh] - mova xm9, [r9+r3*8] - pand m0, m8, m7 - pcmpgtb m0, m9 - pmovmskb r3d, m0 - ret -ALIGN function_align -.upsample_above: ; w4/w8 - pshufb xm2, xm1, [base+z_upsample1-2] - pminub xm3, [base+z_filter_s+4] - vpbroadcastd xm4, [base+pb_36_m4] - vbroadcasti128 m10, [base+pb_0to15] - pshufb xm3, xm1, xm3 - pmaddubsw xm2, xm4 - pmaddubsw xm3, xm4 - lea r2d, [r2+dxq+(1<<6)] - add dxd, dxd - paddw xm2, xm3 - pmulhrsw xm2, xm13 - sub r8d, 3<<6 - paddw m6, m6 - packuswb xm2, xm2 - punpcklbw xm1, xm2 - mova [rsp+gprsize+64], xm1 - ret -ALIGN function_align -.upsample_left: ; h4/h8 - mov r3d, hd - and r3d, 4 - movd xm2, [rsp+gprsize+64] - movddup xm0, [rsp+gprsize+56] - movd xm1, r3d - palignr xm2, xm0, 1 - vpbroadcastb xm1, xm1 - pshufb xm2, [base+z_filter_s+18] - vpbroadcastd xm3, [base+pb_36_m4] - pmaxub xm1, [base+z_upsample1-2] - pshufb xm1, xm0, xm1 - pmaddubsw xm2, xm3 - pmaddubsw xm1, xm3 - paddw xm5, xm5 - add dyq, dyq - paddw xm1, xm2 - pmulhrsw xm1, xm13 - vbroadcasti128 m11, [base+z2_upsample] - paddw xm5, xm15 - packuswb xm1, xm1 - punpcklbw xm0, xm1 - mova [rsp+gprsize+48], xm0 - ret -.w4_no_upsample_above: - lea r3d, [hq+3] - sub angled, 1112 ; angle - 90 - call .filter_strength - test r3d, r3d - jz .w4_no_filter_above - popcnt r3d, r3d - vpbroadcastd xm2, [base+pb_4] - pminub xm2, [base+z_filter_s] - vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] - vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] - pshufb xm3, xm1, xm2 ; 00 01 12 23 - pshufd xm2, xm2, q0321 - pmaddubsw xm0, xm3, xm0 - pshufb xm2, xm1, xm2 ; 12 23 34 44 - pmaddubsw xm2, xm4 - vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] - punpckhqdq xm3, xm3 ; 34 44 44 44 - pmaddubsw xm3, xm4 - movd xm4, r6m ; max_width - pminsw xm4, xm15 - vpbroadcastb xm4, xm4 - paddw xm0, xm2 - paddw xm0, xm3 - pmulhrsw xm0, xm13 - psubb xm4, [base+pb_1to32] - psrlq xm1, 8 - packuswb xm0, xm0 - vpblendvb xm0, xm1, xm4 - movd [rsp+65], xm0 -.w4_no_filter_above: - lea r3d, [hq+2] - add angled, 973 ; angle + 883 - shl r3d, 6 - test r3d, angled - jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) - vpbroadcastd xm0, [base+pb_90] - psubb xm0, xm7 ; 180 - angle - pand xm0, xm8 ; reuse from previous filter_strength call - pcmpgtb xm0, xm9 - pmovmskb r3d, xm0 -.w4_filter_left: - test r3d, r3d - jz .w4_main - popcnt r3d, r3d - mov r5d, 10 - cmp hd, 16 - movu xm2, [rsp+49] - vinserti128 m2, [rsp+43], 1 - cmovs r5d, hd - xor r5d, 15 ; h == 16 ? 5 : 15 - h - movd xm0, r5d - vbroadcasti128 m1, [base+z_filter_s+12] - vbroadcasti128 m4, [base+z_filter_s+16] - vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab - vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd - vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef - vpbroadcastb m0, xm0 - pmaxub m0, m3 - vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] - pshufb m0, m2, m0 - pmaddubsw m0, m3 - vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] - pshufb m1, m2, m1 - pmaddubsw m1, m3 - vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] - pshufb m2, m4 - pmaddubsw m2, m3 - movd xm4, r7m ; max_height - pminsw xm4, xm15 - vpbroadcastb xm4, xm4 - psubb xm4, [base+pb_16to1] - paddw m1, m0 - paddw m1, m2 - pmulhrsw m1, m13 - vextracti128 xm0, m1, 1 - packuswb xm0, xm1 - vpblendvb xm0, [rsp+48], xm4 - mova [rsp+48], xm0 - jmp .w4_main -.w4_upsample_left: - call .upsample_left -.w4_main: - movd xm0, dxd - mova m12, [base+z2_y_shuf_h4] - lea r5, [rsp+56] ; left-7 - vpbroadcastw m0, xm0 - lea r9, [strideq*3] - psraw xm1, xm5, 6 - pand xm5, xm14 ; frac_y - pxor xm2, xm2 - paddw m7, m0, m0 - psubw xm4, xm2, xm1 ; base_y - vpblendd m0, m7, 0xcc - mova xm1, xm7 - punpcklwd xm4, xm2 - paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 - psubw xm1, xm15, xm5 ; 64-frac_y - psllw xm5, 8 - paddw m7, m7 - paddw m6, m0 - por xm5, xm1 ; 64-frac_y, frac_y - vpbroadcastq m5, xm5 -.w4_loop: - lea r3d, [r2+dxq] - shr r2d, 6 ; base_x0 - vpbroadcastq m1, [rsp+r2] - lea r2d, [r3+dxq] - shr r3d, 6 ; base_x1 - vpbroadcastq m2, [rsp+r3] - lea r3d, [r2+dxq] - shr r2d, 6 ; base_x2 - movq xm0, [rsp+r2] - lea r2d, [r3+dxq] - shr r3d, 6 ; base_x3 - movhps xm0, [rsp+r3] - vpblendd m1, m2, 0xc0 - pand m2, m14, m6 ; frac_x - vpblendd m0, m1, 0xf0 - psubw m1, m15, m2 ; 64-frac_x - psllw m2, 8 - pshufb m0, m10 - por m1, m2 ; 64-frac_x, frac_x - pmaddubsw m0, m1 - cmp r3d, 64 - jge .w4_toponly - mova m1, m7 ; arbitrary negative value - vpgatherdq m3, [r5+xm4], m1 - pshufb m1, m3, m11 - vpermd m1, m12, m1 - pmaddubsw m1, m5 - psraw m2, m6, 15 ; base_x < topleft - vpblendvb m0, m1, m2 -.w4_toponly: - pmulhrsw m0, m13 - paddw m6, m7 ; xpos += dx - add r5, dyq - packuswb m0, m0 - vextracti128 xm1, m0, 1 - movd [dstq+strideq*2], xm0 - pextrd [dstq+r9 ], xm0, 1 - movd [dstq+strideq*0], xm1 - pextrd [dstq+strideq*1], xm1, 1 - sub hd, 4 - jz .w4_end - lea dstq, [dstq+strideq*4] - cmp r2d, r8d - jge .w4_loop -.w4_leftonly_loop: - mova m1, m7 - vpgatherdq m2, [r5+xm4], m1 - add r5, dyq - pshufb m0, m2, m11 - vpermd m0, m12, m0 - pmaddubsw m0, m5 - pmulhrsw m0, m13 - packuswb m0, m0 - vextracti128 xm1, m0, 1 - movd [dstq+strideq*2], xm0 - pextrd [dstq+r9 ], xm0, 1 - movd [dstq+strideq*0], xm1 - pextrd [dstq+strideq*1], xm1, 1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w4_leftonly_loop -.w4_end: - RET -.w8: - vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 - movd xm5, dyd - vbroadcasti128 m10, [base+z_filter_s+2] - vbroadcasti128 m11, [base+z2_shuf_h4] - lea r2d, [dxq+(65<<6)] ; xpos - vpbroadcastw xm5, xm5 - mov r8d, (63-8)<<6 - mov dyq, -4 - pmullw xm5, [base+z2_ymul] - test angled, 0x400 - jnz .w8_main - lea r3d, [angleq+126] - mov r3b, hb - cmp r3d, 8 - ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm - vpbroadcastd xm3, [base+pb_8] - movhps [rsp+80], xm1 - call .upsample_above - sub angled, 53 ; angle - 53 - lea r3d, [hq+7] - xor angled, 0x7f ; 180 - angle - call .filter_strength - jmp .w8_filter_left -.w8_no_upsample_above: - lea r3d, [hq+7] - sub angled, 90 ; angle - 90 - call .filter_strength - test r3d, r3d - jz .w8_no_filter_above - popcnt r3d, r3d - vpbroadcastd xm3, [base+pb_8] - pminub xm3, [base+z_filter_s+8] - vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] - vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] - pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 - pmaddubsw xm0, xm2, xm0 - pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 - shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 - pmaddubsw xm2, xm4 - vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] - pmaddubsw xm3, xm4 - movd xm4, r6m ; max_width - pminuw xm4, xm15 - vpbroadcastb xm4, xm4 - paddw xm0, xm2 - paddw xm0, xm3 - pmulhrsw xm0, xm13 - psubb xm4, [base+pb_1to32] - psrldq xm1, 1 - packuswb xm0, xm0 - vpblendvb xm0, xm1, xm4 - movq [rsp+65], xm0 -.w8_no_filter_above: - lea r3d, [angleq-51] - mov r3b, hb - cmp r3d, 8 - jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm - vpbroadcastd m0, [base+pb_90] - psubb m0, m7 - pand m0, m8 - pcmpgtb m0, m9 - pmovmskb r3d, m0 -.w8_filter_left: - test r3d, r3d - jz .w8_main - popcnt r3d, r3d - vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] - vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] - vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] - cmp hd, 32 - jne .w8_filter_left_h16 - movu xm2, [rsp+27] - vinserti128 m2, [rsp+35], 1 - vpbroadcastd xm0, [base+pb_5] - vbroadcasti128 m3, [base+z_filter_s+ 8] - vbroadcasti128 m1, [base+z_filter_s+12] - vbroadcasti128 m4, [base+z_filter_s+16] - pmaxub m3, m0 - pshufb m3, m2, m3 - pmaddubsw m3, m7 - pshufb m1, m2, m1 - pmaddubsw m1, m8 - pshufb m2, m4 - pmaddubsw m2, m9 - paddw m3, m1 - paddw m3, m2 - pmulhrsw m3, m13 - jmp .w8_filter_left_top16 -.w8_filter_left_h16: - mov r5d, 10 - cmp hd, 16 - cmovs r5d, hd - xor r5d, 15 ; h == 16 ? 5 : 15 - h - movd xm0, r5d - vpbroadcastb m0, xm0 -.w8_filter_left_top16: - vbroadcasti128 m1, [base+z_filter_s+12] - vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab - vbroadcasti128 m4, [base+z_filter_s+16] - vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd - vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef - pmaxub m0, m2 - movu xm2, [rsp+49] - vinserti128 m2, [rsp+43], 1 - pshufb m0, m2, m0 - pmaddubsw m0, m7 - movd xm7, r7m ; max_height - pshufb m1, m2, m1 - pmaddubsw m1, m8 - pshufb m2, m4 - pmaddubsw m2, m9 - pminsw xm7, xm15 - paddw m1, m0 - vpbroadcastb m7, xm7 - paddw m1, m2 - pmulhrsw m1, m13 - psubb m7, [base+pb_32to1] - packuswb m3, m1 - vpermq m3, m3, q1320 - vpblendvb m3, [rsp+32], m7 - mova [rsp+32], m3 - jmp .w8_main -.w8_upsample_left: - call .upsample_left -.w8_main: - movd xm3, dxd - lea r5, [rsp+56] ; left-7 - pshufd xm1, xm5, q3120 - pand xm5, xm14 - vpbroadcastw m3, xm3 - pxor xm0, xm0 - psubw xm2, xm15, xm5 - psraw xm1, 6 - lea r9, [strideq*3] - paddw m7, m3, m3 - psubw xm9, xm0, xm1 ; base_y - psllw xm5, 8 - punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 - vpblendd m3, m7, 0xf0 ; xpos0 xpos1 - por xm5, xm2 ; 64-frac_y, frac_y - punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 - paddw m6, m3 - vinserti128 m12, m5, xm5, 1 -.w8_loop: - lea r3d, [r2+dxq] - shr r2d, 6 ; base_x0 - movu xm0, [rsp+r2] - lea r2d, [r3+dxq] - shr r3d, 6 ; base_x1 - vinserti128 m0, [rsp+r3], 1 - lea r3d, [r2+dxq] - shr r2d, 6 ; base_x2 - movu xm1, [rsp+r2] - lea r2d, [r3+dxq] - shr r3d, 6 ; base_x3 - vinserti128 m1, [rsp+r3], 1 - pand m2, m14, m6 - paddsw m4, m6, m7 - psubw m5, m15, m2 - psllw m2, 8 - pshufb m0, m10 - por m2, m5 - pmaddubsw m0, m2 - pand m2, m14, m4 - psubw m5, m15, m2 - psllw m2, 8 - pshufb m1, m10 - por m2, m5 - pmaddubsw m1, m2 - cmp r3d, 64 - jge .w8_toponly - mova m5, m7 - vpgatherdq m3, [r5+xm9], m7 - mova m7, m5 - vpgatherdq m2, [r5+xm8], m5 - pshufb m3, m11 - pshufb m2, m11 - punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 - punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 - vpermq m5, m5, q3120 ; y0 y1 - vpermq m2, m2, q3120 ; y2 y3 - pmaddubsw m5, m12 - pmaddubsw m2, m12 - psraw m6, 15 ; base_x < topleft - vpblendvb m0, m5, m6 - psraw m3, m4, 15 - vpblendvb m1, m2, m3 -.w8_toponly: - pmulhrsw m0, m13 - pmulhrsw m1, m13 - paddw m6, m4, m7 ; xpos += dx - add r5, dyq - packuswb m0, m1 - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movhps [dstq+strideq*2], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+r9 ], xm1 - sub hd, 4 - jz .w8_end - lea dstq, [dstq+strideq*4] - cmp r2d, r8d - jge .w8_loop -.w8_leftonly_loop: - mova m0, m7 - vpgatherdq m5, [r5+xm9], m7 - mova m7, m0 - vpgatherdq m3, [r5+xm8], m0 - add r5, dyq - pshufb m2, m5, m11 - pshufb m1, m3, m11 - punpckldq m0, m1, m2 - punpckhdq m1, m2 - vpermq m0, m0, q3120 - vpermq m1, m1, q3120 - pmaddubsw m0, m12 - pmaddubsw m1, m12 - pmulhrsw m0, m13 - pmulhrsw m1, m13 - packuswb m0, m1 - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movhps [dstq+strideq*2], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+r9 ], xm1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w8_leftonly_loop -.w8_end: - RET -.w16: - mov r8d, hd - test angled, 0x400 - jnz .w16_main - lea r3d, [hq+15] - sub angled, 90 - call .filter_strength - test r3d, r3d - jz .w16_no_filter_above - popcnt r3d, r3d - vbroadcasti128 m6, [tlq+1] - mova xm2, [base+z_filter_s] - vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de - movu xm3, [base+z_filter_s+8] - vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff - vpblendd m1, m6, 0xf0 - vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0] - vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] - vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2] - pshufb m2, m1, m2 - pshufb m1, m3 - pmaddubsw m0, m2, m0 - shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff - pmaddubsw m2, m4 - pmaddubsw m1, m5 - movd xm4, r6m ; max_width - pminsw xm4, xm15 - vpbroadcastb xm4, xm4 - paddw m0, m2 - paddw m0, m1 - pmulhrsw m0, m13 - psubb xm4, [base+pb_1to32] - vextracti128 xm2, m0, 1 - packuswb xm0, xm2 - vpblendvb xm0, xm6, xm4 - movu [rsp+65], xm0 -.w16_no_filter_above: - vpbroadcastd m0, [base+pb_90] - psubb m0, m7 - pand m0, m8 - pcmpgtb m0, m9 - pmovmskb r3d, m0 - test r3d, r3d - jz .w16_main - popcnt r3d, r3d - vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] - vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] - vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] -.w16_filter_left: - movd xm6, r7m ; max_height - pminsw xm6, xm15 - vpbroadcastb m6, xm6 - cmp hd, 32 - jl .w16_filter_left_h16 - vpbroadcastd xm0, [base+pb_5] - vbroadcasti128 m10, [base+z_filter_s+ 8] - vbroadcasti128 m11, [base+z_filter_s+12] - vbroadcasti128 m12, [base+z_filter_s+16] - je .w16_filter_left_h32 - movu m3, [tlq-69] - movu m5, [tlq-61] - pmaxub m1, m10, m0 - pshufb m1, m3, m1 - pmaddubsw m1, m7 - pshufb m2, m3, m11 - pmaddubsw m2, m8 - pshufb m3, m12 - pmaddubsw m3, m9 - paddw m1, m2 - pshufb m2, m5, m10 - pmaddubsw m2, m7 - pshufb m4, m5, m11 - pmaddubsw m4, m8 - pshufb m5, m12 - pmaddubsw m5, m9 - paddw m1, m3 - vpbroadcastd m3, [base+pb_32] - paddb m3, [base+pb_32to1] - paddw m2, m4 - paddw m2, m5 - pmulhrsw m1, m13 - pmulhrsw m2, m13 - psubb m3, m6, m3 - packuswb m1, m2 - vpblendvb m1, [tlq-64], m3 - mova [rsp], m1 - jmp .w16_filter_left_top32 -.w16_filter_left_h32: - pmaxub m10, m0 -.w16_filter_left_top32: - movu xm2, [tlq-37] - vinserti128 m2, [tlq-29], 1 - pshufb m3, m2, m10 - pshufb m1, m2, m11 - pshufb m2, m12 - pmaddubsw m3, m7 - pmaddubsw m1, m8 - pmaddubsw m2, m9 - paddw m3, m1 - paddw m3, m2 - pmulhrsw m3, m13 - jmp .w16_filter_left_top16 -.w16_filter_left_h16: - mov r5d, 10 - cmp hd, 16 - cmovs r5d, hd - xor r5d, 15 ; h == 16 ? 5 : 15 - h - movd xm0, r5d - vpbroadcastb m0, xm0 -.w16_filter_left_top16: - movu xm2, [tlq-15] - vinserti128 m2, [tlq-21], 1 - vbroadcasti128 m1, [base+z_filter_s+12] - vbroadcasti128 m4, [base+z_filter_s+16] - vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab - vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd - vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef - pmaxub m0, m5 - pshufb m0, m2, m0 - pmaddubsw m0, m7 - pshufb m1, m2, m1 - pmaddubsw m1, m8 - pshufb m2, m4 - pmaddubsw m2, m9 - psubb m6, [base+pb_32to1] - paddw m1, m0 - paddw m1, m2 - pmulhrsw m1, m13 - packuswb m3, m1 - vpermq m3, m3, q1320 - vpblendvb m3, [tlq-32], m6 - mova [rsp+32], m3 -.w16_main: - movd xm1, dyd - vbroadcasti128 m10, [base+z_filter_s+2] - movd xm7, dxd - vbroadcasti128 m11, [base+z2_shuf_h2] - vpbroadcastw m1, xm1 - vpbroadcastw m7, xm7 - mov r7, dstq - pmullw m0, m1, [base+z2_ymul] - psllw xm1, 4 - paddw m6, m7, [base+z2_base_inc] - lea r9d, [dxq+(65<<6)] ; xpos - movd [rsp+156], xm1 -.w16_loop0: - mov r2d, r9d - mova [rsp+160], m0 - lea r5, [rsp+60] ; left-3 - mova [rsp+192], m6 - pxor m1, m1 - psraw m2, m0, 6 - pand m0, m14 - psubw m9, m1, m2 ; base_y - psubw m12, m15, m0 - punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11 - psllw m0, 8 - punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15 - por m12, m0 ; 64-frac_y, frac_y -.w16_loop: - lea r3d, [r2+dxq] - shr r2d, 6 ; base_x0 - movu xm0, [rsp+r2] - vinserti128 m0, [rsp+r2+8], 1 - lea r2d, [r3+dxq] - shr r3d, 6 ; base_x1 - movu xm1, [rsp+r3] - vinserti128 m1, [rsp+r3+8], 1 - pand m2, m14, m6 - paddsw m5, m6, m7 - psubw m3, m15, m2 - psllw m2, 8 - pshufb m0, m10 - por m2, m3 - pmaddubsw m0, m2 - pand m2, m14, m5 - psubw m3, m15, m2 - psllw m2, 8 - pshufb m1, m10 - por m2, m3 - pmaddubsw m1, m2 - cmp r3d, 64 - jge .w16_toponly - punpckhwd m2, m5, m5 ; mask out unnecessary loads - vpgatherdd m4, [r5+m9], m2 - punpcklwd m2, m5, m5 - vpgatherdd m3, [r5+m8], m2 - pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1 - pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1 - punpcklqdq m2, m3, m4 ; y0 - punpckhqdq m3, m4 ; y1 - pmaddubsw m2, m12 - pmaddubsw m3, m12 - psraw m6, 15 ; base_x < topleft - vpblendvb m0, m2, m6 - psraw m6, m5, 15 - vpblendvb m1, m3, m6 -.w16_toponly: - pmulhrsw m0, m13 - pmulhrsw m1, m13 - paddw m6, m5, m7 ; xpos += dx - sub r5, 2 - packuswb m0, m1 - vpermq m0, m0, q3120 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - sub hd, 2 - jz .w16_end - lea dstq, [dstq+strideq*2] - cmp r2d, (63-16)<<6 - jge .w16_loop -.w16_leftonly_loop: - mova m0, m7 - vpgatherdd m4, [r5+m9], m7 - mova m7, m0 - vpgatherdd m3, [r5+m8], m0 - sub r5, 2 - pshufb m2, m4, m11 - pshufb m1, m3, m11 - punpcklqdq m0, m1, m2 - punpckhqdq m1, m2 - pmaddubsw m0, m12 - pmaddubsw m1, m12 - pmulhrsw m0, m13 - pmulhrsw m1, m13 - packuswb m0, m1 - vpermq m0, m0, q3120 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w16_leftonly_loop -.w16_end: - sub r8d, 1<<8 - jl .w16_ret - vpbroadcastd m0, [rsp+156] - paddw m0, [rsp+160] ; base_y += 16*dy - paddw m6, m13, [rsp+192] - add r7, 16 - add r9d, 16<<6 - movzx hd, r8b - mov dstq, r7 - paddw m6, m13 ; base_x += 16*64 - jmp .w16_loop0 -.w16_ret: - RET -.w32: - mova m2, [tlq+32] - lea r8d, [hq+(1<<8)] - mova [rsp+96], m2 - test angled, 0x400 - jnz .w16_main - vpbroadcastd m7, [base+z_filter_k+4*2+12*0] - vpbroadcastd m8, [base+z_filter_k+4*2+12*1] - vpbroadcastd m9, [base+z_filter_k+4*2+12*2] - mova xm5, [base+z_filter_s] - vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc - vinserti128 m1, [tlq+11], 1 - movu xm6, [base+z_filter_s+12] - vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff - movu xm3, [tlq+ 6] - vinserti128 m3, [tlq+17], 1 - movd xm0, r6m ; max_width - pminsw xm0, xm15 - vpbroadcastb m10, xm0 -.w32_filter_above: - pshufb m0, m1, m5 - shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de - pmaddubsw m0, m7 - pshufb m2, m1, m4 - shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff - pmaddubsw m2, m8 - pshufb m1, m5 - pmaddubsw m1, m9 - paddw m0, m2 - paddw m0, m1 - pshufb m1, m3, m4 - pmaddubsw m1, m7 - pshufb m2, m3, m5 - pmaddubsw m2, m8 - pshufb m3, m6 - pmaddubsw m3, m9 - paddw m1, m2 - paddw m1, m3 - pmulhrsw m0, m13 - pmulhrsw m1, m13 - psubb m10, [base+pb_1to32] - packuswb m0, m1 - vpblendvb m0, [tlq+1], m10 - movu [rsp+65], m0 - jmp .w16_filter_left -.w64: - mova m2, [tlq+32] - mov r3d, [tlq+64] - lea r8d, [hq+(3<<8)] - mova [rsp+ 96], m2 - mov [rsp+128], r3d - test angled, 0x400 - jnz .w16_main - vpbroadcastd m7, [base+z_filter_k+4*2+12*0] - vpbroadcastd m8, [base+z_filter_k+4*2+12*1] - vpbroadcastd m9, [base+z_filter_k+4*2+12*2] - movu xm6, [base+z_filter_s+ 4] - vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc - movu xm3, [tlq+30] - vinserti128 m3, [tlq+43], 1 - movu xm5, [base+z_filter_s+16] - vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff - pshufb m0, m3, m6 - shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de - pmaddubsw m0, m7 - pshufb m2, m3, m4 - shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff - pmaddubsw m2, m8 - pshufb m3, m6 - pmaddubsw m3, m9 - paddw m0, m2 - paddw m0, m3 - movu xm2, [tlq+36] - vinserti128 m2, [tlq+49], 1 - pshufb m4, m2, m4 - pmaddubsw m4, m7 - pshufb m3, m2, m6 - pmaddubsw m3, m8 - pshufb m2, m5 - pmaddubsw m2, m9 - movd xm5, r6m ; max_width - pminsw xm5, xm15 - vpbroadcastb m10, xm5 - paddw m3, m4 - paddw m2, m3 - vpbroadcastd m3, [base+pb_32] - pmulhrsw m0, m13 - pmulhrsw m2, m13 - mova xm5, [base+z_filter_s] - vinserti128 m5, [base+z_filter_s+6], 1 - psubb m3, m10, m3 - psubb m3, [base+pb_1to32] - vinserti128 m1, [tlq+13], 1 - packuswb m0, m2 - vpblendvb m0, [tlq+33], m3 - movu xm3, [tlq+ 6] - vinserti128 m3, [tlq+19], 1 - movu [rsp+97], m0 - jmp .w32_filter_above - -cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase - %assign org_stack_offset stack_offset - lea r6, [ipred_z3_avx2_table] - tzcnt hd, hm - movifnidn angled, anglem - lea r7, [dr_intra_derivative+45*2-1] - dec tlq - movsxd hq, [r6+hq*4] - sub angled, 180 - add hq, r6 - mov dyd, angled - neg dyd - xor angled, 0x400 - or dyq, ~0x7e - movzx dyd, word [r7+dyq] - vpbroadcastd m3, [pw_512] - vpbroadcastd m4, [pw_62] - vpbroadcastd m5, [pw_64] - mov org_wd, wd - jmp hq -.h4: - lea r7, [strideq*3] - cmp angleb, 40 - jae .h4_no_upsample - lea r4d, [angleq-1024] - sar r4d, 7 - add r4d, wd - jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) - ALLOC_STACK -32, 9 - movu xm8, [tlq-7] - pshufb xm0, xm8, [z_upsample1-4] - vpbroadcastb xm2, xm8 - pshufb xm1, xm8, [z_filter_s+2] - mova [rsp+16], xm2 ; top[max_base_y] - vpbroadcastd xm2, [pb_36_m4] - add dyd, dyd - pmaddubsw xm0, xm2 - pmaddubsw xm1, xm2 - movd xm7, dyd - mov r2d, dyd - vpbroadcastw m7, xm7 - paddw xm1, xm0 - pmulhrsw xm1, xm3 - pslldq m6, m7, 8 - paddw xm2, xm7, xm7 - paddw m6, m7 - packuswb xm1, xm1 - paddw m6, m2 - punpcklbw xm1, xm8 - mova xm8, [z_transpose4] - psllw m7, 2 - pshufb xm1, [pb_15to0] - mova [rsp], xm1 -.h4_upsample_loop: - lea r4d, [r2+dyq] - shr r2d, 6 - vpbroadcastq m1, [rsp+r2] - lea r2d, [r4+dyq] - shr r4d, 6 - vpbroadcastq m2, [rsp+r4] - lea r4d, [r2+dyq] - shr r2d, 6 - movq xm0, [rsp+r2] - lea r2d, [r4+dyq] - shr r4d, 6 - movhps xm0, [rsp+r4] - vpblendd m1, m2, 0xc0 - pand m2, m4, m6 - vpblendd m0, m1, 0xf0 - psubw m1, m5, m2 - psllw m2, 8 - por m1, m2 - pmaddubsw m0, m1 - paddw m6, m7 - pmulhrsw m0, m3 - vextracti128 xm1, m0, 1 - packuswb xm1, xm0 - pshufb xm1, xm8 - movd [dstq+strideq*0], xm1 - pextrd [dstq+strideq*1], xm1, 1 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+r7 ], xm1, 3 - add dstq, 4 - sub wd, 4 - jg .h4_upsample_loop - RET -ALIGN function_align -.filter_strength: ; h4/h8/h16 -%define base r4-z_filter_t0 - lea r4, [z_filter_t0] - movd xm0, maxbased - movd xm2, angled - shr angled, 8 ; is_sm << 1 - vpbroadcastb m0, xm0 - vpbroadcastb m2, xm2 - pcmpeqb m1, m0, [base+z_filter_wh] - pand m1, m2 - mova xm2, [r4+angleq*8] - pcmpgtb m1, m2 - pmovmskb r5d, m1 - ret -.h4_no_upsample: - %assign stack_offset org_stack_offset - ALLOC_STACK -16, 12 - mov maxbased, 7 - test angled, 0x400 ; !enable_intra_edge_filter - jnz .h4_main - lea maxbased, [wq+3] - call .filter_strength - mov maxbased, 7 - test r5d, r5d - jz .h4_main ; filter_strength == 0 - popcnt r5d, r5d - vpbroadcastd m7, [base+pb_7] - vbroadcasti128 m2, [tlq-14] - pmaxub m1, m7, [base+z_filter_s-4] - vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] - pmaxub m7, [base+z_filter_s+4] - vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] - vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] - pshufb m0, m2, m1 - shufps m1, m7, q2121 - pmaddubsw m0, m8 - pshufb m1, m2, m1 - pmaddubsw m1, m9 - pshufb m2, m7 - pmaddubsw m2, m10 - paddw m0, m1 - paddw m0, m2 - pmulhrsw m0, m3 - mov r4d, 9 - lea tlq, [rsp+15] - cmp wd, 4 - cmovne maxbased, r4d - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - mova [rsp], xm0 -.h4_main: - movd xm6, dyd - vpbroadcastq m0, [z_base_inc] ; base_inc << 6 - mov r4, tlq - sub tlq, 4 - neg dyq - vpbroadcastw m6, xm6 - sub r4, maxbaseq - shl maxbased, 6 - vpbroadcastb m7, [r4] - lea r4, [dyq+63] ; ypos - movd xm9, maxbased - not maxbased - vbroadcasti128 m8, [z3_shuf_w4] - add maxbased, 64 - vpbroadcastw m9, xm9 - psrlw m7, 8 ; top[max_base_y] - paddw m10, m6, m6 - psubw m9, m0 ; max_base_y - vpblendd m6, m10, 0xcc - mova xm0, xm10 - paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1 - paddw m10, m10 - mova xm11, [z_transpose4] -.h4_loop: - lea r5, [r4+dyq] - sar r4, 6 ; base0 - vpbroadcastq m1, [tlq+r4] - lea r4, [r5+dyq] - sar r5, 6 ; base1 - vpbroadcastq m2, [tlq+r5] - lea r5, [r4+dyq] - sar r4, 6 ; base2 - movq xm0, [tlq+r4] - lea r4, [r5+dyq] - sar r5, 6 ; base3 - movhps xm0, [tlq+r5] - vpblendd m1, m2, 0xc0 - pand m2, m4, m6 ; frac - vpblendd m0, m1, 0xf0 - psubw m1, m5, m2 ; 64-frac - psllw m2, 8 - pshufb m0, m8 - por m1, m2 ; 64-frac, frac - pmaddubsw m0, m1 - pcmpgtw m1, m9, m6 ; base < max_base_y - pmulhrsw m0, m3 - paddw m6, m10 ; ypos += dy - vpblendvb m0, m7, m0, m1 - vextracti128 xm1, m0, 1 - packuswb xm1, xm0 - pshufb xm1, xm11 ; transpose - movd [dstq+strideq*0], xm1 - pextrd [dstq+strideq*1], xm1, 1 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+r7 ], xm1, 3 - sub wd, 4 - jz .h4_end - add dstq, 4 - cmp r4d, maxbased - jg .h4_loop - packuswb xm7, xm7 -.h4_end_loop: - movd [dstq+strideq*0], xm7 - movd [dstq+strideq*1], xm7 - movd [dstq+strideq*2], xm7 - movd [dstq+r7 ], xm7 - add dstq, 4 - sub wd, 4 - jg .h4_end_loop -.h4_end: - RET -ALIGN function_align -.h8: - lea r4d, [angleq+216] - mov r4b, wb - cmp r4d, 8 - ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 - %assign stack_offset org_stack_offset - ALLOC_STACK -32, 8 - and r4d, 4 - mova xm0, [tlq-15] - vinserti128 m0, [tlq- 9], 1 - movd xm1, r4d - movu xm2, [z_filter_s+2] - vinserti128 m2, [z_filter_s+6], 1 - vpbroadcastb xm1, xm1 ; w & 4 - vpbroadcastd m7, [pb_36_m4] - pmaxub xm1, [z_upsample1-4] ; clip 4x8 - vinserti128 m1, [z_upsample1], 1 - add dyd, dyd - pshufb m1, m0, m1 - pshufb m2, m0, m2 - vinserti128 m0, [tlq-7], 1 - movd xm6, dyd - pmaddubsw m1, m7 - pmaddubsw m2, m7 - vpbroadcastw m6, xm6 - mov r2d, dyd - lea r5, [strideq*3] - paddw m7, m6, m6 - paddw m1, m2 - vpblendd m6, m7, 0xf0 - pmulhrsw m1, m3 - pslldq m2, m7, 8 - paddw m7, m7 - paddw m6, m2 - vbroadcasti128 m2, [pb_15to0] - packuswb m1, m1 - punpcklbw m1, m0 - pshufb m1, m2 - vextracti128 [rsp+ 0], m1, 1 - mova [rsp+16], xm1 -.h8_upsample_loop: - lea r4d, [r2+dyq] - shr r2d, 6 ; base0 - movu xm0, [rsp+r2] - lea r2d, [r4+dyq] - shr r4d, 6 ; base1 - vinserti128 m0, [rsp+r4], 1 - lea r4d, [r2+dyq] - shr r2d, 6 ; base2 - pand m1, m4, m6 - psubw m2, m5, m1 - psllw m1, 8 - por m2, m1 - punpcklqdq m1, m2, m2 ; frac0 frac1 - pmaddubsw m0, m1 - movu xm1, [rsp+r2] - lea r2d, [r4+dyq] - shr r4d, 6 ; base3 - vinserti128 m1, [rsp+r4], 1 - punpckhqdq m2, m2 ; frac2 frac3 - pmaddubsw m1, m2 - pmulhrsw m0, m3 - paddw m6, m7 - pmulhrsw m1, m3 - lea r4, [dstq+strideq*4] - psllw m1, 8 - por m0, m1 - vextracti128 xm1, m0, 1 - punpcklbw xm2, xm0, xm1 - punpckhbw xm0, xm1 - movd [dstq+strideq*0], xm2 - pextrd [dstq+strideq*1], xm2, 1 - pextrd [dstq+strideq*2], xm2, 2 - pextrd [dstq+r5 ], xm2, 3 - movd [r4 +strideq*0], xm0 - pextrd [r4 +strideq*1], xm0, 1 - pextrd [r4 +strideq*2], xm0, 2 - pextrd [r4 +r5 ], xm0, 3 - add dstq, 4 - sub wd, 4 - jg .h8_upsample_loop - RET -.h8_no_intra_edge_filter: - and maxbased, 7 - or maxbased, 8 ; imin(w+7, 15) - jmp .h8_main -.h8_no_upsample: - %assign stack_offset org_stack_offset - ALLOC_STACK -32, 10 - lea maxbased, [wq+7] - test angled, 0x400 - jnz .h8_no_intra_edge_filter - call .filter_strength - test r5d, r5d - jz .h8_main ; filter_strength == 0 - popcnt r5d, r5d - vpbroadcastd xm6, [base+pb_15] - pcmpeqb xm1, xm1 - psubusb xm6, xm0 - psubb xm6, xm1 ; w == 4 ? 5 : 1 - movu xm2, [tlq-16] - pmaxub xm1, xm6, [base+z_filter_s] - vinserti128 m2, [tlq-14], 1 - vinserti128 m1, [base+z_filter_s+12], 1 - vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] - pmaxub xm6, [base+z_filter_s+ 8] - vinserti128 m6, [base+z_filter_s+20], 1 - pshufb m0, m2, m1 - pmaddubsw m0, m7 - vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] - movzx r4d, byte [tlq-15] - shufps m1, m6, q2121 - pshufb m1, m2, m1 - pmaddubsw m1, m7 - paddw m0, m1 - sub r5d, 3 - jnz .h8_3tap - vpbroadcastd m7, [z_filter_k+4*8] - movzx r2d, byte [tlq-14] - pshufb m2, m6 - pmaddubsw m2, m7 - sub r2d, r4d - lea r2d, [r2+r4*8+4] - shr r2d, 3 - mov [rsp+15], r2b - paddw m0, m2 -.h8_3tap: - pmulhrsw m0, m3 - sar r5d, 1 - lea tlq, [rsp+31] - add r5d, 17 - cmp wd, 16 - cmovns maxbased, r5d - neg r5 - mov [tlq+r5], r4b - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - mova [tlq-15], xm0 -.h8_main: - movd xm2, dyd - vbroadcasti128 m0, [z_base_inc] - mov r4, tlq - sub tlq, 8 - neg dyq - vpbroadcastw m2, xm2 - sub r4, maxbaseq - shl maxbased, 6 - vpbroadcastb m7, [r4] - lea r4, [dyq+63] - movd xm9, maxbased - not maxbased - vbroadcasti128 m8, [z3_shuf] - add maxbased, 64 - vpbroadcastw m9, xm9 - psrlw m7, 8 - psubw m9, m0 - paddw m6, m2, m2 - vpblendd m2, m6, 0x0f -.h8_loop: - lea r5, [r4+dyq] - sar r4, 6 - pand m0, m4, m2 - psubw m1, m5, m0 - psllw m0, 8 - por m1, m0 - vbroadcasti128 m0, [tlq+r4] - lea r4, [r5+dyq] - sar r5, 6 - vinserti128 m0, [tlq+r5], 0 - sub rsp, 8*2 - pshufb m0, m8 - pmaddubsw m0, m1 - pcmpgtw m1, m9, m2 - paddw m2, m6 - pmulhrsw m0, m3 - vpblendvb m0, m7, m0, m1 - vextracti128 xm1, m0, 1 - psllw xm0, 8 - por xm0, xm1 ; interleave rows (partial transpose) - mova [rsp], xm0 - sub wd, 2 - jz .h8_transpose - cmp r4d, maxbased - jg .h8_loop - packuswb xm0, xm7, xm7 -.h8_end_loop: - sub rsp, 8*2 - mova [rsp], xm0 - sub wd, 2 - jg .h8_end_loop -.h8_transpose: - mova xm2, [rsp+16*1] - sub org_wd, 8 - lea r2, [strideq*3] - lea r6, [dstq+org_wq] - cmovns dstq, r6 - punpcklwd xm1, xm2, xm0 - punpckhwd xm2, xm0 - lea r6, [dstq+strideq*4] - jge .h8_w8 - add rsp, 16*2 - movd [dstq+strideq*0], xm1 - pextrd [dstq+strideq*1], xm1, 1 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+r2 ], xm1, 3 - movd [r6 +strideq*0], xm2 - pextrd [r6 +strideq*1], xm2, 1 - pextrd [r6 +strideq*2], xm2, 2 - pextrd [r6 +r2 ], xm2, 3 - jmp .h8_end -.h8_w8_loop: - mova xm0, [rsp+16*0] - mova xm2, [rsp+16*1] - punpcklwd xm1, xm2, xm0 - punpckhwd xm2, xm0 -.h8_w8: ; w8/w16/w32 - mova xm0, [rsp+16*2] - mova xm4, [rsp+16*3] - add rsp, 16*4 - punpcklwd xm3, xm4, xm0 - punpckhwd xm4, xm0 - punpckldq xm0, xm3, xm1 - punpckhdq xm3, xm1 - punpckldq xm1, xm4, xm2 - punpckhdq xm4, xm2 - movq [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xm0 - movq [dstq+strideq*2], xm3 - movhps [dstq+r2 ], xm3 - movq [r6 +strideq*0], xm1 - movhps [r6 +strideq*1], xm1 - movq [r6 +strideq*2], xm4 - movhps [r6 +r2 ], xm4 - sub dstq, 8 - sub r6, 8 - sub org_wd, 8 - jge .h8_w8_loop -.h8_end: - RET -.h16_no_intra_edge_filter: - and maxbased, 15 - or maxbased, 16 ; imin(w+15, 31) - jmp .h16_main -ALIGN function_align -.h16: - %assign stack_offset org_stack_offset - ALLOC_STACK -64, 12 - lea maxbased, [wq+15] - test angled, 0x400 - jnz .h16_no_intra_edge_filter - call .filter_strength - test r5d, r5d - jz .h16_main ; filter_strength == 0 - popcnt r5d, r5d - vpbroadcastd m11, [base+pb_27] - vpbroadcastd m1, [base+pb_1] - vbroadcasti128 m6, [base+z_filter_s+12] - vinserti128 m2, m6, [base+z_filter_s+4], 0 - vinserti128 m6, [base+z_filter_s+20], 1 - movu xm10, [tlq-18] - vinserti128 m10, [tlq-14], 1 - vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] - vbroadcasti128 m7, [base+z_filter_s+8] - vinserti128 m8, m7, [base+z_filter_s+0], 0 - vinserti128 m7, [base+z_filter_s+16], 1 - psubusb m11, m0 - por m1, m11 - movu xm11, [tlq-32] - vinserti128 m11, [tlq-28], 1 - pmaxub m8, m1 - pmaxub m7, m1 - pshufb m0, m10, m2 - shufps m2, m6, q2121 - pmaddubsw m0, m9 - pshufb m1, m11, m8 - shufps m8, m7, q2121 - pmaddubsw m1, m9 - vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] - movzx r4d, byte [tlq-31] - pshufb m2, m10, m2 - pmaddubsw m2, m9 - pshufb m8, m11, m8 - pmaddubsw m8, m9 - paddw m0, m2 - paddw m1, m8 - sub r5d, 3 - jnz .h16_3tap - vpbroadcastd m9, [z_filter_k+4*8] - movzx r2d, byte [tlq-30] - pshufb m10, m6 - pmaddubsw m10, m9 - pshufb m11, m7 - pmaddubsw m11, m9 - sub r2d, r4d - lea r2d, [r2+r4*8+4] - shr r2d, 3 - mov [rsp+31], r2b - paddw m0, m10 - paddw m1, m11 -.h16_3tap: - pmulhrsw m0, m3 - pmulhrsw m1, m3 - sar r5d, 1 - lea tlq, [rsp+63] - add r5d, 33 - cmp wd, 32 - cmovns maxbased, r5d - neg r5 - mov [tlq+r5], r4b - packuswb m0, m1 - vpermq m0, m0, q2031 - mova [tlq-31], m0 -.h16_main: - movd xm6, dyd - vbroadcasti128 m0, [z_base_inc] - mov r4, tlq - sub tlq, 8 - neg dyq - vpbroadcastw m6, xm6 - sub r4, maxbaseq - shl maxbased, 6 - vpbroadcastb m7, [r4] - lea r4, [dyq+63] - movd xm9, maxbased - not maxbased - vbroadcasti128 m8, [z3_shuf] - add maxbased, 64 - vpbroadcastw m9, xm9 - psubw m9, m0 - paddw m11, m6, m6 - psubw m10, m9, m3 ; 64*8 - vpblendd m6, m11, 0xf0 -.h16_loop: - lea r5, [r4+dyq] - sar r4, 6 - pand m1, m4, m6 - psubw m2, m5, m1 - psllw m1, 8 - por m2, m1 - movu xm0, [tlq+r4-0] - movu xm1, [tlq+r4-8] - lea r4, [r5+dyq] - sar r5, 6 - vinserti128 m0, [tlq+r5-0], 1 - vinserti128 m1, [tlq+r5-8], 1 - sub rsp, 32 - pshufb m0, m8 - pshufb m1, m8 - pmaddubsw m0, m2 - pmaddubsw m1, m2 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packuswb m0, m1 - pcmpgtw m1, m9, m6 - pcmpgtw m2, m10, m6 - packsswb m1, m2 - paddw m6, m11 - vpblendvb m0, m7, m0, m1 - vpermq m0, m0, q3120 - mova [rsp], m0 - sub wd, 2 - jz .h16_transpose - cmp r4d, maxbased - jg .h16_loop - mova m0, m7 -.h16_end_loop: - sub rsp, 32 - mova [rsp], m7 - sub wd, 2 - jg .h16_end_loop -.h16_transpose: - mova m2, [rsp+32*1] - sub org_wd, 8 - lea r2, [strideq*3] - lea r6, [dstq+org_wq] - cmovns dstq, r6 - punpcklbw m1, m2, m0 - punpckhbw m2, m0 - lea r3, [strideq*5] - punpcklbw m0, m1, m2 - punpckhbw m1, m2 - lea r4, [strideq+r2*2] ; stride*7 - jge .h16_w8 - add rsp, 32*2 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+r2 ], xm0, 3 - vextracti128 xm0, m0, 1 - movd [dstq+strideq*4], xm1 - pextrd [dstq+r3 ], xm1, 1 - pextrd [dstq+r2*2 ], xm1, 2 - pextrd [dstq+r4 ], xm1, 3 - lea dstq, [dstq+strideq*8] - vextracti128 xm1, m1, 1 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+r2 ], xm0, 3 - movd [dstq+strideq*4], xm1 - pextrd [dstq+r3 ], xm1, 1 - pextrd [dstq+r2*2 ], xm1, 2 - pextrd [dstq+r4 ], xm1, 3 - jmp .h16_end -.h16_w8_loop: - mova m0, [rsp+32*0] - mova m2, [rsp+32*1] - punpcklbw m1, m2, m0 - punpckhbw m2, m0 - punpcklbw m0, m1, m2 - punpckhbw m1, m2 -.h16_w8: - mova m2, [rsp+32*2] - mova m4, [rsp+32*3] - lea r6, [dstq+strideq*8] - add rsp, 32*4 - punpcklbw m3, m4, m2 - punpckhbw m4, m2 - punpcklbw m2, m3, m4 - punpckhbw m3, m4 - punpckldq m4, m2, m0 - punpckhdq m2, m0 - punpckldq m0, m3, m1 - punpckhdq m3, m1 - movq [dstq+strideq*0], xm4 - movhps [dstq+strideq*1], xm4 - vextracti128 xm4, m4, 1 - movq [dstq+strideq*2], xm2 - movhps [dstq+r2 ], xm2 - vextracti128 xm2, m2, 1 - movq [dstq+strideq*4], xm0 - movhps [dstq+r3 ], xm0 - vextracti128 xm0, m0, 1 - movq [dstq+r2*2 ], xm3 - movhps [dstq+r4 ], xm3 - vextracti128 xm3, m3, 1 - movq [r6+strideq*0], xm4 - movhps [r6+strideq*1], xm4 - movq [r6+strideq*2], xm2 - movhps [r6+r2 ], xm2 - movq [r6+strideq*4], xm0 - movhps [r6+r3 ], xm0 - movq [r6+r2*2 ], xm3 - movhps [r6+r4 ], xm3 - sub dstq, 8 - sub org_wd, 8 - jge .h16_w8_loop -.h16_end: - RET -ALIGN function_align -.h32: - %assign stack_offset org_stack_offset - ALLOC_STACK -96, 15 - lea maxbased, [wq+31] - and maxbased, 31 - or maxbased, 32 ; imin(w+31, 63) - test angled, 0x400 ; !enable_intra_edge_filter - jnz .h32_main - vbroadcasti128 m0, [pb_0to15] - mov r4d, 21 - mov r5d, 3 - movu xm11, [tlq-66] ; 56-63 - vinserti128 m11, [tlq-52], 1 ; 40-47 - sub r4d, wd ; 21-w - cmovns r5d, r4d - movu xm12, [tlq-58] ; 48-55 - vinserti128 m12, [tlq-44], 1 ; 32-39 - sub r4d, 8 ; 13-w - movd xm1, r5d - movu xm13, [tlq-34] ; 24-31 - vinserti128 m13, [tlq-20], 1 ; 8-15 - movd xm2, r4d - vpbroadcastb m1, xm1 - movu xm14, [tlq-28] ; 16-23 - vinserti128 m14, [tlq-14], 1 ; 0- 7 - vpbroadcastb m2, xm2 - pmaxsb m1, m0 ; clip 16x32 and (32|64)x32 - movu m7, [z_filter_s+4] - pshufb m11, m1 - vinserti128 m8, m7, [z_filter_s+8], 1 - vinserti128 m7, [z_filter_s+16], 0 - pmaxsb m2, m0 ; clip 8x32 - vpbroadcastd m9, [z_filter_k+4*2+12*0] - pshufb m12, m2 - pshufb m0, m11, m8 - pmaddubsw m0, m9 - pshufb m2, m12, m8 - pmaddubsw m2, m9 - pshufb m1, m13, m8 - pmaddubsw m1, m9 - shufps m8, m7, q1021 - pshufb m6, m14, m8 - pmaddubsw m6, m9 - vpbroadcastd m9, [z_filter_k+4*2+12*1] - pshufb m10, m11, m8 - pmaddubsw m10, m9 - paddw m0, m10 - pshufb m10, m12, m8 - pmaddubsw m10, m9 - paddw m2, m10 - pshufb m10, m13, m8 - pmaddubsw m10, m9 - shufps m8, m7, q2121 - paddw m1, m10 - pshufb m10, m14, m8 - pmaddubsw m10, m9 - paddw m6, m10 - vpbroadcastd m9, [z_filter_k+4*2+12*2] - pshufb m11, m8 - pmaddubsw m11, m9 - pshufb m12, m8 - pmaddubsw m12, m9 - movzx r4d, byte [tlq-63] - movzx r2d, byte [tlq-62] - paddw m0, m11 - paddw m2, m12 - pshufb m13, m8 - pmaddubsw m13, m9 - pshufb m14, m7 - pmaddubsw m14, m9 - paddw m1, m13 - paddw m6, m14 - sub r2d, r4d - lea r2d, [r2+r4*8+4] ; edge case for 64x32 - pmulhrsw m0, m3 - pmulhrsw m2, m3 - pmulhrsw m1, m3 - pmulhrsw m6, m3 - shr r2d, 3 - mov [rsp+31], r2b - lea tlq, [rsp+95] - mov [tlq-65], r4b - mov r4d, 65 - cmp wd, 64 - cmove maxbased, r4d - packuswb m0, m2 - packuswb m1, m6 - mova [tlq-63], m0 - mova [tlq-31], m1 -.h32_main: - movd xm6, dyd - mov r4, tlq - sub tlq, 8 - neg dyq - vpbroadcastw m6, xm6 - sub r4, maxbaseq - shl maxbased, 6 - vpbroadcastb m7, [r4] - lea r4, [dyq+63] - movd xm9, maxbased - not maxbased - vbroadcasti128 m8, [z3_shuf] - add maxbased, 64 - vpbroadcastw m9, xm9 - psubw m9, [z_base_inc] - mova m11, m6 - psubw m10, m9, m3 ; 64*8 -.h32_loop: - mov r5, r4 - sar r5, 6 - pand m1, m4, m6 - psubw m2, m5, m1 - psllw m1, 8 - por m2, m1 - movu xm0, [tlq+r5- 0] - vinserti128 m0, [tlq+r5-16], 1 - movu xm1, [tlq+r5- 8] - vinserti128 m1, [tlq+r5-24], 1 - sub rsp, 32 - add r4, dyq - pshufb m0, m8 - pshufb m1, m8 - pmaddubsw m0, m2 - pmaddubsw m1, m2 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packuswb m0, m1 - pcmpgtw m1, m9, m6 - pcmpgtw m2, m10, m6 - packsswb m1, m2 - paddw m6, m11 - vpblendvb m0, m7, m0, m1 - mova [rsp], m0 - dec wd - jz .h32_transpose - cmp r4d, maxbased - jg .h32_loop -.h32_end_loop: - sub rsp, 32 - mova [rsp], m7 - dec wd - jg .h32_end_loop -.h32_transpose: - lea dstq, [dstq+org_wq-8] - lea r2, [strideq*3] - lea r3, [strideq*5] - lea r4, [strideq+r2*2] ; stride*7 -.h32_w8_loop: - mova m7, [rsp+32*0] - mova m6, [rsp+32*1] - mova m5, [rsp+32*2] - mova m4, [rsp+32*3] - mova m3, [rsp+32*4] - mova m2, [rsp+32*5] - mova m1, [rsp+32*6] - mova m0, [rsp+32*7] - lea r6, [dstq+strideq*8] - add rsp, 32*8 - punpcklbw m8, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m2, m3 - punpckhbw m2, m3 - punpcklbw m3, m4, m5 - punpckhbw m4, m5 - punpcklbw m5, m6, m7 - punpckhbw m6, m7 - punpcklwd m7, m8, m1 - punpckhwd m8, m1 - punpcklwd m1, m0, m2 - punpckhwd m0, m2 - punpcklwd m2, m3, m5 - punpckhwd m3, m5 - punpcklwd m5, m4, m6 - punpckhwd m4, m6 - punpckldq m6, m7, m2 - punpckhdq m7, m2 - punpckldq m2, m8, m3 - punpckhdq m8, m3 - punpckldq m3, m1, m5 - punpckhdq m1, m5 - punpckldq m5, m0, m4 - punpckhdq m0, m4 - movq [dstq+strideq*0], xm6 - movhps [dstq+strideq*1], xm6 - vextracti128 xm6, m6, 1 - movq [dstq+strideq*2], xm7 - movhps [dstq+r2 ], xm7 - vextracti128 xm7, m7, 1 - movq [dstq+strideq*4], xm2 - movhps [dstq+r3 ], xm2 - vextracti128 xm2, m2, 1 - movq [dstq+r2*2 ], xm8 - movhps [dstq+r4 ], xm8 - vextracti128 xm8, m8, 1 - movq [r6+strideq*0], xm3 - movhps [r6+strideq*1], xm3 - vextracti128 xm3, m3, 1 - movq [r6+strideq*2], xm1 - movhps [r6+r2 ], xm1 - vextracti128 xm1, m1, 1 - movq [r6+strideq*4], xm5 - movhps [r6+r3 ], xm5 - vextracti128 xm5, m5, 1 - movq [r6+r2*2 ], xm0 - movhps [r6+r4 ], xm0 - lea r6, [r6+strideq*8] - vextracti128 xm0, m0, 1 - movq [r6+strideq*0], xm6 - movhps [r6+strideq*1], xm6 - movq [r6+strideq*2], xm7 - movhps [r6+r2 ], xm7 - movq [r6+strideq*4], xm2 - movhps [r6+r3 ], xm2 - movq [r6+r2*2 ], xm8 - movhps [r6+r4 ], xm8 - lea r6, [r6+strideq*8] - movq [r6+strideq*0], xm3 - movhps [r6+strideq*1], xm3 - movq [r6+strideq*2], xm1 - movhps [r6+r2 ], xm1 - movq [r6+strideq*4], xm5 - movhps [r6+r3 ], xm5 - movq [r6+r2*2 ], xm0 - movhps [r6+r4 ], xm0 - sub dstq, 8 - sub org_wd, 8 - jg .h32_w8_loop - RET -ALIGN function_align -.h64: - %assign stack_offset org_stack_offset - ALLOC_STACK -128, 16 - lea maxbased, [wq+63] - test angled, 0x400 ; !enable_intra_edge_filter - jnz .h64_main - mov r4d, 21 - vpbroadcastb xm11, [tlq-127] - vpblendd xm11, [tlq-130], 0x0e ; 120-127 - sub r4d, wd ; 21-w - mov r5d, 3 - vinserti128 m11, [tlq-116], 1 ; 104-111 - movu m7, [z_filter_s+4] - cmp wd, 32 - cmove r4d, r5d - vinserti128 m8, m7, [z_filter_s+8], 1 - vbroadcasti128 m6, [pb_0to15] - movd xm1, r4d - vpbroadcastd m9, [z_filter_k+4*2+12*0] - movu xm12, [tlq-122] ; 112-119 - vinserti128 m12, [tlq-108], 1 ; 96-103 - vpbroadcastb m1, xm1 - movu xm13, [tlq- 98] ; 88- 95 - vinserti128 m13, [tlq- 84], 1 ; 72- 79 - movu xm14, [tlq- 90] ; 80- 87 - vinserti128 m14, [tlq- 76], 1 ; 64- 71 - vinserti128 m7, [z_filter_s+16], 0 - pshufb m0, m11, m8 - pmaddubsw m0, m9 - pshufb m2, m12, m8 - pmaddubsw m2, m9 - pmaxsb m1, m6 ; clip (16|32)x64 - pshufb m13, m1 - pshufb m1, m13, m8 - pmaddubsw m1, m9 - pshufb m6, m14, m8 - pmaddubsw m6, m9 - vpbroadcastd m9, [z_filter_k+4*2+12*1] - shufps m15, m8, m7, q1021 - pshufb m10, m11, m15 - pmaddubsw m10, m9 - paddw m0, m10 - pshufb m10, m12, m15 - pmaddubsw m10, m9 - paddw m2, m10 - pshufb m10, m13, m15 - pmaddubsw m10, m9 - paddw m1, m10 - pshufb m10, m14, m15 - pmaddubsw m10, m9 - paddw m6, m10 - vpbroadcastd m9, [z_filter_k+4*2+12*2] - shufps m10, m8, m7, q2132 - pshufb m11, m10 - pmaddubsw m11, m9 - pshufb m12, m10 - pmaddubsw m12, m9 - pshufb m13, m10 - pmaddubsw m13, m9 - pshufb m14, m10 - pmaddubsw m14, m9 - paddw m0, m11 - paddw m2, m12 - paddw m1, m13 - paddw m6, m14 - movu xm11, [tlq-66] ; 56-63 - vinserti128 m11, [tlq-52], 1 ; 40-47 - movu xm12, [tlq-58] ; 48-55 - vinserti128 m12, [tlq-44], 1 ; 32-39 - movu xm13, [tlq-34] ; 24-31 - vinserti128 m13, [tlq-20], 1 ; 8-15 - movu xm14, [tlq-28] ; 16-23 - vinserti128 m14, [tlq-14], 1 ; 0- 7 - pmulhrsw m0, m3 - pmulhrsw m2, m3 - pmulhrsw m1, m3 - pmulhrsw m6, m3 - lea tlq, [rsp+127] - packuswb m0, m2 - packuswb m1, m6 - mova [tlq-127], m0 - mova [tlq- 95], m1 - pshufb m0, m11, m10 - pmaddubsw m0, m9 - pshufb m2, m12, m10 - pmaddubsw m2, m9 - pshufb m1, m13, m10 - pmaddubsw m1, m9 - pshufb m6, m14, m7 - pmaddubsw m6, m9 - vpbroadcastd m9, [z_filter_k+4*2+12*1] - pshufb m7, m11, m15 - pmaddubsw m7, m9 - paddw m0, m7 - pshufb m7, m12, m15 - pmaddubsw m7, m9 - paddw m2, m7 - pshufb m7, m13, m15 - pmaddubsw m7, m9 - paddw m1, m7 - pshufb m7, m14, m10 - pmaddubsw m7, m9 - paddw m6, m7 - vpbroadcastd m9, [z_filter_k+4*2+12*0] - pshufb m11, m8 - pmaddubsw m11, m9 - pshufb m12, m8 - pmaddubsw m12, m9 - pshufb m13, m8 - pmaddubsw m13, m9 - pshufb m14, m15 - pmaddubsw m14, m9 - paddw m0, m11 - paddw m2, m12 - paddw m1, m13 - paddw m6, m14 - pmulhrsw m0, m3 - pmulhrsw m2, m3 - pmulhrsw m1, m3 - pmulhrsw m6, m3 - packuswb m0, m2 - packuswb m1, m6 - mova [tlq-63], m0 - mova [tlq-31], m1 -.h64_main: - movd xm12, dyd - neg maxbaseq - vbroadcasti128 m8, [z3_shuf] - vpbroadcastb m7, [tlq+maxbaseq] - shl maxbased, 6 - vpbroadcastw m12, xm12 - lea r5d, [dyq+maxbaseq-64] - neg dyq - or maxbased, 63 - lea r4, [dyq+63] - movd xm6, r5d - mova xm10, [pb_1to32+16] - vinserti128 m10, [pb_1to32], 1 - vpbroadcastd m11, [pb_32] - vpbroadcastw m6, xm6 -.h64_loop: - mov r5, r4 - sar r5, 6 - movu m0, [tlq+r5-24] - movu m1, [tlq+r5-32] - pand m2, m4, m6 - psubw m9, m5, m2 - psllw m2, 8 - por m9, m2 - pshufb m0, m8 - pshufb m1, m8 - pmaddubsw m0, m9 - pmaddubsw m1, m9 - psraw m2, m6, 6 - sub rsp, 64 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packsswb m2, m2 - paddb m2, m10 - packuswb m0, m1 - vpblendvb m0, m7, m0, m2 - mova [rsp+32], m0 - movu m0, [tlq+r5-56] - movu m1, [tlq+r5-64] - add r4, dyq - pshufb m0, m8 - pshufb m1, m8 - pmaddubsw m0, m9 - pmaddubsw m1, m9 - paddb m2, m11 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - paddw m6, m12 - packuswb m0, m1 - vpblendvb m0, m7, m0, m2 - mova [rsp], m0 - dec wd - jz .h64_transpose - cmp r4d, maxbased - jg .h64_loop -.h64_end_loop: - sub rsp, 64 - mova [rsp+32], m7 - mova [rsp+ 0], m7 - dec wd - jg .h64_end_loop -.h64_transpose: - lea r2, [strideq*3] - lea r3, [strideq*5] - imul r5, strideq, -8 - lea dstq, [dstq+org_wq-16] - lea r4, [strideq+r2*2] ; stride*7 -.h64_transpose_loop0: - lea r6, [rsp+16*3] -.h64_transpose_loop: - mova xm0, [r6+64*15] - vinserti128 m0, [r6+64* 7], 1 - mova xm1, [r6+64*14] - vinserti128 m1, [r6+64* 6], 1 - mova xm2, [r6+64*13] - vinserti128 m2, [r6+64* 5], 1 - mova xm3, [r6+64*12] - vinserti128 m3, [r6+64* 4], 1 - mova xm4, [r6+64*11] - vinserti128 m4, [r6+64* 3], 1 - mova xm5, [r6+64*10] - vinserti128 m5, [r6+64* 2], 1 - mova xm6, [r6+64* 9] - vinserti128 m6, [r6+64* 1], 1 - mova xm7, [r6+64* 8] - vinserti128 m7, [r6+64* 0], 1 - sub r6, 16 - punpcklbw m8, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m2, m3 - punpckhbw m2, m3 - punpcklbw m3, m4, m5 - punpckhbw m4, m5 - punpcklbw m5, m6, m7 - punpckhbw m6, m7 - punpcklwd m7, m8, m1 - punpckhwd m8, m1 - punpcklwd m1, m0, m2 - punpckhwd m0, m2 - punpcklwd m2, m3, m5 - punpckhwd m3, m5 - punpcklwd m5, m4, m6 - punpckhwd m4, m6 - punpckldq m6, m7, m2 - punpckhdq m7, m2 - punpckldq m2, m8, m3 - punpckhdq m8, m3 - punpckldq m3, m1, m5 - punpckhdq m1, m5 - punpckldq m5, m0, m4 - punpckhdq m0, m4 - vpermq m6, m6, q3120 - vpermq m7, m7, q3120 - vpermq m2, m2, q3120 - vpermq m8, m8, q3120 - vpermq m3, m3, q3120 - vpermq m1, m1, q3120 - vpermq m5, m5, q3120 - vpermq m0, m0, q3120 - mova [dstq+strideq*0], xm6 - vextracti128 [dstq+strideq*1], m6, 1 - mova [dstq+strideq*2], xm7 - vextracti128 [dstq+r2 ], m7, 1 - mova [dstq+strideq*4], xm2 - vextracti128 [dstq+r3 ], m2, 1 - mova [dstq+r2*2 ], xm8 - vextracti128 [dstq+r4 ], m8, 1 - sub dstq, r5 - mova [dstq+strideq*0], xm3 - vextracti128 [dstq+strideq*1], m3, 1 - mova [dstq+strideq*2], xm1 - vextracti128 [dstq+r2 ], m1, 1 - mova [dstq+strideq*4], xm5 - vextracti128 [dstq+r3 ], m5, 1 - mova [dstq+r2*2 ], xm0 - vextracti128 [dstq+r4 ], m0, 1 - sub dstq, r5 - cmp r6, rsp - jae .h64_transpose_loop - add rsp, 64*16 - lea dstq, [dstq+r5*8-16] - sub org_wd, 16 - jg .h64_transpose_loop0 -.h64_end: - RET - -%macro FILTER_XMM 4 ; dst, src, tmp, shuf -%ifnum %4 - pshufb xm%2, xm%4 -%else - pshufb xm%2, %4 -%endif - pshufd xm%1, xm%2, q0000 ; p0 p1 - pmaddubsw xm%1, xm2 - pshufd xm%3, xm%2, q1111 ; p2 p3 - pmaddubsw xm%3, xm3 - paddw xm%1, xm1 - paddw xm%1, xm%3 - pshufd xm%3, xm%2, q2222 ; p4 p5 - pmaddubsw xm%3, xm4 - paddw xm%1, xm%3 - pshufd xm%3, xm%2, q3333 ; p6 __ - pmaddubsw xm%3, xm5 - paddw xm%1, xm%3 - psraw xm%1, 4 - packuswb xm%1, xm%1 -%endmacro - -%macro FILTER_YMM 4 ; dst, src, tmp, shuf - pshufb m%2, m%4 - pshufd m%1, m%2, q0000 - pmaddubsw m%1, m2 - pshufd m%3, m%2, q1111 - pmaddubsw m%3, m3 - paddw m%1, m1 - paddw m%1, m%3 - pshufd m%3, m%2, q2222 - pmaddubsw m%3, m4 - paddw m%1, m%3 - pshufd m%3, m%2, q3333 - pmaddubsw m%3, m5 - paddw m%1, m%3 - psraw m%1, 4 - vpermq m%3, m%1, q1032 - packuswb m%1, m%3 -%endmacro - -; The ipred_filter SIMD processes 4x2 blocks in the following order which -; increases parallelism compared to doing things row by row. One redundant -; block is calculated for w8 and w16, two for w32. -; w4 w8 w16 w32 -; 1 1 2 1 2 3 5 1 2 3 5 b c d f -; 2 2 3 2 4 5 7 2 4 5 7 c e f h -; 3 3 4 4 6 7 9 4 6 7 9 e g h j -; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ -; 5 8 8 i - -cglobal ipred_filter, 3, 7, 0, dst, stride, tl, w, h, filter -%define base r6-ipred_filter_avx2_table - lea r6, [filter_intra_taps] - tzcnt wd, wm -%ifidn filterd, filterm - movzx filterd, filterb -%else - movzx filterd, byte filterm -%endif - shl filterd, 6 - add filterq, r6 - lea r6, [ipred_filter_avx2_table] - movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 - movsxd wq, [r6+wq*4] - vpbroadcastd m1, [base+pw_8] - vbroadcasti128 m2, [filterq+16*0] - vbroadcasti128 m3, [filterq+16*1] - vbroadcasti128 m4, [filterq+16*2] - vbroadcasti128 m5, [filterq+16*3] - add wq, r6 - mov hd, hm - jmp wq -.w4: - WIN64_SPILL_XMM 9 - mova xm8, [base+filter_shuf2] - sub tlq, 3 - sub tlq, hq - jmp .w4_loop_start -.w4_loop: - pinsrd xm0, xm6, [tlq+hq], 0 - lea dstq, [dstq+strideq*2] -.w4_loop_start: - FILTER_XMM 6, 0, 7, 8 - movd [dstq+strideq*0], xm6 - pextrd [dstq+strideq*1], xm6, 1 - sub hd, 2 - jg .w4_loop - RET -ALIGN function_align -.w8: - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 10 - mova m8, [base+filter_shuf1] - FILTER_XMM 7, 0, 6, [base+filter_shuf2] - vpbroadcastd m0, [tlq+4] - vpbroadcastd m6, [tlq+5] - sub tlq, 4 - sub tlq, hq - vpbroadcastq m7, xm7 - vpblendd m7, m6, 0x20 -.w8_loop: - vpbroadcastd xm6, [tlq+hq] - palignr m6, m0, 12 - vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ - ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 - mova xm6, xm7 - call .main - vpblendd xm6, xm7, 0x0c - pshufd xm6, xm6, q3120 - movq [dstq+strideq*0], xm6 - movhps [dstq+strideq*1], xm6 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w8_loop - RET -ALIGN function_align -.w16: -%if WIN64 - %assign stack_offset stack_offset - stack_size_padded - %assign xmm_regs_used 15 - %assign stack_size_padded 0x98 - SUB rsp, stack_size_padded -%endif - sub hd, 2 - TAIL_CALL .w16_main, 0 -.w16_main: -%if WIN64 - movaps [rsp+0xa8], xmm6 - movaps [rsp+0xb8], xmm7 - movaps [rsp+0x28], xmm8 - movaps [rsp+0x38], xmm9 - movaps [rsp+0x48], xmm10 - movaps [rsp+0x58], xmm11 - movaps [rsp+0x68], xmm12 - movaps [rsp+0x78], xmm13 - movaps [rsp+0x88], xmm14 -%endif - FILTER_XMM 12, 0, 7, [base+filter_shuf2] - vpbroadcastd m0, [tlq+5] - vpblendd m0, [tlq-12], 0x14 - mova m8, [base+filter_shuf1] - vpbroadcastq m7, xm12 - vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ - ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 - call .main ; c0 d0 a1 b1 a1 b1 c0 d0 - movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - vinserti128 m14, m8, [base+filter_shuf3], 0 - vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 - FILTER_XMM 6, 9, 10, 14 - vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 - vpbroadcastd m9, [tlq+13] - vpbroadcastd m10, [tlq+12] - psrld m11, m8, 4 - vpblendd m6, m9, 0x20 ; top - sub tlq, 6 - sub tlq, hq -.w16_loop: - vpbroadcastd xm9, [tlq+hq] - palignr m9, m0, 12 - vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ - ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 - mova xm13, xm7 - call .main ; e0 f0 c1 d1 c1 d1 e0 f0 - vpblendd m9, m12, m10, 0xf0 - vpblendd m12, m6, 0xc0 - pshufd m9, m9, q3333 - vpblendd m9, m6, 0xee - vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 - FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 - vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 - vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 - vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 - mova [dstq+strideq*0], xm9 - vextracti128 [dstq+strideq*1], m9, 1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w16_loop - vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 - pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 - FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] - vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 - shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 - shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 - mova [dstq+strideq*0], xm0 - mova [dstq+strideq*1], xm6 - ret -ALIGN function_align -.w32: - sub rsp, stack_size_padded - sub hd, 2 - lea r3, [dstq+16] - lea r5d, [hq-2] - call .w16_main - add tlq, r5 - mov dstq, r3 - lea r3, [strideq-4] - lea r4, [r3+strideq*2] - movq xm0, [tlq+21] - pinsrd xm0, [dstq-4], 2 - pinsrd xm0, [dstq+r3*1], 3 - FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 - movq xm7, [dstq+r3*2] - pinsrd xm7, [dstq+r4], 2 - palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 - vpbroadcastd m0, [tlq+28] - vpbroadcastd m9, [tlq+29] - vbroadcasti128 m8, [base+filter_shuf1+16] - vpblendd m0, m9, 0x20 - vpblendd m0, m7, 0x0f - vpbroadcastq m7, xm12 - vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 - call .main ; c0 d0 a1 b1 a1 b1 c0 d0 - add r3, 2 - lea r4, [r4+strideq*2] - movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 - FILTER_XMM 6, 9, 10, 14 - vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 - vpbroadcastd m9, [tlq+37] - vpbroadcastd m10, [tlq+36] - vpblendd m6, m9, 0x20 ; top -.w32_loop: - movq xm9, [dstq+r3*4] - pinsrd xm9, [dstq+r4], 2 -.w32_loop_last: - palignr m9, m0, 12 - vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 - mova xm13, xm7 ; c0 d0 - call .main ; e0 f0 c1 d1 c1 d1 e0 f0 - vpblendd m9, m12, m10, 0xf0 - vpblendd m12, m6, 0xc0 - pshufd m9, m9, q3333 - vpblendd m9, m6, 0xee - vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 - FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 - vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 - vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 - vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 - mova [dstq+strideq*0], xm9 - vextracti128 [dstq+strideq*1], m9, 1 - lea dstq, [dstq+strideq*2] - sub r5d, 2 - jg .w32_loop - jz .w32_loop_last - vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 - pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 - FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] - vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 - shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 - shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 - mova [dstq+strideq*0], xm0 - mova [dstq+strideq*1], xm6 - RET -ALIGN function_align -.main: - FILTER_YMM 7, 0, 9, 8 - ret - -%if WIN64 -DECLARE_REG_TMP 5 -%else -DECLARE_REG_TMP 7 -%endif - -%macro IPRED_CFL 1 ; ac in, unpacked pixels out - psignw m3, m%1, m1 - pabsw m%1, m%1 - pmulhrsw m%1, m2 - psignw m%1, m3 - paddw m%1, m0 -%endmacro - -cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha - lea t0, [ipred_cfl_left_avx2_table] - tzcnt wd, wm - inc tlq - movu m0, [tlq] - movifnidn hd, hm - mov r6d, 0x8000 - shrx r6d, r6d, wd - movd xm3, r6d - movsxd r6, [t0+wq*4] - pcmpeqd m2, m2 - pmaddubsw m0, m2 - add r6, t0 - add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table - movsxd wq, [t0+wq*4] - add wq, t0 - movifnidn acq, acmp - jmp r6 - -cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha - mov hd, hm ; zero upper half - tzcnt r6d, hd - sub tlq, hq - tzcnt wd, wm - movu m0, [tlq] - mov t0d, 0x8000 - shrx t0d, t0d, r6d - movd xm3, t0d - lea t0, [ipred_cfl_left_avx2_table] - movsxd r6, [t0+r6*4] - pcmpeqd m2, m2 - pmaddubsw m0, m2 - add r6, t0 - add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table - movsxd wq, [t0+wq*4] - add wq, t0 - movifnidn acq, acmp - jmp r6 -.h32: - vextracti128 xm1, m0, 1 - paddw xm0, xm1 -.h16: - punpckhqdq xm1, xm0, xm0 - paddw xm0, xm1 -.h8: - psrlq xm1, xm0, 32 - paddw xm0, xm1 -.h4: - pmaddwd xm0, xm2 - pmulhrsw xm0, xm3 - vpbroadcastw m0, xm0 - jmp wq - -cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha - movifnidn hd, hm - movifnidn wd, wm - tzcnt r6d, hd - lea t0d, [wq+hq] - movd xm4, t0d - tzcnt t0d, t0d - movd xm5, t0d - lea t0, [ipred_cfl_avx2_table] - tzcnt wd, wd - movsxd r6, [t0+r6*4] - movsxd wq, [t0+wq*4+4*4] - pcmpeqd m3, m3 - psrlw xm4, 1 - add r6, t0 - add wq, t0 - movifnidn acq, acmp - jmp r6 -.h4: - movd xm0, [tlq-4] - pmaddubsw xm0, xm3 - jmp wq -.w4: - movd xm1, [tlq+1] - pmaddubsw xm1, xm3 - psubw xm0, xm4 - paddw xm0, xm1 - pmaddwd xm0, xm3 - cmp hd, 4 - jg .w4_mul - psrlw xm0, 3 - jmp .w4_end -.w4_mul: - punpckhqdq xm1, xm0, xm0 - lea r2d, [hq*2] - mov r6d, 0x55563334 - paddw xm0, xm1 - shrx r6d, r6d, r2d - psrlq xm1, xm0, 32 - paddw xm0, xm1 - movd xm1, r6d - psrlw xm0, 2 - pmulhuw xm0, xm1 -.w4_end: - vpbroadcastw m0, xm0 -.s4: - vpbroadcastw m1, alpham - lea r6, [strideq*3] - pabsw m2, m1 - psllw m2, 9 -.s4_loop: - mova m4, [acq] - IPRED_CFL 4 - packuswb m4, m4 - vextracti128 xm5, m4, 1 - movd [dstq+strideq*0], xm4 - pextrd [dstq+strideq*1], xm4, 1 - movd [dstq+strideq*2], xm5 - pextrd [dstq+r6 ], xm5, 1 - lea dstq, [dstq+strideq*4] - add acq, 32 - sub hd, 4 - jg .s4_loop - RET -ALIGN function_align -.h8: - movq xm0, [tlq-8] - pmaddubsw xm0, xm3 - jmp wq -.w8: - movq xm1, [tlq+1] - vextracti128 xm2, m0, 1 - pmaddubsw xm1, xm3 - psubw xm0, xm4 - paddw xm0, xm2 - punpckhqdq xm2, xm0, xm0 - paddw xm0, xm2 - paddw xm0, xm1 - psrlq xm1, xm0, 32 - paddw xm0, xm1 - pmaddwd xm0, xm3 - psrlw xm0, xm5 - cmp hd, 8 - je .w8_end - mov r6d, 0x5556 - mov r2d, 0x3334 - cmp hd, 32 - cmove r6d, r2d - movd xm1, r6d - pmulhuw xm0, xm1 -.w8_end: - vpbroadcastw m0, xm0 -.s8: - vpbroadcastw m1, alpham - lea r6, [strideq*3] - pabsw m2, m1 - psllw m2, 9 -.s8_loop: - mova m4, [acq] - mova m5, [acq+32] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - vextracti128 xm5, m4, 1 - movq [dstq+strideq*0], xm4 - movq [dstq+strideq*1], xm5 - movhps [dstq+strideq*2], xm4 - movhps [dstq+r6 ], xm5 - lea dstq, [dstq+strideq*4] - add acq, 64 - sub hd, 4 - jg .s8_loop - RET -ALIGN function_align -.h16: - mova xm0, [tlq-16] - pmaddubsw xm0, xm3 - jmp wq -.w16: - movu xm1, [tlq+1] - vextracti128 xm2, m0, 1 - pmaddubsw xm1, xm3 - psubw xm0, xm4 - paddw xm0, xm2 - paddw xm0, xm1 - punpckhqdq xm1, xm0, xm0 - paddw xm0, xm1 - psrlq xm1, xm0, 32 - paddw xm0, xm1 - pmaddwd xm0, xm3 - psrlw xm0, xm5 - cmp hd, 16 - je .w16_end - mov r6d, 0x5556 - mov r2d, 0x3334 - test hb, 8|32 - cmovz r6d, r2d - movd xm1, r6d - pmulhuw xm0, xm1 -.w16_end: - vpbroadcastw m0, xm0 -.s16: - vpbroadcastw m1, alpham - pabsw m2, m1 - psllw m2, 9 -.s16_loop: - mova m4, [acq] - mova m5, [acq+32] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - vpermq m4, m4, q3120 - mova [dstq+strideq*0], xm4 - vextracti128 [dstq+strideq*1], m4, 1 - lea dstq, [dstq+strideq*2] - add acq, 64 - sub hd, 2 - jg .s16_loop - RET -ALIGN function_align -.h32: - mova m0, [tlq-32] - pmaddubsw m0, m3 - jmp wq -.w32: - movu m1, [tlq+1] - pmaddubsw m1, m3 - paddw m0, m1 - vextracti128 xm1, m0, 1 - psubw xm0, xm4 - paddw xm0, xm1 - punpckhqdq xm1, xm0, xm0 - paddw xm0, xm1 - psrlq xm1, xm0, 32 - paddw xm0, xm1 - pmaddwd xm0, xm3 - psrlw xm0, xm5 - cmp hd, 32 - je .w32_end - lea r2d, [hq*2] - mov r6d, 0x33345556 - shrx r6d, r6d, r2d - movd xm1, r6d - pmulhuw xm0, xm1 -.w32_end: - vpbroadcastw m0, xm0 -.s32: - vpbroadcastw m1, alpham - pabsw m2, m1 - psllw m2, 9 -.s32_loop: - mova m4, [acq] - mova m5, [acq+32] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - vpermq m4, m4, q3120 - mova [dstq], m4 - add dstq, strideq - add acq, 64 - dec hd - jg .s32_loop - RET - -cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha - lea t0, [ipred_cfl_splat_avx2_table] - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, [t0+wq*4] - vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128] - add wq, t0 - movifnidn acq, acmp - jmp wq - -cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak - movifnidn hpadd, hpadm - movifnidn wd, wm - mov hd, hm - mov szd, wd - mov ac_bakq, acq - imul szd, hd - shl hpadd, 2 - sub hd, hpadd - vpbroadcastd m2, [pb_2] - pxor m4, m4 - cmp wd, 8 - jg .w16 - je .w8 - ; fall-through - - DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak -.w4: - lea stride3q, [strideq*3] -.w4_loop: - movq xm0, [yq] - movq xm1, [yq+strideq] - movhps xm0, [yq+strideq*2] - movhps xm1, [yq+stride3q] - pmaddubsw xm0, xm2 - pmaddubsw xm1, xm2 - paddw xm0, xm1 - mova [acq], xm0 - paddw xm4, xm0 - lea yq, [yq+strideq*4] - add acq, 16 - sub hd, 2 - jg .w4_loop - test hpadd, hpadd - jz .calc_avg - vpermq m0, m0, q1111 -.w4_hpad_loop: - mova [acq], m0 - paddw m4, m0 - add acq, 32 - sub hpadd, 4 - jg .w4_hpad_loop - jmp .calc_avg - -.w8: - lea stride3q, [strideq*3] - test wpadd, wpadd - jnz .w8_wpad -.w8_loop: - mova xm0, [yq] - mova xm1, [yq+strideq] - vinserti128 m0, [yq+strideq*2], 1 - vinserti128 m1, [yq+stride3q], 1 - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - mova [acq], m0 - paddw m4, m0 - lea yq, [yq+strideq*4] - add acq, 32 - sub hd, 2 - jg .w8_loop - test hpadd, hpadd - jz .calc_avg - jmp .w8_hpad -.w8_wpad: - vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] -.w8_wpad_loop: - movq xm0, [yq] - movq xm1, [yq+strideq] - vinserti128 m0, [yq+strideq*2], 1 - vinserti128 m1, [yq+stride3q], 1 - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - pshufb m0, m3 - mova [acq], m0 - paddw m4, m0 - lea yq, [yq+strideq*4] - add acq, 32 - sub hd, 2 - jg .w8_wpad_loop - test hpadd, hpadd - jz .calc_avg -.w8_hpad: - vpermq m0, m0, q3232 -.w8_hpad_loop: - mova [acq], m0 - paddw m4, m0 - add acq, 32 - sub hpadd, 2 - jg .w8_hpad_loop - jmp .calc_avg - -.w16: - test wpadd, wpadd - jnz .w16_wpad -.w16_loop: - mova m0, [yq] - mova m1, [yq+strideq] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - mova [acq], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 32 - dec hd - jg .w16_loop - test hpadd, hpadd - jz .calc_avg - jmp .w16_hpad_loop -.w16_wpad: - DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak - lea iptrq, [ipred_cfl_ac_420_avx2_table] - shl wpadd, 2 - mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ - ipred_cfl_ac_420_avx2_table+wpadq*8-32] - movsxd wpadq, [iptrq+wpadq+4] - add iptrq, wpadq - jmp iptrq -.w16_pad3: - vpbroadcastq m0, [yq] - vpbroadcastq m1, [yq+strideq] - jmp .w16_wpad_end -.w16_pad2: - vbroadcasti128 m0, [yq] - vbroadcasti128 m1, [yq+strideq] - jmp .w16_wpad_end -.w16_pad1: - mova m0, [yq] - mova m1, [yq+strideq] - ; fall-through -.w16_wpad_end: - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - pshufb m0, m3 - mova [acq], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 32 - dec hd - jz .w16_wpad_done - jmp iptrq -.w16_wpad_done: - test hpadd, hpadd - jz .calc_avg -.w16_hpad_loop: - mova [acq], m0 - paddw m4, m0 - add acq, 32 - dec hpadd - jg .w16_hpad_loop - ; fall-through - -.calc_avg: - vpbroadcastd m2, [pw_1] - pmaddwd m0, m4, m2 - vextracti128 xm1, m0, 1 - tzcnt r1d, szd - paddd xm0, xm1 - movd xm2, r1d - movd xm3, szd - punpckhqdq xm1, xm0, xm0 - paddd xm0, xm1 - psrad xm3, 1 - psrlq xm1, xm0, 32 - paddd xm0, xm3 - paddd xm0, xm1 - psrad xm0, xm2 - vpbroadcastw m0, xm0 -.sub_loop: - mova m1, [ac_bakq] - psubw m1, m0 - mova [ac_bakq], m1 - add ac_bakq, 32 - sub szd, 16 - jg .sub_loop - RET - -cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak - movifnidn hpadd, hpadm - movifnidn wd, wm - mov hd, hm - mov szd, wd - mov ac_bakq, acq - imul szd, hd - shl hpadd, 2 - sub hd, hpadd - vpbroadcastd m2, [pb_4] - pxor m4, m4 - pxor m5, m5 - cmp wd, 8 - jg .w16 - je .w8 - ; fall-through - - DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak -.w4: - lea stride3q, [strideq*3] -.w4_loop: - movq xm1, [yq] - movhps xm1, [yq+strideq] - movq xm0, [yq+strideq*2] - movhps xm0, [yq+stride3q] - pmaddubsw xm0, xm2 - pmaddubsw xm1, xm2 - mova [acq], xm1 - mova [acq+16], xm0 - paddw xm4, xm0 - paddw xm5, xm1 - lea yq, [yq+strideq*4] - add acq, 32 - sub hd, 4 - jg .w4_loop - test hpadd, hpadd - jz .calc_avg - vpermq m0, m0, q1111 -.w4_hpad_loop: - mova [acq], m0 - paddw m4, m0 - add acq, 32 - sub hpadd, 4 - jg .w4_hpad_loop - jmp .calc_avg - -.w8: - lea stride3q, [strideq*3] - test wpadd, wpadd - jnz .w8_wpad -.w8_loop: - mova xm1, [yq] - vinserti128 m1, [yq+strideq], 1 - mova xm0, [yq+strideq*2] - vinserti128 m0, [yq+stride3q], 1 - pmaddubsw m0, m2 - pmaddubsw m1, m2 - mova [acq], m1 - mova [acq+32], m0 - paddw m4, m0 - paddw m5, m1 - lea yq, [yq+strideq*4] - add acq, 64 - sub hd, 4 - jg .w8_loop - test hpadd, hpadd - jz .calc_avg - jmp .w8_hpad -.w8_wpad: - vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] -.w8_wpad_loop: - movq xm1, [yq] - vinserti128 m1, [yq+strideq], 1 - movq xm0, [yq+strideq*2] - vinserti128 m0, [yq+stride3q], 1 - pmaddubsw m0, m2 - pmaddubsw m1, m2 - pshufb m0, m3 - pshufb m1, m3 - mova [acq], m1 - mova [acq+32], m0 - paddw m4, m0 - paddw m5, m1 - lea yq, [yq+strideq*4] - add acq, 64 - sub hd, 4 - jg .w8_wpad_loop - test hpadd, hpadd - jz .calc_avg -.w8_hpad: - vpermq m0, m0, q3232 -.w8_hpad_loop: - mova [acq], m0 - paddw m4, m0 - add acq, 32 - sub hpadd, 2 - jg .w8_hpad_loop - jmp .calc_avg - -.w16: - test wpadd, wpadd - jnz .w16_wpad -.w16_loop: - mova m1, [yq] - mova m0, [yq+strideq] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - mova [acq], m1 - mova [acq+32], m0 - paddw m4, m0 - paddw m5, m1 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_loop - test hpadd, hpadd - jz .calc_avg - jmp .w16_hpad_loop -.w16_wpad: - DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak - lea iptrq, [ipred_cfl_ac_422_avx2_table] - shl wpadd, 2 - mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ - ipred_cfl_ac_422_avx2_table+wpadq*8-32] - movsxd wpadq, [iptrq+wpadq+4] - add iptrq, wpadq - jmp iptrq -.w16_pad3: - vpbroadcastq m1, [yq] - vpbroadcastq m0, [yq+strideq] - jmp .w16_wpad_end -.w16_pad2: - vbroadcasti128 m1, [yq] - vbroadcasti128 m0, [yq+strideq] - jmp .w16_wpad_end -.w16_pad1: - mova m1, [yq] - mova m0, [yq+strideq] - ; fall-through -.w16_wpad_end: - pmaddubsw m0, m2 - pmaddubsw m1, m2 - pshufb m0, m3 - pshufb m1, m3 - mova [acq], m1 - mova [acq+32], m0 - paddw m4, m0 - paddw m5, m1 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jz .w16_wpad_done - jmp iptrq -.w16_wpad_done: - test hpadd, hpadd - jz .calc_avg -.w16_hpad_loop: - mova [acq], m0 - mova [acq+32], m0 - paddw m4, m0 - paddw m5, m0 - add acq, 64 - sub hpadd, 2 - jg .w16_hpad_loop - ; fall-through - -.calc_avg: - vpbroadcastd m2, [pw_1] - pmaddwd m5, m5, m2 - pmaddwd m0, m4, m2 - paddd m0, m5 - vextracti128 xm1, m0, 1 - tzcnt r1d, szd - paddd xm0, xm1 - movd xm2, r1d - movd xm3, szd - punpckhqdq xm1, xm0, xm0 - paddd xm0, xm1 - psrad xm3, 1 - psrlq xm1, xm0, 32 - paddd xm0, xm3 - paddd xm0, xm1 - psrad xm0, xm2 - vpbroadcastw m0, xm0 -.sub_loop: - mova m1, [ac_bakq] - psubw m1, m0 - mova [ac_bakq], m1 - add ac_bakq, 32 - sub szd, 16 - jg .sub_loop - RET - -cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak - movifnidn hpadd, hpadm - movifnidn wd, wm - mov hd, hm - mov szd, wd - imul szd, hd - shl hpadd, 2 - sub hd, hpadd - pxor m4, m4 - vpbroadcastd m5, [pw_1] - tzcnt r8d, wd - lea r5, [ipred_cfl_ac_444_avx2_table] - movsxd r8, [r5+r8*4+12] - add r5, r8 - - DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak - mov ac_bakq, acq - jmp r5 - -.w4: - lea stride3q, [strideq*3] - pxor xm2, xm2 -.w4_loop: - movd xm1, [yq] - movd xm0, [yq+strideq*2] - pinsrd xm1, [yq+strideq], 1 - pinsrd xm0, [yq+stride3q], 1 - punpcklbw xm1, xm2 - punpcklbw xm0, xm2 - psllw xm1, 3 - psllw xm0, 3 - mova [acq], xm1 - mova [acq+16], xm0 - paddw xm1, xm0 - paddw xm4, xm1 - lea yq, [yq+strideq*4] - add acq, 32 - sub hd, 4 - jg .w4_loop - test hpadd, hpadd - jz .calc_avg_mul - pshufd xm0, xm0, q3232 - paddw xm1, xm0, xm0 -.w4_hpad_loop: - mova [acq], xm0 - mova [acq+16], xm0 - paddw xm4, xm1 - add acq, 32 - sub hpadd, 4 - jg .w4_hpad_loop - jmp .calc_avg_mul - -.w8: - lea stride3q, [strideq*3] - pxor m2, m2 -.w8_loop: - movq xm1, [yq] - movq xm0, [yq+strideq*2] - vinserti128 m1, [yq+strideq], 1 - vinserti128 m0, [yq+stride3q], 1 - punpcklbw m1, m2 - punpcklbw m0, m2 - psllw m1, 3 - psllw m0, 3 - mova [acq], m1 - mova [acq+32], m0 - paddw m1, m0 - paddw m4, m1 - lea yq, [yq+strideq*4] - add acq, 64 - sub hd, 4 - jg .w8_loop - test hpadd, hpadd - jz .calc_avg_mul - vpermq m0, m0, q3232 - paddw m1, m0, m0 -.w8_hpad_loop: - mova [acq], m0 - mova [acq+32], m0 - paddw m4, m1 - add acq, 64 - sub hpadd, 4 - jg .w8_hpad_loop - jmp .calc_avg_mul - -.w16: - test wpadd, wpadd - jnz .w16_wpad -.w16_loop: - pmovzxbw m1, [yq] - pmovzxbw m0, [yq+strideq] - psllw m1, 3 - psllw m0, 3 - mova [acq], m1 - mova [acq+32], m0 - paddw m1, m0 - pmaddwd m1, m5 - paddd m4, m1 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_loop - test hpadd, hpadd - jz .calc_avg - jmp .w16_hpad -.w16_wpad: - mova m3, [cfl_ac_444_w16_pad1_shuffle] -.w16_wpad_loop: - vpbroadcastq m1, [yq] - vpbroadcastq m0, [yq+strideq] - pshufb m1, m3 - pshufb m0, m3 - psllw m1, 3 - psllw m0, 3 - mova [acq], m1 - mova [acq+32], m0 - paddw m1, m0 - pmaddwd m1, m5 - paddd m4, m1 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_wpad_loop - test hpadd, hpadd - jz .calc_avg -.w16_hpad: - paddw m1, m0, m0 - pmaddwd m1, m5 -.w16_hpad_loop: - mova [acq], m0 - mova [acq+32], m0 - paddd m4, m1 - add acq, 64 - sub hpadd, 2 - jg .w16_hpad_loop - jmp .calc_avg - -.w32: - test wpadd, wpadd - jnz .w32_wpad -.w32_loop: - pmovzxbw m1, [yq] - pmovzxbw m0, [yq+16] - psllw m1, 3 - psllw m0, 3 - mova [acq], m1 - mova [acq+32], m0 - paddw m2, m1, m0 - pmaddwd m2, m5 - paddd m4, m2 - add yq, strideq - add acq, 64 - dec hd - jg .w32_loop - test hpadd, hpadd - jz .calc_avg - jmp .w32_hpad_loop -.w32_wpad: - DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak - lea iptrq, [ipred_cfl_ac_444_avx2_table] - add wpadd, wpadd - mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table] - movsxd wpadq, [iptrq+wpadq+4] - add iptrq, wpadq - jmp iptrq -.w32_pad3: - vpbroadcastq m1, [yq] - pshufb m1, m3 - vpermq m0, m1, q3232 - jmp .w32_wpad_end -.w32_pad2: - pmovzxbw m1, [yq] - pshufhw m0, m1, q3333 - vpermq m0, m0, q3333 - jmp .w32_wpad_end -.w32_pad1: - pmovzxbw m1, [yq] - vpbroadcastq m0, [yq+16] - pshufb m0, m3 - ; fall-through -.w32_wpad_end: - psllw m1, 3 - psllw m0, 3 - mova [acq], m1 - mova [acq+32], m0 - paddw m2, m1, m0 - pmaddwd m2, m5 - paddd m4, m2 - add yq, strideq - add acq, 64 - dec hd - jz .w32_wpad_done - jmp iptrq -.w32_wpad_done: - test hpadd, hpadd - jz .calc_avg -.w32_hpad_loop: - mova [acq], m1 - mova [acq+32], m0 - paddd m4, m2 - add acq, 64 - dec hpadd - jg .w32_hpad_loop - jmp .calc_avg - -.calc_avg_mul: - pmaddwd m4, m5 -.calc_avg: - vextracti128 xm1, m4, 1 - tzcnt r1d, szd - paddd xm0, xm4, xm1 - movd xm2, r1d - movd xm3, szd - punpckhqdq xm1, xm0, xm0 - paddd xm0, xm1 - psrad xm3, 1 - psrlq xm1, xm0, 32 - paddd xm0, xm3 - paddd xm0, xm1 - psrad xm0, xm2 - vpbroadcastw m0, xm0 -.sub_loop: - mova m1, [ac_bakq] - psubw m1, m0 - mova [ac_bakq], m1 - add ac_bakq, 32 - sub szd, 16 - jg .sub_loop - RET - -cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h - vbroadcasti128 m4, [palq] - lea r2, [pal_pred_avx2_table] - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, [r2+wq*4] - packuswb m4, m4 - add wq, r2 - lea r2, [strideq*3] - jmp wq -.w4: - pshufb xm0, xm4, [idxq] - add idxq, 16 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+r2 ], xm0, 3 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w4 - RET -ALIGN function_align -.w8: - pshufb xm0, xm4, [idxq+16*0] - pshufb xm1, xm4, [idxq+16*1] - add idxq, 16*2 - movq [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xm0 - movq [dstq+strideq*2], xm1 - movhps [dstq+r2 ], xm1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w8 - RET -ALIGN function_align -.w16: - pshufb m0, m4, [idxq+32*0] - pshufb m1, m4, [idxq+32*1] - add idxq, 32*2 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - mova [dstq+strideq*2], xm1 - vextracti128 [dstq+r2 ], m1, 1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w16 - RET -ALIGN function_align -.w32: - pshufb m0, m4, [idxq+32*0] - pshufb m1, m4, [idxq+32*1] - pshufb m2, m4, [idxq+32*2] - pshufb m3, m4, [idxq+32*3] - add idxq, 32*4 - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m1 - mova [dstq+strideq*2], m2 - mova [dstq+r2 ], m3 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w32 - RET -ALIGN function_align -.w64: - pshufb m0, m4, [idxq+32*0] - pshufb m1, m4, [idxq+32*1] - pshufb m2, m4, [idxq+32*2] - pshufb m3, m4, [idxq+32*3] - add idxq, 32*4 - mova [dstq+strideq*0+32*0], m0 - mova [dstq+strideq*0+32*1], m1 - mova [dstq+strideq*1+32*0], m2 - mova [dstq+strideq*1+32*1], m3 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w64 - RET - -%endif diff -Nru dav1d-0.7.1/src/x86/ipred_avx2.asm dav1d-0.9.1/src/x86/ipred_avx2.asm --- dav1d-0.7.1/src/x86/ipred_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/ipred_avx2.asm 2021-07-28 21:38:28.901852100 +0000 @@ -0,0 +1,5387 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + db %1-128, 127-%1 + %rotate 1 + %endrep +%endmacro + +; sm_weights[], but modified to precalculate x and 256-x with offsets to +; enable efficient use of pmaddubsw (which requires signed values) +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 +pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 +z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 + db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 + db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 +z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 + db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 + db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line +pb_128: times 4 db 128 ; those are just placed here for alignment. +pb_36_m4: times 2 db 36, -4 +z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 +z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 +z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 +z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 +z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 +z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 + dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 +z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 + dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 +z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 + db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 +; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 +filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 + db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 +filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 +filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 +pb_127_m127: times 2 db 127, -127 +ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 + db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 +ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 + db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 +pw_64: times 2 dw 64 + +cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 + times 9 db 7, -1 +cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ; w=8, w_pad=1 as well as second half of previous one +cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 + times 5 db 6, 7 + ; w=16,w_pad=2 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + times 8 db 14, 15 + ; w=16,w_pad=3 + db 0, 1, 2, 3, 4, 5 + times 13 db 6, 7 +pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +%define pb_0to15 cfl_ac_w16_pad_shuffle +%define pb_1 (ipred_h_shuf+12) +%define pb_2 (ipred_h_shuf+20) +%define pb_3 (ipred_h_shuf+ 4) +%define pb_4 (ipred_h_shuf+24) +%define pb_5 (ipred_h_shuf+ 8) +%define pb_7 (ipred_h_shuf+ 0) +%define pb_8 (z_upsample2 +12) +%define pb_12 (z2_y_shuf_h4+20) +%define pb_14 (z2_y_shuf_h4+ 4) +%define pb_15 (z_filter_s +32) +%define pb_27 (z2_y_shuf_h4+ 8) +%define pb_31 (z2_y_shuf_h4+12) +%define pb_32 (z2_y_shuf_h4+16) +%define pb_90 (z2_y_shuf_h4+ 0) +%define pw_1 (z2_y_shuf_h4+24) +%define pw_8 (z_filter_k +32) + +pw_62: times 2 dw 62 +pw_128: times 2 dw 128 +pw_255: times 2 dw 255 +pw_512: times 2 dw 512 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4) +%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) + +JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 +JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 +JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 +JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 +JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 + +cextern dr_intra_derivative +cextern filter_intra_taps + +SECTION .text + +INIT_YMM avx2 +cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h + lea r5, [ipred_dc_left_avx2_table] + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + mov r6d, 0x8000 + shrx r6d, r6d, wd + movd xm3, r6d + movsxd r6, [r5+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + mov r5d, 0x8000 + shrx r5d, r5d, r6d + movd xm3, r5d + lea r5, [ipred_dc_left_avx2_table] + movsxd r6, [r5+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m1, [tlq+32] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h32: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h16: + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 +.h8: + psrlq xm1, xm0, 32 + paddw xm0, xm1 +.h4: + pmaddwd xm0, xm2 + pmulhrsw xm0, xm3 + lea stride3q, [strideq*3] + vpbroadcastb m0, xm0 + mova m1, m0 + jmp wq + +cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm4, r5d + tzcnt r5d, r5d + movd xm5, r5d + lea r5, [ipred_dc_avx2_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pcmpeqd m3, m3 + psrlw xm4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movd xm0, [tlq-4] + pmaddubsw xm0, xm3 + jmp wq +.w4: + movd xm1, [tlq+1] + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm1 + pmaddwd xm0, xm3 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + punpckhqdq xm1, xm0, xm0 + lea r2d, [hq*2] + mov r6d, 0x55563334 + paddw xm0, xm1 + shrx r6d, r6d, r2d + psrlq xm1, xm0, 32 + paddw xm0, xm1 + movd xm1, r6d + psrlw xm0, 2 + pmulhuw xm0, xm1 +.w4_end: + vpbroadcastb xm0, xm0 +.s4: + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm0 + movd [dstq+strideq*2], xm0 + movd [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + movq xm0, [tlq-8] + pmaddubsw xm0, xm3 + jmp wq +.w8: + movq xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + punpckhqdq xm2, xm0, xm0 + paddw xm0, xm2 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmove r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w8_end: + vpbroadcastb xm0, xm0 +.s8: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova xm0, [tlq-16] + pmaddubsw xm0, xm3 + jmp wq +.w16: + movu xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w16_end: + vpbroadcastb xm0, xm0 +.s16: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + vextracti128 xm1, m0, 1 + psubw xm0, xm4 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x33345556 + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w32_end: + vpbroadcastb m0, xm0 +.s32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-64] + mova m1, [tlq-32] + pmaddubsw m0, m3 + pmaddubsw m1, m3 + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 1] + movu m2, [tlq+33] + pmaddubsw m1, m3 + pmaddubsw m2, m3 + paddw m0, m1 + paddw m0, m2 + vextracti128 xm1, m0, 1 + psubw xm0, xm4 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 64 + je .w64_end + mov r6d, 0x33345556 + shrx r6d, r6d, hd + movd xm1, r6d + pmulhuw xm0, xm1 +.w64_end: + vpbroadcastb m0, xm0 + mova m1, m0 +.s64: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m0 + mova [dstq+stride3q +32*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s64 + RET + +cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128] + mova m1, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_avx2_table] + tzcnt wd, wm + movu m0, [tlq+ 1] + movu m1, [tlq+33] + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +%macro IPRED_H 2 ; w, store_type + vpbroadcastb m0, [tlq-1] + vpbroadcastb m1, [tlq-2] + vpbroadcastb m2, [tlq-3] + sub tlq, 4 + vpbroadcastb m3, [tlq+0] + mov%2 [dstq+strideq*0], m0 + mov%2 [dstq+strideq*1], m1 + mov%2 [dstq+strideq*2], m2 + mov%2 [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w%1 + RET +ALIGN function_align +%endmacro + +INIT_XMM avx2 +cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 + lea r5, [ipred_h_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + IPRED_H 4, d +.w8: + IPRED_H 8, q +.w16: + IPRED_H 16, a +INIT_YMM avx2 +.w32: + IPRED_H 32, a +.w64: + vpbroadcastb m0, [tlq-1] + vpbroadcastb m1, [tlq-2] + vpbroadcastb m2, [tlq-3] + sub tlq, 4 + vpbroadcastb m3, [tlq+0] + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m0 + mova [dstq+strideq*1+32*0], m1 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m2 + mova [dstq+strideq*2+32*1], m2 + mova [dstq+stride3q +32*0], m3 + mova [dstq+stride3q +32*1], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w64 + RET + +%macro PAETH 2 ; top, ldiff + pavgb m1, m%1, m3 ; Calculating tldiff normally requires + pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it + pand m0, m4 ; in 8-bit with some tricks which avoids + psubusb m2, m5, m1 ; having to unpack everything to 16-bit. + psubb m1, m0 + psubusb m1, m5 + por m1, m2 + paddusb m1, m1 + por m1, m0 ; min(tldiff, 255) + psubusb m2, m5, m3 + psubusb m0, m3, m5 + por m2, m0 ; tdiff + pminub m2, m%2 + pcmpeqb m0, m%2, m2 ; ldiff <= tdiff + vpblendvb m0, m%1, m3, m0 + pminub m1, m2 + pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff + vpblendvb m0, m5, m0, m1 +%endmacro + +cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h +%define base r5-ipred_paeth_avx2_table + lea r5, [ipred_paeth_avx2_table] + tzcnt wd, wm + vpbroadcastb m5, [tlq] ; topleft + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m4, [base+pb_1] + add wq, r5 + jmp wq +.w4: + vpbroadcastd m6, [tlq+1] ; top + mova m8, [base+ipred_h_shuf] + lea r3, [strideq*3] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 ; ldiff +.w4_loop: + sub tlq, 8 + vpbroadcastq m3, [tlq] + pshufb m3, m8 ; left + PAETH 6, 7 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r3 ], xm1, 2 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + sub hd, 8 + jg .w4_loop +.ret: + RET +ALIGN function_align +.w8: + vpbroadcastq m6, [tlq+1] + mova m8, [base+ipred_h_shuf] + lea r3, [strideq*3] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w8_loop: + sub tlq, 4 + vpbroadcastd m3, [tlq] + pshufb m3, m8 + PAETH 6, 7 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +ALIGN function_align +.w16: + vbroadcasti128 m6, [tlq+1] + mova xm8, xm4 ; lower half = 1, upper half = 0 + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w16_loop: + sub tlq, 2 + vpbroadcastd m3, [tlq] + pshufb m3, m8 + PAETH 6, 7 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w32_loop: + dec tlq + vpbroadcastb m3, [tlq] + PAETH 6, 7 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + movu m6, [tlq+ 1] + movu m7, [tlq+33] +%if WIN64 + movaps r4m, xmm9 +%endif + psubusb m8, m5, m6 + psubusb m0, m6, m5 + psubusb m9, m5, m7 + psubusb m1, m7, m5 + por m8, m0 + por m9, m1 +.w64_loop: + dec tlq + vpbroadcastb m3, [tlq] + PAETH 6, 8 + mova [dstq+32*0], m0 + PAETH 7, 9 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w64_loop +%if WIN64 + movaps xmm9, r4m +%endif + RET + +%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] + ; w * a = (w - 128) * a + 128 * a + ; (256 - w) * b = (127 - w) * b + 129 * b + pmaddubsw m0, m%3, m%1 + pmaddubsw m1, m%4, m%2 + paddw m0, m%5 + paddw m1, m%6 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 +%endmacro + +cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights +%define base r6-ipred_smooth_v_avx2_table + lea r6, [ipred_smooth_v_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m0, [base+pb_127_m127] + vpbroadcastd m1, [base+pw_128] + lea weightsq, [base+smooth_weights+hq*4] + neg hq + vpbroadcastb m5, [tlq+hq] ; bottom + add wq, r6 + jmp wq +.w4: + vpbroadcastd m2, [tlq+1] + punpcklbw m2, m5 ; top, bottom + mova m5, [base+ipred_v_shuf] + lea r3, [strideq*3] + punpckldq m4, m5, m5 + punpckhdq m5, m5 + pmaddubsw m3, m2, m0 + paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok + paddw m3, m1 ; 128 * top + 129 * bottom + 128 +.w4_loop: + vbroadcasti128 m1, [weightsq+hq*2] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 1 + pextrd [dstq+r3 ], xm1, 1 + cmp hd, -4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm1, 2 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + add hq, 8 + jl .w4_loop +.ret: + RET +ALIGN function_align +.w8: + vpbroadcastq m2, [tlq+1] + punpcklbw m2, m5 + mova m5, [base+ipred_v_shuf] + lea r3, [strideq*3] + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + pmaddubsw m3, m2, m0 + paddw m1, m2 + paddw m3, m1 +.w8_loop: + vpbroadcastq m1, [weightsq+hq*2] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +ALIGN function_align +.w16: + WIN64_SPILL_XMM 7 + vbroadcasti128 m3, [tlq+1] + mova m6, [base+ipred_v_shuf] + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w16_loop: + vpbroadcastd m1, [weightsq+hq*2] + pshufb m1, m6 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w16_loop + RET +ALIGN function_align +.w32: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 6 + movu m3, [tlq+1] + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w32_loop: + vpbroadcastw m1, [weightsq+hq*2] + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m0 + add dstq, strideq + inc hq + jl .w32_loop + RET +ALIGN function_align +.w64: + WIN64_SPILL_XMM 11 + movu m4, [tlq+ 1] + movu m8, [tlq+33] + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m7, m8, m5 + punpckhbw m8, m5 + pmaddubsw m5, m3, m0 + pmaddubsw m6, m4, m0 + pmaddubsw m9, m7, m0 + pmaddubsw m10, m8, m0 + paddw m2, m1, m3 + paddw m5, m2 + paddw m2, m1, m4 + paddw m6, m2 + paddw m0, m1, m7 + paddw m9, m0 + paddw m1, m8 + paddw m10, m1 +.w64_loop: + vpbroadcastw m2, [weightsq+hq*2] + SMOOTH 2, 2, 3, 4, 5, 6 + mova [dstq+32*0], m0 + SMOOTH 2, 2, 7, 8, 9, 10 + mova [dstq+32*1], m0 + add dstq, strideq + inc hq + jl .w64_loop + RET + +%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used + %assign stack_offset 0 + %assign stack_size_padded 0 + %assign regs_used %2 + %xdefine rstk rsp + SETUP_STACK_POINTER %1 + %if regs_used != %2 && WIN64 + PUSH r%2 + %endif + ALLOC_STACK %1, %3 +%endmacro + +cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h +%define base r6-ipred_smooth_h_avx2_table + lea r6, [ipred_smooth_h_avx2_table] + mov wd, wm + vpbroadcastb m3, [tlq+wq] ; right + tzcnt wd, wd + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m4, [base+pb_127_m127] + vpbroadcastd m5, [base+pw_128] + add wq, r6 + jmp wq +.w4: + WIN64_SPILL_XMM 8 + vpbroadcastq m6, [base+smooth_weights+4*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 8 + sub tlq, hq + lea r3, [strideq*3] +.w4_loop: + vpbroadcastq m2, [tlq+hq] + pshufb m2, m7 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r3 ], xm1, 2 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + sub hd, 8 + jg .w4_loop +.ret: + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 8 + vbroadcasti128 m6, [base+smooth_weights+8*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 4 + lea r3, [strideq*3] + sub tlq, hq +.w8_loop: + vpbroadcastd m2, [tlq+hq] + pshufb m2, m7 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 + paddw m0, m1 + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +ALIGN function_align +.w16: + SETUP_STACK_FRAME 32*4, 7, 8 + lea r3, [rsp+64*2-4] + call .prep ; only worthwhile for for w16 and above + sub tlq, 2 + vpbroadcastd xm6, [base+pb_1] + mova xm7, [base+ipred_v_shuf+16] + vinserti128 m7, [base+ipred_v_shuf+ 0], 1 + vbroadcasti128 m4, [base+smooth_weights+16*2] + vbroadcasti128 m5, [base+smooth_weights+16*3] +.w16_loop: + vpbroadcastd m1, [tlq+hq] + vpbroadcastd m2, [r3+hq*2] + pshufb m1, m6 + punpcklbw m1, m3 + pshufb m2, m7 + SMOOTH 4, 5, 1, 1, 2, 2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + SETUP_STACK_FRAME 32*4, 7, 6 + lea r3, [rsp+64*2-2] + call .prep + dec tlq + mova xm4, [base+smooth_weights+16*4] + vinserti128 m4, [base+smooth_weights+16*6], 1 + mova xm5, [base+smooth_weights+16*5] + vinserti128 m5, [base+smooth_weights+16*7], 1 +.w32_loop: + vpbroadcastb m1, [tlq+hq] + punpcklbw m1, m3 + vpbroadcastw m2, [r3+hq*2] + SMOOTH 4, 5, 1, 1, 2, 2 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + SETUP_STACK_FRAME 32*4, 7, 9 + lea r3, [rsp+64*2-2] + call .prep + add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table + dec tlq + mova xm5, [r6-16*7] + vinserti128 m5, [r6-16*5], 1 + mova xm6, [r6-16*6] + vinserti128 m6, [r6-16*4], 1 + mova xm7, [r6-16*3] + vinserti128 m7, [r6-16*1], 1 + mova xm8, [r6-16*2] + vinserti128 m8, [r6-16*0], 1 +.w64_loop: + vpbroadcastb m2, [tlq+hq] + punpcklbw m2, m3 + vpbroadcastw m4, [r3+hq*2] + SMOOTH 5, 6, 2, 2, 4, 4 + mova [dstq+32*0], m0 + SMOOTH 7, 8, 2, 2, 4, 4 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +ALIGN function_align +.prep: + vpermq m2, [tlq-32*1], q3120 + punpckhbw m1, m2, m3 + punpcklbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m1, m5 ; 1 * left + 256 * right + 128 + paddw m0, m1 ; 128 * left + 129 * right + 128 + pmaddubsw m1, m2, m4 + paddw m2, m5 + paddw m1, m2 + vpermq m2, [tlq-32*2], q3120 + mova [rsp+gprsize+32*3], m0 + mova [rsp+gprsize+32*2], m1 + punpckhbw m1, m2, m3 + punpcklbw m2, m3 + pmaddubsw m0, m1, m4 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m2, m5 + paddw m1, m2 + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*0], m1 + sub r3, hq + sub tlq, hq + sub r3, hq + ret + +%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] + pmaddubsw m0, m%3, m%1 + pmaddubsw m1, m%4, m%2 +%ifnum %5 + paddw m0, m%5 +%else + paddw m0, %5 +%endif +%ifnum %6 + paddw m1, m%6 +%else + paddw m1, %6 +%endif + pavgw m0, m2 + pavgw m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 +%endmacro + +cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights +%define base r6-ipred_smooth_avx2_table + lea r6, [ipred_smooth_avx2_table] + mov wd, wm + vpbroadcastb m4, [tlq+wq] ; right + tzcnt wd, wd + mov hd, hm + mov r5, tlq + sub r5, hq + movsxd wq, [r6+wq*4] + vpbroadcastd m5, [base+pb_127_m127] + vpbroadcastb m0, [r5] ; bottom + vpbroadcastd m3, [base+pw_255] + add wq, r6 + lea v_weightsq, [base+smooth_weights+hq*2] + jmp wq +.w4: + WIN64_SPILL_XMM 12 + mova m10, [base+ipred_h_shuf] + vpbroadcastq m11, [base+smooth_weights+4*2] + mova m7, [base+ipred_v_shuf] + vpbroadcastd m8, [tlq+1] + sub tlq, 8 + lea r3, [strideq*3] + sub tlq, hq + punpcklbw m8, m0 ; top, bottom + pshufd m6, m7, q2200 + pshufd m7, m7, q3311 + pmaddubsw m9, m8, m5 + paddw m3, m8 ; 1 * top + 255 * bottom + 255 + paddw m9, m3 ; 128 * top + 129 * bottom + 255 +.w4_loop: + vpbroadcastq m1, [tlq+hq] + pshufb m1, m10 + punpcklbw m0, m1, m4 ; left, right + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 ; 127 * left - 127 * right + pmaddubsw m3, m1, m5 + paddw m2, m0 ; 128 * left + 129 * right + paddw m3, m1 + pmaddubsw m0, m11 + pmaddubsw m1, m11 + paddw m2, m0 + paddw m3, m1 + vbroadcasti128 m1, [v_weightsq] + add v_weightsq, 16 + pshufb m0, m1, m6 + pshufb m1, m7 + SMOOTH_2D_END 0, 1, 8, 8, 9, 9 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r3 ], xm1, 2 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + sub hd, 8 + jg .w4_loop +.ret: + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 + mova m10, [base+ipred_h_shuf] + vbroadcasti128 m11, [base+smooth_weights+8*2] + mova m7, [base+ipred_v_shuf] + vpbroadcastq m8, [tlq+1] + sub tlq, 4 + lea r3, [strideq*3] + sub tlq, hq + punpcklbw m8, m0 + pshufd m6, m7, q0000 + pshufd m7, m7, q1111 + pmaddubsw m9, m8, m5 + paddw m3, m8 + paddw m9, m3 +.w8_loop: + vpbroadcastd m1, [tlq+hq] + pshufb m1, m10 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 + pmaddubsw m3, m1, m5 + paddw m2, m0 + paddw m3, m1 + pmaddubsw m0, m11 + pmaddubsw m1, m11 + paddw m2, m0 + paddw m3, m1 + vpbroadcastq m1, [v_weightsq] + add v_weightsq, 8 + pshufb m0, m1, m6 + pshufb m1, m7 + SMOOTH_2D_END 0, 1, 8, 8, 9, 9 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +ALIGN function_align +.w16: + SETUP_STACK_FRAME 32*4, 7, 14 + vbroadcasti128 m11, [tlq+1] + lea r3, [rsp+64*2-4] + punpcklbw m10, m11, m0 ; top, bottom + punpckhbw m11, m0 + call .prep_v + sub tlq, 2 + pmaddubsw m12, m10, m5 + pmaddubsw m13, m11, m5 + vpbroadcastd xm5, [base+pb_1] + mova m9, [base+ipred_v_shuf] + vbroadcasti128 m6, [base+smooth_weights+16*2] + vbroadcasti128 m7, [base+smooth_weights+16*3] + vperm2i128 m8, m9, m9, 0x01 + paddw m0, m10, m3 + paddw m3, m11 + paddw m12, m0 + paddw m13, m3 +.w16_loop: + vpbroadcastd m3, [tlq+hq] + vpbroadcastd m0, [r3+hq*2] + vpbroadcastd m1, [v_weightsq] + add v_weightsq, 4 + pshufb m3, m5 + punpcklbw m3, m4 ; left, right + pmaddubsw m2, m3, m6 + pmaddubsw m3, m7 + pshufb m0, m8 + pshufb m1, m9 + paddw m2, m0 + paddw m3, m0 + SMOOTH_2D_END 1, 1, 10, 11, 12, 13 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + SETUP_STACK_FRAME 32*4, 7, 11 + movu m8, [tlq+1] + lea r3, [rsp+64*2-2] + punpcklbw m7, m8, m0 + punpckhbw m8, m0 + call .prep_v + dec tlq + pmaddubsw m9, m7, m5 + pmaddubsw m10, m8, m5 + mova xm5, [base+smooth_weights+16*4] + vinserti128 m5, [base+smooth_weights+16*6], 1 + mova xm6, [base+smooth_weights+16*5] + vinserti128 m6, [base+smooth_weights+16*7], 1 + paddw m0, m7, m3 + paddw m3, m8 + paddw m9, m0 + paddw m10, m3 +.w32_loop: + vpbroadcastb m3, [tlq+hq] + punpcklbw m3, m4 + vpbroadcastw m0, [r3+hq*2] + vpbroadcastw m1, [v_weightsq] + add v_weightsq, 2 + pmaddubsw m2, m3, m5 + pmaddubsw m3, m6 + paddw m2, m0 + paddw m3, m0 + SMOOTH_2D_END 1, 1, 7, 8, 9, 10 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + SETUP_STACK_FRAME 32*8, 7, 16 + movu m13, [tlq+1 ] + movu m15, [tlq+33] + add r6, smooth_weights+16*15-ipred_smooth_avx2_table + lea r3, [rsp+64*2-2] + punpcklbw m12, m13, m0 + punpckhbw m13, m0 + punpcklbw m14, m15, m0 + punpckhbw m15, m0 + call .prep_v + dec tlq + pmaddubsw m0, m12, m5 + pmaddubsw m1, m13, m5 + pmaddubsw m2, m14, m5 + pmaddubsw m5, m15, m5 + mova xm8, [r6-16*7] + vinserti128 m8, [r6-16*5], 1 + mova xm9, [r6-16*6] + vinserti128 m9, [r6-16*4], 1 + mova xm10, [r6-16*3] + vinserti128 m10, [r6-16*1], 1 + mova xm11, [r6-16*2] + vinserti128 m11, [r6-16*0], 1 + lea r6, [rsp+32*4] + paddw m0, m3 + paddw m1, m3 + paddw m2, m3 + paddw m3, m5 + paddw m0, m12 + paddw m1, m13 + paddw m2, m14 + paddw m3, m15 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 +.w64_loop: + vpbroadcastb m5, [tlq+hq] + punpcklbw m5, m4 + vpbroadcastw m6, [r3+hq*2] + vpbroadcastw m7, [v_weightsq] + add v_weightsq, 2 + pmaddubsw m2, m5, m8 + pmaddubsw m3, m5, m9 + paddw m2, m6 + paddw m3, m6 + SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1] + mova [dstq+32*0], m0 + pmaddubsw m2, m5, m10 + pmaddubsw m3, m5, m11 + paddw m2, m6 + paddw m3, m6 + SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3] + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +ALIGN function_align +.prep_v: + vpermq m2, [tlq-32*1], q3120 + punpckhbw m1, m2, m4 + punpcklbw m2, m4 + pmaddubsw m0, m1, m5 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m2, m5 + paddw m1, m2 + vpermq m2, [tlq-32*2], q3120 + mova [rsp+gprsize+32*3], m0 + mova [rsp+gprsize+32*2], m1 + punpckhbw m1, m2, m4 + punpcklbw m2, m4 + pmaddubsw m0, m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m5 + paddw m1, m2 + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*0], m1 + sub r3, hq + sub tlq, hq + sub r3, hq + ret + +cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z1_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea r7, [dr_intra_derivative] + inc tlq + movsxd wq, [r6+wq*4] + add wq, r6 + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [r7+dxq] + xor angled, 0x4ff ; d = 90 - angle + vpbroadcastd m3, [pw_512] + vpbroadcastd m4, [pw_62] + vpbroadcastd m5, [pw_64] + jmp wq +.w4: + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + ALLOC_STACK -32, 8 + mova xm1, [tlq-1] + pshufb xm0, xm1, [z_upsample1] + pshufb xm1, [z_upsample2] + vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse + add dxd, dxd ; pw_512 (which is already in m3) + pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 + pextrd [rsp+16], xm1, 3 ; top[max_base_x] + pmaddubsw xm1, xm2 + movd xm7, dxd + mov r3d, dxd ; xpos + vpbroadcastw m7, xm7 + paddw xm1, xm0 + movq xm0, [tlq] + pmulhrsw xm1, xm3 + pslldq m6, m7, 8 + paddw xm2, xm7, xm7 + lea r2, [strideq*3] + paddw m6, m7 + packuswb xm1, xm1 + paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 + punpcklbw xm0, xm1 + psllw m7, 2 + mova [rsp], xm0 +.w4_upsample_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + vpbroadcastq m1, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vpbroadcastq m2, [rsp+r5] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + movq xm0, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + movhps xm0, [rsp+r5] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 ; frac + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 ; 64-frac + psllw m2, 8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + paddw m6, m7 ; xpos += dx + pmulhrsw m0, m3 + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r2 ], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; w4/w8/w16 + ; The C version uses a lot of branches, but we can do all the comparisons + ; in parallel and use popcnt to get the final filter strength value. +%define base r3-z_filter_t0 + lea r3, [z_filter_t0] + movd xm0, maxbased + movd xm2, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m2, xm2 + pcmpeqb m1, m0, [base+z_filter_wh] + pand m1, m2 + mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases + pcmpgtb m1, m2 + pmovmskb r5d, m1 + ret +.w4_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -16, 11 + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea maxbased, [hq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m7, [base+pb_8] + vbroadcasti128 m2, [tlq-1] + pminub m1, m7, [base+z_filter_s] + vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] + pminub m7, [base+z_filter_s+8] + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] + pshufb m0, m2, m1 + shufps m1, m7, q2121 + pmaddubsw m0, m8 + pshufb m1, m2, m1 + pmaddubsw m1, m9 + pshufb m2, m7 + pmaddubsw m2, m10 + paddw m0, m1 + paddw m0, m2 + pmulhrsw m0, m3 + mov r3d, 9 + mov tlq, rsp + cmp hd, 4 + cmovne maxbased, r3d + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + mova [tlq], xm0 +.w4_main: + movd xm6, dxd + vpbroadcastq m0, [z_base_inc] ; base_inc << 6 + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m6, xm6 + mov r3d, dxd ; xpos + movd xm9, maxbased + vpbroadcastw m9, xm9 + vbroadcasti128 m8, [z1_shuf_w4] + psrlw m7, 8 ; top[max_base_x] + paddw m10, m6, m6 + psubw m9, m0 ; max_base_x + vpblendd m6, m10, 0xcc + mova xm0, xm10 + paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 + paddw m10, m10 +.w4_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + vpbroadcastq m1, [tlq+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vpbroadcastq m2, [tlq+r5] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + movq xm0, [tlq+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + movhps xm0, [tlq+r5] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 ; frac + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 ; 64-frac + psllw m2, 8 + pshufb m0, m8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + pcmpgtw m1, m9, m6 ; base < max_base_x + pmulhrsw m0, m3 + paddw m6, m10 ; xpos += dx + lea r5, [dstq+strideq*2] + vpblendvb m0, m7, m0, m1 + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [r5 +strideq*0], xm0 + pextrd [r5 +strideq*1], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*4] + cmp r3d, maxbased + jb .w4_loop + packuswb xm7, xm7 + lea r6, [strideq*3] +.w4_end_loop: + movd [dstq+strideq*0], xm7 + movd [dstq+strideq*1], xm7 + movd [dstq+strideq*2], xm7 + movd [dstq+r6 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_end_loop +.w4_end: + RET +ALIGN function_align +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 8 + movu xm2, [z_filter_s+6] + mova xm0, [tlq-1] + movd xm6, hd + vinserti128 m0, [tlq+7], 1 + vpbroadcastb xm6, xm6 + vbroadcasti128 m1, [z_upsample1] + pminub xm6, xm2 + vpbroadcastd m7, [pb_36_m4] + vinserti128 m2, xm6, 1 + add dxd, dxd + pshufb m1, m0, m1 + pshufb m2, m0, m2 + movd xm6, dxd + pmaddubsw m1, m7 + pmaddubsw m2, m7 + vpbroadcastw m6, xm6 + mov r3d, dxd + psrldq m0, 1 + lea r2, [strideq*3] + paddw m7, m6, m6 + paddw m1, m2 + vpblendd m6, m7, 0xf0 + pmulhrsw m1, m3 + pslldq m2, m7, 8 + paddw m7, m7 + paddw m6, m2 + packuswb m1, m1 + punpcklbw m0, m1 + mova [rsp], m0 +.w8_upsample_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm0, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vinserti128 m0, [rsp+r5], 1 + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + punpcklqdq m1, m2, m2 ; frac0 frac1 + pmaddubsw m0, m1 + movu xm1, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + vinserti128 m1, [rsp+r5], 1 + punpckhqdq m2, m2 ; frac2 frac3 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + paddw m6, m7 + pmulhrsw m1, m3 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*2], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_upsample_loop + RET +.w8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(h+7, 15) + jmp .w8_main +.w8_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 10 + lea maxbased, [hq+7] + test angled, 0x400 + jnz .w8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w8_main ; filter_strength == 0 + popcnt r5d, r5d + movu xm2, [tlq] + pminub xm1, xm0, [base+z_filter_s+14] + vinserti128 m2, [tlq-1], 1 + vinserti128 m1, [base+z_filter_s+ 0], 1 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pminub xm0, [base+z_filter_s+22] + vinserti128 m0, [base+z_filter_s+ 8], 1 + pshufb m6, m2, m1 + pmaddubsw m6, m7 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] + movzx r3d, byte [tlq+15] + shufps m1, m0, q2121 + pshufb m1, m2, m1 + pmaddubsw m1, m7 + paddw m1, m6 + sub r5d, 3 + jnz .w8_3tap + ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, + ; which also results in an awkward edge case where out[w*2] is + ; slightly different from out[max_base_x] when h > w. + vpbroadcastd m7, [z_filter_k+4*8] + movzx r2d, byte [tlq+14] + pshufb m2, m0 + pmaddubsw m2, m7 + sub r2d, r3d + lea r2d, [r2+r3*8+4] + shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 + mov [rsp+16], r2b + paddw m1, m2 +.w8_3tap: + pmulhrsw m1, m3 + sar r5d, 1 + mov tlq, rsp + add r5d, 17 ; w*2 + (filter_strength == 3) + cmp hd, 16 + cmovns maxbased, r5d + mov [tlq+r5], r3b + vextracti128 xm0, m1, 1 + packuswb xm0, xm1 + mova [tlq], xm0 +.w8_main: + movd xm2, dxd + vbroadcasti128 m0, [z_base_inc] + vpbroadcastw m2, xm2 + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + movd xm9, maxbased + vbroadcasti128 m8, [z_filter_s+2] + vpbroadcastw m9, xm9 + psrlw m7, 8 + psubw m9, m0 + mov r3d, dxd + paddw m6, m2, m2 + vpblendd m2, m6, 0xf0 +.w8_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + pand m0, m4, m2 + psubw m1, m5, m0 + psllw m0, 8 + por m1, m0 + movu xm0, [tlq+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vinserti128 m0, [tlq+r5], 1 + pshufb m0, m8 + pmaddubsw m0, m1 + pcmpgtw m1, m9, m2 + paddw m2, m6 + pmulhrsw m0, m3 + vpblendvb m0, m7, m0, m1 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w8_loop + packuswb xm7, xm7 +.w8_end_loop: + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_end_loop +.w8_end: + RET +.w16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(h+15, 31) + jmp .w16_main +ALIGN function_align +.w16: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 12 + lea maxbased, [hq+15] + test angled, 0x400 + jnz .w16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w16_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m1, [base+pb_12] + vbroadcasti128 m6, [base+z_filter_s+8] + vinserti128 m2, m6, [base+z_filter_s], 0 + vinserti128 m6, [base+z_filter_s+16], 1 + mova xm10, [tlq-1] + vinserti128 m10, [tlq+3], 1 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] + vbroadcasti128 m7, [base+z_filter_s+14] + vinserti128 m8, m7, [base+z_filter_s+6], 0 + vinserti128 m7, [base+z_filter_s+22], 1 + psubw m0, m1 + movu xm11, [tlq+12] + vinserti128 m11, [tlq+16], 1 + pminub m8, m0 + pminub m7, m0 + pshufb m0, m10, m2 + shufps m2, m6, q2121 + pmaddubsw m0, m9 + pshufb m1, m11, m8 + shufps m8, m7, q2121 + pmaddubsw m1, m9 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + movzx r3d, byte [tlq+31] + pshufb m2, m10, m2 + pmaddubsw m2, m9 + pshufb m8, m11, m8 + pmaddubsw m8, m9 + paddw m0, m2 + paddw m1, m8 + sub r5d, 3 + jnz .w16_3tap + vpbroadcastd m9, [z_filter_k+4*8] + movzx r2d, byte [tlq+30] + pshufb m10, m6 + pmaddubsw m10, m9 + pshufb m11, m7 + pmaddubsw m11, m9 + sub r2d, r3d + lea r2d, [r2+r3*8+4] + shr r2d, 3 + mov [rsp+32], r2b + paddw m0, m10 + paddw m1, m11 +.w16_3tap: + pmulhrsw m0, m3 + pmulhrsw m1, m3 + sar r5d, 1 + mov tlq, rsp + add r5d, 33 + cmp hd, 32 + cmovns maxbased, r5d + mov [tlq+r5], r3b + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [tlq], m0 +.w16_main: + movd xm6, dxd + vbroadcasti128 m0, [z_base_inc] + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m6, xm6 + movd xm9, maxbased + vbroadcasti128 m8, [z_filter_s+2] + vpbroadcastw m9, xm9 + mov r3d, dxd + psubw m9, m0 + paddw m11, m6, m6 + psubw m10, m9, m3 ; 64*8 + vpblendd m6, m11, 0xf0 +.w16_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu xm0, [tlq+r3+0] + movu xm1, [tlq+r3+8] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vinserti128 m0, [tlq+r5+0], 1 + vinserti128 m1, [tlq+r5+8], 1 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w16_loop +.w16_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_end_loop +.w16_end: + RET +ALIGN function_align +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 15 + lea r3d, [hq+31] + mov maxbased, 63 + cmp hd, 32 + cmovs maxbased, r3d + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vbroadcasti128 m0, [pb_0to15] + sub r3d, 29 ; h+2 + movu xm13, [tlq+29] ; 32-39 + movd xm1, r3d + movu xm14, [tlq+37] ; 40-47 + sub r3d, 8 ; h-6 + vinserti128 m14, [tlq+51], 1 ; 56-63 + vpbroadcastb xm1, xm1 + mova xm11, [tlq- 1] ; 0- 7 + vinserti128 m11, [tlq+13], 1 ; 16-23 + movd xm2, r3d + movu xm12, [tlq+ 5] ; 8-15 + vinserti128 m12, [tlq+19], 1 ; 24-31 + pminub xm1, xm0 ; clip 32x8 + mova m7, [z_filter_s+0] + pshufb xm13, xm1 + vpbroadcastd m1, [pb_12] + vpbroadcastb xm2, xm2 + vinserti128 m13, [tlq+43], 1 ; 48-55 + vinserti128 m8, m7, [z_filter_s+4], 1 + vpblendd m2, m1, 0xf0 + vinserti128 m7, [z_filter_s+12], 0 + pminub m2, m0 ; clip 32x16 and 32x(32|64) + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m14, m2 + pshufb m0, m11, m8 + shufps m8, m7, q1021 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m10, m11, m8 + shufps m8, m7, q2121 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m8 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m8 + pmaddubsw m10, m9 + paddw m1, m10 + pshufb m10, m14, m8 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*2] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m7 + pmaddubsw m12, m9 + movzx r3d, byte [tlq+63] + movzx r2d, byte [tlq+62] + paddw m0, m11 + paddw m2, m12 + pshufb m13, m7 + pmaddubsw m13, m9 + pshufb m14, m7 + pmaddubsw m14, m9 + paddw m1, m13 + paddw m6, m14 + sub r2d, r3d + lea r2d, [r2+r3*8+4] ; edge case for 32x64 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + shr r2d, 3 + mov [rsp+64], r2b + mov tlq, rsp + mov [tlq+65], r3b + mov r3d, 65 + cmp hd, 64 + cmove maxbased, r3d + packuswb m0, m2 + packuswb m1, m6 + mova [tlq+ 0], m0 + mova [tlq+32], m1 +.w32_main: + movd xm6, dxd + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m6, xm6 + movd xm9, maxbased + vbroadcasti128 m8, [z_filter_s+2] + vpbroadcastw m9, xm9 + mov r5d, dxd + psubw m9, [z_base_inc] + mova m11, m6 + psubw m10, m9, m3 ; 64*8 +.w32_loop: + mov r3d, r5d + shr r3d, 6 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu m0, [tlq+r3+0] + movu m1, [tlq+r3+8] + add r5d, dxd + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + mova [dstq], m0 + dec hd + jz .w32_end + add dstq, strideq + cmp r5d, maxbased + jb .w32_loop + test hb, 1 + jz .w32_end_loop + mova [dstq], m7 + add dstq, strideq + dec hd + jz .w32_end +.w32_end_loop: + mova [dstq+strideq*0], m7 + mova [dstq+strideq*1], m7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_end_loop +.w32_end: + RET +ALIGN function_align +.w64: + %assign stack_offset org_stack_offset + ALLOC_STACK -128, 16 + lea maxbased, [hq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + mova xm11, [tlq- 1] ; 0- 7 + vinserti128 m11, [tlq+13], 1 ; 16-23 + movu xm12, [tlq+ 5] ; 8-15 + vinserti128 m12, [tlq+19], 1 ; 24-31 + mova m7, [z_filter_s+0] + vinserti128 m8, m7, [z_filter_s+4], 1 + vinserti128 m7, [z_filter_s+12], 0 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + movu xm13, [tlq+29] ; 32-39 + vinserti128 m13, [tlq+43], 1 ; 48-55 + movu xm14, [tlq+37] ; 40-47 + vinserti128 m14, [tlq+51], 1 ; 56-63 + pshufb m0, m11, m8 + shufps m8, m7, q1021 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m10, m11, m8 + shufps m15, m8, m7, q2121 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m15 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m15 + pmaddubsw m10, m9 + paddw m1, m10 + pshufb m10, m14, m15 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m10, [z_filter_k+4*2+12*2] + pshufb m11, m15 + pmaddubsw m11, m10 + pshufb m12, m7 + pmaddubsw m12, m10 + pshufb m13, m7 + pmaddubsw m13, m10 + pshufb m14, m7 + pmaddubsw m14, m10 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + movu xm11, [tlq+ 61] ; 64- 71 + vinserti128 m11, [tlq+ 75], 1 ; 80- 87 + movu xm12, [tlq+ 69] ; 72- 79 + vinserti128 m12, [tlq+ 83], 1 ; 88- 95 + movu xm13, [tlq+ 93] ; 96-103 + vinserti128 m13, [tlq+107], 1 ; 112-119 + movu xm14, [tlq+101] ; 104-111 + vinserti128 m14, [tlq+115], 1 ; 120-127 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + lea r3d, [hq-20] + mov tlq, rsp + packuswb m0, m2 + packuswb m1, m6 + vpbroadcastd xm2, [pb_14] + vbroadcasti128 m6, [pb_0to15] + mova [tlq+32*0], m0 + mova [tlq+32*1], m1 + movd xm0, r3d + vpbroadcastd m1, [pb_12] + vpbroadcastb m0, xm0 + paddb m0, m2 + pminub m0, m6 ; clip 64x16 and 64x32 + pshufb m12, m0 + pminub m1, m6 ; clip 64x64 + pshufb m14, m1 + pshufb m0, m11, m7 + pmaddubsw m0, m10 + pshufb m2, m12, m7 + pmaddubsw m2, m10 + pshufb m1, m13, m7 + pmaddubsw m1, m10 + pshufb m6, m14, m7 + pmaddubsw m6, m10 + pshufb m7, m11, m15 + pmaddubsw m7, m9 + pshufb m10, m12, m15 + pmaddubsw m10, m9 + paddw m0, m7 + pshufb m7, m13, m15 + pmaddubsw m7, m9 + paddw m2, m10 + pshufb m10, m14, m15 + pmaddubsw m10, m9 + paddw m1, m7 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m8 + pmaddubsw m12, m9 + pshufb m13, m8 + pmaddubsw m13, m9 + pshufb m14, m8 + pmaddubsw m14, m9 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + packuswb m0, m2 + packuswb m1, m6 + mova [tlq+32*2], m0 + mova [tlq+32*3], m1 +.w64_main: + movd xm12, dxd + vpbroadcastb m7, [tlq+maxbaseq] + lea r3d, [dxq-64] + shl maxbased, 6 + vpbroadcastw m12, xm12 + sub r3d, maxbased + vbroadcasti128 m8, [z_filter_s+2] + movd xm6, r3d + mov r5d, dxd + mova m10, [pb_1to32] + vpbroadcastd m11, [pb_32] + vpbroadcastw m6, xm6 +.w64_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3+ 0] + movu m1, [tlq+r3+ 8] + pand m2, m4, m6 + psubw m9, m5, m2 + psllw m2, 8 + por m9, m2 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + psraw m2, m6, 6 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packsswb m2, m2 + paddb m2, m10 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [dstq+ 0], m0 + movu m0, [tlq+r3+32] + movu m1, [tlq+r3+40] + add r5d, dxd + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + paddb m2, m11 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + paddw m6, m12 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [dstq+32], m0 + dec hd + jz .w64_end + add dstq, strideq + cmp r5d, maxbased + jb .w64_loop +.w64_end_loop: + mova [dstq+ 0], m7 + mova [dstq+32], m7 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + +cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy +%define base r9-z_filter_t0 + lea r9, [ipred_z2_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea dxq, [dr_intra_derivative-90] + movsxd wq, [r9+wq*4] + movzx dyd, angleb + xor angled, 0x400 + mov r8, dxq + sub dxq, dyq + add wq, r9 + add r9, z_filter_t0-ipred_z2_avx2_table + mova m2, [tlq-64] + mova m0, [tlq-32] + mova m1, [tlq] + and dyd, ~1 + and dxq, ~1 + movzx dyd, word [r8+dyq] ; angle - 90 + movzx dxd, word [dxq+270] ; 180 - angle + vpbroadcastd m13, [base+pw_512] + vpbroadcastd m14, [base+pw_62] + vpbroadcastd m15, [base+pw_64] + mova [rsp+ 0], m2 + mova [rsp+32], m0 + mova [rsp+64], m1 + neg dxd + neg dyd + jmp wq +.w4: + vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 + vbroadcasti128 m10, [base+z1_shuf_w4] + vbroadcasti128 m11, [base+z2_shuf_h4] + lea r2d, [dxq+(65<<6)] ; xpos + movd xm5, dyd + mov r8d, (63-4)<<6 + mov dyq, -4 + pshuflw xm5, xm5, q0000 + pmullw xm5, [base+z2_ymul] + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm3, [base+pb_4] + call .upsample_above + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + xor angled, 0x7f ; 180 - angle + call .filter_strength + jmp .w4_filter_left +ALIGN function_align +.filter_strength: + movd xm8, r3d + mov r3d, angled + movd xm7, angled + vpbroadcastb m8, xm8 + shr r3d, 8 ; is_sm << 1 + vpbroadcastb m7, xm7 + pcmpeqb m8, [base+z_filter_wh] + mova xm9, [r9+r3*8] + pand m0, m8, m7 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + ret +ALIGN function_align +.upsample_above: ; w4/w8 + pshufb xm2, xm1, [base+z_upsample1-2] + pminub xm3, [base+z_filter_s+4] + vpbroadcastd xm4, [base+pb_36_m4] + vbroadcasti128 m10, [base+pb_0to15] + pshufb xm3, xm1, xm3 + pmaddubsw xm2, xm4 + pmaddubsw xm3, xm4 + lea r2d, [r2+dxq+(1<<6)] + add dxd, dxd + paddw xm2, xm3 + pmulhrsw xm2, xm13 + sub r8d, 3<<6 + paddw m6, m6 + packuswb xm2, xm2 + punpcklbw xm1, xm2 + mova [rsp+gprsize+64], xm1 + ret +ALIGN function_align +.upsample_left: ; h4/h8 + mov r3d, hd + and r3d, 4 + movd xm2, [rsp+gprsize+64] + movddup xm0, [rsp+gprsize+56] + movd xm1, r3d + palignr xm2, xm0, 1 + vpbroadcastb xm1, xm1 + pshufb xm2, [base+z_filter_s+18] + vpbroadcastd xm3, [base+pb_36_m4] + pmaxub xm1, [base+z_upsample1-2] + pshufb xm1, xm0, xm1 + pmaddubsw xm2, xm3 + pmaddubsw xm1, xm3 + paddw xm5, xm5 + add dyq, dyq + paddw xm1, xm2 + pmulhrsw xm1, xm13 + vbroadcasti128 m11, [base+z2_upsample] + paddw xm5, xm15 + packuswb xm1, xm1 + punpcklbw xm0, xm1 + mova [rsp+gprsize+48], xm0 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + sub angled, 1112 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w4_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm2, [base+pb_4] + pminub xm2, [base+z_filter_s] + vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + pshufb xm3, xm1, xm2 ; 00 01 12 23 + pshufd xm2, xm2, q0321 + pmaddubsw xm0, xm3, xm0 + pshufb xm2, xm1, xm2 ; 12 23 34 44 + pmaddubsw xm2, xm4 + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] + punpckhqdq xm3, xm3 ; 34 44 44 44 + pmaddubsw xm3, xm4 + movd xm4, r6m ; max_width + pminsw xm4, xm15 + vpbroadcastb xm4, xm4 + paddw xm0, xm2 + paddw xm0, xm3 + pmulhrsw xm0, xm13 + psubb xm4, [base+pb_1to32] + psrlq xm1, 8 + packuswb xm0, xm0 + vpblendvb xm0, xm1, xm4 + movd [rsp+65], xm0 +.w4_no_filter_above: + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm0, [base+pb_90] + psubb xm0, xm7 ; 180 - angle + pand xm0, xm8 ; reuse from previous filter_strength call + pcmpgtb xm0, xm9 + pmovmskb r3d, xm0 +.w4_filter_left: + test r3d, r3d + jz .w4_main + popcnt r3d, r3d + mov r5d, 10 + cmp hd, 16 + movu xm2, [rsp+49] + vinserti128 m2, [rsp+43], 1 + cmovs r5d, hd + xor r5d, 15 ; h == 16 ? 5 : 15 - h + movd xm0, r5d + vbroadcasti128 m1, [base+z_filter_s+12] + vbroadcasti128 m4, [base+z_filter_s+16] + vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab + vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd + vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef + vpbroadcastb m0, xm0 + pmaxub m0, m3 + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] + pshufb m0, m2, m0 + pmaddubsw m0, m3 + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] + pshufb m1, m2, m1 + pmaddubsw m1, m3 + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] + pshufb m2, m4 + pmaddubsw m2, m3 + movd xm4, r7m ; max_height + pminsw xm4, xm15 + vpbroadcastb xm4, xm4 + psubb xm4, [base+pb_16to1] + paddw m1, m0 + paddw m1, m2 + pmulhrsw m1, m13 + vextracti128 xm0, m1, 1 + packuswb xm0, xm1 + vpblendvb xm0, [rsp+48], xm4 + mova [rsp+48], xm0 + jmp .w4_main +.w4_upsample_left: + call .upsample_left +.w4_main: + movd xm0, dxd + mova m12, [base+z2_y_shuf_h4] + lea r5, [rsp+56] ; left-7 + vpbroadcastw m0, xm0 + lea r9, [strideq*3] + psraw xm1, xm5, 6 + pand xm5, xm14 ; frac_y + pxor xm2, xm2 + paddw m7, m0, m0 + psubw xm4, xm2, xm1 ; base_y + vpblendd m0, m7, 0xcc + mova xm1, xm7 + punpcklwd xm4, xm2 + paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 + psubw xm1, xm15, xm5 ; 64-frac_y + psllw xm5, 8 + paddw m7, m7 + paddw m6, m0 + por xm5, xm1 ; 64-frac_y, frac_y + vpbroadcastq m5, xm5 +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + vpbroadcastq m1, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + vpbroadcastq m2, [rsp+r3] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movq xm0, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + movhps xm0, [rsp+r3] + vpblendd m1, m2, 0xc0 + pand m2, m14, m6 ; frac_x + vpblendd m0, m1, 0xf0 + psubw m1, m15, m2 ; 64-frac_x + psllw m2, 8 + pshufb m0, m10 + por m1, m2 ; 64-frac_x, frac_x + pmaddubsw m0, m1 + cmp r3d, 64 + jge .w4_toponly + mova m1, m7 ; arbitrary negative value + vpgatherdq m3, [r5+xm4], m1 + pshufb m1, m3, m11 + vpermd m1, m12, m1 + pmaddubsw m1, m5 + psraw m2, m6, 15 ; base_x < topleft + vpblendvb m0, m1, m2 +.w4_toponly: + pmulhrsw m0, m13 + paddw m6, m7 ; xpos += dx + add r5, dyq + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r9 ], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*4] + cmp r2d, r8d + jge .w4_loop +.w4_leftonly_loop: + mova m1, m7 + vpgatherdq m2, [r5+xm4], m1 + add r5, dyq + pshufb m0, m2, m11 + vpermd m0, m12, m0 + pmaddubsw m0, m5 + pmulhrsw m0, m13 + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r9 ], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + RET +.w8: + vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 + movd xm5, dyd + vbroadcasti128 m10, [base+z_filter_s+2] + vbroadcasti128 m11, [base+z2_shuf_h4] + lea r2d, [dxq+(65<<6)] ; xpos + vpbroadcastw xm5, xm5 + mov r8d, (63-8)<<6 + mov dyq, -4 + pmullw xm5, [base+z2_ymul] + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + vpbroadcastd xm3, [base+pb_8] + movhps [rsp+80], xm1 + call .upsample_above + sub angled, 53 ; angle - 53 + lea r3d, [hq+7] + xor angled, 0x7f ; 180 - angle + call .filter_strength + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + sub angled, 90 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w8_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm3, [base+pb_8] + pminub xm3, [base+z_filter_s+8] + vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 + pmaddubsw xm0, xm2, xm0 + pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 + shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 + pmaddubsw xm2, xm4 + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] + pmaddubsw xm3, xm4 + movd xm4, r6m ; max_width + pminuw xm4, xm15 + vpbroadcastb xm4, xm4 + paddw xm0, xm2 + paddw xm0, xm3 + pmulhrsw xm0, xm13 + psubb xm4, [base+pb_1to32] + psrldq xm1, 1 + packuswb xm0, xm0 + vpblendvb xm0, xm1, xm4 + movq [rsp+65], xm0 +.w8_no_filter_above: + lea r3d, [angleq-51] + mov r3b, hb + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 +.w8_filter_left: + test r3d, r3d + jz .w8_main + popcnt r3d, r3d + vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] + cmp hd, 32 + jne .w8_filter_left_h16 + movu xm2, [rsp+27] + vinserti128 m2, [rsp+35], 1 + vpbroadcastd xm0, [base+pb_5] + vbroadcasti128 m3, [base+z_filter_s+ 8] + vbroadcasti128 m1, [base+z_filter_s+12] + vbroadcasti128 m4, [base+z_filter_s+16] + pmaxub m3, m0 + pshufb m3, m2, m3 + pmaddubsw m3, m7 + pshufb m1, m2, m1 + pmaddubsw m1, m8 + pshufb m2, m4 + pmaddubsw m2, m9 + paddw m3, m1 + paddw m3, m2 + pmulhrsw m3, m13 + jmp .w8_filter_left_top16 +.w8_filter_left_h16: + mov r5d, 10 + cmp hd, 16 + cmovs r5d, hd + xor r5d, 15 ; h == 16 ? 5 : 15 - h + movd xm0, r5d + vpbroadcastb m0, xm0 +.w8_filter_left_top16: + vbroadcasti128 m1, [base+z_filter_s+12] + vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab + vbroadcasti128 m4, [base+z_filter_s+16] + vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd + vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef + pmaxub m0, m2 + movu xm2, [rsp+49] + vinserti128 m2, [rsp+43], 1 + pshufb m0, m2, m0 + pmaddubsw m0, m7 + movd xm7, r7m ; max_height + pshufb m1, m2, m1 + pmaddubsw m1, m8 + pshufb m2, m4 + pmaddubsw m2, m9 + pminsw xm7, xm15 + paddw m1, m0 + vpbroadcastb m7, xm7 + paddw m1, m2 + pmulhrsw m1, m13 + psubb m7, [base+pb_32to1] + packuswb m3, m1 + vpermq m3, m3, q1320 + vpblendvb m3, [rsp+32], m7 + mova [rsp+32], m3 + jmp .w8_main +.w8_upsample_left: + call .upsample_left +.w8_main: + movd xm3, dxd + lea r5, [rsp+56] ; left-7 + pshufd xm1, xm5, q3120 + pand xm5, xm14 + vpbroadcastw m3, xm3 + pxor xm0, xm0 + psubw xm2, xm15, xm5 + psraw xm1, 6 + lea r9, [strideq*3] + paddw m7, m3, m3 + psubw xm9, xm0, xm1 ; base_y + psllw xm5, 8 + punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 + vpblendd m3, m7, 0xf0 ; xpos0 xpos1 + por xm5, xm2 ; 64-frac_y, frac_y + punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 + paddw m6, m3 + vinserti128 m12, m5, xm5, 1 +.w8_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm0, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + vinserti128 m0, [rsp+r3], 1 + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movu xm1, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + vinserti128 m1, [rsp+r3], 1 + pand m2, m14, m6 + paddsw m4, m6, m7 + psubw m5, m15, m2 + psllw m2, 8 + pshufb m0, m10 + por m2, m5 + pmaddubsw m0, m2 + pand m2, m14, m4 + psubw m5, m15, m2 + psllw m2, 8 + pshufb m1, m10 + por m2, m5 + pmaddubsw m1, m2 + cmp r3d, 64 + jge .w8_toponly + mova m5, m7 + vpgatherdq m3, [r5+xm9], m7 + mova m7, m5 + vpgatherdq m2, [r5+xm8], m5 + pshufb m3, m11 + pshufb m2, m11 + punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 + vpermq m5, m5, q3120 ; y0 y1 + vpermq m2, m2, q3120 ; y2 y3 + pmaddubsw m5, m12 + pmaddubsw m2, m12 + psraw m6, 15 ; base_x < topleft + vpblendvb m0, m5, m6 + psraw m3, m4, 15 + vpblendvb m1, m2, m3 +.w8_toponly: + pmulhrsw m0, m13 + pmulhrsw m1, m13 + paddw m6, m4, m7 ; xpos += dx + add r5, dyq + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*2], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+r9 ], xm1 + sub hd, 4 + jz .w8_end + lea dstq, [dstq+strideq*4] + cmp r2d, r8d + jge .w8_loop +.w8_leftonly_loop: + mova m0, m7 + vpgatherdq m5, [r5+xm9], m7 + mova m7, m0 + vpgatherdq m3, [r5+xm8], m0 + add r5, dyq + pshufb m2, m5, m11 + pshufb m1, m3, m11 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + pmaddubsw m0, m12 + pmaddubsw m1, m12 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*2], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+r9 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_leftonly_loop +.w8_end: + RET +.w16: + mov r8d, hd + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + sub angled, 90 + call .filter_strength + test r3d, r3d + jz .w16_no_filter_above + popcnt r3d, r3d + vbroadcasti128 m6, [tlq+1] + mova xm2, [base+z_filter_s] + vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de + movu xm3, [base+z_filter_s+8] + vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff + vpblendd m1, m6, 0xf0 + vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2] + pshufb m2, m1, m2 + pshufb m1, m3 + pmaddubsw m0, m2, m0 + shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff + pmaddubsw m2, m4 + pmaddubsw m1, m5 + movd xm4, r6m ; max_width + pminsw xm4, xm15 + vpbroadcastb xm4, xm4 + paddw m0, m2 + paddw m0, m1 + pmulhrsw m0, m13 + psubb xm4, [base+pb_1to32] + vextracti128 xm2, m0, 1 + packuswb xm0, xm2 + vpblendvb xm0, xm6, xm4 + movu [rsp+65], xm0 +.w16_no_filter_above: + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + test r3d, r3d + jz .w16_main + popcnt r3d, r3d + vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] +.w16_filter_left: + movd xm6, r7m ; max_height + pminsw xm6, xm15 + vpbroadcastb m6, xm6 + cmp hd, 32 + jl .w16_filter_left_h16 + vpbroadcastd xm0, [base+pb_5] + vbroadcasti128 m10, [base+z_filter_s+ 8] + vbroadcasti128 m11, [base+z_filter_s+12] + vbroadcasti128 m12, [base+z_filter_s+16] + je .w16_filter_left_h32 + movu m3, [tlq-69] + movu m5, [tlq-61] + pmaxub m1, m10, m0 + pshufb m1, m3, m1 + pmaddubsw m1, m7 + pshufb m2, m3, m11 + pmaddubsw m2, m8 + pshufb m3, m12 + pmaddubsw m3, m9 + paddw m1, m2 + pshufb m2, m5, m10 + pmaddubsw m2, m7 + pshufb m4, m5, m11 + pmaddubsw m4, m8 + pshufb m5, m12 + pmaddubsw m5, m9 + paddw m1, m3 + vpbroadcastd m3, [base+pb_32] + paddb m3, [base+pb_32to1] + paddw m2, m4 + paddw m2, m5 + pmulhrsw m1, m13 + pmulhrsw m2, m13 + psubb m3, m6, m3 + packuswb m1, m2 + vpblendvb m1, [tlq-64], m3 + mova [rsp], m1 + jmp .w16_filter_left_top32 +.w16_filter_left_h32: + pmaxub m10, m0 +.w16_filter_left_top32: + movu xm2, [tlq-37] + vinserti128 m2, [tlq-29], 1 + pshufb m3, m2, m10 + pshufb m1, m2, m11 + pshufb m2, m12 + pmaddubsw m3, m7 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + paddw m3, m1 + paddw m3, m2 + pmulhrsw m3, m13 + jmp .w16_filter_left_top16 +.w16_filter_left_h16: + mov r5d, 10 + cmp hd, 16 + cmovs r5d, hd + xor r5d, 15 ; h == 16 ? 5 : 15 - h + movd xm0, r5d + vpbroadcastb m0, xm0 +.w16_filter_left_top16: + movu xm2, [tlq-15] + vinserti128 m2, [tlq-21], 1 + vbroadcasti128 m1, [base+z_filter_s+12] + vbroadcasti128 m4, [base+z_filter_s+16] + vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab + vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd + vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef + pmaxub m0, m5 + pshufb m0, m2, m0 + pmaddubsw m0, m7 + pshufb m1, m2, m1 + pmaddubsw m1, m8 + pshufb m2, m4 + pmaddubsw m2, m9 + psubb m6, [base+pb_32to1] + paddw m1, m0 + paddw m1, m2 + pmulhrsw m1, m13 + packuswb m3, m1 + vpermq m3, m3, q1320 + vpblendvb m3, [tlq-32], m6 + mova [rsp+32], m3 +.w16_main: + movd xm1, dyd + vbroadcasti128 m10, [base+z_filter_s+2] + movd xm7, dxd + vbroadcasti128 m11, [base+z2_shuf_h2] + vpbroadcastw m1, xm1 + vpbroadcastw m7, xm7 + mov r7, dstq + pmullw m0, m1, [base+z2_ymul] + psllw xm1, 4 + paddw m6, m7, [base+z2_base_inc] + lea r9d, [dxq+(65<<6)] ; xpos + movd [rsp+156], xm1 +.w16_loop0: + mov r2d, r9d + mova [rsp+160], m0 + lea r5, [rsp+60] ; left-3 + mova [rsp+192], m6 + pxor m1, m1 + psraw m2, m0, 6 + pand m0, m14 + psubw m9, m1, m2 ; base_y + psubw m12, m15, m0 + punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11 + psllw m0, 8 + punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15 + por m12, m0 ; 64-frac_y, frac_y +.w16_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm0, [rsp+r2] + vinserti128 m0, [rsp+r2+8], 1 + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movu xm1, [rsp+r3] + vinserti128 m1, [rsp+r3+8], 1 + pand m2, m14, m6 + paddsw m5, m6, m7 + psubw m3, m15, m2 + psllw m2, 8 + pshufb m0, m10 + por m2, m3 + pmaddubsw m0, m2 + pand m2, m14, m5 + psubw m3, m15, m2 + psllw m2, 8 + pshufb m1, m10 + por m2, m3 + pmaddubsw m1, m2 + cmp r3d, 64 + jge .w16_toponly + punpckhwd m2, m5, m5 ; mask out unnecessary loads + vpgatherdd m4, [r5+m9], m2 + punpcklwd m2, m5, m5 + vpgatherdd m3, [r5+m8], m2 + pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1 + pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1 + punpcklqdq m2, m3, m4 ; y0 + punpckhqdq m3, m4 ; y1 + pmaddubsw m2, m12 + pmaddubsw m3, m12 + psraw m6, 15 ; base_x < topleft + vpblendvb m0, m2, m6 + psraw m6, m5, 15 + vpblendvb m1, m3, m6 +.w16_toponly: + pmulhrsw m0, m13 + pmulhrsw m1, m13 + paddw m6, m5, m7 ; xpos += dx + sub r5, 2 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r2d, (63-16)<<6 + jge .w16_loop +.w16_leftonly_loop: + mova m0, m7 + vpgatherdd m4, [r5+m9], m7 + mova m7, m0 + vpgatherdd m3, [r5+m8], m0 + sub r5, 2 + pshufb m2, m4, m11 + pshufb m1, m3, m11 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pmaddubsw m0, m12 + pmaddubsw m1, m12 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_leftonly_loop +.w16_end: + sub r8d, 1<<8 + jl .w16_ret + vpbroadcastd m0, [rsp+156] + paddw m0, [rsp+160] ; base_y += 16*dy + paddw m6, m13, [rsp+192] + add r7, 16 + add r9d, 16<<6 + movzx hd, r8b + mov dstq, r7 + paddw m6, m13 ; base_x += 16*64 + jmp .w16_loop0 +.w16_ret: + RET +.w32: + mova m2, [tlq+32] + lea r8d, [hq+(1<<8)] + mova [rsp+96], m2 + test angled, 0x400 + jnz .w16_main + vpbroadcastd m7, [base+z_filter_k+4*2+12*0] + vpbroadcastd m8, [base+z_filter_k+4*2+12*1] + vpbroadcastd m9, [base+z_filter_k+4*2+12*2] + mova xm5, [base+z_filter_s] + vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc + vinserti128 m1, [tlq+11], 1 + movu xm6, [base+z_filter_s+12] + vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff + movu xm3, [tlq+ 6] + vinserti128 m3, [tlq+17], 1 + movd xm0, r6m ; max_width + pminsw xm0, xm15 + vpbroadcastb m10, xm0 +.w32_filter_above: + pshufb m0, m1, m5 + shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de + pmaddubsw m0, m7 + pshufb m2, m1, m4 + shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff + pmaddubsw m2, m8 + pshufb m1, m5 + pmaddubsw m1, m9 + paddw m0, m2 + paddw m0, m1 + pshufb m1, m3, m4 + pmaddubsw m1, m7 + pshufb m2, m3, m5 + pmaddubsw m2, m8 + pshufb m3, m6 + pmaddubsw m3, m9 + paddw m1, m2 + paddw m1, m3 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + psubb m10, [base+pb_1to32] + packuswb m0, m1 + vpblendvb m0, [tlq+1], m10 + movu [rsp+65], m0 + jmp .w16_filter_left +.w64: + mova m2, [tlq+32] + mov r3d, [tlq+64] + lea r8d, [hq+(3<<8)] + mova [rsp+ 96], m2 + mov [rsp+128], r3d + test angled, 0x400 + jnz .w16_main + vpbroadcastd m7, [base+z_filter_k+4*2+12*0] + vpbroadcastd m8, [base+z_filter_k+4*2+12*1] + vpbroadcastd m9, [base+z_filter_k+4*2+12*2] + movu xm6, [base+z_filter_s+ 4] + vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc + movu xm3, [tlq+30] + vinserti128 m3, [tlq+43], 1 + movu xm5, [base+z_filter_s+16] + vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff + pshufb m0, m3, m6 + shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de + pmaddubsw m0, m7 + pshufb m2, m3, m4 + shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff + pmaddubsw m2, m8 + pshufb m3, m6 + pmaddubsw m3, m9 + paddw m0, m2 + paddw m0, m3 + movu xm2, [tlq+36] + vinserti128 m2, [tlq+49], 1 + pshufb m4, m2, m4 + pmaddubsw m4, m7 + pshufb m3, m2, m6 + pmaddubsw m3, m8 + pshufb m2, m5 + pmaddubsw m2, m9 + movd xm5, r6m ; max_width + pminsw xm5, xm15 + vpbroadcastb m10, xm5 + paddw m3, m4 + paddw m2, m3 + vpbroadcastd m3, [base+pb_32] + pmulhrsw m0, m13 + pmulhrsw m2, m13 + mova xm5, [base+z_filter_s] + vinserti128 m5, [base+z_filter_s+6], 1 + psubb m3, m10, m3 + psubb m3, [base+pb_1to32] + vinserti128 m1, [tlq+13], 1 + packuswb m0, m2 + vpblendvb m0, [tlq+33], m3 + movu xm3, [tlq+ 6] + vinserti128 m3, [tlq+19], 1 + movu [rsp+97], m0 + jmp .w32_filter_above + +cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z3_avx2_table] + tzcnt hd, hm + movifnidn angled, anglem + lea r7, [dr_intra_derivative+45*2-1] + dec tlq + movsxd hq, [r6+hq*4] + sub angled, 180 + add hq, r6 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + movzx dyd, word [r7+dyq] + vpbroadcastd m3, [pw_512] + vpbroadcastd m4, [pw_62] + vpbroadcastd m5, [pw_64] + mov org_wd, wd + jmp hq +.h4: + lea r7, [strideq*3] + cmp angleb, 40 + jae .h4_no_upsample + lea r4d, [angleq-1024] + sar r4d, 7 + add r4d, wd + jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) + ALLOC_STACK -32, 9 + movu xm8, [tlq-7] + pshufb xm0, xm8, [z_upsample1-4] + vpbroadcastb xm2, xm8 + pshufb xm1, xm8, [z_filter_s+2] + mova [rsp+16], xm2 ; top[max_base_y] + vpbroadcastd xm2, [pb_36_m4] + add dyd, dyd + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + movd xm7, dyd + mov r2d, dyd + vpbroadcastw m7, xm7 + paddw xm1, xm0 + pmulhrsw xm1, xm3 + pslldq m6, m7, 8 + paddw xm2, xm7, xm7 + paddw m6, m7 + packuswb xm1, xm1 + paddw m6, m2 + punpcklbw xm1, xm8 + mova xm8, [z_transpose4] + psllw m7, 2 + pshufb xm1, [pb_15to0] + mova [rsp], xm1 +.h4_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 + vpbroadcastq m1, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 + vpbroadcastq m2, [rsp+r4] + lea r4d, [r2+dyq] + shr r2d, 6 + movq xm0, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 + movhps xm0, [rsp+r4] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 + psllw m2, 8 + por m1, m2 + pmaddubsw m0, m1 + paddw m6, m7 + pmulhrsw m0, m3 + vextracti128 xm1, m0, 1 + packuswb xm1, xm0 + pshufb xm1, xm8 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+r7 ], xm1, 3 + add dstq, 4 + sub wd, 4 + jg .h4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; h4/h8/h16 +%define base r4-z_filter_t0 + lea r4, [z_filter_t0] + movd xm0, maxbased + movd xm2, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m2, xm2 + pcmpeqb m1, m0, [base+z_filter_wh] + pand m1, m2 + mova xm2, [r4+angleq*8] + pcmpgtb m1, m2 + pmovmskb r5d, m1 + ret +.h4_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -16, 12 + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea maxbased, [wq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m7, [base+pb_7] + vbroadcasti128 m2, [tlq-14] + pmaxub m1, m7, [base+z_filter_s-4] + vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] + pmaxub m7, [base+z_filter_s+4] + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] + pshufb m0, m2, m1 + shufps m1, m7, q2121 + pmaddubsw m0, m8 + pshufb m1, m2, m1 + pmaddubsw m1, m9 + pshufb m2, m7 + pmaddubsw m2, m10 + paddw m0, m1 + paddw m0, m2 + pmulhrsw m0, m3 + mov r4d, 9 + lea tlq, [rsp+15] + cmp wd, 4 + cmovne maxbased, r4d + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + mova [rsp], xm0 +.h4_main: + movd xm6, dyd + vpbroadcastq m0, [z_base_inc] ; base_inc << 6 + mov r4, tlq + sub tlq, 4 + neg dyq + vpbroadcastw m6, xm6 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] ; ypos + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf_w4] + add maxbased, 64 + vpbroadcastw m9, xm9 + psrlw m7, 8 ; top[max_base_y] + paddw m10, m6, m6 + psubw m9, m0 ; max_base_y + vpblendd m6, m10, 0xcc + mova xm0, xm10 + paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1 + paddw m10, m10 + mova xm11, [z_transpose4] +.h4_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + vpbroadcastq m1, [tlq+r4] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + vpbroadcastq m2, [tlq+r5] + lea r5, [r4+dyq] + sar r4, 6 ; base2 + movq xm0, [tlq+r4] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + movhps xm0, [tlq+r5] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 ; frac + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 ; 64-frac + psllw m2, 8 + pshufb m0, m8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + pcmpgtw m1, m9, m6 ; base < max_base_y + pmulhrsw m0, m3 + paddw m6, m10 ; ypos += dy + vpblendvb m0, m7, m0, m1 + vextracti128 xm1, m0, 1 + packuswb xm1, xm0 + pshufb xm1, xm11 ; transpose + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+r7 ], xm1, 3 + sub wd, 4 + jz .h4_end + add dstq, 4 + cmp r4d, maxbased + jg .h4_loop + packuswb xm7, xm7 +.h4_end_loop: + movd [dstq+strideq*0], xm7 + movd [dstq+strideq*1], xm7 + movd [dstq+strideq*2], xm7 + movd [dstq+r7 ], xm7 + add dstq, 4 + sub wd, 4 + jg .h4_end_loop +.h4_end: + RET +ALIGN function_align +.h8: + lea r4d, [angleq+216] + mov r4b, wb + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 8 + and r4d, 4 + mova xm0, [tlq-15] + vinserti128 m0, [tlq- 9], 1 + movd xm1, r4d + movu xm2, [z_filter_s+2] + vinserti128 m2, [z_filter_s+6], 1 + vpbroadcastb xm1, xm1 ; w & 4 + vpbroadcastd m7, [pb_36_m4] + pmaxub xm1, [z_upsample1-4] ; clip 4x8 + vinserti128 m1, [z_upsample1], 1 + add dyd, dyd + pshufb m1, m0, m1 + pshufb m2, m0, m2 + vinserti128 m0, [tlq-7], 1 + movd xm6, dyd + pmaddubsw m1, m7 + pmaddubsw m2, m7 + vpbroadcastw m6, xm6 + mov r2d, dyd + lea r5, [strideq*3] + paddw m7, m6, m6 + paddw m1, m2 + vpblendd m6, m7, 0xf0 + pmulhrsw m1, m3 + pslldq m2, m7, 8 + paddw m7, m7 + paddw m6, m2 + vbroadcasti128 m2, [pb_15to0] + packuswb m1, m1 + punpcklbw m1, m0 + pshufb m1, m2 + vextracti128 [rsp+ 0], m1, 1 + mova [rsp+16], xm1 +.h8_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 ; base0 + movu xm0, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base1 + vinserti128 m0, [rsp+r4], 1 + lea r4d, [r2+dyq] + shr r2d, 6 ; base2 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + punpcklqdq m1, m2, m2 ; frac0 frac1 + pmaddubsw m0, m1 + movu xm1, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base3 + vinserti128 m1, [rsp+r4], 1 + punpckhqdq m2, m2 ; frac2 frac3 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + paddw m6, m7 + pmulhrsw m1, m3 + lea r4, [dstq+strideq*4] + psllw m1, 8 + por m0, m1 + vextracti128 xm1, m0, 1 + punpcklbw xm2, xm0, xm1 + punpckhbw xm0, xm1 + movd [dstq+strideq*0], xm2 + pextrd [dstq+strideq*1], xm2, 1 + pextrd [dstq+strideq*2], xm2, 2 + pextrd [dstq+r5 ], xm2, 3 + movd [r4 +strideq*0], xm0 + pextrd [r4 +strideq*1], xm0, 1 + pextrd [r4 +strideq*2], xm0, 2 + pextrd [r4 +r5 ], xm0, 3 + add dstq, 4 + sub wd, 4 + jg .h8_upsample_loop + RET +.h8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(w+7, 15) + jmp .h8_main +.h8_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 10 + lea maxbased, [wq+7] + test angled, 0x400 + jnz .h8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h8_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd xm6, [base+pb_15] + pcmpeqb xm1, xm1 + psubusb xm6, xm0 + psubb xm6, xm1 ; w == 4 ? 5 : 1 + movu xm2, [tlq-16] + pmaxub xm1, xm6, [base+z_filter_s] + vinserti128 m2, [tlq-14], 1 + vinserti128 m1, [base+z_filter_s+12], 1 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pmaxub xm6, [base+z_filter_s+ 8] + vinserti128 m6, [base+z_filter_s+20], 1 + pshufb m0, m2, m1 + pmaddubsw m0, m7 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] + movzx r4d, byte [tlq-15] + shufps m1, m6, q2121 + pshufb m1, m2, m1 + pmaddubsw m1, m7 + paddw m0, m1 + sub r5d, 3 + jnz .h8_3tap + vpbroadcastd m7, [z_filter_k+4*8] + movzx r2d, byte [tlq-14] + pshufb m2, m6 + pmaddubsw m2, m7 + sub r2d, r4d + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+15], r2b + paddw m0, m2 +.h8_3tap: + pmulhrsw m0, m3 + sar r5d, 1 + lea tlq, [rsp+31] + add r5d, 17 + cmp wd, 16 + cmovns maxbased, r5d + neg r5 + mov [tlq+r5], r4b + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + mova [tlq-15], xm0 +.h8_main: + movd xm2, dyd + vbroadcasti128 m0, [z_base_inc] + mov r4, tlq + sub tlq, 8 + neg dyq + vpbroadcastw m2, xm2 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf] + add maxbased, 64 + vpbroadcastw m9, xm9 + psrlw m7, 8 + psubw m9, m0 + paddw m6, m2, m2 + vpblendd m2, m6, 0x0f +.h8_loop: + lea r5, [r4+dyq] + sar r4, 6 + pand m0, m4, m2 + psubw m1, m5, m0 + psllw m0, 8 + por m1, m0 + vbroadcasti128 m0, [tlq+r4] + lea r4, [r5+dyq] + sar r5, 6 + vinserti128 m0, [tlq+r5], 0 + sub rsp, 8*2 + pshufb m0, m8 + pmaddubsw m0, m1 + pcmpgtw m1, m9, m2 + paddw m2, m6 + pmulhrsw m0, m3 + vpblendvb m0, m7, m0, m1 + vextracti128 xm1, m0, 1 + psllw xm0, 8 + por xm0, xm1 ; interleave rows (partial transpose) + mova [rsp], xm0 + sub wd, 2 + jz .h8_transpose + cmp r4d, maxbased + jg .h8_loop + packuswb xm0, xm7, xm7 +.h8_end_loop: + sub rsp, 8*2 + mova [rsp], xm0 + sub wd, 2 + jg .h8_end_loop +.h8_transpose: + mova xm2, [rsp+16*1] + sub org_wd, 8 + lea r2, [strideq*3] + lea r6, [dstq+org_wq] + cmovns dstq, r6 + punpcklwd xm1, xm2, xm0 + punpckhwd xm2, xm0 + lea r6, [dstq+strideq*4] + jge .h8_w8 + add rsp, 16*2 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+r2 ], xm1, 3 + movd [r6 +strideq*0], xm2 + pextrd [r6 +strideq*1], xm2, 1 + pextrd [r6 +strideq*2], xm2, 2 + pextrd [r6 +r2 ], xm2, 3 + jmp .h8_end +.h8_w8_loop: + mova xm0, [rsp+16*0] + mova xm2, [rsp+16*1] + punpcklwd xm1, xm2, xm0 + punpckhwd xm2, xm0 +.h8_w8: ; w8/w16/w32 + mova xm0, [rsp+16*2] + mova xm4, [rsp+16*3] + add rsp, 16*4 + punpcklwd xm3, xm4, xm0 + punpckhwd xm4, xm0 + punpckldq xm0, xm3, xm1 + punpckhdq xm3, xm1 + punpckldq xm1, xm4, xm2 + punpckhdq xm4, xm2 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm3 + movhps [dstq+r2 ], xm3 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + movq [r6 +strideq*2], xm4 + movhps [r6 +r2 ], xm4 + sub dstq, 8 + sub r6, 8 + sub org_wd, 8 + jge .h8_w8_loop +.h8_end: + RET +.h16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(w+15, 31) + jmp .h16_main +ALIGN function_align +.h16: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 12 + lea maxbased, [wq+15] + test angled, 0x400 + jnz .h16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h16_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m11, [base+pb_27] + vpbroadcastd m1, [base+pb_1] + vbroadcasti128 m6, [base+z_filter_s+12] + vinserti128 m2, m6, [base+z_filter_s+4], 0 + vinserti128 m6, [base+z_filter_s+20], 1 + movu xm10, [tlq-18] + vinserti128 m10, [tlq-14], 1 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] + vbroadcasti128 m7, [base+z_filter_s+8] + vinserti128 m8, m7, [base+z_filter_s+0], 0 + vinserti128 m7, [base+z_filter_s+16], 1 + psubusb m11, m0 + por m1, m11 + movu xm11, [tlq-32] + vinserti128 m11, [tlq-28], 1 + pmaxub m8, m1 + pmaxub m7, m1 + pshufb m0, m10, m2 + shufps m2, m6, q2121 + pmaddubsw m0, m9 + pshufb m1, m11, m8 + shufps m8, m7, q2121 + pmaddubsw m1, m9 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + movzx r4d, byte [tlq-31] + pshufb m2, m10, m2 + pmaddubsw m2, m9 + pshufb m8, m11, m8 + pmaddubsw m8, m9 + paddw m0, m2 + paddw m1, m8 + sub r5d, 3 + jnz .h16_3tap + vpbroadcastd m9, [z_filter_k+4*8] + movzx r2d, byte [tlq-30] + pshufb m10, m6 + pmaddubsw m10, m9 + pshufb m11, m7 + pmaddubsw m11, m9 + sub r2d, r4d + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+31], r2b + paddw m0, m10 + paddw m1, m11 +.h16_3tap: + pmulhrsw m0, m3 + pmulhrsw m1, m3 + sar r5d, 1 + lea tlq, [rsp+63] + add r5d, 33 + cmp wd, 32 + cmovns maxbased, r5d + neg r5 + mov [tlq+r5], r4b + packuswb m0, m1 + vpermq m0, m0, q2031 + mova [tlq-31], m0 +.h16_main: + movd xm6, dyd + vbroadcasti128 m0, [z_base_inc] + mov r4, tlq + sub tlq, 8 + neg dyq + vpbroadcastw m6, xm6 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf] + add maxbased, 64 + vpbroadcastw m9, xm9 + psubw m9, m0 + paddw m11, m6, m6 + psubw m10, m9, m3 ; 64*8 + vpblendd m6, m11, 0xf0 +.h16_loop: + lea r5, [r4+dyq] + sar r4, 6 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu xm0, [tlq+r4-0] + movu xm1, [tlq+r4-8] + lea r4, [r5+dyq] + sar r5, 6 + vinserti128 m0, [tlq+r5-0], 1 + vinserti128 m1, [tlq+r5-8], 1 + sub rsp, 32 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + vpermq m0, m0, q3120 + mova [rsp], m0 + sub wd, 2 + jz .h16_transpose + cmp r4d, maxbased + jg .h16_loop + mova m0, m7 +.h16_end_loop: + sub rsp, 32 + mova [rsp], m7 + sub wd, 2 + jg .h16_end_loop +.h16_transpose: + mova m2, [rsp+32*1] + sub org_wd, 8 + lea r2, [strideq*3] + lea r6, [dstq+org_wq] + cmovns dstq, r6 + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + lea r3, [strideq*5] + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + lea r4, [strideq+r2*2] ; stride*7 + jge .h16_w8 + add rsp, 32*2 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + vextracti128 xm0, m0, 1 + movd [dstq+strideq*4], xm1 + pextrd [dstq+r3 ], xm1, 1 + pextrd [dstq+r2*2 ], xm1, 2 + pextrd [dstq+r4 ], xm1, 3 + lea dstq, [dstq+strideq*8] + vextracti128 xm1, m1, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + movd [dstq+strideq*4], xm1 + pextrd [dstq+r3 ], xm1, 1 + pextrd [dstq+r2*2 ], xm1, 2 + pextrd [dstq+r4 ], xm1, 3 + jmp .h16_end +.h16_w8_loop: + mova m0, [rsp+32*0] + mova m2, [rsp+32*1] + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 +.h16_w8: + mova m2, [rsp+32*2] + mova m4, [rsp+32*3] + lea r6, [dstq+strideq*8] + add rsp, 32*4 + punpcklbw m3, m4, m2 + punpckhbw m4, m2 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + punpckldq m4, m2, m0 + punpckhdq m2, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + vextracti128 xm4, m4, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+r2 ], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*4], xm0 + movhps [dstq+r3 ], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+r2*2 ], xm3 + movhps [dstq+r4 ], xm3 + vextracti128 xm3, m3, 1 + movq [r6+strideq*0], xm4 + movhps [r6+strideq*1], xm4 + movq [r6+strideq*2], xm2 + movhps [r6+r2 ], xm2 + movq [r6+strideq*4], xm0 + movhps [r6+r3 ], xm0 + movq [r6+r2*2 ], xm3 + movhps [r6+r4 ], xm3 + sub dstq, 8 + sub org_wd, 8 + jge .h16_w8_loop +.h16_end: + RET +ALIGN function_align +.h32: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 15 + lea maxbased, [wq+31] + and maxbased, 31 + or maxbased, 32 ; imin(w+31, 63) + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h32_main + vbroadcasti128 m0, [pb_0to15] + mov r4d, 21 + mov r5d, 3 + movu xm11, [tlq-66] ; 56-63 + vinserti128 m11, [tlq-52], 1 ; 40-47 + sub r4d, wd ; 21-w + cmovns r5d, r4d + movu xm12, [tlq-58] ; 48-55 + vinserti128 m12, [tlq-44], 1 ; 32-39 + sub r4d, 8 ; 13-w + movd xm1, r5d + movu xm13, [tlq-34] ; 24-31 + vinserti128 m13, [tlq-20], 1 ; 8-15 + movd xm2, r4d + vpbroadcastb m1, xm1 + movu xm14, [tlq-28] ; 16-23 + vinserti128 m14, [tlq-14], 1 ; 0- 7 + vpbroadcastb m2, xm2 + pmaxsb m1, m0 ; clip 16x32 and (32|64)x32 + movu m7, [z_filter_s+4] + pshufb m11, m1 + vinserti128 m8, m7, [z_filter_s+8], 1 + vinserti128 m7, [z_filter_s+16], 0 + pmaxsb m2, m0 ; clip 8x32 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m12, m2 + pshufb m0, m11, m8 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + shufps m8, m7, q1021 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m10, m11, m8 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m8 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m8 + pmaddubsw m10, m9 + shufps m8, m7, q2121 + paddw m1, m10 + pshufb m10, m14, m8 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*2] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m8 + pmaddubsw m12, m9 + movzx r4d, byte [tlq-63] + movzx r2d, byte [tlq-62] + paddw m0, m11 + paddw m2, m12 + pshufb m13, m8 + pmaddubsw m13, m9 + pshufb m14, m7 + pmaddubsw m14, m9 + paddw m1, m13 + paddw m6, m14 + sub r2d, r4d + lea r2d, [r2+r4*8+4] ; edge case for 64x32 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + shr r2d, 3 + mov [rsp+31], r2b + lea tlq, [rsp+95] + mov [tlq-65], r4b + mov r4d, 65 + cmp wd, 64 + cmove maxbased, r4d + packuswb m0, m2 + packuswb m1, m6 + mova [tlq-63], m0 + mova [tlq-31], m1 +.h32_main: + movd xm6, dyd + mov r4, tlq + sub tlq, 8 + neg dyq + vpbroadcastw m6, xm6 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf] + add maxbased, 64 + vpbroadcastw m9, xm9 + psubw m9, [z_base_inc] + mova m11, m6 + psubw m10, m9, m3 ; 64*8 +.h32_loop: + mov r5, r4 + sar r5, 6 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu xm0, [tlq+r5- 0] + vinserti128 m0, [tlq+r5-16], 1 + movu xm1, [tlq+r5- 8] + vinserti128 m1, [tlq+r5-24], 1 + sub rsp, 32 + add r4, dyq + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + mova [rsp], m0 + dec wd + jz .h32_transpose + cmp r4d, maxbased + jg .h32_loop +.h32_end_loop: + sub rsp, 32 + mova [rsp], m7 + dec wd + jg .h32_end_loop +.h32_transpose: + lea dstq, [dstq+org_wq-8] + lea r2, [strideq*3] + lea r3, [strideq*5] + lea r4, [strideq+r2*2] ; stride*7 +.h32_w8_loop: + mova m7, [rsp+32*0] + mova m6, [rsp+32*1] + mova m5, [rsp+32*2] + mova m4, [rsp+32*3] + mova m3, [rsp+32*4] + mova m2, [rsp+32*5] + mova m1, [rsp+32*6] + mova m0, [rsp+32*7] + lea r6, [dstq+strideq*8] + add rsp, 32*8 + punpcklbw m8, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklwd m7, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + punpckldq m2, m8, m3 + punpckhdq m8, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + movq [dstq+strideq*0], xm6 + movhps [dstq+strideq*1], xm6 + vextracti128 xm6, m6, 1 + movq [dstq+strideq*2], xm7 + movhps [dstq+r2 ], xm7 + vextracti128 xm7, m7, 1 + movq [dstq+strideq*4], xm2 + movhps [dstq+r3 ], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+r2*2 ], xm8 + movhps [dstq+r4 ], xm8 + vextracti128 xm8, m8, 1 + movq [r6+strideq*0], xm3 + movhps [r6+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [r6+strideq*2], xm1 + movhps [r6+r2 ], xm1 + vextracti128 xm1, m1, 1 + movq [r6+strideq*4], xm5 + movhps [r6+r3 ], xm5 + vextracti128 xm5, m5, 1 + movq [r6+r2*2 ], xm0 + movhps [r6+r4 ], xm0 + lea r6, [r6+strideq*8] + vextracti128 xm0, m0, 1 + movq [r6+strideq*0], xm6 + movhps [r6+strideq*1], xm6 + movq [r6+strideq*2], xm7 + movhps [r6+r2 ], xm7 + movq [r6+strideq*4], xm2 + movhps [r6+r3 ], xm2 + movq [r6+r2*2 ], xm8 + movhps [r6+r4 ], xm8 + lea r6, [r6+strideq*8] + movq [r6+strideq*0], xm3 + movhps [r6+strideq*1], xm3 + movq [r6+strideq*2], xm1 + movhps [r6+r2 ], xm1 + movq [r6+strideq*4], xm5 + movhps [r6+r3 ], xm5 + movq [r6+r2*2 ], xm0 + movhps [r6+r4 ], xm0 + sub dstq, 8 + sub org_wd, 8 + jg .h32_w8_loop + RET +ALIGN function_align +.h64: + %assign stack_offset org_stack_offset + ALLOC_STACK -128, 16 + lea maxbased, [wq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h64_main + mov r4d, 21 + vpbroadcastb xm11, [tlq-127] + vpblendd xm11, [tlq-130], 0x0e ; 120-127 + sub r4d, wd ; 21-w + mov r5d, 3 + vinserti128 m11, [tlq-116], 1 ; 104-111 + movu m7, [z_filter_s+4] + cmp wd, 32 + cmove r4d, r5d + vinserti128 m8, m7, [z_filter_s+8], 1 + vbroadcasti128 m6, [pb_0to15] + movd xm1, r4d + vpbroadcastd m9, [z_filter_k+4*2+12*0] + movu xm12, [tlq-122] ; 112-119 + vinserti128 m12, [tlq-108], 1 ; 96-103 + vpbroadcastb m1, xm1 + movu xm13, [tlq- 98] ; 88- 95 + vinserti128 m13, [tlq- 84], 1 ; 72- 79 + movu xm14, [tlq- 90] ; 80- 87 + vinserti128 m14, [tlq- 76], 1 ; 64- 71 + vinserti128 m7, [z_filter_s+16], 0 + pshufb m0, m11, m8 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pmaxsb m1, m6 ; clip (16|32)x64 + pshufb m13, m1 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + shufps m15, m8, m7, q1021 + pshufb m10, m11, m15 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m15 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m15 + pmaddubsw m10, m9 + paddw m1, m10 + pshufb m10, m14, m15 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*2] + shufps m10, m8, m7, q2132 + pshufb m11, m10 + pmaddubsw m11, m9 + pshufb m12, m10 + pmaddubsw m12, m9 + pshufb m13, m10 + pmaddubsw m13, m9 + pshufb m14, m10 + pmaddubsw m14, m9 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + movu xm11, [tlq-66] ; 56-63 + vinserti128 m11, [tlq-52], 1 ; 40-47 + movu xm12, [tlq-58] ; 48-55 + vinserti128 m12, [tlq-44], 1 ; 32-39 + movu xm13, [tlq-34] ; 24-31 + vinserti128 m13, [tlq-20], 1 ; 8-15 + movu xm14, [tlq-28] ; 16-23 + vinserti128 m14, [tlq-14], 1 ; 0- 7 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + lea tlq, [rsp+127] + packuswb m0, m2 + packuswb m1, m6 + mova [tlq-127], m0 + mova [tlq- 95], m1 + pshufb m0, m11, m10 + pmaddubsw m0, m9 + pshufb m2, m12, m10 + pmaddubsw m2, m9 + pshufb m1, m13, m10 + pmaddubsw m1, m9 + pshufb m6, m14, m7 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m7, m11, m15 + pmaddubsw m7, m9 + paddw m0, m7 + pshufb m7, m12, m15 + pmaddubsw m7, m9 + paddw m2, m7 + pshufb m7, m13, m15 + pmaddubsw m7, m9 + paddw m1, m7 + pshufb m7, m14, m10 + pmaddubsw m7, m9 + paddw m6, m7 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m8 + pmaddubsw m12, m9 + pshufb m13, m8 + pmaddubsw m13, m9 + pshufb m14, m15 + pmaddubsw m14, m9 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + packuswb m0, m2 + packuswb m1, m6 + mova [tlq-63], m0 + mova [tlq-31], m1 +.h64_main: + movd xm12, dyd + neg maxbaseq + vbroadcasti128 m8, [z3_shuf] + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m12, xm12 + lea r5d, [dyq+maxbaseq-64] + neg dyq + or maxbased, 63 + lea r4, [dyq+63] + movd xm6, r5d + mova xm10, [pb_1to32+16] + vinserti128 m10, [pb_1to32], 1 + vpbroadcastd m11, [pb_32] + vpbroadcastw m6, xm6 +.h64_loop: + mov r5, r4 + sar r5, 6 + movu m0, [tlq+r5-24] + movu m1, [tlq+r5-32] + pand m2, m4, m6 + psubw m9, m5, m2 + psllw m2, 8 + por m9, m2 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + psraw m2, m6, 6 + sub rsp, 64 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packsswb m2, m2 + paddb m2, m10 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [rsp+32], m0 + movu m0, [tlq+r5-56] + movu m1, [tlq+r5-64] + add r4, dyq + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + paddb m2, m11 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + paddw m6, m12 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [rsp], m0 + dec wd + jz .h64_transpose + cmp r4d, maxbased + jg .h64_loop +.h64_end_loop: + sub rsp, 64 + mova [rsp+32], m7 + mova [rsp+ 0], m7 + dec wd + jg .h64_end_loop +.h64_transpose: + lea r2, [strideq*3] + lea r3, [strideq*5] + imul r5, strideq, -8 + lea dstq, [dstq+org_wq-16] + lea r4, [strideq+r2*2] ; stride*7 +.h64_transpose_loop0: + lea r6, [rsp+16*3] +.h64_transpose_loop: + mova xm0, [r6+64*15] + vinserti128 m0, [r6+64* 7], 1 + mova xm1, [r6+64*14] + vinserti128 m1, [r6+64* 6], 1 + mova xm2, [r6+64*13] + vinserti128 m2, [r6+64* 5], 1 + mova xm3, [r6+64*12] + vinserti128 m3, [r6+64* 4], 1 + mova xm4, [r6+64*11] + vinserti128 m4, [r6+64* 3], 1 + mova xm5, [r6+64*10] + vinserti128 m5, [r6+64* 2], 1 + mova xm6, [r6+64* 9] + vinserti128 m6, [r6+64* 1], 1 + mova xm7, [r6+64* 8] + vinserti128 m7, [r6+64* 0], 1 + sub r6, 16 + punpcklbw m8, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklwd m7, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + punpckldq m2, m8, m3 + punpckhdq m8, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + vpermq m6, m6, q3120 + vpermq m7, m7, q3120 + vpermq m2, m2, q3120 + vpermq m8, m8, q3120 + vpermq m3, m3, q3120 + vpermq m1, m1, q3120 + vpermq m5, m5, q3120 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm6 + vextracti128 [dstq+strideq*1], m6, 1 + mova [dstq+strideq*2], xm7 + vextracti128 [dstq+r2 ], m7, 1 + mova [dstq+strideq*4], xm2 + vextracti128 [dstq+r3 ], m2, 1 + mova [dstq+r2*2 ], xm8 + vextracti128 [dstq+r4 ], m8, 1 + sub dstq, r5 + mova [dstq+strideq*0], xm3 + vextracti128 [dstq+strideq*1], m3, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r2 ], m1, 1 + mova [dstq+strideq*4], xm5 + vextracti128 [dstq+r3 ], m5, 1 + mova [dstq+r2*2 ], xm0 + vextracti128 [dstq+r4 ], m0, 1 + sub dstq, r5 + cmp r6, rsp + jae .h64_transpose_loop + add rsp, 64*16 + lea dstq, [dstq+r5*8-16] + sub org_wd, 16 + jg .h64_transpose_loop0 +.h64_end: + RET + +%macro FILTER_XMM 4 ; dst, src, tmp, shuf +%ifnum %4 + pshufb xm%2, xm%4 +%else + pshufb xm%2, %4 +%endif + pshufd xm%1, xm%2, q0000 ; p0 p1 + pmaddubsw xm%1, xm2 + pshufd xm%3, xm%2, q1111 ; p2 p3 + pmaddubsw xm%3, xm3 + paddw xm%1, xm1 + paddw xm%1, xm%3 + pshufd xm%3, xm%2, q2222 ; p4 p5 + pmaddubsw xm%3, xm4 + paddw xm%1, xm%3 + pshufd xm%3, xm%2, q3333 ; p6 __ + pmaddubsw xm%3, xm5 + paddw xm%1, xm%3 + psraw xm%1, 4 + packuswb xm%1, xm%1 +%endmacro + +%macro FILTER_YMM 4 ; dst, src, tmp, shuf + pshufb m%2, m%4 + pshufd m%1, m%2, q0000 + pmaddubsw m%1, m2 + pshufd m%3, m%2, q1111 + pmaddubsw m%3, m3 + paddw m%1, m1 + paddw m%1, m%3 + pshufd m%3, m%2, q2222 + pmaddubsw m%3, m4 + paddw m%1, m%3 + pshufd m%3, m%2, q3333 + pmaddubsw m%3, m5 + paddw m%1, m%3 + psraw m%1, 4 + vperm2i128 m%3, m%1, m%1, 0x01 + packuswb m%1, m%3 +%endmacro + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. One redundant +; block is calculated for w8 and w16, two for w32. +; w4 w8 w16 w32 +; 1 1 2 1 2 3 5 1 2 3 5 b c d f +; 2 2 3 2 4 5 7 2 4 5 7 c e f h +; 3 3 4 4 6 7 9 4 6 7 9 e g h j +; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ +; 5 8 8 i + +cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter +%define base r6-ipred_filter_avx2_table + lea r6, [filter_intra_taps] + tzcnt wd, wm +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + add filterq, r6 + lea r6, [ipred_filter_avx2_table] + movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 + movsxd wq, [r6+wq*4] + vpbroadcastd m1, [base+pw_8] + vbroadcasti128 m2, [filterq+16*0] + vbroadcasti128 m3, [filterq+16*1] + vbroadcasti128 m4, [filterq+16*2] + vbroadcasti128 m5, [filterq+16*3] + add wq, r6 + mov hd, hm + jmp wq +.w4: + WIN64_SPILL_XMM 9 + mova xm8, [base+filter_shuf2] + sub tlq, 3 + sub tlq, hq + jmp .w4_loop_start +.w4_loop: + pinsrd xm0, xm6, [tlq+hq], 0 + lea dstq, [dstq+strideq*2] +.w4_loop_start: + FILTER_XMM 6, 0, 7, 8 + movd [dstq+strideq*0], xm6 + pextrd [dstq+strideq*1], xm6, 1 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 10 + mova m8, [base+filter_shuf1] + FILTER_XMM 7, 0, 6, [base+filter_shuf2] + vpbroadcastd m0, [tlq+4] + vpbroadcastd m6, [tlq+5] + sub tlq, 4 + sub tlq, hq + vpbroadcastq m7, xm7 + vpblendd m7, m6, 0x20 +.w8_loop: + vpbroadcastd xm6, [tlq+hq] + palignr m6, m0, 12 + vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm6, xm7 + call .main + vpblendd xm6, xm7, 0x0c + pshufd xm6, xm6, q3120 + movq [dstq+strideq*0], xm6 + movhps [dstq+strideq*1], xm6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + %assign xmm_regs_used 15 + %assign stack_size_padded 0x98 + SUB rsp, stack_size_padded +%endif + sub hd, 2 + TAIL_CALL .w16_main, 0 +.w16_main: +%if WIN64 + movaps [rsp+0xa8], xmm6 + movaps [rsp+0xb8], xmm7 + movaps [rsp+0x28], xmm8 + movaps [rsp+0x38], xmm9 + movaps [rsp+0x48], xmm10 + movaps [rsp+0x58], xmm11 + movaps [rsp+0x68], xmm12 + movaps [rsp+0x78], xmm13 + movaps [rsp+0x88], xmm14 +%endif + FILTER_XMM 12, 0, 7, [base+filter_shuf2] + vpbroadcastd m0, [tlq+5] + vpblendd m0, [tlq-12], 0x14 + mova m8, [base+filter_shuf1] + vpbroadcastq m7, xm12 + vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + call .main ; c0 d0 a1 b1 a1 b1 c0 d0 + movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + vinserti128 m14, m8, [base+filter_shuf3], 0 + vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 + FILTER_XMM 6, 9, 10, 14 + vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 + vpbroadcastd m9, [tlq+13] + vpbroadcastd m10, [tlq+12] + psrld m11, m8, 4 + vpblendd m6, m9, 0x20 ; top + sub tlq, 6 + sub tlq, hq +.w16_loop: + vpbroadcastd xm9, [tlq+hq] + palignr m9, m0, 12 + vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm13, xm7 + call .main ; e0 f0 c1 d1 c1 d1 e0 f0 + vpblendd m9, m12, m10, 0xf0 + vpblendd m12, m6, 0xc0 + pshufd m9, m9, q3333 + vpblendd m9, m6, 0xee + vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 + vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 + vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 + vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 + mova [dstq+strideq*0], xm9 + vextracti128 [dstq+strideq*1], m9, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 + pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] + vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 + shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 + shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm6 + ret +ALIGN function_align +.w32: + sub rsp, stack_size_padded + sub hd, 2 + lea r3, [dstq+16] + lea r5d, [hq-2] + call .w16_main + add tlq, r5 + mov dstq, r3 + lea r3, [strideq-4] + lea r4, [r3+strideq*2] + movq xm0, [tlq+21] + pinsrd xm0, [dstq-4], 2 + pinsrd xm0, [dstq+r3*1], 3 + FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 + movq xm7, [dstq+r3*2] + pinsrd xm7, [dstq+r4], 2 + palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 + vpbroadcastd m0, [tlq+28] + vpbroadcastd m9, [tlq+29] + vbroadcasti128 m8, [base+filter_shuf1+16] + vpblendd m0, m9, 0x20 + vpblendd m0, m7, 0x0f + vpbroadcastq m7, xm12 + vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + call .main ; c0 d0 a1 b1 a1 b1 c0 d0 + add r3, 2 + lea r4, [r4+strideq*2] + movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 + FILTER_XMM 6, 9, 10, 14 + vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 + vpbroadcastd m9, [tlq+37] + vpbroadcastd m10, [tlq+36] + vpblendd m6, m9, 0x20 ; top +.w32_loop: + movq xm9, [dstq+r3*4] + pinsrd xm9, [dstq+r4], 2 +.w32_loop_last: + palignr m9, m0, 12 + vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm13, xm7 ; c0 d0 + call .main ; e0 f0 c1 d1 c1 d1 e0 f0 + vpblendd m9, m12, m10, 0xf0 + vpblendd m12, m6, 0xc0 + pshufd m9, m9, q3333 + vpblendd m9, m6, 0xee + vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 + vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 + vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 + vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 + mova [dstq+strideq*0], xm9 + vextracti128 [dstq+strideq*1], m9, 1 + lea dstq, [dstq+strideq*2] + sub r5d, 2 + jg .w32_loop + jz .w32_loop_last + vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 + pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] + vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 + shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 + shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm6 + RET +ALIGN function_align +.main: + FILTER_YMM 7, 0, 9, 8 + ret + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +%macro IPRED_CFL 1 ; ac in, unpacked pixels out + psignw m3, m%1, m1 + pabsw m%1, m%1 + pmulhrsw m%1, m2 + psignw m%1, m3 + paddw m%1, m0 +%endmacro + +cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + lea t0, [ipred_cfl_left_avx2_table] + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + mov r6d, 0x8000 + shrx r6d, r6d, wd + movd xm3, r6d + movsxd r6, [t0+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 + +cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + mov t0d, 0x8000 + shrx t0d, t0d, r6d + movd xm3, t0d + lea t0, [ipred_cfl_left_avx2_table] + movsxd r6, [t0+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h32: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h16: + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 +.h8: + psrlq xm1, xm0, 32 + paddw xm0, xm1 +.h4: + pmaddwd xm0, xm2 + pmulhrsw xm0, xm3 + vpbroadcastw m0, xm0 + jmp wq + +cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd xm4, t0d + tzcnt t0d, t0d + movd xm5, t0d + lea t0, [ipred_cfl_avx2_table] + tzcnt wd, wd + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + pcmpeqd m3, m3 + psrlw xm4, 1 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h4: + movd xm0, [tlq-4] + pmaddubsw xm0, xm3 + jmp wq +.w4: + movd xm1, [tlq+1] + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm1 + pmaddwd xm0, xm3 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + punpckhqdq xm1, xm0, xm0 + lea r2d, [hq*2] + mov r6d, 0x55563334 + paddw xm0, xm1 + shrx r6d, r6d, r2d + psrlq xm1, xm0, 32 + paddw xm0, xm1 + movd xm1, r6d + psrlw xm0, 2 + pmulhuw xm0, xm1 +.w4_end: + vpbroadcastw m0, xm0 +.s4: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq] + IPRED_CFL 4 + packuswb m4, m4 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + movd [dstq+strideq*2], xm5 + pextrd [dstq+r6 ], xm5, 1 + lea dstq, [dstq+strideq*4] + add acq, 32 + sub hd, 4 + jg .s4_loop + RET +ALIGN function_align +.h8: + movq xm0, [tlq-8] + pmaddubsw xm0, xm3 + jmp wq +.w8: + movq xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + punpckhqdq xm2, xm0, xm0 + paddw xm0, xm2 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmove r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w8_end: + vpbroadcastw m0, xm0 +.s8: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + movhps [dstq+strideq*2], xm4 + movhps [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + add acq, 64 + sub hd, 4 + jg .s8_loop + RET +ALIGN function_align +.h16: + mova xm0, [tlq-16] + pmaddubsw xm0, xm3 + jmp wq +.w16: + movu xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + vpermq m4, m4, q3120 + mova [dstq+strideq*0], xm4 + vextracti128 [dstq+strideq*1], m4, 1 + lea dstq, [dstq+strideq*2] + add acq, 64 + sub hd, 2 + jg .s16_loop + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + vextracti128 xm1, m0, 1 + psubw xm0, xm4 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x33345556 + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w32_end: + vpbroadcastw m0, xm0 +.s32: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + vpermq m4, m4, q3120 + mova [dstq], m4 + add dstq, strideq + add acq, 64 + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + lea t0, [ipred_cfl_splat_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [t0+wq*4] + vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128] + add wq, t0 + movifnidn acq, acmp + jmp wq + +cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pb_2] + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq xm0, [yq] + movq xm1, [yq+strideq] + movhps xm0, [yq+strideq*2] + movhps xm1, [yq+stride3q] + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + paddw xm0, xm1 + mova [acq], xm0 + paddw xm4, xm0 + lea yq, [yq+strideq*4] + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q1111 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg + +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova xm0, [yq] + mova xm1, [yq+strideq] + vinserti128 m0, [yq+strideq*2], 1 + vinserti128 m1, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 2 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg + jmp .w8_hpad +.w8_wpad: + vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] +.w8_wpad_loop: + movq xm0, [yq] + movq xm1, [yq+strideq] + vinserti128 m0, [yq+strideq*2], 1 + vinserti128 m1, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufb m0, m3 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 2 + jg .w8_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w8_hpad: + vpermq m0, m0, q3232 +.w8_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad_loop +.w16_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_420_avx2_table] + shl wpadd, 2 + mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ + ipred_cfl_ac_420_avx2_table+wpadq*8-32] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w16_pad3: + vpbroadcastq m0, [yq] + vpbroadcastq m1, [yq+strideq] + jmp .w16_wpad_end +.w16_pad2: + vbroadcasti128 m0, [yq] + vbroadcasti128 m1, [yq+strideq] + jmp .w16_wpad_end +.w16_pad1: + mova m0, [yq] + mova m1, [yq+strideq] + ; fall-through +.w16_wpad_end: + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufb m0, m3 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jz .w16_wpad_done + jmp iptrq +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + dec hpadd + jg .w16_hpad_loop + ; fall-through + +.calc_avg: + vpbroadcastd m2, [pw_1] + pmaddwd m0, m4, m2 + vextracti128 xm1, m0, 1 + tzcnt r1d, szd + paddd xm0, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pb_4] + pxor m4, m4 + pxor m5, m5 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq xm1, [yq] + movhps xm1, [yq+strideq] + movq xm0, [yq+strideq*2] + movhps xm0, [yq+stride3q] + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + mova [acq], xm1 + mova [acq+16], xm0 + paddw xm4, xm0 + paddw xm5, xm1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q1111 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg + +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova xm1, [yq] + vinserti128 m1, [yq+strideq], 1 + mova xm0, [yq+strideq*2] + vinserti128 m0, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg + jmp .w8_hpad +.w8_wpad: + vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] +.w8_wpad_loop: + movq xm1, [yq] + vinserti128 m1, [yq+strideq], 1 + movq xm0, [yq+strideq*2] + vinserti128 m0, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pshufb m0, m3 + pshufb m1, m3 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w8_hpad: + vpermq m0, m0, q3232 +.w8_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m1, [yq] + mova m0, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad_loop +.w16_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_422_avx2_table] + shl wpadd, 2 + mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ + ipred_cfl_ac_422_avx2_table+wpadq*8-32] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w16_pad3: + vpbroadcastq m1, [yq] + vpbroadcastq m0, [yq+strideq] + jmp .w16_wpad_end +.w16_pad2: + vbroadcasti128 m1, [yq] + vbroadcasti128 m0, [yq+strideq] + jmp .w16_wpad_end +.w16_pad1: + mova m1, [yq] + mova m0, [yq+strideq] + ; fall-through +.w16_wpad_end: + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pshufb m0, m3 + pshufb m1, m3 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jz .w16_wpad_done + jmp iptrq +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m0 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + ; fall-through + +.calc_avg: + vpbroadcastd m2, [pw_1] + pmaddwd m5, m5, m2 + pmaddwd m0, m4, m2 + paddd m0, m5 + vextracti128 xm1, m0, 1 + tzcnt r1d, szd + paddd xm0, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + vpbroadcastd m5, [pw_1] + tzcnt r8d, wd + lea r5, [ipred_cfl_ac_444_avx2_table] + movsxd r8, [r5+r8*4+12] + add r5, r8 + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak + mov ac_bakq, acq + jmp r5 + +.w4: + lea stride3q, [strideq*3] + pxor xm2, xm2 +.w4_loop: + movd xm1, [yq] + movd xm0, [yq+strideq*2] + pinsrd xm1, [yq+strideq], 1 + pinsrd xm0, [yq+stride3q], 1 + punpcklbw xm1, xm2 + punpcklbw xm0, xm2 + psllw xm1, 3 + psllw xm0, 3 + mova [acq], xm1 + mova [acq+16], xm0 + paddw xm1, xm0 + paddw xm4, xm1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_mul + pshufd xm0, xm0, q3232 + paddw xm1, xm0, xm0 +.w4_hpad_loop: + mova [acq], xm0 + mova [acq+16], xm0 + paddw xm4, xm1 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg_mul + +.w8: + lea stride3q, [strideq*3] + pxor m2, m2 +.w8_loop: + movq xm1, [yq] + movq xm0, [yq+strideq*2] + vinserti128 m1, [yq+strideq], 1 + vinserti128 m0, [yq+stride3q], 1 + punpcklbw m1, m2 + punpcklbw m0, m2 + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + paddw m4, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_mul + vpermq m0, m0, q3232 + paddw m1, m0, m0 +.w8_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddw m4, m1 + add acq, 64 + sub hpadd, 4 + jg .w8_hpad_loop + jmp .calc_avg_mul + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmovzxbw m1, [yq] + pmovzxbw m0, [yq+strideq] + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + pmaddwd m1, m5 + paddd m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad +.w16_wpad: + mova m3, [cfl_ac_444_w16_pad1_shuffle] +.w16_wpad_loop: + vpbroadcastq m1, [yq] + vpbroadcastq m0, [yq+strideq] + pshufb m1, m3 + pshufb m0, m3 + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + pmaddwd m1, m5 + paddd m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w16_hpad: + paddw m1, m0, m0 + pmaddwd m1, m5 +.w16_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddd m4, m1 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + jmp .calc_avg + +.w32: + test wpadd, wpadd + jnz .w32_wpad +.w32_loop: + pmovzxbw m1, [yq] + pmovzxbw m0, [yq+16] + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m2, m1, m0 + pmaddwd m2, m5 + paddd m4, m2 + add yq, strideq + add acq, 64 + dec hd + jg .w32_loop + test hpadd, hpadd + jz .calc_avg + jmp .w32_hpad_loop +.w32_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_444_avx2_table] + add wpadd, wpadd + mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w32_pad3: + vpbroadcastq m1, [yq] + pshufb m1, m3 + vpermq m0, m1, q3232 + jmp .w32_wpad_end +.w32_pad2: + pmovzxbw m1, [yq] + pshufhw m0, m1, q3333 + vpermq m0, m0, q3333 + jmp .w32_wpad_end +.w32_pad1: + pmovzxbw m1, [yq] + vpbroadcastq m0, [yq+16] + pshufb m0, m3 + ; fall-through +.w32_wpad_end: + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m2, m1, m0 + pmaddwd m2, m5 + paddd m4, m2 + add yq, strideq + add acq, 64 + dec hd + jz .w32_wpad_done + jmp iptrq +.w32_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w32_hpad_loop: + mova [acq], m1 + mova [acq+32], m0 + paddd m4, m2 + add acq, 64 + dec hpadd + jg .w32_hpad_loop + jmp .calc_avg + +.calc_avg_mul: + pmaddwd m4, m5 +.calc_avg: + vextracti128 xm1, m4, 1 + tzcnt r1d, szd + paddd xm0, xm4, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h + vbroadcasti128 m4, [palq] + lea r2, [pal_pred_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r2+wq*4] + packuswb m4, m4 + add wq, r2 + lea r2, [strideq*3] + jmp wq +.w4: + pshufb xm0, xm4, [idxq] + add idxq, 16 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +ALIGN function_align +.w8: + pshufb xm0, xm4, [idxq+16*0] + pshufb xm1, xm4, [idxq+16*1] + add idxq, 16*2 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +ALIGN function_align +.w16: + pshufb m0, m4, [idxq+32*0] + pshufb m1, m4, [idxq+32*1] + add idxq, 32*2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r2 ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +ALIGN function_align +.w32: + pshufb m0, m4, [idxq+32*0] + pshufb m1, m4, [idxq+32*1] + pshufb m2, m4, [idxq+32*2] + pshufb m3, m4, [idxq+32*3] + add idxq, 32*4 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r2 ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32 + RET +ALIGN function_align +.w64: + pshufb m0, m4, [idxq+32*0] + pshufb m1, m4, [idxq+32*1] + pshufb m2, m4, [idxq+32*2] + pshufb m3, m4, [idxq+32*3] + add idxq, 32*4 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64 + RET + +%endif diff -Nru dav1d-0.7.1/src/x86/ipred_init_tmpl.c dav1d-0.9.1/src/x86/ipred_init_tmpl.c --- dav1d-0.7.1/src/x86/ipred_init_tmpl.c 2020-06-21 11:48:55.020126300 +0000 +++ dav1d-0.9.1/src/x86/ipred_init_tmpl.c 2021-07-28 21:38:28.901852100 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -28,112 +28,101 @@ #include "src/cpu.h" #include "src/ipred.h" -decl_angular_ipred_fn(dav1d_ipred_dc_avx2); -decl_angular_ipred_fn(dav1d_ipred_dc_128_avx2); -decl_angular_ipred_fn(dav1d_ipred_dc_top_avx2); -decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2); -decl_angular_ipred_fn(dav1d_ipred_h_avx2); -decl_angular_ipred_fn(dav1d_ipred_v_avx2); -decl_angular_ipred_fn(dav1d_ipred_paeth_avx2); -decl_angular_ipred_fn(dav1d_ipred_smooth_avx2); -decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2); -decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2); -decl_angular_ipred_fn(dav1d_ipred_z1_avx2); -decl_angular_ipred_fn(dav1d_ipred_z2_avx2); -decl_angular_ipred_fn(dav1d_ipred_z3_avx2); -decl_angular_ipred_fn(dav1d_ipred_filter_avx2); - -decl_cfl_pred_fn(dav1d_ipred_cfl_avx2); -decl_cfl_pred_fn(dav1d_ipred_cfl_128_avx2); -decl_cfl_pred_fn(dav1d_ipred_cfl_top_avx2); -decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2); - -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_avx2); - -decl_pal_pred_fn(dav1d_pal_pred_avx2); - -decl_angular_ipred_fn(dav1d_ipred_dc_ssse3); -decl_angular_ipred_fn(dav1d_ipred_dc_128_ssse3); -decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3); -decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3); -decl_angular_ipred_fn(dav1d_ipred_h_ssse3); -decl_angular_ipred_fn(dav1d_ipred_v_ssse3); -decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3); -decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3); -decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3); -decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3); -decl_angular_ipred_fn(dav1d_ipred_filter_ssse3); - -decl_cfl_pred_fn(dav1d_ipred_cfl_ssse3); -decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3); -decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3); -decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3); - -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3); +#define decl_fn(type, name) \ + decl_##type##_fn(BF(dav1d_##name, ssse3)); \ + decl_##type##_fn(BF(dav1d_##name, avx2)) +#define init_fn(type0, type1, name, suffix) \ + c->type0[type1] = BF(dav1d_##name, suffix) + +#define init_angular_ipred_fn(type, name, suffix) \ + init_fn(intra_pred, type, name, suffix) +#define init_cfl_pred_fn(type, name, suffix) \ + init_fn(cfl_pred, type, name, suffix) +#define init_cfl_ac_fn(type, name, suffix) \ + init_fn(cfl_ac, type, name, suffix) + +decl_fn(angular_ipred, ipred_dc); +decl_fn(angular_ipred, ipred_dc_128); +decl_fn(angular_ipred, ipred_dc_top); +decl_fn(angular_ipred, ipred_dc_left); +decl_fn(angular_ipred, ipred_h); +decl_fn(angular_ipred, ipred_v); +decl_fn(angular_ipred, ipred_paeth); +decl_fn(angular_ipred, ipred_smooth); +decl_fn(angular_ipred, ipred_smooth_h); +decl_fn(angular_ipred, ipred_smooth_v); +decl_fn(angular_ipred, ipred_z1); +decl_fn(angular_ipred, ipred_z2); +decl_fn(angular_ipred, ipred_z3); +decl_fn(angular_ipred, ipred_filter); + +decl_fn(cfl_pred, ipred_cfl); +decl_fn(cfl_pred, ipred_cfl_128); +decl_fn(cfl_pred, ipred_cfl_top); +decl_fn(cfl_pred, ipred_cfl_left); + +decl_fn(cfl_ac, ipred_cfl_ac_420); +decl_fn(cfl_ac, ipred_cfl_ac_422); +decl_fn(cfl_ac, ipred_cfl_ac_444); -decl_pal_pred_fn(dav1d_pal_pred_ssse3); +decl_fn(pal_pred, pal_pred); COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; -#if BITDEPTH == 8 - c->intra_pred[DC_PRED] = dav1d_ipred_dc_ssse3; - c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_ssse3; - c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_ssse3; - c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3; - c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3; - c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3; - c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_ssse3; - c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3; - c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3; - c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3; - c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_ssse3; - - c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_ssse3; - c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_ssse3; - c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_ssse3; - c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_ssse3; - - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3; - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3; - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_ssse3; + init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3); + init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3); + init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3); + + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3); - c->pal_pred = dav1d_pal_pred_ssse3; -#endif + c->pal_pred = BF(dav1d_pal_pred, ssse3); +#if ARCH_X86_64 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if BITDEPTH == 8 && ARCH_X86_64 - c->intra_pred[DC_PRED] = dav1d_ipred_dc_avx2; - c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_avx2; - c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_avx2; - c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_avx2; - c->intra_pred[HOR_PRED] = dav1d_ipred_h_avx2; - c->intra_pred[VERT_PRED] = dav1d_ipred_v_avx2; - c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_avx2; - c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_avx2; - c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2; - c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2; - c->intra_pred[Z1_PRED] = dav1d_ipred_z1_avx2; - c->intra_pred[Z2_PRED] = dav1d_ipred_z2_avx2; - c->intra_pred[Z3_PRED] = dav1d_ipred_z3_avx2; - c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_avx2; - - c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_avx2; - c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_avx2; - c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_avx2; - c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_avx2; - - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2; - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_avx2; - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_avx2; + init_angular_ipred_fn(DC_PRED, ipred_dc, avx2); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2); + init_angular_ipred_fn(HOR_PRED, ipred_h, avx2); + init_angular_ipred_fn(VERT_PRED, ipred_v, avx2); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2); + init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2); + init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2); + init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2); - c->pal_pred = dav1d_pal_pred_avx2; + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2); +#if BITDEPTH == 8 + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2); +#endif + c->pal_pred = BF(dav1d_pal_pred, avx2); #endif } diff -Nru dav1d-0.7.1/src/x86/ipred_sse.asm dav1d-0.9.1/src/x86/ipred_sse.asm --- dav1d-0.7.1/src/x86/ipred_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/ipred_sse.asm 2021-07-28 21:38:28.901852100 +0000 @@ -0,0 +1,3109 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + db %1-128, 127-%1 + %rotate 1 + %endrep +%endmacro + +; sm_weights[], but modified to precalculate x and 256-x with offsets to +; enable efficient use of pmaddubsw (which requires signed values) +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +ipred_v_shuf : db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 +ipred_h_shuf : db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 +ipred_paeth_shuf : db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 +filter_shuf1 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 +filter_shuf2 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 + +pw_8 : times 8 dw 8 +pb_3 : times 16 db 3 +pb_128 : times 8 db 128 +pw_128 : times 4 dw 128 +pw_255 : times 4 dw 255 +pb_2 : times 8 db 2 +pb_4 : times 8 db 4 +pb_127_m127 : times 4 db 127, -127 +pd_32768 : times 1 dd 32768 + + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) +%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) + +JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 +JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 +JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 + +cextern filter_intra_taps + + +SECTION .text + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 + pshuflw m1, m0, %3 ; extend 8 byte for 2 pos + punpcklqdq m1, m1 + mova [dstq + %2], m1 +%if %1 > 16 + mova [dstq + 16 + %2], m1 +%endif +%if %1 > 32 + mova [dstq + 32 + %2], m1 + mova [dstq + 48 + %2], m1 +%endif +%endmacro + +%macro IPRED_H 1 ; width + sub tlq, 4 + movd m0, [tlq] ; get 4 bytes of topleft data + punpcklbw m0, m0 ; extend 2 byte +%if %1 == 4 + pshuflw m1, m0, q2233 + movd [dstq+strideq*0], m1 + psrlq m1, 32 + movd [dstq+strideq*1], m1 + pshuflw m0, m0, q0011 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+stride3q ], m0 + +%elif %1 == 8 + punpcklwd m0, m0 + punpckhdq m1, m0, m0 + punpckldq m0, m0 + movq [dstq+strideq*1], m1 + movhps [dstq+strideq*0], m1 + movq [dstq+stride3q ], m0 + movhps [dstq+strideq*2], m0 +%else + IPRED_SET %1, 0, q3333 + IPRED_SET %1, strideq, q2222 + IPRED_SET %1, strideq*2, q1111 + IPRED_SET %1, stride3q, q0000 +%endif + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w%1 + RET +%endmacro + +INIT_XMM ssse3 +cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 + LEA r5, ipred_h_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + IPRED_H 4 +.w8: + IPRED_H 8 +.w16: + IPRED_H 16 +.w32: + IPRED_H 32 +.w64: + IPRED_H 64 + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_ssse3_table + tzcnt wd, wm + movu m0, [tlq+ 1] + movu m1, [tlq+17] + movu m2, [tlq+33] + movu m3, [tlq+49] + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd m4, r5d + tzcnt r5d, r5d + movd m5, r5d + LEA r5, ipred_dc_ssse3_table + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+20] + pcmpeqd m3, m3 + psrlw m4, 1 ; dc = (width + height) >> 1; + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movd m0, [tlq-4] + pmaddubsw m0, m3 + jmp wq +.w4: + movd m1, [tlq+1] + pmaddubsw m1, m3 + psubw m0, m4 + paddw m0, m1 + pmaddwd m0, m3 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 ; dc >>= ctz(width + height); + jmp .w4_end +.w4_mul: + punpckhqdq m1, m0, m0 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrlw m0, 2 + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8 + cmovz r6d, r2d + movd m5, r6d + pmulhuw m0, m5 +.w4_end: + pxor m1, m1 + pshufb m0, m1 +.s4: + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + movq m0, [tlq-8] + pmaddubsw m0, m3 + jmp wq +.w8: + movq m1, [tlq+1] + pmaddubsw m1, m3 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + paddw m0, m1 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w8_end: + pxor m1, m1 + pshufb m0, m1 +.s8: + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova m0, [tlq-16] + pmaddubsw m0, m3 + jmp wq +.w16: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w16_end: + pxor m1, m1 + pshufb m0, m1 +.s16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + mova m2, [tlq-16] + pmaddubsw m2, m3 + paddw m0, m2 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + movu m2, [tlq+17] + pmaddubsw m2, m3 + paddw m1, m2 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 64|16 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w32_end: + pxor m1, m1 + pshufb m0, m1 + mova m1, m0 +.s32: + mova [dstq], m0 + mova [dstq+16], m1 + mova [dstq+strideq], m0 + mova [dstq+strideq+16], m1 + mova [dstq+strideq*2], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q], m0 + mova [dstq+stride3q+16], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-64] + mova m1, [tlq-48] + pmaddubsw m0, m3 + pmaddubsw m1, m3 + paddw m0, m1 + mova m1, [tlq-32] + pmaddubsw m1, m3 + paddw m0, m1 + mova m1, [tlq-16] + pmaddubsw m1, m3 + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 1] + movu m2, [tlq+17] + pmaddubsw m1, m3 + pmaddubsw m2, m3 + paddw m1, m2 + movu m2, [tlq+33] + pmaddubsw m2, m3 + paddw m1, m2 + movu m2, [tlq+49] + pmaddubsw m2, m3 + paddw m1, m2 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 64 + je .w64_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w64_end: + pxor m1, m1 + pshufb m0, m1 + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s64: + mova [dstq], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + mova [dstq+strideq], m0 + mova [dstq+strideq+16], m1 + mova [dstq+strideq+32], m2 + mova [dstq+strideq+48], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s64 + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_left_ssse3_table + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] + movd m2, r6d + psrld m3, m2 + movsxd r6, [r5+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m1, [tlq+48] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 + movu m1, [tlq+32] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h32: + movu m1, [tlq+16] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h16: + pshufd m1, m0, q3232 ; psrlq m1, m0, 16 + paddw m0, m1 +.h8: + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 +.h4: + pmaddwd m0, m2 + pmulhrsw m0, m3 + lea stride3q, [strideq*3] + pxor m1, m1 + pshufb m0, m1 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp wq + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] + mova m1, m0 + mova m2, m0 + mova m3, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h + LEA r5, ipred_dc_left_ssse3_table + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] + movd m2, wd + psrld m3, m2 + movsxd r6, [r5+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] + ; w * a = (w - 128) * a + 128 * a + ; (256 - w) * b = (127 - w) * b + 129 * b + ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] + pmaddubsw m6, m%3, m%1 + pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b + paddw m6, m%5 + paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] + psrlw m6, 8 + psrlw m0, 8 + packuswb m6, m0 +%endmacro + +cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights +%define base r6-ipred_smooth_v_ssse3_table + LEA r6, ipred_smooth_v_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m0, [base+pb_127_m127] + movddup m1, [base+pw_128] + lea weightsq, [base+smooth_weights+hq*4] + neg hq + movd m5, [tlq+hq] + pxor m2, m2 + pshufb m5, m2 + add wq, r6 + jmp wq +.w4: + movd m2, [tlq+1] + punpckldq m2, m2 + punpcklbw m2, m5 ; top, bottom + lea r3, [strideq*3] + mova m4, [base+ipred_v_shuf] + mova m5, m4 + punpckldq m4, m4 + punpckhdq m5, m5 + pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom + paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok + paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 +.w4_loop: + movu m1, [weightsq+hq*2] + pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + movd [dstq+strideq*0], m6 + pshuflw m1, m6, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m6, m6 + movd [dstq+strideq*2], m6 + psrlq m6, 32 + movd [dstq+r3 ], m6 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop + RET +ALIGN function_align +.w8: + movq m2, [tlq+1] + punpcklbw m2, m5 + mova m5, [base+ipred_v_shuf] + lea r3, [strideq*3] + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + pmaddubsw m3, m2, m0 + paddw m1, m2 + paddw m3, m1 ; m3 is output for loop +.w8_loop: + movq m1, [weightsq+hq*2] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + movq [dstq+strideq*0], m6 + movhps [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w8_loop + RET +ALIGN function_align +.w16: + movu m3, [tlq+1] + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 ; m4 and m5 is output for loop +.w16_loop: + movd m1, [weightsq+hq*2] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m6 + add dstq, strideq + add hq, 1 + jl .w16_loop + RET +ALIGN function_align +.w32: +%if WIN64 + movaps [rsp+24], xmm7 + %define xmm_regs_used 8 +%endif + mova m7, m5 +.w32_loop_init: + mov r3d, 2 +.w32_loop: + movddup m0, [base+pb_127_m127] + movddup m1, [base+pw_128] + movu m3, [tlq+1] + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 + movd m1, [weightsq+hq*2] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m6 + add tlq, 16 + add dstq, 16 + dec r3d + jg .w32_loop + lea dstq, [dstq-32+strideq] + sub tlq, 32 + add hq, 1 + jl .w32_loop_init + RET +ALIGN function_align +.w64: +%if WIN64 + movaps [rsp+24], xmm7 + %define xmm_regs_used 8 +%endif + mova m7, m5 +.w64_loop_init: + mov r3d, 4 +.w64_loop: + movddup m0, [base+pb_127_m127] + movddup m1, [base+pw_128] + movu m3, [tlq+1] + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 + movd m1, [weightsq+hq*2] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m6 + add tlq, 16 + add dstq, 16 + dec r3d + jg .w64_loop + lea dstq, [dstq-64+strideq] + sub tlq, 64 + add hq, 1 + jl .w64_loop_init + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h +%define base r6-ipred_smooth_h_ssse3_table + LEA r6, ipred_smooth_h_ssse3_table + mov wd, wm + movd m3, [tlq+wq] + pxor m1, m1 + pshufb m3, m1 ; right + tzcnt wd, wd + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m4, [base+pb_127_m127] + movddup m5, [base+pw_128] + add wq, r6 + jmp wq +.w4: + movddup m6, [base+smooth_weights+4*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 4 + sub tlq, hq + lea r3, [strideq*3] +.w4_loop: + movd m2, [tlq+hq] ; left + pshufb m2, m7 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r3 ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + mova m6, [base+smooth_weights+8*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 4 + sub tlq, hq + punpckldq m7, m7 +.w8_loop: + movd m2, [tlq+hq] ; left + pshufb m2, m7 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + mova m6, [base+smooth_weights+16*2] + mova m7, [base+smooth_weights+16*3] + sub tlq, 1 + sub tlq, hq +.w16_loop: + pxor m1, m1 + movd m2, [tlq+hq] ; left + pshufb m2, m1 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m7 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq], m0 + lea dstq, [dstq+strideq] + sub hd, 1 + jg .w16_loop + RET +ALIGN function_align +.w32: + sub tlq, 1 + sub tlq, hq + pxor m6, m6 +.w32_loop_init: + mov r5, 2 + lea r3, [base+smooth_weights+16*4] +.w32_loop: + mova m7, [r3] + add r3, 16 + movd m2, [tlq+hq] ; left + pshufb m2, m6 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m7 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + mova m7, [r3] + add r3, 16 + pmaddubsw m2, m7 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq], m0 + add dstq, 16 + dec r5 + jg .w32_loop + lea dstq, [dstq-32+strideq] + sub hd, 1 + jg .w32_loop_init + RET +ALIGN function_align +.w64: + sub tlq, 1 + sub tlq, hq + pxor m6, m6 +.w64_loop_init: + mov r5, 4 + lea r3, [base+smooth_weights+16*8] +.w64_loop: + mova m7, [r3] + add r3, 16 + movd m2, [tlq+hq] ; left + pshufb m2, m6 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m7 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + mova m7, [r3] + add r3, 16 + pmaddubsw m2, m7 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq], m0 + add dstq, 16 + dec r5 + jg .w64_loop + lea dstq, [dstq-64+strideq] + sub hd, 1 + jg .w64_loop_init + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 + pmaddubsw m6, m%3, m%1 + mova m0, m6 + pmaddubsw m6, m%4, m%2 + mova m1, m6 +%ifnum %5 + paddw m0, m%5 +%else + paddw m0, %5 +%endif +%ifnum %6 + paddw m1, m%6 +%else + paddw m1, %6 +%endif +%ifnum %7 +%else + mova m3, %7 +%endif + pavgw m0, m2 + pavgw m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 +%endmacro + +%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] + mova m1, [rsp+16*%1] ; top + punpckhbw m6, m1, m0 ; top, bottom + punpcklbw m1, m0 ; top, bottom + pmaddubsw m2, m1, m5 + mova [rsp+16*%2], m1 + paddw m1, m3 ; 1 * top + 255 * bottom + 255 + paddw m2, m1 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*%3], m2 + pmaddubsw m2, m6, m5 + mova [rsp+16*%4], m6 + paddw m6, m3 ; 1 * top + 255 * bottom + 255 + paddw m2, m6 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*%5], m2 + movd m1, [tlq+hq] ; left + pshufb m1, [base+pb_3] ; topleft[-(1 + y)] + punpcklbw m1, m4 ; left, right + pmaddubsw m2, m1, m5 ; 127 * left - 127 * right + paddw m2, m1 ; 128 * left + 129 * right + mova m3, m2 + pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; + pmaddubsw m1, %7 + paddw m2, m3, m0 + paddw m3, m1 + movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + mova m7, [rsp+16*%9] + pshufb m1, m7 + mova [rsp+16*%8], m3 + mova m4, [rsp+16*%2] + mova m5, [rsp+16*%3] + mova m3, [rsp+16*%4] + mova m7, [rsp+16*%5] + SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] + mova [dstq], m0 + movddup m3, [base+pw_255] ; recovery + mova m0, [rsp+16*%10] ; recovery + mova m4, [rsp+16*%11] ; recovery + mova m5, [rsp+16*%12] ; recovery +%endmacro + +cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights +%define base r6-ipred_smooth_ssse3_table + mov wd, wm + mov hd, hm + LEA r6, ipred_smooth_ssse3_table + movd m4, [tlq+wq] ; right + pxor m2, m2 + pshufb m4, m2 + tzcnt wd, wd + mov r5, tlq + sub r5, hq + movsxd wq, [r6+wq*4] + movddup m5, [base+pb_127_m127] + movd m0, [r5] + pshufb m0, m2 ; bottom + movddup m3, [base+pw_255] + add wq, r6 + lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] + jmp wq +.w4: + mova m7, [base+ipred_v_shuf] + movd m1, [tlq+1] ; left + pshufd m1, m1, q0000 + sub tlq, 4 + lea r3, [strideq*3] + sub tlq, hq + punpcklbw m1, m0 ; top, bottom + pshufd m6, m7, q1100 + pshufd m7, m7, q3322 + pmaddubsw m2, m1, m5 + paddw m3, m1 ; 1 * top + 255 * bottom + 255 + paddw m2, m3 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; + punpcklqdq m1, m1 + mova [rsp+16*2], m1 + mova [rsp+16*3], m4 + mova [rsp+16*4], m6 + mova [rsp+16*5], m5 +.w4_loop: + movd m1, [tlq+hq] ; left + pshufb m1, [base+ipred_h_shuf] + punpcklbw m0, m1, m4 ; left, right + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 ; 127 * left - 127 * right + pmaddubsw m3, m1, m5 + paddw m2, m0 ; 128 * left + 129 * right + paddw m3, m1 + mova m4, [rsp+16*2] + pmaddubsw m0, m4 + pmaddubsw m1, m4 + paddw m2, m0 + paddw m3, m1 + movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + add v_weightsq, 8 + pshufb m0, m1, m6 + pshufb m1, m7 + mova m4, [rsp+16*0] + mova m5, [rsp+16*1] + SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 + mova m4, [rsp+16*3] + mova m6, [rsp+16*4] + mova m5, [rsp+16*5] + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r3 ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + mova m7, [base+ipred_v_shuf] + movq m1, [tlq+1] ; left + punpcklqdq m1, m1 + sub tlq, 4 + sub tlq, hq + punpcklbw m1, m0 + pshufd m6, m7, q0000 + pshufd m7, m7, q1111 + pmaddubsw m2, m1, m5 + paddw m3, m1 + paddw m2, m3 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; + mova [rsp+16*2], m1 + mova [rsp+16*3], m4 + mova [rsp+16*4], m6 + mova [rsp+16*5], m5 +.w8_loop: + movd m1, [tlq+hq] ; left + pshufb m1, [base+ipred_h_shuf] + pshufd m1, m1, q1100 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 + pmaddubsw m3, m1, m5 + paddw m2, m0 + paddw m3, m1 + mova m4, [rsp+16*2] + pmaddubsw m0, m4 + pmaddubsw m1, m4 + paddw m2, m0 + paddw m3, m1 + movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + add v_weightsq, 4 + pshufb m0, m1, m6 + pshufb m1, m7 + mova m4, [rsp+16*0] + mova m5, [rsp+16*1] + SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 + mova m4, [rsp+16*3] + mova m6, [rsp+16*4] + mova m5, [rsp+16*5] + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + mova m7, [base+ipred_v_shuf] + movu m1, [tlq+1] ; left + sub tlq, 4 + sub tlq, hq + punpckhbw m6, m1, m0 ; top, bottom + punpcklbw m1, m0 ; top, bottom + pshufd m7, m7, q0000 + mova [rsp+16*2], m7 + pmaddubsw m2, m6, m5 + mova [rsp+16*5], m6 + paddw m6, m3 ; 1 * top + 255 * bottom + 255 + paddw m2, m6 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*6], m2 + pmaddubsw m2, m1, m5 + paddw m3, m1 ; 1 * top + 255 * bottom + 255 + mova [rsp+16*0], m1 + paddw m2, m3 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*1], m2 + mova [rsp+16*3], m4 + mova [rsp+16*4], m5 +.w16_loop: + movd m1, [tlq+hq] ; left + pshufb m1, [base+pb_3] ; topleft[-(1 + y)] + punpcklbw m1, m4 ; left, right + pmaddubsw m2, m1, m5 ; 127 * left - 127 * right + paddw m2, m1 ; 128 * left + 129 * right + mova m0, m1 + mova m3, m2 + pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; + pmaddubsw m1, [base+smooth_weights+16*3] + paddw m2, m0 + paddw m3, m1 + movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + add v_weightsq, 2 + mova m7, [rsp+16*2] + pshufb m1, m7 + mova [rsp+16*7], m3 + mova m4, [rsp+16*0] + mova m5, [rsp+16*1] + mova m3, [rsp+16*5] + mova m7, [rsp+16*6] + SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] + mova m4, [rsp+16*3] + mova m5, [rsp+16*4] + mova [dstq], m0 + lea dstq, [dstq+strideq] + sub hd, 1 + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m1, [tlq+1] ; top topleft[1 + x] + movu m2, [tlq+17] ; top + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + sub tlq, 4 + sub tlq, hq + mova m7, [base+ipred_v_shuf] + pshufd m7, m7, q0000 + mova [rsp+16*2], m7 + mova [rsp+16*3], m0 + mova [rsp+16*4], m4 + mova [rsp+16*5], m5 +.w32_loop: + SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 + lea dstq, [dstq-16+strideq] + add v_weightsq, 2 + sub hd, 1 + jg .w32_loop + RET +ALIGN function_align +.w64: + movu m1, [tlq+1] ; top topleft[1 + x] + movu m2, [tlq+17] ; top + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + movu m1, [tlq+33] ; top + movu m2, [tlq+49] ; top + mova [rsp+16*11], m1 + mova [rsp+16*12], m2 + sub tlq, 4 + sub tlq, hq + mova m7, [base+ipred_v_shuf] + pshufd m7, m7, q0000 + mova [rsp+16*2], m7 + mova [rsp+16*3], m0 + mova [rsp+16*4], m4 + mova [rsp+16*5], m5 +.w64_loop: + SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 + lea dstq, [dstq-48+strideq] + add v_weightsq, 2 + sub hd, 1 + jg .w64_loop + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, +; const uint8_t *idx, const int w, const int h); +;--------------------------------------------------------------------------------------- +cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h + mova m4, [palq] + LEA r2, pal_pred_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r2+wq*4] + packuswb m4, m4 + add wq, r2 + lea r2, [strideq*3] + jmp wq +.w4: + pshufb m0, m4, [idxq] + add idxq, 16 + movd [dstq ], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq ], m1 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r2 ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +ALIGN function_align +.w8: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + add idxq, 32 + movq [dstq ], m0 + movhps [dstq+strideq ], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r2 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +ALIGN function_align +.w16: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + pshufb m2, m4, [idxq+32] + pshufb m3, m4, [idxq+48] + add idxq, 64 + mova [dstq ], m0 + mova [dstq+strideq ], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r2 ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +ALIGN function_align +.w32: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + pshufb m2, m4, [idxq+32] + pshufb m3, m4, [idxq+48] + add idxq, 64 + mova [dstq ], m0 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +ALIGN function_align +.w64: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + pshufb m2, m4, [idxq+32] + pshufb m3, m4, [idxq+48] + add idxq, 64 + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, strideq + sub hd, 1 + jg .w64 + RET + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +%macro IPRED_CFL 1 ; ac in, unpacked pixels out + psignw m3, m%1, m1 + pabsw m%1, m%1 + pmulhrsw m%1, m2 + psignw m%1, m3 + paddw m%1, m0 +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + movifnidn wd, wm + movifnidn hd, hm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd m4, t0d + tzcnt t0d, t0d + movd m5, t0d + LEA t0, ipred_cfl_ssse3_table + tzcnt wd, wd + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+16] + pcmpeqd m3, m3 + psrlw m4, 1 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h4: + movd m0, [tlq-4] + pmaddubsw m0, m3 + jmp wq +.w4: + movd m1, [tlq+1] + pmaddubsw m1, m3 + psubw m0, m4 + paddw m0, m1 + pmaddwd m0, m3 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 ; dc >>= ctz(width + height); + jmp .w4_end +.w4_mul: + punpckhqdq m1, m0, m0 + paddw m0, m1 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + psrlw m0, 2 + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8 + cmovz r6d, r2d + movd m5, r6d + pmulhuw m0, m5 +.w4_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s4: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + movd [dstq+strideq*0], m4 + pshuflw m4, m4, q1032 + movd [dstq+strideq*1], m4 + punpckhqdq m4, m4 + movd [dstq+strideq*2], m4 + psrlq m4, 32 + movd [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + add acq, 32 + sub hd, 4 + jg .s4_loop + RET +ALIGN function_align +.h8: + movq m0, [tlq-8] + pmaddubsw m0, m3 + jmp wq +.w8: + movq m1, [tlq+1] + pmaddubsw m1, m3 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + paddw m0, m1 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + movq [dstq ], m4 + movhps [dstq+strideq ], m4 + mova m4, [acq+32] + mova m5, [acq+48] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + movq [dstq+strideq*2], m4 + movhps [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + add acq, 64 + sub hd, 4 + jg .s8_loop + RET +ALIGN function_align +.h16: + mova m0, [tlq-16] + pmaddubsw m0, m3 + jmp wq +.w16: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq], m4 + mova m4, [acq+32] + mova m5, [acq+48] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + add acq, 64 + sub hd, 2 + jg .s16_loop + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + mova m2, [tlq-16] + pmaddubsw m2, m3 + paddw m0, m2 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + movu m2, [tlq+17] + pmaddubsw m2, m3 + paddw m1, m2 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 64|16 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq], m4 + mova m4, [acq+32] + mova m5, [acq+48] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq+16], m4 + add dstq, strideq + add acq, 64 + dec hd + jg .s32_loop + RET + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + mov t0d, 0x8000 + movd m3, t0d + movd m2, r6d + psrld m3, m2 + LEA t0, ipred_cfl_left_ssse3_table + movsxd r6, [t0+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h32: + movu m1, [tlq+16] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h16: + pshufd m1, m0, q3232 ; psrlq m1, m0, 16 + paddw m0, m1 +.h8: + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 +.h4: + pmaddwd m0, m2 + pmulhrsw m0, m3 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + LEA t0, ipred_cfl_left_ssse3_table + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + mov r6d, 0x8000 + movd m3, r6d + movd m2, wd + psrld m3, m2 + movsxd r6, [t0+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + tzcnt wd, wm + movifnidn hd, hm + LEA r6, ipred_cfl_splat_ssse3_table + movsxd wq, [r6+wq*4] + movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] + add wq, r6 + movifnidn acq, acmp + jmp wq + +%macro RELOAD_ACQ_32 1 + mov acq, ac_bakq ; restore acq +%endmacro + +%if ARCH_X86_64 +cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak +DECLARE_REG_TMP 7 + movddup m2, [pb_2] +%else +cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h +DECLARE_REG_TMP 4 +%define ac_bakq acmp + mov t0d, 0x02020202 + movd m2, t0d + pshufd m2, m2, q0000 +%endif + movifnidn wd, wm + mov t0d, hm + mov hd, t0d + imul t0d, wd + movd m5, t0d + movifnidn hpadd, hpadm +%if ARCH_X86_64 + mov ac_bakq, acq +%endif + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through +%if ARCH_X86_64 + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak +%else + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h +%endif +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq m0, [yq] + movq m1, [yq+strideq] + movhps m0, [yq+strideq*2] + movhps m1, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_4_8 + punpckhqdq m0, m0 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 16 + sub hpadd, 2 + jg .w4_hpad_loop + jmp .calc_avg_4_8 +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + mova m0, [yq+strideq*2] + mova m1, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq+16], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 2 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_4_8 + jmp .w8_hpad +.w8_wpad: ; wpadd=1 + movddup m0, [yq] + movddup m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufhw m0, m0, q3333 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 16 + sub hd, 1 + jg .w8_wpad + test hpadd, hpadd + jz .calc_avg_4_8 +.w8_hpad: + mova [acq], m0 + paddw m4, m0 + add acq, 16 + sub hpadd, 1 + jg .w8_hpad + jmp .calc_avg_4_8 +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + mova m6, [yq+16] + mova m1, [yq+strideq+16] + pmaddubsw m6, m2 + pmaddubsw m1, m2 + paddw m6, m1 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_loop + test hpadd, hpadd + jz .calc_avg16 + jmp .w16_hpad_loop +.w16_wpad: + cmp wpadd, 2 + jl .w16_pad1 + je .w16_pad2 +.w16_pad3: + movddup m0, [yq] + movddup m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufhw m0, m0, q3333 + mova [acq], m0 + paddw m4, m0 + mova m6, m0 + punpckhqdq m6, m0, m0 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_pad3 + jmp .w16_wpad_done +.w16_pad2: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + pshufhw m6, m0, q3333 + punpckhqdq m6, m6 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_pad2 + jmp .w16_wpad_done +.w16_pad1: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + movddup m6, [yq+16] + movddup m1, [yq+strideq+16] + pmaddubsw m6, m2 + pmaddubsw m1, m2 + paddw m6, m1 + pshufhw m6, m6, q3333 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_pad1 +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg16 +.w16_hpad_loop: + mova [acq], m0 + paddw m4, m0 + mova [acq+16], m6 + paddw m4, m6 + add acq, 32 + dec hpadd + jg .w16_hpad_loop + jmp .calc_avg16 + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak +%else + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h +%endif +.calc_avg_4_8: + psrlw m2, 9 + pmaddwd m4, m2 + jmp .calc_avg +.calc_avg16: + psrld m0, m4, 16 + pslld m4, 16 + psrld m4, 16 + paddd m4, m0 +.calc_avg: + movd szd, m5 + psrad m5, 1 + tzcnt r1d, szd + paddd m4, m5 + movd m1, r1d + pshufd m0, m4, q2301 + paddd m0, m4 + pshufd m4, m0, q1032 + paddd m0, m4 + psrad m0, m1 ; sum >>= log2sz; + packssdw m0, m0 + RELOAD_ACQ_32 acq +.sub_loop: + mova m1, [acq] + psubw m1, m0 ; ac[x] -= sum; + mova [acq], m1 + add acq, 16 + sub szd, 8 + jg .sub_loop + RET + +%if ARCH_X86_64 +cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak + movddup m2, [pb_4] +%else +cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h + mov t0d, 0x04040404 + movd m2, t0d + pshufd m2, m2, q0000 +%endif + movifnidn wd, wm + mov t0d, hm + mov hd, t0d + imul t0d, wd + movd m6, t0d + movifnidn hpadd, hpadm +%if ARCH_X86_64 + mov ac_bakq, acq +%endif + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + pxor m5, m5 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak +%else + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h +%endif +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq m1, [yq] + movhps m1, [yq+strideq] + movq m0, [yq+strideq*2] + movhps m0, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_4 + punpckhqdq m0, m0 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 16 + sub hpadd, 2 + jg .w4_hpad_loop + jmp .calc_avg_4 +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova m1, [yq] + mova m0, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m0 + paddw m5, m1 + mova m1, [yq+strideq*2] + mova m0, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w8_hpad +.w8_wpad: + movddup m1, [yq] + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq], m1 + paddw m5, m1 + movddup m0, [yq+strideq] + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+16], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_wpad + test hpadd, hpadd + jz .calc_avg_8_16 +.w8_hpad: + mova [acq], m0 + paddw m4, m0 + mova [acq+16], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad + jmp .calc_avg_8_16 +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m1, [yq] + mova m0, [yq+16] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m5, m0 + paddw m5, m1 + mova m1, [yq+strideq] + mova m0, [yq+strideq+16] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m0 + paddw m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w16_hpad_loop +.w16_wpad: + cmp wpadd, 2 + jl .w16_pad1 + je .w16_pad2 +.w16_pad3: + movddup m1, [yq] + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq], m1 + paddw m5, m1 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + movddup m1, [yq+strideq] + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq+32], m1 + paddw m4, m1 + punpckhqdq m0, m1, m1 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad3 + jmp .w16_wpad_done +.w16_pad2: + mova m1, [yq] + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + pshufhw m1, m1, q3333 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + mova m1, [yq+strideq] + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + mova m0, m1 + pshufhw m0, m0, q3333 + punpckhqdq m0, m0 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad2 + jmp .w16_wpad_done +.w16_pad1: + mova m1, [yq] + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + movddup m0, [yq+16] + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+16], m0 + paddw m5, m0 + mova m1, [yq+strideq] + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + movddup m0, [yq+strideq+16] + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad1 +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg_8_16 +.w16_hpad_loop: + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m1 + paddw m5, m0 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m1 + paddw m5, m0 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + jmp .calc_avg_8_16 + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak +%else + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h +%endif +.calc_avg_4: + psrlw m2, 10 + pmaddwd m5, m2 + pmaddwd m0, m4, m2 + jmp .calc_avg +.calc_avg_8_16: + mova m0, m5 + psrld m5, 16 + pslld m0, 16 + psrld m0, 16 + paddd m5, m0 + mova m0, m4 + psrld m0, 16 + pslld m4, 16 + psrld m4, 16 + paddd m0, m4 +.calc_avg: + paddd m5, m0 + movd szd, m6 + psrad m6, 1 + tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); + paddd m5, m6 + movd m1, r1d + pshufd m0, m5, q2301 + paddd m0, m5 + pshufd m5, m0, q1032 + paddd m0, m5 + psrad m0, m1 ; sum >>= log2sz; + packssdw m0, m0 + RELOAD_ACQ_32 acq ; ac = ac_orig +.sub_loop: + mova m1, [acq] + psubw m1, m0 + mova [acq], m1 + add acq, 16 + sub szd, 8 + jg .sub_loop + RET + +%if ARCH_X86_64 +cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak + movddup m2, [pb_4] +%else +cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h +%define ac_bakq [rsp+16*4] + mov t0d, 0x04040404 + movd m2, t0d + pshufd m2, m2, q0000 +%endif + movifnidn wd, wm + movifnidn hpadd, hpadm + movd m0, hpadd + mov t0d, hm + mov hd, t0d + imul t0d, wd + movd m6, t0d + movd hpadd, m0 + mov ac_bakq, acq + shl hpadd, 2 + sub hd, hpadd + pxor m5, m5 + pxor m4, m4 + cmp wd, 16 + jg .w32 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak +%else + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h +%endif +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movd m1, [yq] + movd m3, [yq+strideq] + punpckldq m1, m3 + punpcklbw m1, m1 + movd m0, [yq+strideq*2] + movd m3, [yq+stride3q] + punpckldq m0, m3 + punpcklbw m0, m0 + pmaddubsw m1, m2 + pmaddubsw m0, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m5, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_4 + punpckhqdq m0, m0 +.w4_hpad_loop: + mova [acq], m0 + paddw m5, m0 + add acq, 16 + sub hpadd, 2 + jg .w4_hpad_loop +.calc_avg_4: + psrlw m2, 10 + pmaddwd m5, m2 + jmp .calc_avg + +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + movq m1, [yq] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + movq m0, [yq+strideq] + punpcklbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0 + movq m1, [yq+strideq*2] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + movq m0, [yq+stride3q] + punpcklbw m0, m0 + pmaddubsw m0, m2 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w8_hpad +.w8_wpad: + movd m1, [yq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq], m1 + paddw m5, m1 + movd m0, [yq+strideq] + punpcklbw m0, m0 + punpcklqdq m0, m0 + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+16], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_wpad + test hpadd, hpadd + jz .calc_avg_8_16 +.w8_hpad: + mova [acq], m0 + paddw m5, m0 + mova [acq+16], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad + jmp .calc_avg_8_16 + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0 + mova m0, [yq+strideq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w16_hpad_loop +.w16_wpad: + cmp wpadd, 2 + jl .w16_pad1 + je .w16_pad2 +.w16_pad3: + movd m1, [yq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pshufhw m1, m1, q3333 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + movd m1, [yq+strideq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pshufhw m1, m1, q3333 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + punpckhqdq m0, m1, m1 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad3 + jmp .w16_wpad_done +.w16_pad2: + movq m1, [yq] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + pshufhw m1, m1, q3333 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + movq m1, [yq+strideq] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + mova m0, m1 + pshufhw m0, m0, q3333 + punpckhqdq m0, m0 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad2 + jmp .w16_wpad_done +.w16_pad1: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + punpckhbw m0, m0 + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0 + mova m0, [yq+strideq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + punpckhbw m0, m0 + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 + pmaddubsw m0, m2 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad1 +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg_8_16 +.w16_hpad_loop: + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m1 + paddw m5, m0 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m1 + paddw m5, m0 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop +.calc_avg_8_16: + mova m0, m5 + psrld m5, 16 + pslld m0, 16 + psrld m0, 16 + paddd m5, m0 + mova m0, m4 + psrld m0, 16 + pslld m4, 16 + psrld m4, 16 + paddd m0, m4 + paddd m5, m0 + jmp .calc_avg + +.w32: + pxor m0, m0 + mova [rsp ], m0 + mova [rsp+16], m0 + mova [rsp+32], m0 + mova [rsp+48], m0 + test wpadd, wpadd + jnz .w32_wpad +.w32_loop: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m4, [yq+16] + mova m3, m4 + punpcklbw m3, m3 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + punpckhbw m4, m4 + pmaddubsw m4, m2 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_loop + test hpadd, hpadd + jz .calc_avg_32 + jmp .w32_hpad_loop +.w32_wpad: + cmp wpadd, 2 + jl .w32_pad1 + je .w32_pad2 + cmp wpadd, 4 + jl .w32_pad3 + je .w32_pad4 + cmp wpadd, 6 + jl .w32_pad5 + je .w32_pad6 +.w32_pad7: + movd m1, [yq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pshufhw m1, m1, q3333 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + mova m0, m1 + punpckhqdq m0, m0 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad7 + jmp .w32_wpad_done +.w32_pad6: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + pshufhw m0, m1, q3333 + punpckhqdq m0, m0 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad6 + jmp .w32_wpad_done +.w32_pad5: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + mova m5, [rsp] + paddw m5, m1 + mova [rsp ], m5 + punpckhbw m0, m0 + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + punpckhqdq m3, m3 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad5 + jmp .w32_wpad_done +.w32_pad4: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + pshufhw m3, m3, q3333 + punpckhqdq m3, m3 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad4 + jmp .w32_wpad_done +.w32_pad3: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + movd m3, [yq+16] + punpcklbw m3, m3 + punpcklqdq m3, m3 + pshufhw m3, m3, q3333 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + punpckhqdq m4, m4 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad3 + jmp .w32_wpad_done +.w32_pad2: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, [yq+16] + punpcklbw m3, m3 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + pshufhw m4, m3, q3333 + punpckhqdq m4, m4 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad2 + jmp .w32_wpad_done +.w32_pad1: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m4, [yq+16] + mova m3, m4 + punpcklbw m3, m3 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + punpckhbw m4, m4 + punpcklqdq m4, m4 + pshufhw m4, m4, q3333 + pmaddubsw m4, m2 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad1 +.w32_wpad_done: + test hpadd, hpadd + jz .calc_avg_32 +.w32_hpad_loop: + mova [acq], m1 + mova [acq+16], m0 + paddw m5, m1, [rsp] + mova [rsp ], m5 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova [acq+32], m3 + mova [acq+48], m4 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + add acq, 64 + sub hpadd, 1 + jg .w32_hpad_loop + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak +%else + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h +%endif + +.calc_avg_32: + mova m5, [rsp] + mova m0, m5 + psrld m5, 16 + pslld m0, 16 + psrld m0, 16 + paddd m5, m0 + mova m0, [rsp+16] + mova m3, m0 + psrld m0, 16 + pslld m3, 16 + psrld m3, 16 + paddd m0, m3 + paddd m5, m0 + mova m0, [rsp+32] + mova m3, m0 + psrld m0, 16 + pslld m3, 16 + psrld m3, 16 + paddd m0, m3 + mova m1, [rsp+48] + mova m3, m1 + psrld m1, 16 + pslld m3, 16 + psrld m3, 16 + paddd m1, m3 + paddd m1, m0 + paddd m5, m1 +.calc_avg: + movd szd, m6 + psrad m6, 1 + tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); + paddd m5, m6 + movd m1, r1d + pshufd m0, m5, q2301 + paddd m0, m5 + pshufd m5, m0, q1032 + paddd m0, m5 + psrad m0, m1 ; sum >>= log2sz; + packssdw m0, m0 + RELOAD_ACQ_32 acq ; ac = ac_orig +.sub_loop: + mova m1, [acq] + psubw m1, m0 + mova [acq], m1 + add acq, 16 + sub szd, 8 + jg .sub_loop + RET + +; %1 simd register that hold the mask and will hold the result +; %2 simd register that holds the "true" values +; %3 location of the "false" values (simd register/memory) +%macro BLEND 3 ; mask, true, false + pand %2, %1 + pandn %1, %3 + por %1, %2 +%endmacro + +%macro PAETH 2 ; top, ldiff + pavgb m1, m%1, m3 + pxor m0, m%1, m3 + pand m0, m4 + psubusb m2, m5, m1 + psubb m1, m0 + psubusb m1, m5 + por m1, m2 + paddusb m1, m1 + por m1, m0 ; min(tldiff, 255) + psubusb m2, m5, m3 + psubusb m0, m3, m5 + por m2, m0 ; tdiff +%ifnum %2 + pminub m2, m%2 + pcmpeqb m0, m%2, m2 ; ldiff <= tdiff +%else + mova m0, %2 + pminub m2, m0 + pcmpeqb m0, m2 +%endif + pminub m1, m2 + pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff + mova m2, m3 + BLEND m0, m2, m%1 + BLEND m1, m0, m5 +%endmacro + +cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h +%define base r5-ipred_paeth_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + pxor m0, m0 + movd m5, [tlq] + pshufb m5, m0 + LEA r5, ipred_paeth_ssse3_table + movsxd wq, [r5+wq*4] + movddup m4, [base+ipred_paeth_shuf] + add wq, r5 + jmp wq +.w4: + movd m6, [tlq+1] ; top + pshufd m6, m6, q0000 + lea r3, [strideq*3] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 ; ldiff +.w4_loop: + sub tlq, 4 + movd m3, [tlq] + mova m1, [base+ipred_h_shuf] + pshufb m3, m1 ; left + PAETH 6, 7 + movd [dstq ], m1 + pshuflw m0, m1, q1032 + movd [dstq+strideq ], m0 + punpckhqdq m1, m1 + movd [dstq+strideq*2], m1 + psrlq m1, 32 + movd [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + movddup m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w8_loop: + sub tlq, 2 + movd m3, [tlq] + pshufb m3, [base+ipred_paeth_shuf] + PAETH 6, 7 + movq [dstq ], m1 + movhps [dstq+strideq], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w16_loop: + sub tlq, 1 + movd m3, [tlq] + pxor m1, m1 + pshufb m3, m1 + PAETH 6, 7 + mova [dstq], m1 + add dstq, strideq + sub hd, 1 + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp ], m6 + mova [rsp+16], m7 + movu m6, [tlq+17] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+32], m6 +.w32_loop: + dec tlq + movd m3, [tlq] + pxor m1, m1 + pshufb m3, m1 + mova m6, [rsp] + PAETH 6, [rsp+16] + mova [dstq ], m1 + mova m6, [rsp+32] + PAETH 6, 7 + mova [dstq+16], m1 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp ], m6 + mova [rsp+16], m7 + movu m6, [tlq+17] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+32], m6 + mova [rsp+48], m7 + movu m6, [tlq+33] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+64], m6 + mova [rsp+80], m7 + movu m6, [tlq+49] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+96], m6 +.w64_loop: + dec tlq + movd m3, [tlq] + pxor m1, m1 + pshufb m3, m1 + mova m6, [rsp] + PAETH 6, [rsp+16] + mova [dstq ], m1 + mova m6, [rsp+32] + PAETH 6, [rsp+48] + mova [dstq+16], m1 + mova m6, [rsp+64] + PAETH 6, [rsp+80] + mova [dstq+32], m1 + mova m6, [rsp+96] + PAETH 6, 7 + mova [dstq+48], m1 + add dstq, strideq + dec hd + jg .w64_loop + RET + + +%macro FILTER 4 ;dst, src, tmp, shuf +%ifnum %4 + pshufb m%2, m%4 +%else + pshufb m%2, %4 +%endif + pshufd m%1, m%2, q0000 ;p0 p1 + pmaddubsw m%1, m2 + pshufd m%3, m%2, q1111 ;p2 p3 + pmaddubsw m%3, m3 + paddw m%1, [base+pw_8] + paddw m%1, m%3 + pshufd m%3, m%2, q2222 ;p4 p5 + pmaddubsw m%3, m4 + paddw m%1, m%3 + pshufd m%3, m%2, q3333 ;p6 __ + pmaddubsw m%3, m5 + paddw m%1, m%3 + psraw m%1, 4 + packuswb m%1, m%1 +%endmacro + +cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter +%define base r6-$$ + LEA r6, $$ + tzcnt wd, wm +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + lea filterq, [base+filter_intra_taps+filterq] + movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 + movsxd wq, [base+ipred_filter_ssse3_table+wq*4] + mova m2, [filterq+16*0] + mova m3, [filterq+16*1] + mova m4, [filterq+16*2] + mova m5, [filterq+16*3] + lea wq, [base+ipred_filter_ssse3_table+wq] + mov hd, hm + jmp wq +.w4: + mova m1, [base+filter_shuf1] + sub tlq, 3 + sub tlq, hq + jmp .w4_loop_start +.w4_loop: + movd m0, [tlq+hq] + punpckldq m0, m6 + lea dstq, [dstq+strideq*2] +.w4_loop_start: + FILTER 6, 0, 7, 1 + movd [dstq+strideq*0], m6 + pshuflw m6, m6, q1032 + movd [dstq+strideq*1], m6 + sub hd, 2 + jg .w4_loop + RET + +ALIGN function_align +.w8: + movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 + sub tlq, 5 + sub tlq, hq + +.w8_loop: + FILTER 7, 0, 1, [base+filter_shuf1] + punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER 0, 6, 1, [base+filter_shuf2] + + punpckldq m6, m7, m0 + movq [dstq+strideq*0], m6 + punpckhqdq m6, m6 + movq [dstq+strideq*1], m6 + + movd m0, [tlq+hq] ;_ 6 5 0 + punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 + + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET + +ALIGN function_align +.w16: + movu m6, [tlq+1] ;top row + sub tlq, 5 + sub tlq, hq + +.w16_loop: + FILTER 7, 0, 1, [base+filter_shuf1] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+4+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + + FILTER 7, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+8+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + movd [dstq+12+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + mova [dstq+strideq*1], m6 + + movd m0, [tlq+hq] ;_ 6 5 0 + punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 + + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET + +ALIGN function_align +.w32: + movu m6, [tlq+1] ;top row + lea filterq, [tlq+17] + sub tlq, 5 + sub tlq, hq + +.w32_loop: + FILTER 7, 0, 1, [base+filter_shuf1] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+4+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + + FILTER 7, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+8+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + movu m1, [filterq] + punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ + punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+12+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + mova [dstq+strideq*1], m6 + + mova m6, m1 + + FILTER 7, 0, 6, [base+filter_shuf2] + punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+16+strideq*0], m7 + psrlq m7, 32 + palignr m7, m1, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+20+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + + FILTER 7, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+24+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + movd [dstq+28+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + mova [dstq+16+strideq*1], m6 + + mova m6, [dstq+strideq*1] + movd m0, [tlq+hq] ;_ 6 5 0 + punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 + lea filterq, [dstq+16+strideq*1] + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET diff -Nru dav1d-0.7.1/src/x86/ipred_ssse3.asm dav1d-0.9.1/src/x86/ipred_ssse3.asm --- dav1d-0.7.1/src/x86/ipred_ssse3.asm 2020-06-21 11:48:55.020126300 +0000 +++ dav1d-0.9.1/src/x86/ipred_ssse3.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,3108 +0,0 @@ -; Copyright © 2018, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - -SECTION_RODATA 16 - -%macro SMOOTH_WEIGHT_TABLE 1-* - %rep %0 - db %1-128, 127-%1 - %rotate 1 - %endrep -%endmacro - -; sm_weights[], but modified to precalculate x and 256-x with offsets to -; enable efficient use of pmaddubsw (which requires signed values) -smooth_weights: SMOOTH_WEIGHT_TABLE \ - 0, 0, 255, 128, 255, 149, 85, 64, \ - 255, 197, 146, 105, 73, 50, 37, 32, \ - 255, 225, 196, 170, 145, 123, 102, 84, \ - 68, 54, 43, 33, 26, 20, 17, 16, \ - 255, 240, 225, 210, 196, 182, 169, 157, \ - 145, 133, 122, 111, 101, 92, 83, 74, \ - 66, 59, 52, 45, 39, 34, 29, 25, \ - 21, 17, 14, 12, 10, 9, 8, 8, \ - 255, 248, 240, 233, 225, 218, 210, 203, \ - 196, 189, 182, 176, 169, 163, 156, 150, \ - 144, 138, 133, 127, 121, 116, 111, 106, \ - 101, 96, 91, 86, 82, 77, 73, 69, \ - 65, 61, 57, 54, 50, 47, 44, 41, \ - 38, 35, 32, 29, 27, 25, 22, 20, \ - 18, 16, 15, 13, 12, 10, 9, 8, \ - 7, 6, 6, 5, 5, 4, 4, 4 - -ipred_v_shuf : db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 -ipred_h_shuf : db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 -ipred_paeth_shuf : db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 -filter_shuf1 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 -filter_shuf2 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 - -pw_8 : times 8 dw 8 -pb_3 : times 16 db 3 -pb_128 : times 8 db 128 -pw_128 : times 4 dw 128 -pw_255 : times 4 dw 255 -pb_2 : times 8 db 2 -pb_4 : times 8 db 4 -pb_127_m127 : times 4 db 127, -127 -pd_32768 : times 1 dd 32768 - - -%macro JMP_TABLE 3-* - %xdefine %1_%2_table (%%table - 2*4) - %xdefine %%base mangle(private_prefix %+ _%1_%2) - %%table: - %rep %0 - 2 - dd %%base %+ .%3 - (%%table - 2*4) - %rotate 1 - %endrep -%endmacro - -%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) -%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) - -JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 -JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ - s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 -JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 -JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 -JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 -JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 -JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 -JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 -JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ - s4-8*4, s8-8*4, s16-8*4, s32-8*4 -JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 -JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 - -cextern filter_intra_taps - - -SECTION .text - -;--------------------------------------------------------------------------------------- -;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int a); -;--------------------------------------------------------------------------------------- -%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 - pshuflw m1, m0, %3 ; extend 8 byte for 2 pos - punpcklqdq m1, m1 - mova [dstq + %2], m1 -%if %1 > 16 - mova [dstq + 16 + %2], m1 -%endif -%if %1 > 32 - mova [dstq + 32 + %2], m1 - mova [dstq + 48 + %2], m1 -%endif -%endmacro - -%macro IPRED_H 1 ; width - sub tlq, 4 - movd m0, [tlq] ; get 4 bytes of topleft data - punpcklbw m0, m0 ; extend 2 byte -%if %1 == 4 - pshuflw m1, m0, q2233 - movd [dstq+strideq*0], m1 - psrlq m1, 32 - movd [dstq+strideq*1], m1 - pshuflw m0, m0, q0011 - movd [dstq+strideq*2], m0 - psrlq m0, 32 - movd [dstq+stride3q ], m0 - -%elif %1 == 8 - punpcklwd m0, m0 - punpckhdq m1, m0, m0 - punpckldq m0, m0 - movq [dstq+strideq*1], m1 - movhps [dstq+strideq*0], m1 - movq [dstq+stride3q ], m0 - movhps [dstq+strideq*2], m0 -%else - IPRED_SET %1, 0, q3333 - IPRED_SET %1, strideq, q2222 - IPRED_SET %1, strideq*2, q1111 - IPRED_SET %1, stride3q, q0000 -%endif - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w%1 - RET -%endmacro - -INIT_XMM ssse3 -cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3 - LEA r5, ipred_h_ssse3_table - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, [r5+wq*4] - add wq, r5 - lea stride3q, [strideq*3] - jmp wq -.w4: - IPRED_H 4 -.w8: - IPRED_H 8 -.w16: - IPRED_H 16 -.w32: - IPRED_H 32 -.w64: - IPRED_H 64 - -;--------------------------------------------------------------------------------------- -;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int a); -;--------------------------------------------------------------------------------------- -cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3 - LEA r5, ipred_dc_splat_ssse3_table - tzcnt wd, wm - movu m0, [tlq+ 1] - movu m1, [tlq+17] - movu m2, [tlq+33] - movu m3, [tlq+49] - movifnidn hd, hm - movsxd wq, [r5+wq*4] - add wq, r5 - lea stride3q, [strideq*3] - jmp wq - -;--------------------------------------------------------------------------------------- -;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int a); -;--------------------------------------------------------------------------------------- -cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3 - movifnidn hd, hm - movifnidn wd, wm - tzcnt r6d, hd - lea r5d, [wq+hq] - movd m4, r5d - tzcnt r5d, r5d - movd m5, r5d - LEA r5, ipred_dc_ssse3_table - tzcnt wd, wd - movsxd r6, [r5+r6*4] - movsxd wq, [r5+wq*4+20] - pcmpeqd m3, m3 - psrlw m4, 1 ; dc = (width + height) >> 1; - add r6, r5 - add wq, r5 - lea stride3q, [strideq*3] - jmp r6 -.h4: - movd m0, [tlq-4] - pmaddubsw m0, m3 - jmp wq -.w4: - movd m1, [tlq+1] - pmaddubsw m1, m3 - psubw m0, m4 - paddw m0, m1 - pmaddwd m0, m3 - cmp hd, 4 - jg .w4_mul - psrlw m0, 3 ; dc >>= ctz(width + height); - jmp .w4_end -.w4_mul: - punpckhqdq m1, m0, m0 - paddw m0, m1 - psrlq m1, m0, 32 - paddw m0, m1 - psrlw m0, 2 - mov r6d, 0x5556 - mov r2d, 0x3334 - test hd, 8 - cmovz r6d, r2d - movd m5, r6d - pmulhuw m0, m5 -.w4_end: - pxor m1, m1 - pshufb m0, m1 -.s4: - movd [dstq+strideq*0], m0 - movd [dstq+strideq*1], m0 - movd [dstq+strideq*2], m0 - movd [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .s4 - RET -ALIGN function_align -.h8: - movq m0, [tlq-8] - pmaddubsw m0, m3 - jmp wq -.w8: - movq m1, [tlq+1] - pmaddubsw m1, m3 - psubw m4, m0 - punpckhqdq m0, m0 - psubw m0, m4 - paddw m0, m1 - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 - pmaddwd m0, m3 - psrlw m0, m5 - cmp hd, 8 - je .w8_end - mov r6d, 0x5556 - mov r2d, 0x3334 - cmp hd, 32 - cmovz r6d, r2d - movd m1, r6d - pmulhuw m0, m1 -.w8_end: - pxor m1, m1 - pshufb m0, m1 -.s8: - movq [dstq+strideq*0], m0 - movq [dstq+strideq*1], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .s8 - RET -ALIGN function_align -.h16: - mova m0, [tlq-16] - pmaddubsw m0, m3 - jmp wq -.w16: - movu m1, [tlq+1] - pmaddubsw m1, m3 - paddw m0, m1 - psubw m4, m0 - punpckhqdq m0, m0 - psubw m0, m4 - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 - pmaddwd m0, m3 - psrlw m0, m5 - cmp hd, 16 - je .w16_end - mov r6d, 0x5556 - mov r2d, 0x3334 - test hd, 8|32 - cmovz r6d, r2d - movd m1, r6d - pmulhuw m0, m1 -.w16_end: - pxor m1, m1 - pshufb m0, m1 -.s16: - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .s16 - RET -ALIGN function_align -.h32: - mova m0, [tlq-32] - pmaddubsw m0, m3 - mova m2, [tlq-16] - pmaddubsw m2, m3 - paddw m0, m2 - jmp wq -.w32: - movu m1, [tlq+1] - pmaddubsw m1, m3 - movu m2, [tlq+17] - pmaddubsw m2, m3 - paddw m1, m2 - paddw m0, m1 - psubw m4, m0 - punpckhqdq m0, m0 - psubw m0, m4 - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 - pmaddwd m0, m3 - psrlw m0, m5 - cmp hd, 32 - je .w32_end - lea r2d, [hq*2] - mov r6d, 0x5556 - mov r2d, 0x3334 - test hd, 64|16 - cmovz r6d, r2d - movd m1, r6d - pmulhuw m0, m1 -.w32_end: - pxor m1, m1 - pshufb m0, m1 - mova m1, m0 -.s32: - mova [dstq], m0 - mova [dstq+16], m1 - mova [dstq+strideq], m0 - mova [dstq+strideq+16], m1 - mova [dstq+strideq*2], m0 - mova [dstq+strideq*2+16], m1 - mova [dstq+stride3q], m0 - mova [dstq+stride3q+16], m1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .s32 - RET -ALIGN function_align -.h64: - mova m0, [tlq-64] - mova m1, [tlq-48] - pmaddubsw m0, m3 - pmaddubsw m1, m3 - paddw m0, m1 - mova m1, [tlq-32] - pmaddubsw m1, m3 - paddw m0, m1 - mova m1, [tlq-16] - pmaddubsw m1, m3 - paddw m0, m1 - jmp wq -.w64: - movu m1, [tlq+ 1] - movu m2, [tlq+17] - pmaddubsw m1, m3 - pmaddubsw m2, m3 - paddw m1, m2 - movu m2, [tlq+33] - pmaddubsw m2, m3 - paddw m1, m2 - movu m2, [tlq+49] - pmaddubsw m2, m3 - paddw m1, m2 - paddw m0, m1 - psubw m4, m0 - punpckhqdq m0, m0 - psubw m0, m4 - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 - pmaddwd m0, m3 - psrlw m0, m5 - cmp hd, 64 - je .w64_end - mov r6d, 0x5556 - mov r2d, 0x3334 - test hd, 32 - cmovz r6d, r2d - movd m1, r6d - pmulhuw m0, m1 -.w64_end: - pxor m1, m1 - pshufb m0, m1 - mova m1, m0 - mova m2, m0 - mova m3, m0 -.s64: - mova [dstq], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - mova [dstq+strideq], m0 - mova [dstq+strideq+16], m1 - mova [dstq+strideq+32], m2 - mova [dstq+strideq+48], m3 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .s64 - RET - -;--------------------------------------------------------------------------------------- -;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int a); -;--------------------------------------------------------------------------------------- -cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3 - LEA r5, ipred_dc_left_ssse3_table - mov hd, hm ; zero upper half - tzcnt r6d, hd - sub tlq, hq - tzcnt wd, wm - movu m0, [tlq] - movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] - movd m2, r6d - psrld m3, m2 - movsxd r6, [r5+r6*4] - pcmpeqd m2, m2 - pmaddubsw m0, m2 - add r6, r5 - add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table - movsxd wq, [r5+wq*4] - add wq, r5 - jmp r6 -.h64: - movu m1, [tlq+48] ; unaligned when jumping here from dc_top - pmaddubsw m1, m2 - paddw m0, m1 - movu m1, [tlq+32] ; unaligned when jumping here from dc_top - pmaddubsw m1, m2 - paddw m0, m1 -.h32: - movu m1, [tlq+16] ; unaligned when jumping here from dc_top - pmaddubsw m1, m2 - paddw m0, m1 -.h16: - pshufd m1, m0, q3232 ; psrlq m1, m0, 16 - paddw m0, m1 -.h8: - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 -.h4: - pmaddwd m0, m2 - pmulhrsw m0, m3 - lea stride3q, [strideq*3] - pxor m1, m1 - pshufb m0, m1 - mova m1, m0 - mova m2, m0 - mova m3, m0 - jmp wq - -;--------------------------------------------------------------------------------------- -;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int a); -;--------------------------------------------------------------------------------------- -cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3 - LEA r5, ipred_dc_splat_ssse3_table - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, [r5+wq*4] - movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] - mova m1, m0 - mova m2, m0 - mova m3, m0 - add wq, r5 - lea stride3q, [strideq*3] - jmp wq - -;--------------------------------------------------------------------------------------- -;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int a); -;--------------------------------------------------------------------------------------- -cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h - LEA r5, ipred_dc_left_ssse3_table - tzcnt wd, wm - inc tlq - movu m0, [tlq] - movifnidn hd, hm - movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] - movd m2, wd - psrld m3, m2 - movsxd r6, [r5+wq*4] - pcmpeqd m2, m2 - pmaddubsw m0, m2 - add r6, r5 - add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table - movsxd wq, [r5+wq*4] - add wq, r5 - jmp r6 - -;--------------------------------------------------------------------------------------- -;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int a); -;--------------------------------------------------------------------------------------- -%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] - ; w * a = (w - 128) * a + 128 * a - ; (256 - w) * b = (127 - w) * b + 129 * b - ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] - pmaddubsw m6, m%3, m%1 - pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b - paddw m6, m%5 - paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] - psrlw m6, 8 - psrlw m0, 8 - packuswb m6, m0 -%endmacro - -cglobal ipred_smooth_v, 3, 7, 7, dst, stride, tl, w, h, weights -%define base r6-ipred_smooth_v_ssse3_table - LEA r6, ipred_smooth_v_ssse3_table - tzcnt wd, wm - mov hd, hm - movsxd wq, [r6+wq*4] - movddup m0, [base+pb_127_m127] - movddup m1, [base+pw_128] - lea weightsq, [base+smooth_weights+hq*4] - neg hq - movd m5, [tlq+hq] - pxor m2, m2 - pshufb m5, m2 - add wq, r6 - jmp wq -.w4: - movd m2, [tlq+1] - punpckldq m2, m2 - punpcklbw m2, m5 ; top, bottom - lea r3, [strideq*3] - mova m4, [base+ipred_v_shuf] - mova m5, m4 - punpckldq m4, m4 - punpckhdq m5, m5 - pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom - paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok - paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 -.w4_loop: - movu m1, [weightsq+hq*2] - pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop - pshufb m1, m5 - SMOOTH 0, 1, 2, 2, 3, 3 - movd [dstq+strideq*0], m6 - pshuflw m1, m6, q1032 - movd [dstq+strideq*1], m1 - punpckhqdq m6, m6 - movd [dstq+strideq*2], m6 - psrlq m6, 32 - movd [dstq+r3 ], m6 - lea dstq, [dstq+strideq*4] - add hq, 4 - jl .w4_loop - RET -ALIGN function_align -.w8: - movq m2, [tlq+1] - punpcklbw m2, m5 - mova m5, [base+ipred_v_shuf] - lea r3, [strideq*3] - pshufd m4, m5, q0000 - pshufd m5, m5, q1111 - pmaddubsw m3, m2, m0 - paddw m1, m2 - paddw m3, m1 ; m3 is output for loop -.w8_loop: - movq m1, [weightsq+hq*2] - pshufb m0, m1, m4 - pshufb m1, m5 - SMOOTH 0, 1, 2, 2, 3, 3 - movq [dstq+strideq*0], m6 - movhps [dstq+strideq*1], m6 - lea dstq, [dstq+strideq*2] - add hq, 2 - jl .w8_loop - RET -ALIGN function_align -.w16: - movu m3, [tlq+1] - punpcklbw m2, m3, m5 - punpckhbw m3, m5 - pmaddubsw m4, m2, m0 - pmaddubsw m5, m3, m0 - paddw m0, m1, m2 - paddw m1, m3 - paddw m4, m0 - paddw m5, m1 ; m4 and m5 is output for loop -.w16_loop: - movd m1, [weightsq+hq*2] - pshuflw m1, m1, q0000 - punpcklqdq m1, m1 - SMOOTH 1, 1, 2, 3, 4, 5 - mova [dstq], m6 - add dstq, strideq - add hq, 1 - jl .w16_loop - RET -ALIGN function_align -.w32: -%if WIN64 - movaps [rsp+24], xmm7 - %define xmm_regs_used 8 -%endif - mova m7, m5 -.w32_loop_init: - mov r3d, 2 -.w32_loop: - movddup m0, [base+pb_127_m127] - movddup m1, [base+pw_128] - movu m3, [tlq+1] - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - pmaddubsw m4, m2, m0 - pmaddubsw m5, m3, m0 - paddw m0, m1, m2 - paddw m1, m3 - paddw m4, m0 - paddw m5, m1 - movd m1, [weightsq+hq*2] - pshuflw m1, m1, q0000 - punpcklqdq m1, m1 - SMOOTH 1, 1, 2, 3, 4, 5 - mova [dstq], m6 - add tlq, 16 - add dstq, 16 - dec r3d - jg .w32_loop - lea dstq, [dstq-32+strideq] - sub tlq, 32 - add hq, 1 - jl .w32_loop_init - RET -ALIGN function_align -.w64: -%if WIN64 - movaps [rsp+24], xmm7 - %define xmm_regs_used 8 -%endif - mova m7, m5 -.w64_loop_init: - mov r3d, 4 -.w64_loop: - movddup m0, [base+pb_127_m127] - movddup m1, [base+pw_128] - movu m3, [tlq+1] - punpcklbw m2, m3, m7 - punpckhbw m3, m7 - pmaddubsw m4, m2, m0 - pmaddubsw m5, m3, m0 - paddw m0, m1, m2 - paddw m1, m3 - paddw m4, m0 - paddw m5, m1 - movd m1, [weightsq+hq*2] - pshuflw m1, m1, q0000 - punpcklqdq m1, m1 - SMOOTH 1, 1, 2, 3, 4, 5 - mova [dstq], m6 - add tlq, 16 - add dstq, 16 - dec r3d - jg .w64_loop - lea dstq, [dstq-64+strideq] - sub tlq, 64 - add hq, 1 - jl .w64_loop_init - RET - -;--------------------------------------------------------------------------------------- -;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int a); -;--------------------------------------------------------------------------------------- -cglobal ipred_smooth_h, 3, 7, 8, dst, stride, tl, w, h -%define base r6-ipred_smooth_h_ssse3_table - LEA r6, ipred_smooth_h_ssse3_table - mov wd, wm - movd m3, [tlq+wq] - pxor m1, m1 - pshufb m3, m1 ; right - tzcnt wd, wd - mov hd, hm - movsxd wq, [r6+wq*4] - movddup m4, [base+pb_127_m127] - movddup m5, [base+pw_128] - add wq, r6 - jmp wq -.w4: - movddup m6, [base+smooth_weights+4*2] - mova m7, [base+ipred_h_shuf] - sub tlq, 4 - sub tlq, hq - lea r3, [strideq*3] -.w4_loop: - movd m2, [tlq+hq] ; left - pshufb m2, m7 - punpcklbw m1, m2, m3 ; left, right - punpckhbw m2, m3 - pmaddubsw m0, m1, m4 ; 127 * left - 127 * right - paddw m0, m1 ; 128 * left + 129 * right - pmaddubsw m1, m6 - paddw m1, m5 - paddw m0, m1 - pmaddubsw m1, m2, m4 - paddw m1, m2 - pmaddubsw m2, m6 - paddw m2, m5 - paddw m1, m2 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 - movd [dstq+strideq*0], m0 - pshuflw m1, m0, q1032 - movd [dstq+strideq*1], m1 - punpckhqdq m0, m0 - movd [dstq+strideq*2], m0 - psrlq m0, 32 - movd [dstq+r3 ], m0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w4_loop - RET -ALIGN function_align -.w8: - mova m6, [base+smooth_weights+8*2] - mova m7, [base+ipred_h_shuf] - sub tlq, 4 - sub tlq, hq - punpckldq m7, m7 -.w8_loop: - movd m2, [tlq+hq] ; left - pshufb m2, m7 - punpcklbw m1, m2, m3 ; left, right - punpckhbw m2, m3 - pmaddubsw m0, m1, m4 ; 127 * left - 127 * right - paddw m0, m1 ; 128 * left + 129 * right - pmaddubsw m1, m6 - paddw m1, m5 - paddw m0, m1 - pmaddubsw m1, m2, m4 - paddw m1, m2 - pmaddubsw m2, m6 - paddw m2, m5 - paddw m1, m2 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 - movq [dstq+strideq*0], m0 - movhps [dstq+strideq*1], m0 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w8_loop - RET -ALIGN function_align -.w16: - mova m6, [base+smooth_weights+16*2] - mova m7, [base+smooth_weights+16*3] - sub tlq, 1 - sub tlq, hq -.w16_loop: - pxor m1, m1 - movd m2, [tlq+hq] ; left - pshufb m2, m1 - punpcklbw m1, m2, m3 ; left, right - punpckhbw m2, m3 - pmaddubsw m0, m1, m4 ; 127 * left - 127 * right - paddw m0, m1 ; 128 * left + 129 * right - pmaddubsw m1, m6 - paddw m1, m5 - paddw m0, m1 - pmaddubsw m1, m2, m4 - paddw m1, m2 - pmaddubsw m2, m7 - paddw m2, m5 - paddw m1, m2 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 - mova [dstq], m0 - lea dstq, [dstq+strideq] - sub hd, 1 - jg .w16_loop - RET -ALIGN function_align -.w32: - sub tlq, 1 - sub tlq, hq - pxor m6, m6 -.w32_loop_init: - mov r5, 2 - lea r3, [base+smooth_weights+16*4] -.w32_loop: - mova m7, [r3] - add r3, 16 - movd m2, [tlq+hq] ; left - pshufb m2, m6 - punpcklbw m1, m2, m3 ; left, right - punpckhbw m2, m3 - pmaddubsw m0, m1, m4 ; 127 * left - 127 * right - paddw m0, m1 ; 128 * left + 129 * right - pmaddubsw m1, m7 - paddw m1, m5 - paddw m0, m1 - pmaddubsw m1, m2, m4 - paddw m1, m2 - mova m7, [r3] - add r3, 16 - pmaddubsw m2, m7 - paddw m2, m5 - paddw m1, m2 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 - mova [dstq], m0 - add dstq, 16 - dec r5 - jg .w32_loop - lea dstq, [dstq-32+strideq] - sub hd, 1 - jg .w32_loop_init - RET -ALIGN function_align -.w64: - sub tlq, 1 - sub tlq, hq - pxor m6, m6 -.w64_loop_init: - mov r5, 4 - lea r3, [base+smooth_weights+16*8] -.w64_loop: - mova m7, [r3] - add r3, 16 - movd m2, [tlq+hq] ; left - pshufb m2, m6 - punpcklbw m1, m2, m3 ; left, right - punpckhbw m2, m3 - pmaddubsw m0, m1, m4 ; 127 * left - 127 * right - paddw m0, m1 ; 128 * left + 129 * right - pmaddubsw m1, m7 - paddw m1, m5 - paddw m0, m1 - pmaddubsw m1, m2, m4 - paddw m1, m2 - mova m7, [r3] - add r3, 16 - pmaddubsw m2, m7 - paddw m2, m5 - paddw m1, m2 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 - mova [dstq], m0 - add dstq, 16 - dec r5 - jg .w64_loop - lea dstq, [dstq-64+strideq] - sub hd, 1 - jg .w64_loop_init - RET - -;--------------------------------------------------------------------------------------- -;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int a); -;--------------------------------------------------------------------------------------- -%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 - pmaddubsw m6, m%3, m%1 - mova m0, m6 - pmaddubsw m6, m%4, m%2 - mova m1, m6 -%ifnum %5 - paddw m0, m%5 -%else - paddw m0, %5 -%endif -%ifnum %6 - paddw m1, m%6 -%else - paddw m1, %6 -%endif -%ifnum %7 -%else - mova m3, %7 -%endif - pavgw m0, m2 - pavgw m1, m3 - psrlw m0, 8 - psrlw m1, 8 - packuswb m0, m1 -%endmacro - -%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] - mova m1, [rsp+16*%1] ; top - punpckhbw m6, m1, m0 ; top, bottom - punpcklbw m1, m0 ; top, bottom - pmaddubsw m2, m1, m5 - mova [rsp+16*%2], m1 - paddw m1, m3 ; 1 * top + 255 * bottom + 255 - paddw m2, m1 ; 128 * top + 129 * bottom + 255 - mova [rsp+16*%3], m2 - pmaddubsw m2, m6, m5 - mova [rsp+16*%4], m6 - paddw m6, m3 ; 1 * top + 255 * bottom + 255 - paddw m2, m6 ; 128 * top + 129 * bottom + 255 - mova [rsp+16*%5], m2 - movd m1, [tlq+hq] ; left - pshufb m1, [base+pb_3] ; topleft[-(1 + y)] - punpcklbw m1, m4 ; left, right - pmaddubsw m2, m1, m5 ; 127 * left - 127 * right - paddw m2, m1 ; 128 * left + 129 * right - mova m3, m2 - pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; - pmaddubsw m1, %7 - paddw m2, m3, m0 - paddw m3, m1 - movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; - mova m7, [rsp+16*%9] - pshufb m1, m7 - mova [rsp+16*%8], m3 - mova m4, [rsp+16*%2] - mova m5, [rsp+16*%3] - mova m3, [rsp+16*%4] - mova m7, [rsp+16*%5] - SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] - mova [dstq], m0 - movddup m3, [base+pw_255] ; recovery - mova m0, [rsp+16*%10] ; recovery - mova m4, [rsp+16*%11] ; recovery - mova m5, [rsp+16*%12] ; recovery -%endmacro - -cglobal ipred_smooth, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights -%define base r6-ipred_smooth_ssse3_table - mov wd, wm - mov hd, hm - LEA r6, ipred_smooth_ssse3_table - movd m4, [tlq+wq] ; right - pxor m2, m2 - pshufb m4, m2 - tzcnt wd, wd - mov r5, tlq - sub r5, hq - movsxd wq, [r6+wq*4] - movddup m5, [base+pb_127_m127] - movd m0, [r5] - pshufb m0, m2 ; bottom - movddup m3, [base+pw_255] - add wq, r6 - lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] - jmp wq -.w4: - mova m7, [base+ipred_v_shuf] - movd m1, [tlq+1] ; left - pshufd m1, m1, q0000 - sub tlq, 4 - lea r3, [strideq*3] - sub tlq, hq - punpcklbw m1, m0 ; top, bottom - pshufd m6, m7, q1100 - pshufd m7, m7, q3322 - pmaddubsw m2, m1, m5 - paddw m3, m1 ; 1 * top + 255 * bottom + 255 - paddw m2, m3 ; 128 * top + 129 * bottom + 255 - mova [rsp+16*0], m1 - mova [rsp+16*1], m2 - movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; - punpcklqdq m1, m1 - mova [rsp+16*2], m1 - mova [rsp+16*3], m4 - mova [rsp+16*4], m6 - mova [rsp+16*5], m5 -.w4_loop: - movd m1, [tlq+hq] ; left - pshufb m1, [base+ipred_h_shuf] - punpcklbw m0, m1, m4 ; left, right - punpckhbw m1, m4 - pmaddubsw m2, m0, m5 ; 127 * left - 127 * right - pmaddubsw m3, m1, m5 - paddw m2, m0 ; 128 * left + 129 * right - paddw m3, m1 - mova m4, [rsp+16*2] - pmaddubsw m0, m4 - pmaddubsw m1, m4 - paddw m2, m0 - paddw m3, m1 - movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; - add v_weightsq, 8 - pshufb m0, m1, m6 - pshufb m1, m7 - mova m4, [rsp+16*0] - mova m5, [rsp+16*1] - SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 - mova m4, [rsp+16*3] - mova m6, [rsp+16*4] - mova m5, [rsp+16*5] - movd [dstq+strideq*0], m0 - pshuflw m1, m0, q1032 - movd [dstq+strideq*1], m1 - punpckhqdq m0, m0 - movd [dstq+strideq*2], m0 - psrlq m0, 32 - movd [dstq+r3 ], m0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w4_loop - RET -ALIGN function_align -.w8: - mova m7, [base+ipred_v_shuf] - movq m1, [tlq+1] ; left - punpcklqdq m1, m1 - sub tlq, 4 - sub tlq, hq - punpcklbw m1, m0 - pshufd m6, m7, q0000 - pshufd m7, m7, q1111 - pmaddubsw m2, m1, m5 - paddw m3, m1 - paddw m2, m3 - mova [rsp+16*0], m1 - mova [rsp+16*1], m2 - mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; - mova [rsp+16*2], m1 - mova [rsp+16*3], m4 - mova [rsp+16*4], m6 - mova [rsp+16*5], m5 -.w8_loop: - movd m1, [tlq+hq] ; left - pshufb m1, [base+ipred_h_shuf] - pshufd m1, m1, q1100 - punpcklbw m0, m1, m4 - punpckhbw m1, m4 - pmaddubsw m2, m0, m5 - pmaddubsw m3, m1, m5 - paddw m2, m0 - paddw m3, m1 - mova m4, [rsp+16*2] - pmaddubsw m0, m4 - pmaddubsw m1, m4 - paddw m2, m0 - paddw m3, m1 - movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; - add v_weightsq, 4 - pshufb m0, m1, m6 - pshufb m1, m7 - mova m4, [rsp+16*0] - mova m5, [rsp+16*1] - SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 - mova m4, [rsp+16*3] - mova m6, [rsp+16*4] - mova m5, [rsp+16*5] - movq [dstq+strideq*0], m0 - movhps [dstq+strideq*1], m0 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w8_loop - RET -ALIGN function_align -.w16: - mova m7, [base+ipred_v_shuf] - movu m1, [tlq+1] ; left - sub tlq, 4 - sub tlq, hq - punpckhbw m6, m1, m0 ; top, bottom - punpcklbw m1, m0 ; top, bottom - pshufd m7, m7, q0000 - mova [rsp+16*2], m7 - pmaddubsw m2, m6, m5 - mova [rsp+16*5], m6 - paddw m6, m3 ; 1 * top + 255 * bottom + 255 - paddw m2, m6 ; 128 * top + 129 * bottom + 255 - mova [rsp+16*6], m2 - pmaddubsw m2, m1, m5 - paddw m3, m1 ; 1 * top + 255 * bottom + 255 - mova [rsp+16*0], m1 - paddw m2, m3 ; 128 * top + 129 * bottom + 255 - mova [rsp+16*1], m2 - mova [rsp+16*3], m4 - mova [rsp+16*4], m5 -.w16_loop: - movd m1, [tlq+hq] ; left - pshufb m1, [base+pb_3] ; topleft[-(1 + y)] - punpcklbw m1, m4 ; left, right - pmaddubsw m2, m1, m5 ; 127 * left - 127 * right - paddw m2, m1 ; 128 * left + 129 * right - mova m0, m1 - mova m3, m2 - pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; - pmaddubsw m1, [base+smooth_weights+16*3] - paddw m2, m0 - paddw m3, m1 - movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; - add v_weightsq, 2 - mova m7, [rsp+16*2] - pshufb m1, m7 - mova [rsp+16*7], m3 - mova m4, [rsp+16*0] - mova m5, [rsp+16*1] - mova m3, [rsp+16*5] - mova m7, [rsp+16*6] - SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] - mova m4, [rsp+16*3] - mova m5, [rsp+16*4] - mova [dstq], m0 - lea dstq, [dstq+strideq] - sub hd, 1 - jg .w16_loop - RET -ALIGN function_align -.w32: - movu m1, [tlq+1] ; top topleft[1 + x] - movu m2, [tlq+17] ; top - mova [rsp+16*0], m1 - mova [rsp+16*1], m2 - sub tlq, 4 - sub tlq, hq - mova m7, [base+ipred_v_shuf] - pshufd m7, m7, q0000 - mova [rsp+16*2], m7 - mova [rsp+16*3], m0 - mova [rsp+16*4], m4 - mova [rsp+16*5], m5 -.w32_loop: - SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 - add dstq, 16 - SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 - lea dstq, [dstq-16+strideq] - add v_weightsq, 2 - sub hd, 1 - jg .w32_loop - RET -ALIGN function_align -.w64: - movu m1, [tlq+1] ; top topleft[1 + x] - movu m2, [tlq+17] ; top - mova [rsp+16*0], m1 - mova [rsp+16*1], m2 - movu m1, [tlq+33] ; top - movu m2, [tlq+49] ; top - mova [rsp+16*11], m1 - mova [rsp+16*12], m2 - sub tlq, 4 - sub tlq, hq - mova m7, [base+ipred_v_shuf] - pshufd m7, m7, q0000 - mova [rsp+16*2], m7 - mova [rsp+16*3], m0 - mova [rsp+16*4], m4 - mova [rsp+16*5], m5 -.w64_loop: - SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 - add dstq, 16 - SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 - add dstq, 16 - SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 - add dstq, 16 - SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 - lea dstq, [dstq-48+strideq] - add v_weightsq, 2 - sub hd, 1 - jg .w64_loop - RET - -;--------------------------------------------------------------------------------------- -;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, -; const uint8_t *idx, const int w, const int h); -;--------------------------------------------------------------------------------------- -cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h - mova m4, [palq] - LEA r2, pal_pred_ssse3_table - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, [r2+wq*4] - packuswb m4, m4 - add wq, r2 - lea r2, [strideq*3] - jmp wq -.w4: - pshufb m0, m4, [idxq] - add idxq, 16 - movd [dstq ], m0 - pshuflw m1, m0, q1032 - movd [dstq+strideq ], m1 - punpckhqdq m0, m0 - movd [dstq+strideq*2], m0 - psrlq m0, 32 - movd [dstq+r2 ], m0 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w4 - RET -ALIGN function_align -.w8: - pshufb m0, m4, [idxq] - pshufb m1, m4, [idxq+16] - add idxq, 32 - movq [dstq ], m0 - movhps [dstq+strideq ], m0 - movq [dstq+strideq*2], m1 - movhps [dstq+r2 ], m1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w8 - RET -ALIGN function_align -.w16: - pshufb m0, m4, [idxq] - pshufb m1, m4, [idxq+16] - pshufb m2, m4, [idxq+32] - pshufb m3, m4, [idxq+48] - add idxq, 64 - mova [dstq ], m0 - mova [dstq+strideq ], m1 - mova [dstq+strideq*2], m2 - mova [dstq+r2 ], m3 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w16 - RET -ALIGN function_align -.w32: - pshufb m0, m4, [idxq] - pshufb m1, m4, [idxq+16] - pshufb m2, m4, [idxq+32] - pshufb m3, m4, [idxq+48] - add idxq, 64 - mova [dstq ], m0 - mova [dstq+16 ], m1 - mova [dstq+strideq ], m2 - mova [dstq+strideq+16], m3 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w32 - RET -ALIGN function_align -.w64: - pshufb m0, m4, [idxq] - pshufb m1, m4, [idxq+16] - pshufb m2, m4, [idxq+32] - pshufb m3, m4, [idxq+48] - add idxq, 64 - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - add dstq, strideq - sub hd, 1 - jg .w64 - RET - -;--------------------------------------------------------------------------------------- -;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int16_t *ac, const int alpha); -;--------------------------------------------------------------------------------------- -%macro IPRED_CFL 1 ; ac in, unpacked pixels out - psignw m3, m%1, m1 - pabsw m%1, m%1 - pmulhrsw m%1, m2 - psignw m%1, m3 - paddw m%1, m0 -%endmacro - -%if UNIX64 -DECLARE_REG_TMP 7 -%else -DECLARE_REG_TMP 5 -%endif - -cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha - movifnidn wd, wm - movifnidn hd, hm - tzcnt r6d, hd - lea t0d, [wq+hq] - movd m4, t0d - tzcnt t0d, t0d - movd m5, t0d - LEA t0, ipred_cfl_ssse3_table - tzcnt wd, wd - movsxd r6, [t0+r6*4] - movsxd wq, [t0+wq*4+16] - pcmpeqd m3, m3 - psrlw m4, 1 - add r6, t0 - add wq, t0 - movifnidn acq, acmp - jmp r6 -.h4: - movd m0, [tlq-4] - pmaddubsw m0, m3 - jmp wq -.w4: - movd m1, [tlq+1] - pmaddubsw m1, m3 - psubw m0, m4 - paddw m0, m1 - pmaddwd m0, m3 - cmp hd, 4 - jg .w4_mul - psrlw m0, 3 ; dc >>= ctz(width + height); - jmp .w4_end -.w4_mul: - punpckhqdq m1, m0, m0 - paddw m0, m1 - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 - psrlw m0, 2 - mov r6d, 0x5556 - mov r2d, 0x3334 - test hd, 8 - cmovz r6d, r2d - movd m5, r6d - pmulhuw m0, m5 -.w4_end: - pshuflw m0, m0, q0000 - punpcklqdq m0, m0 -.s4: - movd m1, alpham - pshuflw m1, m1, q0000 - punpcklqdq m1, m1 - lea r6, [strideq*3] - pabsw m2, m1 - psllw m2, 9 -.s4_loop: - mova m4, [acq] - mova m5, [acq+16] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - movd [dstq+strideq*0], m4 - pshuflw m4, m4, q1032 - movd [dstq+strideq*1], m4 - punpckhqdq m4, m4 - movd [dstq+strideq*2], m4 - psrlq m4, 32 - movd [dstq+r6 ], m4 - lea dstq, [dstq+strideq*4] - add acq, 32 - sub hd, 4 - jg .s4_loop - RET -ALIGN function_align -.h8: - movq m0, [tlq-8] - pmaddubsw m0, m3 - jmp wq -.w8: - movq m1, [tlq+1] - pmaddubsw m1, m3 - psubw m4, m0 - punpckhqdq m0, m0 - psubw m0, m4 - paddw m0, m1 - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 - pmaddwd m0, m3 - psrlw m0, m5 - cmp hd, 8 - je .w8_end - mov r6d, 0x5556 - mov r2d, 0x3334 - cmp hd, 32 - cmovz r6d, r2d - movd m1, r6d - pmulhuw m0, m1 -.w8_end: - pshuflw m0, m0, q0000 - punpcklqdq m0, m0 -.s8: - movd m1, alpham - pshuflw m1, m1, q0000 - punpcklqdq m1, m1 - lea r6, [strideq*3] - pabsw m2, m1 - psllw m2, 9 -.s8_loop: - mova m4, [acq] - mova m5, [acq+16] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - movq [dstq ], m4 - movhps [dstq+strideq ], m4 - mova m4, [acq+32] - mova m5, [acq+48] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - movq [dstq+strideq*2], m4 - movhps [dstq+r6 ], m4 - lea dstq, [dstq+strideq*4] - add acq, 64 - sub hd, 4 - jg .s8_loop - RET -ALIGN function_align -.h16: - mova m0, [tlq-16] - pmaddubsw m0, m3 - jmp wq -.w16: - movu m1, [tlq+1] - pmaddubsw m1, m3 - paddw m0, m1 - psubw m4, m0 - punpckhqdq m0, m0 - psubw m0, m4 - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 - pmaddwd m0, m3 - psrlw m0, m5 - cmp hd, 16 - je .w16_end - mov r6d, 0x5556 - mov r2d, 0x3334 - test hd, 8|32 - cmovz r6d, r2d - movd m1, r6d - pmulhuw m0, m1 -.w16_end: - pshuflw m0, m0, q0000 - punpcklqdq m0, m0 -.s16: - movd m1, alpham - pshuflw m1, m1, q0000 - punpcklqdq m1, m1 - pabsw m2, m1 - psllw m2, 9 -.s16_loop: - mova m4, [acq] - mova m5, [acq+16] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - mova [dstq], m4 - mova m4, [acq+32] - mova m5, [acq+48] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - mova [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - add acq, 64 - sub hd, 2 - jg .s16_loop - RET -ALIGN function_align -.h32: - mova m0, [tlq-32] - pmaddubsw m0, m3 - mova m2, [tlq-16] - pmaddubsw m2, m3 - paddw m0, m2 - jmp wq -.w32: - movu m1, [tlq+1] - pmaddubsw m1, m3 - movu m2, [tlq+17] - pmaddubsw m2, m3 - paddw m1, m2 - paddw m0, m1 - psubw m4, m0 - punpckhqdq m0, m0 - psubw m0, m4 - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 - pmaddwd m0, m3 - psrlw m0, m5 - cmp hd, 32 - je .w32_end - lea r2d, [hq*2] - mov r6d, 0x5556 - mov r2d, 0x3334 - test hd, 64|16 - cmovz r6d, r2d - movd m1, r6d - pmulhuw m0, m1 -.w32_end: - pshuflw m0, m0, q0000 - punpcklqdq m0, m0 -.s32: - movd m1, alpham - pshuflw m1, m1, q0000 - punpcklqdq m1, m1 - pabsw m2, m1 - psllw m2, 9 -.s32_loop: - mova m4, [acq] - mova m5, [acq+16] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - mova [dstq], m4 - mova m4, [acq+32] - mova m5, [acq+48] - IPRED_CFL 4 - IPRED_CFL 5 - packuswb m4, m5 - mova [dstq+16], m4 - add dstq, strideq - add acq, 64 - dec hd - jg .s32_loop - RET - -;--------------------------------------------------------------------------------------- -;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int16_t *ac, const int alpha); -;--------------------------------------------------------------------------------------- -cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha - mov hd, hm ; zero upper half - tzcnt r6d, hd - sub tlq, hq - tzcnt wd, wm - movu m0, [tlq] - mov t0d, 0x8000 - movd m3, t0d - movd m2, r6d - psrld m3, m2 - LEA t0, ipred_cfl_left_ssse3_table - movsxd r6, [t0+r6*4] - pcmpeqd m2, m2 - pmaddubsw m0, m2 - add r6, t0 - add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table - movsxd wq, [t0+wq*4] - add wq, t0 - movifnidn acq, acmp - jmp r6 -.h32: - movu m1, [tlq+16] ; unaligned when jumping here from dc_top - pmaddubsw m1, m2 - paddw m0, m1 -.h16: - pshufd m1, m0, q3232 ; psrlq m1, m0, 16 - paddw m0, m1 -.h8: - pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 - paddw m0, m1 -.h4: - pmaddwd m0, m2 - pmulhrsw m0, m3 - pshuflw m0, m0, q0000 - punpcklqdq m0, m0 - jmp wq - -;--------------------------------------------------------------------------------------- -;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int16_t *ac, const int alpha); -;--------------------------------------------------------------------------------------- -cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha - LEA t0, ipred_cfl_left_ssse3_table - tzcnt wd, wm - inc tlq - movu m0, [tlq] - movifnidn hd, hm - mov r6d, 0x8000 - movd m3, r6d - movd m2, wd - psrld m3, m2 - movsxd r6, [t0+wq*4] - pcmpeqd m2, m2 - pmaddubsw m0, m2 - add r6, t0 - add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table - movsxd wq, [t0+wq*4] - add wq, t0 - movifnidn acq, acmp - jmp r6 - -;--------------------------------------------------------------------------------------- -;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, -; const int width, const int height, const int16_t *ac, const int alpha); -;--------------------------------------------------------------------------------------- -cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha - tzcnt wd, wm - movifnidn hd, hm - LEA r6, ipred_cfl_splat_ssse3_table - movsxd wq, [r6+wq*4] - movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] - add wq, r6 - movifnidn acq, acmp - jmp wq - -%macro RELOAD_ACQ_32 1 - mov acq, ac_bakq ; restore acq -%endmacro - -%if ARCH_X86_64 -cglobal ipred_cfl_ac_420, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak -DECLARE_REG_TMP 7 - movddup m2, [pb_2] -%else -cglobal ipred_cfl_ac_420, 4, 7, 7, ac, y, stride, wpad, hpad, w, h -DECLARE_REG_TMP 4 -%define ac_bakq acmp - mov t0d, 0x02020202 - movd m2, t0d - pshufd m2, m2, q0000 -%endif - movifnidn wd, wm - mov t0d, hm - mov hd, t0d - imul t0d, wd - movd m5, t0d - movifnidn hpadd, hpadm -%if ARCH_X86_64 - mov ac_bakq, acq -%endif - shl hpadd, 2 - sub hd, hpadd - pxor m4, m4 - cmp wd, 8 - jg .w16 - je .w8 - ; fall-through -%if ARCH_X86_64 - DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak -%else - DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h -%endif -.w4: - lea stride3q, [strideq*3] -.w4_loop: - movq m0, [yq] - movq m1, [yq+strideq] - movhps m0, [yq+strideq*2] - movhps m1, [yq+stride3q] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - mova [acq], m0 - paddw m4, m0 - lea yq, [yq+strideq*4] - add acq, 16 - sub hd, 2 - jg .w4_loop - test hpadd, hpadd - jz .calc_avg_4_8 - punpckhqdq m0, m0 -.w4_hpad_loop: - mova [acq], m0 - paddw m4, m0 - add acq, 16 - sub hpadd, 2 - jg .w4_hpad_loop - jmp .calc_avg_4_8 -.w8: - lea stride3q, [strideq*3] - test wpadd, wpadd - jnz .w8_wpad -.w8_loop: - mova m0, [yq] - mova m1, [yq+strideq] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - mova [acq], m0 - paddw m4, m0 - mova m0, [yq+strideq*2] - mova m1, [yq+stride3q] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - mova [acq+16], m0 - paddw m4, m0 - lea yq, [yq+strideq*4] - add acq, 32 - sub hd, 2 - jg .w8_loop - test hpadd, hpadd - jz .calc_avg_4_8 - jmp .w8_hpad -.w8_wpad: ; wpadd=1 - movddup m0, [yq] - movddup m1, [yq+strideq] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - pshufhw m0, m0, q3333 - mova [acq], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 16 - sub hd, 1 - jg .w8_wpad - test hpadd, hpadd - jz .calc_avg_4_8 -.w8_hpad: - mova [acq], m0 - paddw m4, m0 - add acq, 16 - sub hpadd, 1 - jg .w8_hpad - jmp .calc_avg_4_8 -.w16: - test wpadd, wpadd - jnz .w16_wpad -.w16_loop: - mova m0, [yq] - mova m1, [yq+strideq] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - mova [acq], m0 - paddw m4, m0 - mova m6, [yq+16] - mova m1, [yq+strideq+16] - pmaddubsw m6, m2 - pmaddubsw m1, m2 - paddw m6, m1 - mova [acq+16], m6 - paddw m4, m6 - lea yq, [yq+strideq*2] - add acq, 32 - dec hd - jg .w16_loop - test hpadd, hpadd - jz .calc_avg16 - jmp .w16_hpad_loop -.w16_wpad: - cmp wpadd, 2 - jl .w16_pad1 - je .w16_pad2 -.w16_pad3: - movddup m0, [yq] - movddup m1, [yq+strideq] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - pshufhw m0, m0, q3333 - mova [acq], m0 - paddw m4, m0 - mova m6, m0 - punpckhqdq m6, m0, m0 - mova [acq+16], m6 - paddw m4, m6 - lea yq, [yq+strideq*2] - add acq, 32 - dec hd - jg .w16_pad3 - jmp .w16_wpad_done -.w16_pad2: - mova m0, [yq] - mova m1, [yq+strideq] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - mova [acq], m0 - paddw m4, m0 - pshufhw m6, m0, q3333 - punpckhqdq m6, m6 - mova [acq+16], m6 - paddw m4, m6 - lea yq, [yq+strideq*2] - add acq, 32 - dec hd - jg .w16_pad2 - jmp .w16_wpad_done -.w16_pad1: - mova m0, [yq] - mova m1, [yq+strideq] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - paddw m0, m1 - mova [acq], m0 - paddw m4, m0 - movddup m6, [yq+16] - movddup m1, [yq+strideq+16] - pmaddubsw m6, m2 - pmaddubsw m1, m2 - paddw m6, m1 - pshufhw m6, m6, q3333 - mova [acq+16], m6 - paddw m4, m6 - lea yq, [yq+strideq*2] - add acq, 32 - dec hd - jg .w16_pad1 -.w16_wpad_done: - test hpadd, hpadd - jz .calc_avg16 -.w16_hpad_loop: - mova [acq], m0 - paddw m4, m0 - mova [acq+16], m6 - paddw m4, m6 - add acq, 32 - dec hpadd - jg .w16_hpad_loop - jmp .calc_avg16 - -%if ARCH_X86_64 - DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak -%else - DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h -%endif -.calc_avg_4_8: - psrlw m2, 9 - pmaddwd m4, m2 - jmp .calc_avg -.calc_avg16: - psrld m0, m4, 16 - pslld m4, 16 - psrld m4, 16 - paddd m4, m0 -.calc_avg: - movd szd, m5 - psrad m5, 1 - tzcnt r1d, szd - paddd m4, m5 - movd m1, r1d - pshufd m0, m4, q2301 - paddd m0, m4 - pshufd m4, m0, q1032 - paddd m0, m4 - psrad m0, m1 ; sum >>= log2sz; - packssdw m0, m0 - RELOAD_ACQ_32 acq -.sub_loop: - mova m1, [acq] - psubw m1, m0 ; ac[x] -= sum; - mova [acq], m1 - add acq, 16 - sub szd, 8 - jg .sub_loop - RET - -%if ARCH_X86_64 -cglobal ipred_cfl_ac_422, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak - movddup m2, [pb_4] -%else -cglobal ipred_cfl_ac_422, 4, 7, 7, ac, y, stride, wpad, hpad, w, h - mov t0d, 0x04040404 - movd m2, t0d - pshufd m2, m2, q0000 -%endif - movifnidn wd, wm - mov t0d, hm - mov hd, t0d - imul t0d, wd - movd m6, t0d - movifnidn hpadd, hpadm -%if ARCH_X86_64 - mov ac_bakq, acq -%endif - shl hpadd, 2 - sub hd, hpadd - pxor m4, m4 - pxor m5, m5 - cmp wd, 8 - jg .w16 - je .w8 - ; fall-through - -%if ARCH_X86_64 - DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak -%else - DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h -%endif -.w4: - lea stride3q, [strideq*3] -.w4_loop: - movq m1, [yq] - movhps m1, [yq+strideq] - movq m0, [yq+strideq*2] - movhps m0, [yq+stride3q] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - mova [acq], m1 - mova [acq+16], m0 - paddw m4, m0 - paddw m5, m1 - lea yq, [yq+strideq*4] - add acq, 32 - sub hd, 4 - jg .w4_loop - test hpadd, hpadd - jz .calc_avg_4 - punpckhqdq m0, m0 -.w4_hpad_loop: - mova [acq], m0 - paddw m4, m0 - add acq, 16 - sub hpadd, 2 - jg .w4_hpad_loop - jmp .calc_avg_4 -.w8: - lea stride3q, [strideq*3] - test wpadd, wpadd - jnz .w8_wpad -.w8_loop: - mova m1, [yq] - mova m0, [yq+strideq] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - mova [acq], m1 - mova [acq+16], m0 - paddw m4, m0 - paddw m5, m1 - mova m1, [yq+strideq*2] - mova m0, [yq+stride3q] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - mova [acq+32], m1 - mova [acq+48], m0 - paddw m4, m0 - paddw m5, m1 - lea yq, [yq+strideq*4] - add acq, 64 - sub hd, 4 - jg .w8_loop - test hpadd, hpadd - jz .calc_avg_8_16 - jmp .w8_hpad -.w8_wpad: - movddup m1, [yq] - pmaddubsw m1, m2 - pshufhw m1, m1, q3333 - mova [acq], m1 - paddw m5, m1 - movddup m0, [yq+strideq] - pmaddubsw m0, m2 - pshufhw m0, m0, q3333 - mova [acq+16], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 32 - sub hd, 2 - jg .w8_wpad - test hpadd, hpadd - jz .calc_avg_8_16 -.w8_hpad: - mova [acq], m0 - paddw m4, m0 - mova [acq+16], m0 - paddw m4, m0 - add acq, 32 - sub hpadd, 2 - jg .w8_hpad - jmp .calc_avg_8_16 -.w16: - test wpadd, wpadd - jnz .w16_wpad -.w16_loop: - mova m1, [yq] - mova m0, [yq+16] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - mova [acq], m1 - mova [acq+16], m0 - paddw m5, m0 - paddw m5, m1 - mova m1, [yq+strideq] - mova m0, [yq+strideq+16] - pmaddubsw m0, m2 - pmaddubsw m1, m2 - mova [acq+32], m1 - mova [acq+48], m0 - paddw m4, m0 - paddw m4, m1 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_loop - test hpadd, hpadd - jz .calc_avg_8_16 - jmp .w16_hpad_loop -.w16_wpad: - cmp wpadd, 2 - jl .w16_pad1 - je .w16_pad2 -.w16_pad3: - movddup m1, [yq] - pmaddubsw m1, m2 - pshufhw m1, m1, q3333 - mova [acq], m1 - paddw m5, m1 - punpckhqdq m1, m1 - mova [acq+16], m1 - paddw m5, m1 - movddup m1, [yq+strideq] - pmaddubsw m1, m2 - pshufhw m1, m1, q3333 - mova [acq+32], m1 - paddw m4, m1 - punpckhqdq m0, m1, m1 - mova [acq+48], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_pad3 - jmp .w16_wpad_done -.w16_pad2: - mova m1, [yq] - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1 - pshufhw m1, m1, q3333 - punpckhqdq m1, m1 - mova [acq+16], m1 - paddw m5, m1 - mova m1, [yq+strideq] - pmaddubsw m1, m2 - mova [acq+32], m1 - paddw m4, m1 - mova m0, m1 - pshufhw m0, m0, q3333 - punpckhqdq m0, m0 - mova [acq+48], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_pad2 - jmp .w16_wpad_done -.w16_pad1: - mova m1, [yq] - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1 - movddup m0, [yq+16] - pmaddubsw m0, m2 - pshufhw m0, m0, q3333 - mova [acq+16], m0 - paddw m5, m0 - mova m1, [yq+strideq] - pmaddubsw m1, m2 - mova [acq+32], m1 - paddw m4, m1 - movddup m0, [yq+strideq+16] - pmaddubsw m0, m2 - pshufhw m0, m0, q3333 - mova [acq+48], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_pad1 -.w16_wpad_done: - test hpadd, hpadd - jz .calc_avg_8_16 -.w16_hpad_loop: - mova [acq], m1 - mova [acq+16], m0 - paddw m4, m1 - paddw m5, m0 - mova [acq+32], m1 - mova [acq+48], m0 - paddw m4, m1 - paddw m5, m0 - add acq, 64 - sub hpadd, 2 - jg .w16_hpad_loop - jmp .calc_avg_8_16 - -%if ARCH_X86_64 - DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak -%else - DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h -%endif -.calc_avg_4: - psrlw m2, 10 - pmaddwd m5, m2 - pmaddwd m0, m4, m2 - jmp .calc_avg -.calc_avg_8_16: - mova m0, m5 - psrld m5, 16 - pslld m0, 16 - psrld m0, 16 - paddd m5, m0 - mova m0, m4 - psrld m0, 16 - pslld m4, 16 - psrld m4, 16 - paddd m0, m4 -.calc_avg: - paddd m5, m0 - movd szd, m6 - psrad m6, 1 - tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); - paddd m5, m6 - movd m1, r1d - pshufd m0, m5, q2301 - paddd m0, m5 - pshufd m5, m0, q1032 - paddd m0, m5 - psrad m0, m1 ; sum >>= log2sz; - packssdw m0, m0 - RELOAD_ACQ_32 acq ; ac = ac_orig -.sub_loop: - mova m1, [acq] - psubw m1, m0 - mova [acq], m1 - add acq, 16 - sub szd, 8 - jg .sub_loop - RET - -%if ARCH_X86_64 -cglobal ipred_cfl_ac_444, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak - movddup m2, [pb_4] -%else -cglobal ipred_cfl_ac_444, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h -%define ac_bakq [rsp+16*4] - mov t0d, 0x04040404 - movd m2, t0d - pshufd m2, m2, q0000 -%endif - movifnidn wd, wm - movifnidn hpadd, hpadm - movd m0, hpadd - mov t0d, hm - mov hd, t0d - imul t0d, wd - movd m6, t0d - movd hpadd, m0 - mov ac_bakq, acq - shl hpadd, 2 - sub hd, hpadd - pxor m5, m5 - pxor m4, m4 - cmp wd, 16 - jg .w32 - cmp wd, 8 - jg .w16 - je .w8 - ; fall-through - -%if ARCH_X86_64 - DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak -%else - DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h -%endif -.w4: - lea stride3q, [strideq*3] -.w4_loop: - movd m1, [yq] - movd m3, [yq+strideq] - punpckldq m1, m3 - punpcklbw m1, m1 - movd m0, [yq+strideq*2] - movd m3, [yq+stride3q] - punpckldq m0, m3 - punpcklbw m0, m0 - pmaddubsw m1, m2 - pmaddubsw m0, m2 - mova [acq], m1 - mova [acq+16], m0 - paddw m5, m0 - paddw m5, m1 - lea yq, [yq+strideq*4] - add acq, 32 - sub hd, 4 - jg .w4_loop - test hpadd, hpadd - jz .calc_avg_4 - punpckhqdq m0, m0 -.w4_hpad_loop: - mova [acq], m0 - paddw m5, m0 - add acq, 16 - sub hpadd, 2 - jg .w4_hpad_loop -.calc_avg_4: - psrlw m2, 10 - pmaddwd m5, m2 - jmp .calc_avg - -.w8: - lea stride3q, [strideq*3] - test wpadd, wpadd - jnz .w8_wpad -.w8_loop: - movq m1, [yq] - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1 - movq m0, [yq+strideq] - punpcklbw m0, m0 - pmaddubsw m0, m2 - mova [acq+16], m0 - paddw m5, m0 - movq m1, [yq+strideq*2] - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq+32], m1 - paddw m4, m1 - movq m0, [yq+stride3q] - punpcklbw m0, m0 - pmaddubsw m0, m2 - mova [acq+48], m0 - paddw m4, m0 - lea yq, [yq+strideq*4] - add acq, 64 - sub hd, 4 - jg .w8_loop - test hpadd, hpadd - jz .calc_avg_8_16 - jmp .w8_hpad -.w8_wpad: - movd m1, [yq] - punpcklbw m1, m1 - punpcklqdq m1, m1 - pmaddubsw m1, m2 - pshufhw m1, m1, q3333 - mova [acq], m1 - paddw m5, m1 - movd m0, [yq+strideq] - punpcklbw m0, m0 - punpcklqdq m0, m0 - pmaddubsw m0, m2 - pshufhw m0, m0, q3333 - mova [acq+16], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 32 - sub hd, 2 - jg .w8_wpad - test hpadd, hpadd - jz .calc_avg_8_16 -.w8_hpad: - mova [acq], m0 - paddw m5, m0 - mova [acq+16], m0 - paddw m4, m0 - add acq, 32 - sub hpadd, 2 - jg .w8_hpad - jmp .calc_avg_8_16 - -.w16: - test wpadd, wpadd - jnz .w16_wpad -.w16_loop: - mova m0, [yq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1 - punpckhbw m0, m0 - pmaddubsw m0, m2 - mova [acq+16], m0 - paddw m5, m0 - mova m0, [yq+strideq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq+32], m1 - paddw m4, m1 - punpckhbw m0, m0 - pmaddubsw m0, m2 - mova [acq+48], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_loop - test hpadd, hpadd - jz .calc_avg_8_16 - jmp .w16_hpad_loop -.w16_wpad: - cmp wpadd, 2 - jl .w16_pad1 - je .w16_pad2 -.w16_pad3: - movd m1, [yq] - punpcklbw m1, m1 - punpcklqdq m1, m1 - pshufhw m1, m1, q3333 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1 - punpckhqdq m1, m1 - mova [acq+16], m1 - paddw m5, m1 - movd m1, [yq+strideq] - punpcklbw m1, m1 - punpcklqdq m1, m1 - pshufhw m1, m1, q3333 - pmaddubsw m1, m2 - mova [acq+32], m1 - paddw m4, m1 - punpckhqdq m0, m1, m1 - mova [acq+48], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_pad3 - jmp .w16_wpad_done -.w16_pad2: - movq m1, [yq] - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1 - pshufhw m1, m1, q3333 - punpckhqdq m1, m1 - mova [acq+16], m1 - paddw m5, m1 - movq m1, [yq+strideq] - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq+32], m1 - paddw m4, m1 - mova m0, m1 - pshufhw m0, m0, q3333 - punpckhqdq m0, m0 - mova [acq+48], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_pad2 - jmp .w16_wpad_done -.w16_pad1: - mova m0, [yq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1 - punpckhbw m0, m0 - punpcklqdq m0, m0 - pshufhw m0, m0, q3333 - pmaddubsw m0, m2 - mova [acq+16], m0 - paddw m5, m0 - mova m0, [yq+strideq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq+32], m1 - paddw m4, m1 - punpckhbw m0, m0 - punpcklqdq m0, m0 - pshufhw m0, m0, q3333 - pmaddubsw m0, m2 - mova [acq+48], m0 - paddw m4, m0 - lea yq, [yq+strideq*2] - add acq, 64 - sub hd, 2 - jg .w16_pad1 -.w16_wpad_done: - test hpadd, hpadd - jz .calc_avg_8_16 -.w16_hpad_loop: - mova [acq], m1 - mova [acq+16], m0 - paddw m4, m1 - paddw m5, m0 - mova [acq+32], m1 - mova [acq+48], m0 - paddw m4, m1 - paddw m5, m0 - add acq, 64 - sub hpadd, 2 - jg .w16_hpad_loop -.calc_avg_8_16: - mova m0, m5 - psrld m5, 16 - pslld m0, 16 - psrld m0, 16 - paddd m5, m0 - mova m0, m4 - psrld m0, 16 - pslld m4, 16 - psrld m4, 16 - paddd m0, m4 - paddd m5, m0 - jmp .calc_avg - -.w32: - pxor m0, m0 - mova [rsp ], m0 - mova [rsp+16], m0 - mova [rsp+32], m0 - mova [rsp+48], m0 - test wpadd, wpadd - jnz .w32_wpad -.w32_loop: - mova m0, [yq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1, [rsp] - mova [rsp ], m5 - punpckhbw m0, m0 - pmaddubsw m0, m2 - mova [acq+16], m0 - paddw m5, m0, [rsp+16] - mova [rsp+16], m5 - mova m4, [yq+16] - mova m3, m4 - punpcklbw m3, m3 - pmaddubsw m3, m2 - mova [acq+32], m3 - paddw m5, m3, [rsp+32] - mova [rsp+32], m5 - punpckhbw m4, m4 - pmaddubsw m4, m2 - mova [acq+48], m4 - paddw m5, m4, [rsp+48] - mova [rsp+48], m5 - lea yq, [yq+strideq] - add acq, 64 - sub hd, 1 - jg .w32_loop - test hpadd, hpadd - jz .calc_avg_32 - jmp .w32_hpad_loop -.w32_wpad: - cmp wpadd, 2 - jl .w32_pad1 - je .w32_pad2 - cmp wpadd, 4 - jl .w32_pad3 - je .w32_pad4 - cmp wpadd, 6 - jl .w32_pad5 - je .w32_pad6 -.w32_pad7: - movd m1, [yq] - punpcklbw m1, m1 - punpcklqdq m1, m1 - pshufhw m1, m1, q3333 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1, [rsp] - mova [rsp ], m5 - mova m0, m1 - punpckhqdq m0, m0 - mova [acq+16], m0 - paddw m5, m0, [rsp+16] - mova [rsp+16], m5 - mova m3, m0 - mova [acq+32], m3 - paddw m5, m3, [rsp+32] - mova [rsp+32], m5 - mova m4, m3 - mova [acq+48], m4 - paddw m5, m4, [rsp+48] - mova [rsp+48], m5 - lea yq, [yq+strideq] - add acq, 64 - sub hd, 1 - jg .w32_pad7 - jmp .w32_wpad_done -.w32_pad6: - mova m0, [yq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1, [rsp] - mova [rsp ], m5 - pshufhw m0, m1, q3333 - punpckhqdq m0, m0 - mova [acq+16], m0 - paddw m5, m0, [rsp+16] - mova [rsp+16], m5 - mova m3, m0 - mova [acq+32], m3 - paddw m5, m3, [rsp+32] - mova [rsp+32], m5 - mova m4, m3 - mova [acq+48], m4 - paddw m5, m4, [rsp+48] - mova [rsp+48], m5 - lea yq, [yq+strideq] - add acq, 64 - sub hd, 1 - jg .w32_pad6 - jmp .w32_wpad_done -.w32_pad5: - mova m0, [yq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - mova m5, [rsp] - paddw m5, m1 - mova [rsp ], m5 - punpckhbw m0, m0 - punpcklqdq m0, m0 - pshufhw m0, m0, q3333 - pmaddubsw m0, m2 - mova [acq+16], m0 - paddw m5, m0, [rsp+16] - mova [rsp+16], m5 - mova m3, m0 - punpckhqdq m3, m3 - mova [acq+32], m3 - paddw m5, m3, [rsp+32] - mova [rsp+32], m5 - mova m4, m3 - mova [acq+48], m4 - paddw m5, m4, [rsp+48] - mova [rsp+48], m5 - lea yq, [yq+strideq] - add acq, 64 - sub hd, 1 - jg .w32_pad5 - jmp .w32_wpad_done -.w32_pad4: - mova m0, [yq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1, [rsp] - mova [rsp ], m5 - punpckhbw m0, m0 - pmaddubsw m0, m2 - mova [acq+16], m0 - paddw m5, m0, [rsp+16] - mova [rsp+16], m5 - mova m3, m0 - pshufhw m3, m3, q3333 - punpckhqdq m3, m3 - mova [acq+32], m3 - paddw m5, m3, [rsp+32] - mova [rsp+32], m5 - mova m4, m3 - mova [acq+48], m4 - paddw m5, m4, [rsp+48] - mova [rsp+48], m5 - lea yq, [yq+strideq] - add acq, 64 - sub hd, 1 - jg .w32_pad4 - jmp .w32_wpad_done -.w32_pad3: - mova m0, [yq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1, [rsp] - mova [rsp ], m5 - punpckhbw m0, m0 - pmaddubsw m0, m2 - mova [acq+16], m0 - paddw m5, m0, [rsp+16] - mova [rsp+16], m5 - movd m3, [yq+16] - punpcklbw m3, m3 - punpcklqdq m3, m3 - pshufhw m3, m3, q3333 - pmaddubsw m3, m2 - mova [acq+32], m3 - paddw m5, m3, [rsp+32] - mova [rsp+32], m5 - mova m4, m3 - punpckhqdq m4, m4 - mova [acq+48], m4 - paddw m5, m4, [rsp+48] - mova [rsp+48], m5 - lea yq, [yq+strideq] - add acq, 64 - sub hd, 1 - jg .w32_pad3 - jmp .w32_wpad_done -.w32_pad2: - mova m0, [yq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1, [rsp] - mova [rsp ], m5 - punpckhbw m0, m0 - pmaddubsw m0, m2 - mova [acq+16], m0 - paddw m5, m0, [rsp+16] - mova [rsp+16], m5 - mova m3, [yq+16] - punpcklbw m3, m3 - pmaddubsw m3, m2 - mova [acq+32], m3 - paddw m5, m3, [rsp+32] - mova [rsp+32], m5 - pshufhw m4, m3, q3333 - punpckhqdq m4, m4 - mova [acq+48], m4 - paddw m5, m4, [rsp+48] - mova [rsp+48], m5 - lea yq, [yq+strideq] - add acq, 64 - sub hd, 1 - jg .w32_pad2 - jmp .w32_wpad_done -.w32_pad1: - mova m0, [yq] - mova m1, m0 - punpcklbw m1, m1 - pmaddubsw m1, m2 - mova [acq], m1 - paddw m5, m1, [rsp] - mova [rsp ], m5 - punpckhbw m0, m0 - pmaddubsw m0, m2 - mova [acq+16], m0 - paddw m5, m0, [rsp+16] - mova [rsp+16], m5 - mova m4, [yq+16] - mova m3, m4 - punpcklbw m3, m3 - pmaddubsw m3, m2 - mova [acq+32], m3 - paddw m5, m3, [rsp+32] - mova [rsp+32], m5 - punpckhbw m4, m4 - punpcklqdq m4, m4 - pshufhw m4, m4, q3333 - pmaddubsw m4, m2 - mova [acq+48], m4 - paddw m5, m4, [rsp+48] - mova [rsp+48], m5 - lea yq, [yq+strideq] - add acq, 64 - sub hd, 1 - jg .w32_pad1 -.w32_wpad_done: - test hpadd, hpadd - jz .calc_avg_32 -.w32_hpad_loop: - mova [acq], m1 - mova [acq+16], m0 - paddw m5, m1, [rsp] - mova [rsp ], m5 - paddw m5, m0, [rsp+16] - mova [rsp+16], m5 - mova [acq+32], m3 - mova [acq+48], m4 - paddw m5, m3, [rsp+32] - mova [rsp+32], m5 - paddw m5, m4, [rsp+48] - mova [rsp+48], m5 - add acq, 64 - sub hpadd, 1 - jg .w32_hpad_loop - -%if ARCH_X86_64 - DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak -%else - DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h -%endif - -.calc_avg_32: - mova m5, [rsp] - mova m0, m5 - psrld m5, 16 - pslld m0, 16 - psrld m0, 16 - paddd m5, m0 - mova m0, [rsp+16] - mova m3, m0 - psrld m0, 16 - pslld m3, 16 - psrld m3, 16 - paddd m0, m3 - paddd m5, m0 - mova m0, [rsp+32] - mova m3, m0 - psrld m0, 16 - pslld m3, 16 - psrld m3, 16 - paddd m0, m3 - mova m1, [rsp+48] - mova m3, m1 - psrld m1, 16 - pslld m3, 16 - psrld m3, 16 - paddd m1, m3 - paddd m1, m0 - paddd m5, m1 -.calc_avg: - movd szd, m6 - psrad m6, 1 - tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); - paddd m5, m6 - movd m1, r1d - pshufd m0, m5, q2301 - paddd m0, m5 - pshufd m5, m0, q1032 - paddd m0, m5 - psrad m0, m1 ; sum >>= log2sz; - packssdw m0, m0 - RELOAD_ACQ_32 acq ; ac = ac_orig -.sub_loop: - mova m1, [acq] - psubw m1, m0 - mova [acq], m1 - add acq, 16 - sub szd, 8 - jg .sub_loop - RET - -; %1 simd register that hold the mask and will hold the result -; %2 simd register that holds the "true" values -; %3 location of the "false" values (simd register/memory) -%macro BLEND 3 ; mask, true, false - pand %2, %1 - pandn %1, %3 - por %1, %2 -%endmacro - -%macro PAETH 2 ; top, ldiff - pavgb m1, m%1, m3 - pxor m0, m%1, m3 - pand m0, m4 - psubusb m2, m5, m1 - psubb m1, m0 - psubusb m1, m5 - por m1, m2 - paddusb m1, m1 - por m1, m0 ; min(tldiff, 255) - psubusb m2, m5, m3 - psubusb m0, m3, m5 - por m2, m0 ; tdiff -%ifnum %2 - pminub m2, m%2 - pcmpeqb m0, m%2, m2 ; ldiff <= tdiff -%else - mova m0, %2 - pminub m2, m0 - pcmpeqb m0, m2 -%endif - pminub m1, m2 - pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff - mova m2, m3 - BLEND m0, m2, m%1 - BLEND m1, m0, m5 -%endmacro - -cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h -%define base r5-ipred_paeth_ssse3_table - tzcnt wd, wm - movifnidn hd, hm - pxor m0, m0 - movd m5, [tlq] - pshufb m5, m0 - LEA r5, ipred_paeth_ssse3_table - movsxd wq, [r5+wq*4] - movddup m4, [base+ipred_paeth_shuf] - add wq, r5 - jmp wq -.w4: - movd m6, [tlq+1] ; top - pshufd m6, m6, q0000 - lea r3, [strideq*3] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 ; ldiff -.w4_loop: - sub tlq, 4 - movd m3, [tlq] - mova m1, [base+ipred_h_shuf] - pshufb m3, m1 ; left - PAETH 6, 7 - movd [dstq ], m1 - pshuflw m0, m1, q1032 - movd [dstq+strideq ], m0 - punpckhqdq m1, m1 - movd [dstq+strideq*2], m1 - psrlq m1, 32 - movd [dstq+r3 ], m1 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w4_loop - RET -ALIGN function_align -.w8: - movddup m6, [tlq+1] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 -.w8_loop: - sub tlq, 2 - movd m3, [tlq] - pshufb m3, [base+ipred_paeth_shuf] - PAETH 6, 7 - movq [dstq ], m1 - movhps [dstq+strideq], m1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w8_loop - RET -ALIGN function_align -.w16: - movu m6, [tlq+1] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 -.w16_loop: - sub tlq, 1 - movd m3, [tlq] - pxor m1, m1 - pshufb m3, m1 - PAETH 6, 7 - mova [dstq], m1 - add dstq, strideq - sub hd, 1 - jg .w16_loop - RET -ALIGN function_align -.w32: - movu m6, [tlq+1] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 - mova [rsp ], m6 - mova [rsp+16], m7 - movu m6, [tlq+17] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 - mova [rsp+32], m6 -.w32_loop: - dec tlq - movd m3, [tlq] - pxor m1, m1 - pshufb m3, m1 - mova m6, [rsp] - PAETH 6, [rsp+16] - mova [dstq ], m1 - mova m6, [rsp+32] - PAETH 6, 7 - mova [dstq+16], m1 - add dstq, strideq - dec hd - jg .w32_loop - RET -ALIGN function_align -.w64: - movu m6, [tlq+1] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 - mova [rsp ], m6 - mova [rsp+16], m7 - movu m6, [tlq+17] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 - mova [rsp+32], m6 - mova [rsp+48], m7 - movu m6, [tlq+33] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 - mova [rsp+64], m6 - mova [rsp+80], m7 - movu m6, [tlq+49] - psubusb m7, m5, m6 - psubusb m0, m6, m5 - por m7, m0 - mova [rsp+96], m6 -.w64_loop: - dec tlq - movd m3, [tlq] - pxor m1, m1 - pshufb m3, m1 - mova m6, [rsp] - PAETH 6, [rsp+16] - mova [dstq ], m1 - mova m6, [rsp+32] - PAETH 6, [rsp+48] - mova [dstq+16], m1 - mova m6, [rsp+64] - PAETH 6, [rsp+80] - mova [dstq+32], m1 - mova m6, [rsp+96] - PAETH 6, 7 - mova [dstq+48], m1 - add dstq, strideq - dec hd - jg .w64_loop - RET - - -%macro FILTER 4 ;dst, src, tmp, shuf -%ifnum %4 - pshufb m%2, m%4 -%else - pshufb m%2, %4 -%endif - pshufd m%1, m%2, q0000 ;p0 p1 - pmaddubsw m%1, m2 - pshufd m%3, m%2, q1111 ;p2 p3 - pmaddubsw m%3, m3 - paddw m%1, [base+pw_8] - paddw m%1, m%3 - pshufd m%3, m%2, q2222 ;p4 p5 - pmaddubsw m%3, m4 - paddw m%1, m%3 - pshufd m%3, m%2, q3333 ;p6 __ - pmaddubsw m%3, m5 - paddw m%1, m%3 - psraw m%1, 4 - packuswb m%1, m%1 -%endmacro - -cglobal ipred_filter, 3, 7, 8, dst, stride, tl, w, h, filter -%define base r6-$$ - LEA r6, $$ - tzcnt wd, wm -%ifidn filterd, filterm - movzx filterd, filterb -%else - movzx filterd, byte filterm -%endif - shl filterd, 6 - lea filterq, [base+filter_intra_taps+filterq] - movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 - movsxd wq, [base+ipred_filter_ssse3_table+wq*4] - mova m2, [filterq+16*0] - mova m3, [filterq+16*1] - mova m4, [filterq+16*2] - mova m5, [filterq+16*3] - lea wq, [base+ipred_filter_ssse3_table+wq] - mov hd, hm - jmp wq -.w4: - mova m1, [base+filter_shuf1] - sub tlq, 3 - sub tlq, hq - jmp .w4_loop_start -.w4_loop: - movd m0, [tlq+hq] - punpckldq m0, m6 - lea dstq, [dstq+strideq*2] -.w4_loop_start: - FILTER 6, 0, 7, 1 - movd [dstq+strideq*0], m6 - pshuflw m6, m6, q1032 - movd [dstq+strideq*1], m6 - sub hd, 2 - jg .w4_loop - RET - -ALIGN function_align -.w8: - movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 - sub tlq, 5 - sub tlq, hq - -.w8_loop: - FILTER 7, 0, 1, [base+filter_shuf1] - punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - FILTER 0, 6, 1, [base+filter_shuf2] - - punpckldq m6, m7, m0 - movq [dstq+strideq*0], m6 - punpckhqdq m6, m6 - movq [dstq+strideq*1], m6 - - movd m0, [tlq+hq] ;_ 6 5 0 - punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 - - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w8_loop - RET - -ALIGN function_align -.w16: - movu m6, [tlq+1] ;top row - sub tlq, 5 - sub tlq, hq - -.w16_loop: - FILTER 7, 0, 1, [base+filter_shuf1] - punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+strideq*0], m7 - psrlq m7, 32 - palignr m7, m6, 4 - - FILTER 6, 0, 1, [base+filter_shuf2] - punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+4+strideq*0], m6 - psrlq m6, 32 - palignr m6, m7, 4 - - FILTER 7, 0, 1, [base+filter_shuf2] - punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+8+strideq*0], m7 - psrlq m7, 32 - palignr m7, m6, 4 - - FILTER 6, 0, 1, [base+filter_shuf2] - movd [dstq+12+strideq*0], m6 - psrlq m6, 32 - palignr m6, m7, 4 - mova [dstq+strideq*1], m6 - - movd m0, [tlq+hq] ;_ 6 5 0 - punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 - - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w16_loop - RET - -ALIGN function_align -.w32: - movu m6, [tlq+1] ;top row - lea filterq, [tlq+17] - sub tlq, 5 - sub tlq, hq - -.w32_loop: - FILTER 7, 0, 1, [base+filter_shuf1] - punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+strideq*0], m7 - psrlq m7, 32 - palignr m7, m6, 4 - - FILTER 6, 0, 1, [base+filter_shuf2] - punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+4+strideq*0], m6 - psrlq m6, 32 - palignr m6, m7, 4 - - FILTER 7, 0, 1, [base+filter_shuf2] - punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+8+strideq*0], m7 - psrlq m7, 32 - palignr m7, m6, 4 - - FILTER 6, 0, 1, [base+filter_shuf2] - movu m1, [filterq] - punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ - punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+12+strideq*0], m6 - psrlq m6, 32 - palignr m6, m7, 4 - mova [dstq+strideq*1], m6 - - mova m6, m1 - - FILTER 7, 0, 6, [base+filter_shuf2] - punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+16+strideq*0], m7 - psrlq m7, 32 - palignr m7, m1, 4 - - FILTER 6, 0, 1, [base+filter_shuf2] - punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+20+strideq*0], m6 - psrlq m6, 32 - palignr m6, m7, 4 - - FILTER 7, 0, 1, [base+filter_shuf2] - punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 - movd [dstq+24+strideq*0], m7 - psrlq m7, 32 - palignr m7, m6, 4 - - FILTER 6, 0, 1, [base+filter_shuf2] - movd [dstq+28+strideq*0], m6 - psrlq m6, 32 - palignr m6, m7, 4 - mova [dstq+16+strideq*1], m6 - - mova m6, [dstq+strideq*1] - movd m0, [tlq+hq] ;_ 6 5 0 - punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 - lea filterq, [dstq+16+strideq*1] - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w32_loop - RET diff -Nru dav1d-0.7.1/src/x86/itx16_avx2.asm dav1d-0.9.1/src/x86/itx16_avx2.asm --- dav1d-0.7.1/src/x86/itx16_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/itx16_avx2.asm 2021-07-28 21:38:28.901852100 +0000 @@ -0,0 +1,6368 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pd_1321_2482: dd 1321, 1321, 1321, 1321, 2482, 2482, 2482, 2482 +itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 + dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 +pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 +iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 +idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 +idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 + +%macro COEF_PAIR 2 +pd_%1_%2: dd %1, %1, %2, %2 +%define pd_%1 (pd_%1_%2 + 4*0) +%define pd_%2 (pd_%1_%2 + 4*2) +%endmacro + +COEF_PAIR 201, 995 +COEF_PAIR 401, 1931 +COEF_PAIR 799, 3406 +COEF_PAIR 1380, 601 +COEF_PAIR 1751, 2440 +COEF_PAIR 2598, 1189 +COEF_PAIR 2751, 2106 +COEF_PAIR 2896, 1567 +COEF_PAIR 2896, 3784 +COEF_PAIR 3035, 3513 +COEF_PAIR 3166, 3920 +COEF_PAIR 3703, 3290 +COEF_PAIR 3857, 4052 +COEF_PAIR 4017, 2276 +COEF_PAIR 4076, 3612 +COEF_PAIR 4091, 3973 + +%define pd_1321 (pd_1321_2482 + 4*0) +%define pd_2482 (pd_1321_2482 + 4*4) + +pd_m601: dd -601 +pd_m1189: dd -1189 +pd_m1380: dd -1380 +pd_m2106: dd -2106 +pd_m2598: dd -2598 +pd_m2751: dd -2751 +pd_m3344: dd -3344 +pd_3803: dd 3803 +pd_5793: dd 5793 +pd_6144: dd 6144 ; 2048 + 4096 +pd_10239: dd 10239 ; 2048 + 8192 - 1 +pd_10240: dd 10240 ; 2048 + 8192 +pd_11586: dd 11586 ; 5793 * 2 +pd_38912: dd 38912 ; 2048 + 4096 + 32768 + +pixel_max: times 2 dw 0x03ff ; 10bpc +clip_min: dd -0x20000 +clip_max: dd 0x1ffff + +idct64_mul_16bpc: +dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 +dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 +dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 +dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 + +cextern deint_shuf +cextern idct64_mul +cextern pw_1697x8 +cextern pw_1697x16 +cextern pw_1567_3784 +cextern pw_m1567_m3784 +cextern pw_m3784_1567 +cextern pw_2896_2896 +cextern pw_m2896_2896 +cextern pw_5 +cextern pw_2048 +cextern pw_4096 +cextern pw_8192 +cextern pw_16384 +cextern pw_2896x8 +cextern pd_2048 + +cextern idct_4x8_internal_8bpc_avx2.main +cextern idct_4x16_internal_8bpc_avx2.main +cextern idct_8x8_internal_8bpc_avx2.main +cextern idct_8x16_internal_8bpc_avx2.main +cextern idct_16x4_internal_8bpc_avx2.main +cextern idct_16x8_internal_8bpc_avx2.main +cextern idct_16x16_internal_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1 +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal + +cextern iadst_4x4_internal_8bpc_avx2.main +cextern iadst_4x8_internal_8bpc_avx2.main_pass2 +cextern iadst_4x16_internal_8bpc_avx2.main2 +cextern iadst_8x4_internal_8bpc_avx2.main +cextern iadst_8x8_internal_8bpc_avx2.main_pass2 +cextern iadst_8x16_internal_8bpc_avx2.main +cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x4_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x16_internal_8bpc_avx2.main +cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro IWHT4_1D_PACKED 0 + ; m0 = in0 in2, m1 = in1 in3 + psubd m2, m0, m1 ; t2 + paddd xm0, xm1 ; t0 + vpermq m2, m2, q3322 + vpermq m0, m0, q1100 + vpermq m1, m1, q3120 + psubd m3, m0, m2 + psrad m3, 1 + psubd m3, m1 ; t1 t3 + psubd m0, m3 ; ____ out0 + paddd m2, m3 ; out3 ____ +%endmacro + +INIT_YMM avx2 +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax + mova xm0, [cq+16*0] + vinserti128 m0, [cq+16*2], 1 + mova xm1, [cq+16*1] + vinserti128 m1, [cq+16*3], 1 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + lea r6, [dstq+strideq*2] + psrad m0, 2 + psrad m1, 2 + IWHT4_1D_PACKED + punpckhdq m0, m3 + punpckldq m3, m2 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x33 + packssdw m0, m3 + vextracti128 xm2, m0, 1 + punpckhdq xm1, xm0, xm2 ; out2 out1 + punpckldq xm0, xm2 ; out3 out0 + movq xm2, [r6 +strideq*1] + movhps xm2, [dstq+strideq*0] + movq xm3, [r6 +strideq*0] + movhps xm3, [dstq+strideq*1] +%ifidn bdmaxd, bdmaxm + movd xm5, bdmaxd + vpbroadcastw xm5, xm5 +%else ; win64: load from stack + vpbroadcastw xm5, bdmaxm +%endif + paddsw xm0, xm2 + paddsw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movq [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm0 + RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else +%if %9 & 1 + vbroadcasti128 m%3, [pd_%8] +%else + vpbroadcastd m%3, [pd_%8] +%endif + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else +%if %9 & 1 + vbroadcasti128 m%5, [pd_%7] +%else + vpbroadcastd m%5, [pd_%7] +%endif + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 4 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%if %9 & 2 ; invert the upper half of dst1 before rounding + vbroadcasti128 m%4, [pw_2048_m2048] + psubd m%1, m%3 + psignd m%1, m%4 + paddd m%1, m%6 +%else +%ifnum %6 + paddd m%1, m%6 +%endif + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size +cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_16bpc) + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%4_internal_16bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + movd xm1, [pw_2896x8] + mov [cq], eobd ; 0 + add r6d, 2048 + sar r6d, 12 + movd xm0, r6d + packssdw xm0, xm0 + pmulhrsw xm0, xm1 + vpbroadcastw xm0, xm0 + mova xm1, xm0 + jmp m(iadst_4x4_internal_16bpc).end +%endif +%endmacro + +%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd + ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1 + punpckhqdq m%3, m%2, m%1 ; t3 t2 + punpcklqdq m%2, m%1 ; t0 t1 + paddd m%1, m%2, m%3 ; out0 out1 + psubd m%2, m%3 ; out3 out2 +%endmacro + +%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd + vpbroadcastd m%5, [pw_m3784_1567] + punpckhwd m%3, m%2, m%1 + psubw m%4, m%1, m%2 + paddw m%1, m%2 + vpbroadcastd m%2, [pw_1567_3784] + punpcklqdq m%1, m%4 + vpbroadcastd m%4, [pw_2896x8] + pmaddwd m%5, m%3 + pmaddwd m%3, m%2 + pmulhrsw m%1, m%4 ; t0 t1 + paddd m%5, m%6 + paddd m%3, m%6 + psrad m%5, 12 + psrad m%3, 12 + packssdw m%3, m%5 ; t3 t2 + psubsw m%2, m%1, m%3 ; out3 out2 + paddsw m%1, m%3 ; out0 out1 +%endmacro + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, identity +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst + +cglobal idct_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m5, [pd_2048] + IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 + vbroadcasti128 m2, [idct4_shuf] + packssdw m0, m1 + pshufb m0, m2 + jmp tx2q +.pass2: + vextracti128 xm1, m0, 1 + WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 + packssdw xm5, xm5 ; pw_2048 + pmulhrsw xm0, xm5 + pmulhrsw xm1, xm5 + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*1] + movhps xm3, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_max] + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + RET + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call .main + vpermd m0, m4, m0 + psrld m4, 4 + pshufb m0, m4 + jmp tx2q +.pass2: + lea rax, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8bpc).main +.end: + vpbroadcastd xm4, [pw_2048] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm5, [pixel_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET +ALIGN function_align +.main: + mova m2, [cq+16*2] + mova m0, [pd_1321_2482] + vpbroadcastd m3, [pd_3803] + vbroadcasti128 m5, [cq+16*0] + vpbroadcastd m1, [pd_m3344] + pmulld m4, m0, m2 + pmulld m3, m2 + pmulld m0, m5 + vpbroadcastd m5, [pd_2048] + psubd xm2, [cq+16*3] + psubd m2, [cq+16*0] + pmulld m2, m1 ; t2 t3 + vpermq m4, m4, q1032 + paddd m4, m3 + psubd m0, m4 + paddd xm4, xm4 + paddd m4, m0 ; t0 t1 + vinserti128 m3, m2, xm4, 1 ; t2 t0 + paddd m0, m4, m5 + psubd xm4, xm2 + psubd m1, m0, m2 + vpermq m2, m2, q3232 ; t3 t3 + psubd m1, m4 + mova m4, [itx4_shuf] + paddd m0, m2 ; out0 out1 + paddd m1, m3 ; out2 out3 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_16bpc).main + psrld m1, m4, 8 + vpermd m0, m1, m0 + psrld m4, 4 + pshufb m0, m4 + jmp tx2q +.pass2: + lea rax, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8bpc).main + vpbroadcastd xm4, [pw_2048] + movq xm3, [dstq+strideq*1] + movhps xm3, [dstq+strideq*0] + lea r6, [dstq+strideq*2] + movq xm2, [r6 +strideq*1] + movhps xm2, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movq [r6 +strideq*1], xm0 + RET + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2 + vpbroadcastd m1, [pd_5793] + pmulld m0, m1, [cq+32*0] + pmulld m1, [cq+32*1] + vpbroadcastd m5, [pd_2048] + mova m3, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m3, m0 + psrld m3, 4 + pshufb m0, m3 + jmp tx2q +.pass2: + vpbroadcastd m1, [pw_1697x8] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + pmulhrsw m1, m0 + paddsw m0, m1 + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm4, [pixel_max] + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pxor m5, m5 + mova [cq+32*0], m5 + mova [cq+32*1], m5 + vextracti128 xm1, m0, 1 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm5 + pmaxsw xm1, xm5 + pminsw xm0, xm4 + pminsw xm1, xm4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 8 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 2048 + sar r6d, 12 +.end: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw xm0, xm0 +.end2: + vpbroadcastd xm3, [pixel_max] + pxor xm2, xm2 +.end_loop: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + paddw xm1, xm0 + pmaxsw xm1, xm2 + pminsw xm1, xm3 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .end_loop + WRAP_XMM RET +%endif +%endmacro + +%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3 + vpbroadcastd m%5, [pd_2896] + pmulld m%1, m%5 + pmulld m%3, m%5 + paddd m%1, m%8 + paddd m%5, m%1, m%3 + psubd m%1, m%3 + psrad m%5, 12 ; t0 + psrad m%1, 12 ; t1 + psubd m%3, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%5, m%4 + psubd m%4, m%5, m%4 +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + vpbroadcastd m3, [pd_2896] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, m3, [cq+32*3] + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp tx2q +.pass2: + packssdw m0, m2 + packssdw m1, m3 + lea rax, [deint_shuf+128] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ; 2 3 + punpckldq m0, m2 ; 0 1 + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + call m(idct_4x8_internal_8bpc).main + vpbroadcastd xm4, [pw_2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 3 2 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movhps [r6 +strideq*2], xm3 + movq [r6 +r3 ], xm3 + RET + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_16bpc).main + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 + jmp tx2q +.pass2: + call .pass2_main + mova xm4, [pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 +.end: + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 2 3 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 6 7 + vpbroadcastd xm5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + packssdw m0, m2 + packssdw m1, m3 + lea rax, [deint_shuf+128] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpckhdq m5, m4, m0 + punpckldq m4, m0 + vextracti128 xm2, m4, 1 ; 4 5 + vextracti128 xm3, m5, 1 ; 6 7 + pshufd xm4, xm4, q1032 ; 1 0 + pshufd xm5, xm5, q1032 ; 3 2 + jmp m(iadst_4x8_internal_8bpc).main_pass2 +ALIGN function_align +.main: + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m2, [cq+16*2] + vbroadcasti128 m3, [cq+16*5] + vbroadcasti128 m1, [cq+16*7] + vpbroadcastd m6, [pd_2896] + shufpd m0, m2, 0x0c ; 0 2 + shufpd m1, m3, 0x0c ; 7 5 + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m4, [cq+16*6] + vbroadcasti128 m5, [cq+16*1] + vbroadcasti128 m3, [cq+16*3] + vpbroadcastd m7, [pd_2048] + vpbroadcastd m8, [clip_min] + vpbroadcastd m9, [clip_max] + shufpd m2, m4, 0x0c ; 4 6 + shufpd m3, m5, 0x0c ; 3 1 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1 + ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1 + psubd m4, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + REPX {pmaxsd x, m8}, m4, m2, m0, m1 + REPX {pminsd x, m9}, m4, m2, m0, m1 + pxor m5, m5 + psubd m5, m4 + vpblendd m4, m2, 0xcc ; t4 t7 + vpblendd m2, m5, 0xcc ; t5 -t6 + ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784 + vpbroadcastd m5, [pd_2896] + vbroadcasti128 m6, [pw_2048_m2048] ; + + - - + punpckhqdq m3, m0, m1 + punpcklqdq m0, m1 + psubd m1, m0, m3 ; t2 t3 + paddd m0, m3 ; out0 -out7 + punpckhqdq m3, m4, m2 ; t7a t6a + punpcklqdq m4, m2 ; t5a t4a + psubd m2, m4, m3 ; t7 t6 + paddd m4, m3 ; out6 -out1 + REPX {pmaxsd x, m8}, m1, m2 + REPX {pminsd x, m9}, m1, m2 + vpblendd m3, m1, m2, 0xcc + shufpd m1, m2, 0x05 + pmulld m3, m5 + pmulld m5, m1 + psignd m0, m6 ; out0 out7 + psignd m4, m6 ; out6 out1 + paddd m3, m7 + psubd m2, m3, m5 + paddd m5, m3 + psrad m2, 12 ; out4 -out5 + psrad m5, 12 ; -out3 out2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_16bpc).main + psrad m0, m3, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + psrad m3, m4, 12 + jmp tx2q +.pass2: + call m(iadst_4x8_internal_16bpc).pass2_main + mova xm4, [pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*1] + movhps xm4, [dstq+strideq*0] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*1] + movhps xm6, [r6 +strideq*0] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm3, xm4 ; 1 0 + paddw xm2, xm5 ; 3 2 + paddw xm1, xm6 ; 5 4 + paddw xm0, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 + REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 + movhps [dstq+strideq*0], xm3 + movq [dstq+strideq*1], xm3 + movhps [dstq+strideq*2], xm2 + movq [dstq+r3 ], xm2 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + movhps [r6 +strideq*2], xm0 + movq [r6 +r3 ], xm0 + RET + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2 + vpbroadcastd m3, [pd_2896] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, [cq+32*3] + vpbroadcastd m5, [pd_2048] + vpbroadcastd m4, [pd_5793] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m4, [pw_4096] + packssdw m0, m2 + packssdw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m4 + pmulhrsw m0, m4 + punpckhdq m1, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + vpbroadcastq m4, [r6 +strideq*0] + vpbroadcastq m5, [r6 +strideq*1] + movq xm3, [dstq+strideq*2] + movhps xm3, [dstq+r3 ] + vpblendd m2, m4, 0x30 + vpblendd m2, m5, 0xc0 + vpbroadcastq m4, [r6 +strideq*2] + vpbroadcastq m5, [r6 +r3 ] + vpblendd m3, m4, 0x30 + vpblendd m3, m5, 0xc0 + vpbroadcastd m5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 + add r6d, 6144 + sar r6d, 13 + jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 + mova m1, [cq+32*2] + mova m3, [cq+32*6] + mova m5, [cq+32*3] + mova m7, [cq+32*7] + vpbroadcastd m4, [pd_3784] + vpbroadcastd m8, [pd_1567] + vpbroadcastd m9, [pd_2048] + vpbroadcastd m6, [pd_2896] + ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l + ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h + pmulld m0, m6, [cq+32*0] + pmulld m2, m6, [cq+32*4] + pmulld m4, m6, [cq+32*1] + pmulld m6, [cq+32*5] + vpbroadcastd m8, [pd_6144] + paddd m0, m8 + paddd m4, m8 + paddd m8, m0, m2 + psubd m0, m2 + paddd m9, m4, m6 + psubd m4, m6 + REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + psubd m2, m0, m1 + paddd m1, m0 + psubd m6, m4, m5 + paddd m5, m4 + paddd m0, m8, m3 + psubd m3, m8, m3 + paddd m4, m9, m7 + psubd m7, m9, m7 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea rax, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 ; 2 3 + punpckldq m0, m4 ; 0 1 + punpckldq m4, m5, m2 ; 8 9 + punpckhdq m5, m2 ; a b + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + vextracti128 xm6, m4, 1 ; c d + vextracti128 xm7, m5, 1 ; e f + call m(idct_4x16_internal_8bpc).main + vpbroadcastd m9, [pw_2048] + vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 + vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 + vinserti128 m2, m4, xm5, 1 ; 8 9 b a + vinserti128 m3, m6, xm7, 1 ; c d f e + vpbroadcastd m8, [pixel_max] + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + vpbroadcastq m5, [dstq+strideq*2] + vpbroadcastq m6, [dstq+r6 ] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + movhps [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 + call m(iadst_16x4_internal_16bpc).main + psrad m0, m4, 13 + psrad m1, m5, 13 + psrad m2, 13 + psrad m3, 13 + psrad m4, m8, 13 + psrad m5, m9, 13 + psrad m6, 13 + psrad m7, 13 + jmp tx2q +.pass2: + call .pass2_main + vpbroadcastd m5, [pw_2048] + vpbroadcastd m8, [pixel_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+r6 ] + movhps xm4, [dstq+strideq*0] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movhps [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm5 + movq [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea rax, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + punpckldq m4, m5, m2 + punpckhdq m5, m2 + vpblendd m3, m0, m1, 0x33 + vpblendd m0, m1, 0xcc + shufpd m2, m5, m4, 0x05 + shufpd m4, m5, 0x05 + vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 + vinserti128 m0, xm3, 1 ; 0 3 2 1 + vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? + vinserti128 m2, xm4, 1 ; b 8 9 a + call m(iadst_4x16_internal_8bpc).main2 + vpbroadcastd m5, [pw_2896x8] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + ret +ALIGN function_align +.main: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 2] + vbroadcasti128 m1, [cq+16*15] + vbroadcasti128 m5, [cq+16*13] + vbroadcasti128 m2, [cq+16* 4] + vbroadcasti128 m6, [cq+16* 6] + vbroadcasti128 m3, [cq+16*11] + vbroadcasti128 m7, [cq+16* 9] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m1, m5, 0x0c ; 15 13 + shufpd m2, m6, 0x0c ; 4 6 + shufpd m3, m7, 0x0c ; 11 9 + vbroadcasti128 m4, [cq+16* 8] + vbroadcasti128 m6, [cq+16*10] + vbroadcasti128 m5, [cq+16* 7] + vbroadcasti128 m7, [cq+16* 5] + shufpd m4, m6, 0x0c ; 8 10 + shufpd m5, m7, 0x0c ; 7 5 + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m7, [cq+16*14] + shufpd m6, m7, 0x0c ; 12 14 + vbroadcasti128 m7, [cq+16* 3] + vbroadcasti128 m8, [cq+16* 1] + shufpd m7, m8, 0x0c ; 3 1 + vpbroadcastd m11, [pd_2048] + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 + psubd m8, m0, m4 ; t8a t10a + paddd m0, m4 ; t0a t2a + psubd m4, m1, m5 ; t9a t11a + paddd m1, m5 ; t1a t3a + psubd m5, m2, m6 ; t12a t14a + paddd m2, m6 ; t4a t6a + psubd m6, m3, m7 ; t13a t15a + paddd m3, m7 ; t5a t7a + ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1 + ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1 + psubd m7, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + psubd m3, m4, m6 ; t12a t14a + paddd m4, m6 ; t8a t10a + psubd m6, m8, m5 ; t13a t15a + paddd m8, m5 ; t9a t11a + punpcklqdq m5, m3, m7 ; t12a t4 + punpckhqdq m3, m7 ; t14a t6 + punpckhqdq m7, m6, m2 ; t15a t7 + punpcklqdq m6, m2 ; t13a t5 + ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567 + ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10 + vpbroadcastd m10, [pd_2896] + vbroadcasti128 m9, [pw_2048_m2048] ; + + - - + punpckhqdq m2, m4, m0 ; t10a t2 + punpcklqdq m4, m0 ; t8a t0 + punpckhqdq m0, m8, m1 ; t11a t3 + punpcklqdq m8, m1 ; t9a t1 + paddd m1, m6, m7 ; out2 -out3 + psubd m6, m7 ; t14a t6 + paddd m7, m5, m3 ; -out13 out12 + psubd m5, m3 ; t15a t7 + psubd m3, m8, m0 ; t11 t3a + paddd m8, m0 ; out14 -out15 + paddd m0, m4, m2 ; -out1 out0 + psubd m4, m2 ; t10 t2a + REPX {pmulld x, m10}, m6, m5, m3, m4 + paddd m6, m11 + paddd m4, m11 + paddd m2, m6, m5 ; -out5 out4 + psubd m6, m5 ; out10 -out11 + psubd m5, m4, m3 ; -out9 out8 + paddd m3, m4 ; out6 -out7 + REPX {psrad x, 12}, m2, m3, m5, m6 + REPX {psignd x, m9}, m1, m8, m3, m6 + pshufd m9, m9, q1032 + REPX {psignd x, m9}, m0, m7, m2, m5 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 + call m(iadst_16x4_internal_16bpc).main + psrad m0, m3, 13 + psrad m1, m2, 13 + psrad m2, m5, 13 + psrad m3, m4, 13 + psrad m4, m7, 13 + psrad m5, m6, 13 + psrad m6, m9, 13 + psrad m7, m8, 13 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_16bpc).pass2_main + vpbroadcastd m5, [pw_2048] + vpbroadcastd m8, [pixel_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+r6 ] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0x30 + vpblendd m4, m6, 0xc0 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + movhps [dstq+strideq*2], xm5 + movhps [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2 + vpbroadcastd m7, [pd_5793] + pmulld m0, m7, [cq+32*0] + pmulld m4, m7, [cq+32*1] + pmulld m1, m7, [cq+32*2] + pmulld m5, m7, [cq+32*3] + pmulld m2, m7, [cq+32*4] + pmulld m6, m7, [cq+32*5] + pmulld m3, m7, [cq+32*6] + pmulld m7, [cq+32*7] + vpbroadcastd m8, [pd_6144] + REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 + REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m7, [pw_1697x16] + vpbroadcastd m8, [pw_2048] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + vpbroadcastd m4, [pixel_max] + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + lea r6, [strideq*5] + pxor m3, m3 + punpckhdq m5, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + punpckldq m6, m7, m1 ; 8 9 c d + punpckhdq m7, m1 ; a b e f + pmulhrsw m0, m8 + call .write_2x4x2 + pmulhrsw m0, m5, m8 + call .write_2x4x2 + pmulhrsw m0, m6, m8 + lea dstq, [dstq+strideq*4] + call .write_2x4x2 + pmulhrsw m0, m7, m8 + call .write_2x4x2 + RET +ALIGN function_align +.write_2x4x2: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + vpbroadcastq m2, [dstq+strideq*4] + vpblendd m1, m2, 0x30 + vpbroadcastq m2, [dstq+r6 ] + vpblendd m1, m2, 0xc0 + mova [cq+32*0], m3 + mova [cq+32*1], m3 + add cq, 32*2 + paddw m1, m0 + pmaxsw m1, m3 + pminsw m1, m4 + vextracti128 xm2, m1, 1 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + movq [dstq+strideq*4], xm2 + movhps [dstq+r6 ], xm2 + lea dstq, [dstq+strideq*2] + ret + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 8x4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw m0, xm0 +.end: + vpbroadcastd m4, [pixel_max] + pxor m3, m3 + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + lea r6, [dstq+strideq*2] + mova xm2, [r6 +strideq*0] + vinserti128 m2, [r6 +strideq*1], 1 + paddw m1, m0 + paddw m2, m0 + pmaxsw m1, m3 + pmaxsw m2, m3 + pminsw m1, m4 + pminsw m2, m4 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + mova [r6 +strideq*0], xm2 + vextracti128 [r6 +strideq*1], m2, 1 + RET +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, identity +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst + +cglobal idct_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vbroadcasti128 m1, [cq+16*1] + vbroadcasti128 m0, [cq+16*5] + vbroadcasti128 m2, [cq+16*3] + vbroadcasti128 m3, [cq+16*7] + vpbroadcastd m6, [pd_2896] + shufpd m1, m0, 0x0c ; 1 5 + shufpd m3, m2, 0x0c ; 7 3 + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m4, [cq+16*2] + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m5, [cq+16*6] + vpbroadcastd m7, [pd_2048] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m2, m5, 0x0c ; 4 6 + REPX {pmulld x, m6}, m1, m3, m0, m2 + REPX {paddd x, m7}, m1, m3, m0, m2 + REPX {psrad x, 12}, m1, m3, m0, m2 + call .main + psubd m3, m0, m4 ; out7 out6 + paddd m0, m4 ; out0 out1 + paddd m1, m2, m5 ; out3 out2 + psubd m2, m5 ; out4 out5 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp tx2q +.pass2: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q2031 ; out2 out3 + jmp m(iadst_8x4_internal_16bpc).end +ALIGN function_align +.main: + ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1 + IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7 + vpbroadcastd m8, [clip_min] + vpbroadcastd m9, [clip_max] + vpbroadcastd m6, [pd_2896] + punpcklqdq m4, m1, m3 ; t4a t7a + punpckhqdq m1, m3 ; t5a t6a + psubd m3, m4, m1 ; t5a t6a + paddd m4, m1 ; t4 t7 + REPX {pmaxsd x, m8}, m3, m4, m0, m2 + REPX {pminsd x, m9}, m3, m4, m0, m2 + pmulld m3, m6 + pshufd m1, m3, q1032 + paddd m3, m7 + psubd m5, m3, m1 + paddd m1, m3 + psrad m5, 12 + psrad m1, 12 + vpblendd m5, m4, 0x33 ; t4 t5 + punpckhqdq m4, m1 ; t7 t6 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_16bpc).main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + call .pass2_main + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q3120 ; out2 out3 +.end: + vpbroadcastd m1, [pw_2048] + pmulhrsw m0, m1 + pmulhrsw m1, m2 +.end2: + mova xm2, [dstq+strideq*0] + vinserti128 m2, [dstq+strideq*1], 1 + lea r6, [dstq+strideq*2] + mova xm3, [r6 +strideq*0] + vinserti128 m3, [r6 +strideq*1], 1 + vpbroadcastd m5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [r6 +strideq*0], xm1 + vextracti128 [r6 +strideq*1], m1, 1 + RET +ALIGN function_align +.pass2_main: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + lea rax, [deint_shuf+128] + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + jmp m(iadst_8x4_internal_8bpc).main +ALIGN function_align +.main: + vpbroadcastd m1, [pd_2896] + pmulld m0, m1, [cq+32*0] + pmulld m3, m1, [cq+32*3] + pmulld m2, m1, [cq+32*2] + pmulld m1, [cq+32*1] + vpbroadcastd m4, [pd_2048] + REPX {paddd x, m4}, m0, m3, m2, m1 + REPX {psrad x, 12}, m0, m3, m2, m1 + vbroadcasti128 m6, [pd_1321] + vbroadcasti128 m7, [pd_2482] + pmulld m4, m0, m6 ; 1321*in0 + pmulld m5, m3, m7 ; 2482*in3 + paddd m4, m5 ; 1321*in0 + 2482*in3 + pmulld m5, m0, m7 ; 2482*in0 + paddd m0, m3 ; in0 + in3 + paddd m7, m6 ; pd_3803 + pmulld m6, m2 ; 1321*in2 + pmulld m3, m7 ; 3803*in3 + pmulld m7, m2 ; 3803*in2 + psubd m2, m0 ; in2 - in0 - in3 + vpbroadcastd m0, [pd_m3344] + psubd m5, m6 ; 2482*in0 - 1321*in2 + vpbroadcastd m6, [pd_2048] + psubd m5, m3 ; t1 + pmulld m2, m0 ; t2 + pmulld m1, m0 ; -t3 + paddd m4, m7 ; t0 + paddd m5, m6 + paddd m3, m4, m5 + paddd m4, m6 + psubd m4, m1 ; out0 (unshifted) + psubd m5, m1 ; out1 (unshifted) + paddd m2, m6 ; out2 (unshifted) + paddd m3, m1 ; out3 (unshifted) + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_16bpc, 0, 5, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_16bpc).main + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_16bpc).pass2_main + vpermq m2, m0, q2031 + vpermq m0, m1, q2031 + jmp m(iadst_8x4_internal_16bpc).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m4, [pd_2896] + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpermq m2, [cq+32*2], q3120 + vpermq m3, [cq+32*3], q3120 + vpbroadcastd m7, [pd_2048] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {paddd x, x }, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m4, [pw_1697x8] + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m1, m4, m0 + pmulhrsw m4, m2 + paddsw m0, m1 + paddsw m2, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + packssdw m7, m7 ; pw_2048 + lea r6, [dstq+strideq*2] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m7 + pmulhrsw m0, m7 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova xm2, [dstq+strideq*0] + vinserti128 m2, [r6 +strideq*0], 1 + mova xm3, [dstq+strideq*1] + vinserti128 m3, [r6 +strideq*1], 1 + vpbroadcastd m5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [r6 +strideq*0], m0, 1 + vextracti128 [r6 +strideq*1], m1, 1 + RET + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 8x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 8 +.dconly: + add r6d, 6144 + sar r6d, 13 +.dconly2: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw m0, xm0 + vpbroadcastd m3, [pixel_max] + pxor m2, m2 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + paddw m1, m0 + pmaxsw m1, m2 + pminsw m1, m3 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2] + ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a + psubd m%9, m%3, m%7 ; t6 + paddd m%3, m%7 ; t2 + psubd m%7, m%1, m%5 ; t4 + paddd m%1, m%5 ; t0 + psubd m%5, m%6, m%2 ; t7 + paddd m%6, m%2 ; t3 + psubd m%2, m%8, m%4 ; t5 + paddd m%8, m%4 ; t1 + REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a + psubd m%10, m%7, m%9 ; t7 + paddd m%7, m%9 ; out6 + vpbroadcastd m%9, [pd_2896] + psubd m%4, m%8, m%6 ; t3 + paddd m%8, m%6 ; -out7 + psubd m%6, m%1, m%3 ; t2 + paddd m%1, m%3 ; out0 + psubd m%3, m%2, m%5 ; t6 + paddd m%2, m%5 ; -out1 + REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 + REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 + REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 + psubd m%5, m%6, m%4 ; (t2 - t3) * 2896 + paddd m%4, m%6 ; (t2 + t3) * 2896 + psubd m%6, m%3, m%10 ; (t6 - t7) * 2896 + paddd m%3, m%10 ; (t6 + t7) * 2896 +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + call .main + call .round_shift1 + jmp tx2q +.pass2: + call .transpose_8x8_packed + call m(idct_8x8_internal_8bpc).main + vpbroadcastd m12, [pw_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_8x4 + RET +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_max] + lea r6, [strideq*3] + pxor m10, m10 +.write_8x4: + mova xm8, [dstq+strideq*0] + vinserti128 m8, [dstq+strideq*1], 1 + mova xm9, [dstq+strideq*2] + vinserti128 m9, [dstq+r6 ], 1 + mova [cq+32*0], m10 + mova [cq+32*1], m10 + mova [cq+32*2], m10 + mova [cq+32*3], m10 + add cq, 32*4 + paddw m0, m8 + paddw m1, m9 + pmaxsw m0, m10 + pmaxsw m1, m10 + pminsw m0, m11 + pminsw m1, m11 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.transpose_8x8_packed: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea rax, [deint_shuf+128] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m4, m1 + punpckldq m4, m1 + vinserti128 m1, m3, xm2, 1 + vperm2i128 m3, m2, 0x31 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret +ALIGN function_align +.main_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main: + ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + vpbroadcastd m3, [pd_2896] + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m3 }, m0, m4, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +ALIGN function_align +.round_shift1: + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call .main + call .main_end + jmp tx2q +.pass2: + call m(idct_8x8_internal_16bpc).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m5, [pw_2048] + vpbroadcastd xm12, [pw_4096] + psubw m12, m5 + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + RET +ALIGN function_align +.main: + mova m0, [cq+32*0] + mova m7, [cq+32*7] + mova m1, [cq+32*1] + mova m6, [cq+32*6] + mova m2, [cq+32*2] + mova m5, [cq+32*5] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] +.main2: + IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + psrld m8, 11 ; pd_1 + vpbroadcastd m9, [pd_6144] + ret +ALIGN function_align +.main_end: + paddd m0, m8 + psubd m1, m8, m1 + paddd m6, m8 + psubd m7, m8, m7 + REPX {psrad x, 1 }, m0, m1, m6, m7 + ; (1 + ((x + 2048) >> 12)) >> 1 = (6144 + x) >> 13 + ; (1 - ((x + 2048) >> 12)) >> 1 = (6143 - x) >> 13 + psubd m8, m9, m8 ; pd_6143 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 13}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_8x8_internal_16bpc).main + call .main_end + jmp tx2q +.pass2: + call m(idct_8x8_internal_16bpc).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m12, [pw_2048] + vpbroadcastd xm5, [pw_4096] + psubw m12, m5 + vpermq m8, m3, q2031 + vpermq m9, m2, q2031 + vpermq m2, m1, q2031 + vpermq m3, m0, q2031 + pmulhrsw m0, m8, m12 + pmulhrsw m1, m9, m12 + call m(idct_8x8_internal_16bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + RET +ALIGN function_align +.main_end: + paddd m10, m8, m0 + psubd m0, m8, m7 + psubd m7, m8, m1 + paddd m1, m8, m6 + psrad m0, 1 + psrad m1, 1 + psrad m6, m7, 1 + psrad m7, m10, 1 + psubd m8, m9, m8 ; pd_6143 + psubd m10, m8, m5 + paddd m5, m9, m2 + psubd m2, m8, m3 + paddd m3, m9, m4 + psrad m4, m2, 13 + psrad m2, m10, 13 + psrad m3, 13 + psrad m5, 13 + ret + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m12, [pw_4096] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m1 + punpckhdq m4, m1 + punpckhqdq m1, m0, m2 ; 1 5 + punpcklqdq m0, m2 ; 0 4 + punpcklqdq m2, m3, m4 ; 2 6 + punpckhqdq m3, m4 ; 3 7 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_2x8x2_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_2x8x2_zero + RET +.write_2x8x2_start: + vpbroadcastd m7, [pixel_max] + lea r6, [strideq*5] + pxor m6, m6 +.write_2x8x2_zero: + mova [cq+32*0], m6 + mova [cq+32*1], m6 + mova [cq+32*2], m6 + mova [cq+32*3], m6 + add cq, 32*4 +.write_2x8x2: + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + mova xm5, [dstq+strideq*1] + vinserti128 m5, [dstq+r6 ], 1 + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pminsw m0, m7 + pminsw m1, m7 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*4], m0, 1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*2] + ret + +%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 8x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).dconly +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, 35 +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +cglobal idct_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_8x16_internal_8bpc).main + vpbroadcastd m12, [pw_2048] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + pmulhrsw m0, m4, m12 + pmulhrsw m1, m5, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + pmulhrsw m0, m6, m12 + pmulhrsw m1, m7, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + RET +ALIGN function_align +.transpose: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 + lea rax, [deint_shuf+128] + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m3, m6 + punpckldq m3, m6 + punpckhdq m6, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + vperm2i128 m2, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vperm2i128 m3, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m7, m5, m6, 0x31 + vinserti128 m5, xm6, 1 + vperm2i128 m6, m8, m4, 0x31 + vinserti128 m4, m8, xm4, 1 + ret +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct_8x8_internal_16bpc).main_rect2 + jmp m(idct_8x8_internal_16bpc).round_shift1 +ALIGN function_align +.main_evenhalf: + paddd m1, m6, m7 ; idct8 out1 + psubd m6, m7 ; idct8 out6 + psubd m7, m0, m9 ; idct8 out7 + paddd m0, m9 ; idct8 out0 + paddd m2, m5, m4 ; idct8 out2 + psubd m5, m4 ; idct8 out5 + psubd m4, m3, m8 ; idct8 out4 + paddd m3, m8 ; idct8 out3 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +.main_oddhalf_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_fast: ; lower half zero + vpbroadcastd m7, [pd_4076] + vpbroadcastd m8, [pd_401] + vpbroadcastd m6, [pd_m1189] + vpbroadcastd m9, [pd_3920] + vpbroadcastd m5, [pd_3612] + vpbroadcastd m10, [pd_1931] + vpbroadcastd m4, [pd_m2598] + vpbroadcastd m15, [pd_3166] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_fast2 +.main_oddhalf_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf: + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a +.main_oddhalf_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t9 + paddd m0, m4 ; t8 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 + REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 + psubd m3, m1, m4 ; t10 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + psubd m2, m8, m6 ; t13 + paddd m6, m8 ; t14 + psubd m8, m7, m5 ; t12a + paddd m7, m5 ; t15a + REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pmulld x, m14}, m2, m8, m3, m4 + paddd m2, m11 + paddd m8, m11 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m8, m4 ; t11 + paddd m4, m8 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, 35 + +cglobal iadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + call m(iadst_8x8_internal_16bpc).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + call m(iadst_8x8_internal_16bpc).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + call m(iadst_8x8_internal_16bpc).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_8x16_internal_16bpc).transpose + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end + vpbroadcastd m8, [pw_2048] + vpbroadcastd xm12, [pw_4096] + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + psubw m12, m8 + jmp m(idct_8x16_internal_16bpc).end +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m7, m14, [cq+32*14] + pmulld m1, m14, [cq+32* 2] + pmulld m6, m14, [cq+32*12] + pmulld m2, m14, [cq+32* 4] + pmulld m5, m14, [cq+32*10] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp m(iadst_8x8_internal_16bpc).main2 + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, 35 + +cglobal iflipadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + cmp eobd, 43 + jl .fast + add cq, 32 + call m(iadst_8x16_internal_16bpc).pass1_main + call m(iflipadst_8x8_internal_16bpc).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call m(iadst_8x16_internal_16bpc).pass1_main + call m(iflipadst_8x8_internal_16bpc).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call m(iadst_8x16_internal_16bpc).pass1_main + call m(iflipadst_8x8_internal_16bpc).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_8x16_internal_16bpc).transpose + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end + vpbroadcastd m12, [pw_2048] + vpbroadcastd xm13, [pw_4096] + mova m11, m0 + vpermq m0, m7, q2031 + mova m10, m1 + vpermq m1, m6, q2031 + mova m9, m2 + vpermq m2, m5, q2031 + mova m8, m3 + vpermq m3, m4, q2031 + vpermq m4, m8, q3120 + vpermq m5, m9, q3120 + vpermq m6, m10, q3120 + vpermq m7, m11, q3120 + psubw m12, m13 + jmp m(idct_8x16_internal_16bpc).end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2 + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 0] + pmulld m8, m15, [cq+32* 1] + pmulld m1, m15, [cq+32* 2] + pmulld m9, m15, [cq+32* 3] + pmulld m2, m15, [cq+32* 4] + pmulld m10, m15, [cq+32* 5] + pmulld m3, m15, [cq+32* 6] + pmulld m11, m15, [cq+32* 7] + pmulld m4, m15, [cq+32* 8] + pmulld m12, m15, [cq+32* 9] + pmulld m5, m15, [cq+32*10] + pmulld m13, m15, [cq+32*11] + pmulld m6, m15, [cq+32*12] + pmulld m14, m15, [cq+32*13] + pmulld m7, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [cq], m7 + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 + vpbroadcastd m8, [pw_1697x16] + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + punpckhwd m9, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m2, m3 + punpckhwd m2, m3 + vpbroadcastd m12, [pw_2048] + punpckhdq m3, m0, m5 + punpckldq m0, m5 + punpckhdq m11, m9, m2 + punpckldq m9, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m7, m1 + punpckhdq m7, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m8, m9, m6 + punpckhqdq m9, m6 + punpcklqdq m10, m11, m7 + punpckhqdq m11, m7 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(iidentity_8x8_internal_16bpc).write_2x8x2_start + pmulhrsw m0, m12, m2 + pmulhrsw m1, m12, m3 + call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero + pmulhrsw m0, m12, m8 + pmulhrsw m1, m12, m9 + lea dstq, [dstq+strideq*4] + call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero + pmulhrsw m0, m12, m10 + pmulhrsw m1, m12, m11 + call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero + RET + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 16x4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 4 +.dconly: + add r6d, 6144 + sar r6d, 13 +.dconly2: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw m0, xm0 + vpbroadcastd m4, [pixel_max] + pxor m3, m3 +.dconly_loop: + paddw m1, m0, [dstq+strideq*0] + paddw m2, m0, [dstq+strideq*1] + pmaxsw m1, m3 + pmaxsw m2, m3 + pminsw m1, m4 + pminsw m2, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, identity +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst + +cglobal idct_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 4] + vbroadcasti128 m1, [cq+16* 2] + vbroadcasti128 m7, [cq+16* 6] + vbroadcasti128 m5, [cq+16*10] + vbroadcasti128 m2, [cq+16* 8] + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m3, [cq+16*14] + shufpd m0, m4, 0x0c ; 0 4 + shufpd m1, m5, 0x0c ; 2 10 + shufpd m2, m6, 0x0c ; 8 12 + shufpd m3, m7, 0x0c ; 14 6 + vpbroadcastd m7, [pd_2048] + call m(idct_8x4_internal_16bpc).main + pcmpeqd m6, m6 + psubd m0, m6 + psubd m2, m6 + psubd m3, m0, m4 ; idct8 out7 out6 + paddd m0, m4 ; idct8 out0 out1 + paddd m1, m2, m5 ; idct8 out3 out2 + psubd m2, m5 ; idct8 out4 out5 + vbroadcasti128 m10, [cq+16* 1] + vbroadcasti128 m4, [cq+16* 5] + vbroadcasti128 m11, [cq+16*15] + vbroadcasti128 m5, [cq+16*11] + shufpd m10, m4, 0x0c ; 1 5 + shufpd m11, m5, 0x0c ; 15 11 + vbroadcasti128 m5, [cq+16* 9] + vbroadcasti128 m4, [cq+16*13] + shufpd m5, m4, 0x0c ; 9 13 + vbroadcasti128 m6, [cq+16* 7] + vbroadcasti128 m4, [cq+16* 3] + shufpd m6, m4, 0x0c ; 7 3 + ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 + ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 + psubd m4, m10, m5 ; t9 -t10 + paddd m10, m5 ; t8 t11 + psubd m5, m11, m6 ; t14 -t13 + paddd m11, m6 ; t15 t12 + REPX {pmaxsd x, m8}, m4, m5, m10, m11 + REPX {pminsd x, m9}, m4, m5, m10, m11 + ITX_MULSUB_2D 5, 4, 6, 12, 13, 7, 1567, 3784, 2 + vpbroadcastd m12, [pd_2896] + punpckhqdq m6, m11, m5 + punpcklqdq m11, m4 + punpckhqdq m4, m10, m4 + punpcklqdq m10, m5 + psubd m5, m11, m6 ; t12a t13 + paddd m11, m6 ; t15a t14 + psubd m6, m10, m4 ; t11a t10 + paddd m10, m4 ; t8a t9 + REPX {pmaxsd x, m8}, m5, m6 + REPX {pminsd x, m9}, m5, m6 + pmulld m5, m12 + pmulld m6, m12 + REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10 + REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10 + paddd m5, m7 + psubd m4, m5, m6 + paddd m5, m6 + psrad m4, 12 ; t11 t10a + psrad m5, 12 ; t12 t13a + psubd m7, m0, m11 ; out15 out14 + paddd m0, m11 ; out0 out1 + psubd m6, m1, m5 ; out12 out13 + paddd m1, m5 ; out3 out2 + psubd m5, m2, m4 ; out11 out10 + paddd m2, m4 ; out4 out5 + psubd m4, m3, m10 ; out8 out9 + paddd m3, m10 ; out7 out6 + REPX {pshufd x, x, q1032}, m1, m3, m5, m7 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .transpose_4x16_packed + lea rax, [deint_shuf+128] + call m(idct_16x4_internal_8bpc).main +.end: + vpbroadcastd m4, [pw_2048] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 +.end2: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] +.end3: + lea r6, [dstq+strideq*2] + paddw m2, [r6 +strideq*0] + paddw m3, [r6 +strideq*1] + vpbroadcastd m5, [pixel_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + REPX {pminsw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [r6 +strideq*0], m2 + mova [r6 +strideq*1], m3 + RET +ALIGN function_align +.transpose_4x16_packed: + vbroadcasti128 m8, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + REPX {pshufb x, m8}, m0, m2, m4, m6 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m2, m4, m6 + punpcklqdq m4, m6 + vperm2i128 m3, m1, m2, 0x31 + vinserti128 m1, xm2, 1 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_4x16_internal_16bpc).main + psrad m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + paddd m4, m5, m11 + paddd m5, m6, m11 + paddd m6, m7, m11 + paddd m7, m8, m11 +.pass1_end: + REPX {pshufd x, x, q1032}, m0, m2, m4, m6 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct_16x4_internal_16bpc).transpose_4x16_packed + lea rax, [deint_shuf+128] + call m(iadst_16x4_internal_8bpc).main + jmp m(idct_16x4_internal_16bpc).end +ALIGN function_align +.main: + vbroadcasti128 m6, [pd_1321] + mova m0, [cq+32*0] + mova m1, [cq+32*1] + vbroadcasti128 m7, [pd_2482] + mova m2, [cq+32*6] + mova m3, [cq+32*7] + pmulld m4, m0, m6 + pmulld m5, m1, m6 ; 1321*in0 + pmulld m9, m2, m7 + pmulld m8, m3, m7 ; 2482*in3 + paddd m4, m9 + paddd m8, m5 ; 1321*in0 + 2482*in3 + pmulld m5, m0, m7 + pmulld m9, m1, m7 ; 2482*in0 + paddd m0, m2 + paddd m1, m3 ; in0 + in3 + paddd m7, m6 ; pd_3803 + pmulld m2, m7 + pmulld m3, m7 ; 3803*in3 + psubd m5, m2 + psubd m9, m3 ; 2482*in0 - 3803*in3 + mova m2, [cq+32*4] + pmulld m10, m7, m2 + pmulld m3, m6, m2 + psubd m2, m0 + mova m0, [cq+32*5] + pmulld m7, m0 ; 3803*in2 + pmulld m6, m0 ; 1321*in2 + psubd m0, m1 ; in2 - in0 - in3 + vpbroadcastd m1, [pd_m3344] + paddd m4, m10 + paddd m7, m8 ; t0 + psubd m5, m3 + psubd m9, m6 ; t1 + vpbroadcastd m6, [pd_6144] + pmulld m2, m1 + pmulld m0, m1 ; t2 + pmulld m3, m1, [cq+32*2] + pmulld m1, [cq+32*3] ; -t3 + paddd m5, m6 + paddd m9, m6 + paddd m10, m4, m5 + paddd m4, m6 + paddd m8, m7, m6 + paddd m7, m9 + psubd m4, m3 ; out0 (unshifted) + psubd m5, m3 ; out1 (unshifted) + paddd m2, m6 ; out2 (unshifted) + paddd m3, m10 ; out3 (unshifted) + psubd m8, m1 ; out4 (unshifted) + psubd m9, m1 ; out5 (unshifted) + paddd m6, m0 ; out6 (unshifted) + paddd m7, m1 ; out7 (unshifted) + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_4x16_internal_16bpc).main + psrad m11, 11 ; pd_1 + paddd m4, m3, m11 + paddd m3, m5, m11 + paddd m5, m2, m11 + paddd m2, m6, m11 + paddd m6, m1, m11 + paddd m1, m7, m11 + paddd m7, m0, m11 + paddd m0, m8, m11 + jmp m(iadst_16x4_internal_16bpc).pass1_end +.pass2: + call m(idct_16x4_internal_16bpc).transpose_4x16_packed + lea rax, [deint_shuf+128] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m4, [pw_2048] + pmulhrsw m5, m3, m4 + pmulhrsw m6, m2, m4 + pmulhrsw m2, m1, m4 + pmulhrsw m3, m0, m4 + paddw m0, m5, [dstq+strideq*0] + paddw m1, m6, [dstq+strideq*1] + jmp m(idct_16x4_internal_16bpc).end3 + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [pd_11586] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m4, [cq+32*4], q3120 ; 8 9 + vpermq m5, [cq+32*5], q3120 ; a b + vpermq m6, [cq+32*6], q3120 ; c d + vpermq m7, [cq+32*7], q3120 ; e f + vpbroadcastd m9, [pd_6144] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct_16x4_internal_16bpc).transpose_4x16_packed + vpbroadcastd m7, [pw_1697x8] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_16x4_internal_16bpc).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 16x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 8 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst + +cglobal idct_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m14, [pd_2896] + pmulld m0, m14, [cq+32* 1] + pmulld m1, m14, [cq+32* 3] + pmulld m2, m14, [cq+32* 5] + pmulld m3, m14, [cq+32* 7] + pmulld m4, m14, [cq+32* 9] + pmulld m5, m14, [cq+32*11] + pmulld m6, m14, [cq+32*13] + pmulld m7, m14, [cq+32*15] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + lea r6, [rsp+32*4] + call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct_8x8_internal_16bpc).main_rect2 + call m(idct_8x16_internal_16bpc).main_evenhalf + psrld m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m10, [pw_2048] +.end: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call .write_16x4_start + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m10 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m10 + call .write_16x4_zero + RET +ALIGN function_align +.transpose: + lea rax, [deint_shuf+128] +.transpose2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 +.transpose3: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m4, m6 + punpckldq m4, m6 + punpckldq m6, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpcklqdq m5, m6, m3 + punpckhqdq m6, m3 + punpckhqdq m3, m2, m7 + punpcklqdq m2, m7 + punpcklqdq m7, m8, m1 + punpckhqdq m8, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m5, 0x31 + vinserti128 m0, xm5, 1 + vperm2i128 m5, m1, m6, 0x31 + vinserti128 m1, xm6, 1 + vperm2i128 m6, m2, m7, 0x31 + vinserti128 m2, xm7, 1 + vperm2i128 m7, m3, m8, 0x31 + vinserti128 m3, xm8, 1 + ret +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_max] + lea r3, [strideq*3] + pxor m8, m8 +.write_16x4_zero: + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 32*8 +.write_16x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3 ] + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + lea r6, [rsp+32*4] + call .main + vpbroadcastd m14, [pd_6144] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_6143 + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] +.pass1_end: + REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp tx2q +.pass2: + call m(idct_16x8_internal_16bpc).transpose + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + vpbroadcastd m10, [pw_2048] + pxor m11, m11 + psubw m11, m10 + pmulhrsw m0, m10 + pmulhrsw m1, m11 + pmulhrsw m2, m10 + pmulhrsw m3, m11 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m11 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m11 + call m(idct_16x8_internal_16bpc).write_16x4_zero + RET +ALIGN function_align +.main: + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 2] + pmulld m1, m15, [cq+32*13] + pmulld m2, m15, [cq+32* 6] + pmulld m3, m15, [cq+32* 9] + pmulld m4, m15, [cq+32*10] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32*14] + pmulld m7, m15, [cq+32* 1] + vpbroadcastd m12, [pd_2048] + vpbroadcastd m13, [clip_min] + vpbroadcastd m14, [clip_max] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + call .main_part1 + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32*15] + pmulld m2, m15, [cq+32* 4] + pmulld m3, m15, [cq+32*11] + pmulld m4, m15, [cq+32* 8] + pmulld m5, m15, [cq+32* 7] + pmulld m6, m15, [cq+32*12] + pmulld m7, m15, [cq+32* 3] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_part2: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 + psubd m8, m0, m4 ; t8a + paddd m0, m4 ; t0a + psubd m4, m1, m5 ; t9a + paddd m1, m5 ; t1a + psubd m5, m2, m6 ; t12a + paddd m2, m6 ; t4a + psubd m6, m3, m7 ; t13a + paddd m7, m3 ; t5a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [pd_4017] + vpbroadcastd m10, [pd_799] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 + psubd m3, m0, m2 ; t4 + paddd m0, m2 ; t0 + psubd m2, m1, m7 ; t5 + paddd m1, m7 ; t1 + psubd m7, m4, m6 ; t12a + paddd m4, m6 ; t8a + psubd m6, m8, m5 ; t13a + paddd m5, m8 ; t9a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 + vpbroadcastd m11, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11 + ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11 + pminsd m10, m14, [r6-32*4] ; t2 + pminsd m8, m14, [r6-32*3] ; t3 + psubd m9, m0, m10 ; t2a + paddd m0, m10 ; out0 + psubd m10, m1, m8 ; t3a + paddd m1, m8 ; -out15 + pmaxsd m9, m13 + pmaxsd m10, m13 + pminsd m9, m14 + pminsd m10, m14 + pmulld m9, m15 + pmulld m10, m15 + mova [r6-32*4], m1 + mova m11, [r6-32*1] ; t7a + mova m1, [r6-32*2] ; t6a + psubd m8, m3, m11 ; t7 + paddd m11, m3 ; out12 + paddd m3, m2, m1 ; -out3 + psubd m2, m1 ; t6 + pmaxsd m8, m13 + pmaxsd m2, m13 + pminsd m8, m14 + pminsd m2, m14 + pmulld m8, m15 + mova [r6-32*1], m11 + mova [r6-32*3], m2 + mova m1, [r6+32*3] ; t15 + mova m2, [r6+32*2] ; t14 + paddd m12, m7, m1 ; -out13 + psubd m7, m1 ; t15a + psubd m11, m6, m2 ; t14a + paddd m2, m6 ; out2 + pmaxsd m7, m13 + pmaxsd m11, m13 + pminsd m7, m14 + pminsd m11, m14 + pmulld m7, m15 + pmulld m11, m15 + mova [r6-32*2], m12 + pminsd m1, m14, [r6+32*0] ; t10a + pminsd m12, m14, [r6+32*1] ; t11a + psubd m6, m4, m1 ; t10 + paddd m1, m4 ; -out1 + psubd m4, m5, m12 ; t11 + paddd m5, m12 ; out14 + pmulld m12, m15, [r6-32*3] ; t6 + pmaxsd m6, m13 + pmaxsd m4, m13 + pminsd m6, m14 + pminsd m4, m14 + pmulld m6, m15 + pmulld m4, m15 + mova [r6-32*3], m5 + paddd m5, m11, m7 ; -out5 (unshifted) + psubd m11, m7 ; out10 (unshifted) + paddd m7, m9, m10 ; -out7 (unshifted) + psubd m9, m10 ; out8 (unshifted) + psubd m10, m6, m4 ; -out9 (unshifted) + paddd m6, m4 ; out6 (unshifted) + paddd m4, m12, m8 ; out4 (unshifted) + psubd m12, m8 ; -out11 (unshifted) + ret +.main_part1: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 + psubd m8, m0, m4 ; t10a + paddd m0, m4 ; t2a + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m3, m7 ; t15a + paddd m7, m3 ; t7a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [pd_2276] + vpbroadcastd m10, [pd_3406] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m8, m5 ; t15a + paddd m5, m8 ; t11a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later + vpbroadcastd m11, [pd_1567] + vpbroadcastd m10, [pd_3784] + ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + lea r6, [rsp+32*4] + call m(iadst_16x8_internal_16bpc).main + vpbroadcastd m14, [pd_6144] + psrld m15, 11 + psubd m13, m14, m15 + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + jmp m(iadst_16x8_internal_16bpc).pass1_end +.pass2: + call m(idct_16x8_internal_16bpc).transpose + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + vpbroadcastd m10, [pw_2048] + pxor m11, m11 + psubw m11, m10 + mova m12, m0 + pmulhrsw m0, m7, m11 + mova m7, m1 + pmulhrsw m1, m6, m10 + mova m6, m2 + pmulhrsw m2, m5, m11 + mova m5, m3 + pmulhrsw m3, m4, m10 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m5, m11 + pmulhrsw m1, m6, m10 + pmulhrsw m2, m7, m11 + pmulhrsw m3, m12, m10 + call m(idct_16x8_internal_16bpc).write_16x4_zero + RET + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32* 1] + pmulld m2, m15, [cq+32* 2] + pmulld m3, m15, [cq+32* 3] + pmulld m4, m15, [cq+32* 4] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32* 6] + pmulld m7, m15, [cq+32* 7] + pmulld m8, m15, [cq+32* 8] + pmulld m9, m15, [cq+32* 9] + pmulld m10, m15, [cq+32*10] + pmulld m11, m15, [cq+32*11] + pmulld m12, m15, [cq+32*12] + pmulld m13, m15, [cq+32*13] + pmulld m14, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [rsp], m7 + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + mova [rsp], m15 + vpbroadcastd m15, [pd_11586] + REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pmulld m15, [rsp] + mova [rsp], m7 + vpbroadcastd m7, [pd_6144] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_16x8_internal_16bpc).transpose + vpbroadcastd m10, [pw_4096] + jmp m(idct_16x8_internal_16bpc).end + +%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 16x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 + add r6d, 10240 + sar r6d, 14 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity, 28 +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst + +cglobal idct_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + mova m10, [r6-32*4] + mova m9, [r6-32*3] + mova m8, [r6-32*2] + psubd m15, m0, m10 ; out15 + paddd m0, m10 ; out0 + psubd m10, m1, m9 ; out14 + paddd m1, m9 ; out1 + psubd m9, m2, m8 ; out13 + paddd m2, m8 ; out2 + REPX {psrad x, 2}, m0, m1, m2 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova m2, [r6-32*1] + mova m1, [r6+32*0] + mova m0, [r6+32*1] + REPX {psrad x, 2}, m9, m10, m15 + psubd m8, m3, m2 ; out12 + paddd m3, m2 ; out3 + psubd m2, m4, m1 ; out11 + paddd m4, m1 ; out4 + psubd m1, m5, m0 ; out10 + paddd m5, m0 ; out5 + REPX {psrad x, 2}, m3, m4, m5 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova m4, [r6+32*2] + mova m3, [r6+32*3] + REPX {psrad x, 2}, m1, m2, m8 + psubd m5, m6, m4 ; out9 + paddd m6, m4 ; out6 + psubd m4, m7, m3 ; out8 + paddd m7, m3 ; out7 + REPX {psrad x, 2}, m6, m7, m4, m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + mova [r6-32*4], m4 + mova [r6-32*3], m5 + mova [r6-32*2], m1 + mova [r6-32*1], m2 + mova [r6+32*0], m8 + mova [r6+32*1], m9 + mova [r6+32*2], m10 + mova [r6+32*3], m15 +.fast: + add r6, 32*8 + call .main + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + sub r6, 32*8 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + lea rax, [pw_5+128] + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] +.end: + call .write_16x16 + RET +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_2048] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct_16x8_internal_16bpc).write_16x4_zero +ALIGN function_align +.transpose: + test eobd, eobd + jl .transpose_fast + packssdw m8, [r6-32*4] + packssdw m9, [r6-32*3] + packssdw m10, [r6-32*2] + packssdw m11, [r6-32*1] + packssdw m12, [r6+32*0] + packssdw m13, [r6+32*1] + packssdw m14, [r6+32*2] + packssdw m15, [r6+32*3] + sub r6, 32*8 + packssdw m0, [r6-32*4] + packssdw m1, [r6-32*3] + packssdw m2, [r6-32*2] + packssdw m3, [r6-32*1] + packssdw m4, [r6+32*0] + packssdw m5, [r6+32*1] + packssdw m6, [r6+32*2] + packssdw m7, [r6+32*3] + mova [r6], m8 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m6, m7 + punpcklwd m6, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m7, m6 + punpckldq m7, m6 + punpckhdq m6, m4, m3 + punpckldq m4, m3 + punpckhqdq m3, m2, m1 + punpcklqdq m2, m1 + punpckhqdq m1, m0, m7 + punpcklqdq m0, m7 + punpcklqdq m7, m8, m6 + punpckhqdq m8, m6 + punpckhqdq m6, m5, m4 + punpcklqdq m5, m4 + mova m4, [r6] + mova [r6], m8 + punpcklwd m8, m4, m9 + punpckhwd m4, m9 + punpcklwd m9, m10, m11 + punpckhwd m10, m11 + punpckhwd m11, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m12, m13 + punpcklwd m12, m13 + punpckldq m13, m4, m10 + punpckhdq m4, m10 + punpckhdq m10, m8, m9 + punpckldq m8, m9 + punpckhdq m9, m12, m14 + punpckldq m12, m14 + punpckhdq m14, m15, m11 + punpckldq m15, m11 + punpckhqdq m11, m10, m9 + punpcklqdq m10, m9 + punpckhqdq m9, m8, m12 + punpcklqdq m8, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m4, m14 + punpcklqdq m14, m4, m14 + vperm2i128 m4, m0, m8, 0x31 + vinserti128 m0, xm8, 1 + vinserti128 m8, m5, xm12, 1 + vperm2i128 m12, m5, 0x13 + vperm2i128 m5, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vinserti128 m9, m6, xm13, 1 + vperm2i128 m13, m6, 0x13 + vperm2i128 m6, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vinserti128 m10, m7, xm14, 1 + vperm2i128 m14, m7, 0x13 + vperm2i128 m7, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + mova xm11, [r6] + vinserti128 m11, xm15, 1 + vinserti128 m15, [r6+16], 0 + ret +.transpose_fast: + call m(idct_16x8_internal_16bpc).transpose2 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + ret +ALIGN function_align +.main: + mova m0, [cq+64* 1] + mova m1, [cq+64* 3] + mova m2, [cq+64* 5] + mova m3, [cq+64* 7] + mova m4, [cq+64* 9] + mova m5, [cq+64*11] + mova m6, [cq+64*13] + mova m7, [cq+64*15] + call m(idct_8x16_internal_16bpc).main_oddhalf + mova m0, [cq+64* 0] + mova m1, [cq+64* 2] + mova m2, [cq+64* 4] + mova m3, [cq+64* 6] + mova m4, [cq+64* 8] + mova m5, [cq+64*10] + mova m6, [cq+64*12] + mova m7, [cq+64*14] + call m(idct_8x8_internal_16bpc).main + call m(idct_8x16_internal_16bpc).main_evenhalf + psrld m10, m11, 10 ; pd_2 + REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_min] + vpbroadcastd m14, [clip_max] + vpbroadcastd m15, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + vpbroadcastd m8, [pd_10240] + paddd m4, m8 + paddd m6, m8 + paddd m9, m8 + paddd m11, m8 + vpbroadcastd m8, [pd_10239] + psubd m5, m8, m5 + psubd m7, m8, m7 + psubd m10, m8, m10 + psubd m12, m8, m12 + REPX {psrad x, 14}, m4, m5, m6, m7, m9, m10, m11, m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + psrld m4, m15, 10 ; pd_2 + paddd m0, m4 + psubd m1, m4, m1 + paddd m2, m4 + psubd m3, m4, m3 + psubd m7, m4, [r6-32*4] + paddd m6, m4, [r6-32*3] + psubd m5, m4, [r6-32*2] + paddd m4, [r6-32*1] + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + add r6, 32*8 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova [r6-32*2], m11 + mova [r6-32*1], m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 +.fast: + add r6, 32*8 + call .main + vpbroadcastd m14, [pd_10240] + vpbroadcastd m13, [pd_10239] + psrld m15, 10 ; pd_2 + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] +.pass1_end: + REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 + sub r6, 32*8 + jmp tx2q +.pass2: + call m(idct_16x16_internal_16bpc).transpose + lea rax, [pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end + mova [rsp+32*0], m8 + mova [rsp+32*2], m12 + mova [rsp+32*3], m13 + vpbroadcastd m12, [pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m12 + pmulhrsw m1, m13, [rsp+32*1] + mova [rsp+32*1], m9 + pmulhrsw m2, m12 + pmulhrsw m3, m13 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m12, m4 + pmulhrsw m1, m13, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m13, m7 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*0] + pmulhrsw m1, m13, [rsp+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m13, m11 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*2] + pmulhrsw m1, m13, [rsp+32*3] + pmulhrsw m2, m12, m14 + pmulhrsw m3, m13, m15 + call m(idct_16x8_internal_16bpc).write_16x4_zero + RET +ALIGN function_align +.main: + mova m0, [cq+64* 2] + mova m1, [cq+64*13] + mova m2, [cq+64* 6] + mova m3, [cq+64* 9] + mova m4, [cq+64*10] + mova m5, [cq+64* 5] + mova m6, [cq+64*14] + mova m7, [cq+64* 1] + vpbroadcastd m12, [pd_2048] + call m(iadst_16x8_internal_16bpc).main_part1 + mova m0, [cq+64* 0] + mova m1, [cq+64*15] + mova m2, [cq+64* 4] + mova m3, [cq+64*11] + mova m4, [cq+64* 8] + mova m5, [cq+64* 7] + mova m6, [cq+64*12] + mova m7, [cq+64* 3] + jmp m(iadst_16x8_internal_16bpc).main_part2 + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_min] + vpbroadcastd m14, [clip_max] + vpbroadcastd m15, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call m(iadst_16x16_internal_16bpc).main + sub cq, 32 + vpbroadcastd m8, [pd_10240] + paddd m11, m8 + paddd m9, m8 + paddd m6, m8 + paddd m4, m8 + vpbroadcastd m8, [pd_10239] + psubd m12, m8, m12 + psubd m10, m8, m10 + psubd m7, m8, m7 + psubd m5, m8, m5 + REPX {psrad x, 14}, m12, m11, m10, m9, m7, m6, m5, m4 + mova [r6+32*0], m12 + mova [r6+32*1], m11 + mova [r6+32*2], m10 + mova [r6+32*3], m9 + psrld m9, m15, 10 ; pd_2 + psubd m3, m9, m3 + paddd m2, m9 + psubd m1, m9, m1 + paddd m0, m9 + psubd m12, m9, [r6-32*4] + paddd m11, m9, [r6-32*3] + psubd m10, m9, [r6-32*2] + paddd m9, [r6-32*1] + REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 + mova [r6-32*4], m12 + mova [r6-32*3], m11 + mova [r6-32*2], m10 + mova [r6-32*1], m9 + add r6, 32*8 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 +.fast: + add r6, 32*8 + call m(iadst_16x16_internal_16bpc).main + vpbroadcastd m14, [pd_10240] + vpbroadcastd m13, [pd_10239] + psrld m15, 10 ; pd_2 + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + jmp m(iadst_16x16_internal_16bpc).pass1_end +.pass2: + call m(idct_16x16_internal_16bpc).transpose + lea rax, [pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end + mova [rsp+32*3], m3 + mova [rsp+32*2], m2 + mova [rsp+32*0], m0 + mova m2, m13 + mova m3, m12 + vpbroadcastd m12, [pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m13, m15 + pmulhrsw m1, m12, m14 + pmulhrsw m2, m13 + pmulhrsw m3, m12 + mova m14, m8 + mova m15, m9 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m13, m11 + pmulhrsw m1, m12, m10 + pmulhrsw m2, m13, m15 + pmulhrsw m3, m12, m14 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m13, m7 + pmulhrsw m1, m12, m6 + pmulhrsw m2, m13, m5 + pmulhrsw m3, m12, m4 + call m(idct_16x8_internal_16bpc).write_16x4_zero + pmulhrsw m0, m13, [rsp+32*3] + pmulhrsw m1, m12, [rsp+32*2] + pmulhrsw m2, m13, [rsp+32*1] + pmulhrsw m3, m12, [rsp+32*0] + call m(idct_16x8_internal_16bpc).write_16x4_zero + RET + +INV_TXFM_16X16_FN identity, dct, -92 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m15, [pd_11586] + vpbroadcastd m7, [pd_10240] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + pmulld m0, m15, [cq+r3+32*33] + pmulld m1, m15, [cq+r3+32*35] + pmulld m2, m15, [cq+r3+32*37] + pmulld m3, m15, [cq+r3+32*39] + add r6, 32*4 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 14}, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + pmulld m0, m15, [cq+64* 0] + pmulld m1, m15, [cq+64* 1] + pmulld m2, m15, [cq+64* 2] + pmulld m3, m15, [cq+64* 3] + pmulld m4, m15, [cq+64* 4] + pmulld m5, m15, [cq+64* 5] + pmulld m6, m15, [cq+64* 6] + pmulld m8, m15, [cq+64* 7] + mova [cq], m8 + pmulld m8, m15, [cq+64* 8] + pmulld m9, m15, [cq+64* 9] + pmulld m10, m15, [cq+64*10] + pmulld m11, m15, [cq+64*11] + pmulld m12, m15, [cq+64*12] + pmulld m13, m15, [cq+64*13] + pmulld m14, m15, [cq+64*14] + pmulld m15, [cq+64*15] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_16x16_internal_16bpc).transpose + + mova [cq+32*0], m15 + mova [cq+32*1], m0 + vpbroadcastd m15, [pw_1697x16] + + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [cq+32*1] + mova [cq+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [cq+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + mova m1, [cq+32*1] + jmp m(idct_16x16_internal_16bpc).end + +%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift + mova m%4, [r6+32*(%1-4)] + mova m%2, [r5+32*(3-%1)] + mova m%5, [r4+32*(%1-4)] + psubd m%3, m%1, m%4 ; idct16 out15 - n + paddd m%1, m%4 ; idct16 out0 + n + pmaxsd m%1, m12 + pmaxsd m%3, m12 + pminsd m%1, m13 + pminsd m%3, m13 + paddd m%1, m11 + paddd m%3, m11 + psubd m%4, m%1, m%2 ; out31 - n + paddd m%1, m%2 ; out0 + n + paddd m%2, m%3, m%5 ; out15 - n + psubd m%3, m%5 ; out16 + n + REPX {psrad x, %6}, m%1, m%3, m%2, m%4 + packssdw m%1, m%3 ; out0 + n, out16 + n + packssdw m%2, m%4 ; out15 - n, out31 - n +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vbroadcasti128 m14, [idct32_shuf] + mov r4, cq + call .pass1_main + mova [rsp+32*0], m2 + mova [rsp+32*1], m3 + cmp eobd, 43 + jge .eob43 + pxor m4, m4 + REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 + jmp .pass1_end_fast +.eob43: + lea r6, [rsp+32*8] + mova [r6-32*4], m0 + mova [r6-32*3], m1 + call .pass1_main + mova [rsp+32*2], m2 + cmp eobd, 107 + jge .eob107 + mova m11, m3 + mova m2, m0 + mova m3, m1 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + pxor m4, m4 +.pass1_end_fast: + vpbroadcastd m10, [pw_2048] + lea rax, [deint_shuf+128] + REPX {mova x, m4}, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .end +.eob107: + mova [rsp+32*3], m3 + mova [r6-32*2], m0 + mova [r6-32*1], m1 + call .pass1_main + cmp eobd, 171 + jge .eob171 + pshufd m12, m2, q1032 + pshufd m13, m3, q1032 + mova m4, m0 + mova m5, m1 + pxor m6, m6 + REPX {mova x, m6}, m7, m14, m15 + jmp .pass1_end +.eob171: + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + call .pass1_main + pshufd m12, [r6+32*2], q1032 ; out19 out17 + pshufd m13, [r6+32*3], q1032 ; out23 out21 + mova m4, [r6+32*0] ; out16 out18 + mova m5, [r6+32*1] ; out20 out22 + pshufd m14, m2, q1032 ; out27 out25 + pshufd m15, m3, q1032 ; out31 out29 + mova m6, m0 ; out24 out26 + mova m7, m1 ; out28 out30 +.pass1_end: + mova m0, [r6-32*4] ; out0 out2 + mova m1, [r6-32*3] ; out4 out6 + mova m2, [r6-32*2] ; out8 out10 + mova m3, [r6-32*1] ; out12 out14 + lea rax, [deint_shuf+128] + mova m11, [rsp+32*3] ; out13 out15 + vpbroadcastd m10, [pw_2048] + call m(inv_txfm_add_dct_dct_8x32_8bpc).main +.end: ; [rsp+0*32] = m12 + vpbroadcastd m12, [pw_2048] + mov cq, r4 + mova [rsp+32*1], m8 + mova [rsp+32*2], m9 + mova [rsp+32*3], m10 + mova [rsp+32*4], m11 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4_start + vpermq m0, m2, q3120 + vpermq m1, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, [rsp+32*1], q3120 + vpermq m1, [rsp+32*2], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, [rsp+32*3], q3120 + vpermq m1, [rsp+32*4], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, [rsp+32*0], q3120 + vpermq m1, m13, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + vpermq m0, m14, q3120 + vpermq m1, m15, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_16bpc).write_8x4 + RET +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 32 + add r6d, 10240 + sar r6d, 14 + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).dconly2 +ALIGN function_align +.pass1_main: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + add cq, 32 + call m(idct_8x8_internal_16bpc).main + psrld m1, m11, 10 ; pd_2 + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + pshufb m0, m14 + pshufb m2, m14 + pshufb m4, m14 + pshufb m6, m14 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + vperm2i128 m1, m0, m2, 0x31 ; 4 6 + vinserti128 m0, xm2, 1 ; 0 2 + vinserti128 m2, m3, xm4, 1 ; 1 3 + vperm2i128 m3, m4, 0x31 ; 5 7 + ret +.main_oddhalf_part1_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part1_fast: ; lower half zero + vpbroadcastd m7, [pd_4091] + vpbroadcastd m8, [pd_201] + vpbroadcastd m6, [pd_m1380] + vpbroadcastd m9, [pd_3857] + vpbroadcastd m5, [pd_3703] + vpbroadcastd m10, [pd_1751] + vpbroadcastd m4, [pd_m2751] + vpbroadcastd m15, [pd_3035] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part1_fast2 +.main_oddhalf_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a +.main_oddhalf_part1_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t17 + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m7, m3 ; t30 + paddd m7, m3 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [pd_4017] + vpbroadcastd m10, [pd_799] + ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m1 ; t28a + paddd m7, m1 ; t31a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m8, m2 ; t29 + paddd m8, m2 ; t30 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a + ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 + mova [r6-32*4], m0 + mova [r6-32*3], m5 + mova [r6-32*2], m4 + mova [r6-32*1], m6 + mova [r6+32*0], m3 + mova [r6+32*1], m1 + mova [r6+32*2], m8 + mova [r6+32*3], m7 + ret +.main_oddhalf_part2_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part2_fast: ; lower half zero + vpbroadcastd m7, [pd_m601] + vpbroadcastd m8, [pd_4052] + vpbroadcastd m6, [pd_3973] + vpbroadcastd m9, [pd_995] + vpbroadcastd m5, [pd_m2106] + vpbroadcastd m10, [pd_3513] + vpbroadcastd m4, [pd_3290] + vpbroadcastd m15, [pd_2440] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part2_fast2 +.main_oddhalf_part2_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 + ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a + ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a +.main_oddhalf_part2_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t25 + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m7, m3 ; t22 + paddd m7, m3 ; t23 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [pd_2276] + vpbroadcastd m10, [pd_3406] + ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m1 ; t20a + paddd m7, m1 ; t23a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m8, m2 ; t26 + paddd m8, m2 ; t25 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 + mova m9, [r6-32*4] ; t16a + mova m10, [r6-32*3] ; t17 + psubd m2, m9, m7 ; t23 + paddd m9, m7 ; t16 + psubd m7, m10, m5 ; t22a + paddd m10, m5 ; t17a + REPX {pmaxsd x, m12}, m9, m10, m2, m7 + REPX {pminsd x, m13}, m9, m10, m2, m7 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova m9, [r6-32*2] ; t18a + mova m10, [r6-32*1] ; t19 + psubd m5, m9, m1 ; t21 + paddd m9, m1 ; t18 + psubd m1, m10, m6 ; t20a + paddd m10, m6 ; t19a + REPX {pmaxsd x, m12}, m9, m10, m5, m1 + REPX {pminsd x, m13}, m9, m10, m5, m1 + mova [r6-32*2], m9 + mova [r6-32*1], m10 + mova m9, [r6+32*0] ; t28 + mova m10, [r6+32*1] ; t29a + psubd m6, m9, m3 ; t27a + paddd m9, m3 ; t28a + psubd m3, m10, m4 ; t26 + paddd m10, m4 ; t29 + REPX {pmaxsd x, m12}, m9, m10, m6, m3 + REPX {pminsd x, m13}, m9, m10, m6, m3 + REPX {pmulld x, m14}, m6, m3, m1, m5 + paddd m6, m11 + paddd m3, m11 + psubd m4, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m5 ; t21a + paddd m3, m5 ; t26a + REPX {psrad x, 12 }, m4, m1, m3, m6 + mova [r6+32*0], m4 + mova [r6+32*1], m1 + mova m4, [r6+32*2] ; t30 + mova m1, [r6+32*3] ; t31a + psubd m5, m4, m8 ; t25a + paddd m4, m8 ; t30a + psubd m8, m1, m0 ; t24 + paddd m1, m0 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m1 + REPX {pminsd x, m13}, m8, m5, m4, m1 + REPX {pmulld x, m14}, m5, m8, m7, m2 + paddd m5, m11 + paddd m8, m11 + psubd m0, m5, m7 ; t22 + paddd m5, m7 ; t25 + psubd m7, m8, m2 ; t23a + paddd m2, m8 ; t24a + REPX {psrad x, 12 }, m0, m7, m2, m5 + mova [r6+32*2], m0 + mova [r6+32*3], m7 + mov r4, r6 + add r6, 32*8 + mova [r6-32*4], m2 + mova [r6-32*3], m5 + mova [r6-32*2], m3 + mova [r6-32*1], m6 + mova [r6+32*0], m9 + mova [r6+32*1], m10 + mova [r6+32*2], m4 + mova [r6+32*3], m1 + mov r5, r6 + add r6, 32*8 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2 + IDCT32_END 1, 14, 8, 9, 10, 2 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 2 + IDCT32_END 3, 14, 8, 9, 10, 2 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 2 + IDCT32_END 5, 14, 8, 9, 10, 2 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 2 + IDCT32_END 7, 14, 8, 9, 10, 2 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 +.transpose: + punpckhdq m15, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m4, m6 + punpckldq m4, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpcklqdq m5, m2, m15 + punpckhqdq m2, m15 + punpckhqdq m15, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m6, m1 + punpcklqdq m6, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m7, 0x31 + vinserti128 m0, xm7, 1 + vperm2i128 m7, m3, m2, 0x31 + vinserti128 m3, xm2, 1 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m15, 0x31 + vinserti128 m1, xm15, 1 + ret + +cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m5, [pw_5] + vpbroadcastd m7, [pixel_max] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {paddsw x, m5}, m0, m1, m2, m3 + REPX {psraw x, 3 }, m0, m1, m2, m3 + call .main_zero + add cq, 32 + lea dstq, [dstq+strideq*8] + sub eobd, 64 + jge .loop + RET +ALIGN function_align +.main_zero: + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m1 + punpcklwd m2, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + paddw m0, m4 + mova xm4, [dstq+strideq*1] + vinserti128 m4, [dstq+r5 ], 1 + paddw m1, m4 + mova xm4, [dstq+strideq*2] + vinserti128 m4, [dstq+r6*2 ], 1 + paddw m2, m4 + mova xm4, [dstq+r6 ] + vinserti128 m4, [dstq+r4 ], 1 + paddw m3, m4 + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + REPX {pminsw x, m7}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*4], m0, 1 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+r5 ], m1, 1 + mova [dstq+strideq*2], xm2 + vextracti128 [dstq+r6*2 ], m2, 1 + mova [dstq+r6 ], xm3 + vextracti128 [dstq+r4 ], m3, 1 + ret + +cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 8 +.dconly: + add r6d, 10240 + sar r6d, 14 +.dconly2: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d + vpbroadcastw m0, xm0 + vpbroadcastd m4, [pixel_max] + pxor m3, m3 +.dconly_loop: + paddw m1, m0, [dstq+32*0] + paddw m2, m0, [dstq+32*1] + pmaxsw m1, m3 + pmaxsw m2, m3 + pminsw m1, m4 + pminsw m2, m4 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + mova m0, [cq+32* 1] + mova m1, [cq+32* 7] + mova m2, [cq+32* 9] + mova m3, [cq+32*15] + mova m4, [cq+32*17] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32*31] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1 + mova m0, [cq+32* 3] + mova m1, [cq+32* 5] + mova m2, [cq+32*11] + mova m3, [cq+32*13] + mova m4, [cq+32*19] + mova m5, [cq+32*21] + mova m6, [cq+32*27] + mova m7, [cq+32*29] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2 + mova m0, [cq+32* 2] + mova m1, [cq+32* 6] + mova m2, [cq+32*10] + mova m3, [cq+32*14] + mova m4, [cq+32*18] + mova m5, [cq+32*22] + mova m6, [cq+32*26] + mova m7, [cq+32*30] + call m(idct_8x16_internal_16bpc).main_oddhalf + mova m0, [cq+32* 0] + mova m1, [cq+32* 4] + mova m2, [cq+32* 8] + mova m3, [cq+32*12] + mova m4, [cq+32*16] + mova m5, [cq+32*20] + mova m6, [cq+32*24] + mova m7, [cq+32*28] + call m(idct_8x8_internal_16bpc).main + call m(idct_8x16_internal_16bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_end + lea rax, [deint_shuf+128] + vpbroadcastd m11, [pw_2048] + mov r4, dstq + call .pass2 + mova m0, [r5+32*3] ; 16 17 + mova m1, [r5+32*2] ; 30 31 + mova m2, [r5+32*1] ; 18 19 + mova m3, [r5+32*0] ; 28 29 + mova m4, [r5-32*1] ; 20 21 + mova m5, [r5-32*2] ; 26 27 + mova m6, [r5-32*3] ; 22 23 + mova m7, [r5-32*4] ; 24 25 + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + lea dstq, [r4+32] + call .pass2 + RET +ALIGN function_align +.pass2: + call m(idct_16x8_internal_8bpc).main + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + call m(idct_16x8_internal_16bpc).write_16x4_start + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 + jmp m(idct_16x8_internal_16bpc).write_16x4_zero + +cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m5, [pw_4096] + vpbroadcastd m7, [pixel_max] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+32*0] + packssdw m0, [cq+32*1] + mova m1, [cq+32*2] + packssdw m1, [cq+32*3] + REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 + add cq, 32*8 + mova m2, [cq-32*4] + packssdw m2, [cq-32*3] + mova m3, [cq-32*2] + packssdw m3, [cq-32*1] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 + call m(inv_txfm_add_identity_identity_8x32_16bpc).main + add dstq, 16 + sub eobd, 64 + jge .loop + RET + +%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 +%if %1 == 0 + pxor m6, m6 +%endif + pmulhrsw m%3, m15 + pmulhrsw m%1, m15 + paddw m%3, [dstq+%5] + paddw m%1, [r2+%6] + pmaxsw m%3, m6 + pmaxsw m%1, m6 + pminsw m%3, m7 + pminsw m%1, m7 + mova [dstq+%5], m%3 + mova [r2+%6], m%1 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*16] + lea r4, [r6+32*8] + lea r5, [r6+32*16] + call .main + sub eobd, 44 + jge .eob44 + vperm2i128 m2, m0, m3, 0x31 ; 5 + vinserti128 m0, xm3, 1 ; 1 + vperm2i128 m3, m1, m4, 0x31 ; 7 + vinserti128 m1, xm4, 1 ; 3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 + jmp .fast +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 32 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly +.eob44: + mova [r4+16*0], xm0 + mova [r4+16*1], xm3 + mova [r4+16*2], xm1 + mova [r4+16*3], xm4 + vextracti128 [r4+16*4], m0, 1 + vextracti128 [r4+16*5], m3, 1 + vextracti128 [r4+16*6], m1, 1 + vextracti128 [r4+16*7], m4, 1 + call .main + sub eobd, 107 + jge .eob151 + vperm2i128 m7, m1, m4, 0x31 ; 15 + vinserti128 m5, m1, xm4, 1 ; 11 + vperm2i128 m6, m0, m3, 0x31 ; 13 + vinserti128 m4, m0, xm3, 1 ; 9 + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] +.fast: + lea rax, [pw_5+128] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct16 +.eob151: + mova [r4-16*8], xm0 + mova [r4-16*7], xm3 + mova [r4-16*6], xm1 + mova [r4-16*5], xm4 + vextracti128 [r4-16*4], m0, 1 + vextracti128 [r4-16*3], m3, 1 + vextracti128 [r4-16*2], m1, 1 + vextracti128 [r4-16*1], m4, 1 + call .main + sub eobd, 128 + jge .eob279 + vperm2i128 m10, m0, m3, 0x31 ; 21 + vinserti128 m8, m0, xm3, 1 ; 17 + vperm2i128 m11, m1, m4, 0x31 ; 23 + vinserti128 m9, m1, xm4, 1 ; 19 + pxor m12, m12 + REPX {mova x, m12}, m13, m14, m15 + REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 + jmp .full +.eob279: + mova [r5+16*0], xm0 + mova [r5+16*1], xm3 + mova [r5+16*2], xm1 + mova [r5+16*3], xm4 + vextracti128 [r5+16*4], m0, 1 + vextracti128 [r5+16*5], m3, 1 + vextracti128 [r5+16*6], m1, 1 + vextracti128 [r5+16*7], m4, 1 + call .main + vperm2i128 m14, m0, m3, 0x31 ; 29 + vinserti128 m12, m0, xm3, 1 ; 25 + vperm2i128 m15, m1, m4, 0x31 ; 31 + vinserti128 m13, m1, xm4, 1 ; 27 + mova m8, [r5+32*0] + mova m9, [r5+32*1] + mova m10, [r5+32*2] + mova m11, [r5+32*3] +.full: + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] + mova m4, [r4-32*4] + mova m5, [r4-32*3] + mova m6, [r4-32*2] + mova m7, [r4-32*1] + lea rax, [pw_5 + 128] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + lea r3, [rsp+32*8] + mova m8, [r3+32*0] + mova m9, [r3+32*1] + mova m10, [r3+32*2] + mova m11, [r3+32*3] + mova m12, [r3-32*4] + mova m13, [r3-32*3] + mova m14, [r3-32*2] + mova m15, [r3-32*1] +.idct16: + lea r3, [rsp+32*16] + mova m0, [r3+32*0] + mova m1, [r3+32*1] + mova m2, [r3+32*2] + mova m3, [r3+32*3] + mova m4, [r3-32*4] + mova m5, [r3-32*3] + mova m6, [r3-32*2] + mova m7, [r3-32*1] + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +.main: + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 3] + pmulld m2, m14, [cq+128* 5] + pmulld m3, m14, [cq+128* 7] + pmulld m4, m14, [cq+128* 9] + pmulld m5, m14, [cq+128*11] + pmulld m6, m14, [cq+128*13] + pmulld m7, m14, [cq+128*15] + call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 2] + pmulld m2, m14, [cq+128* 4] + pmulld m3, m14, [cq+128* 6] + pmulld m4, m14, [cq+128* 8] + pmulld m5, m14, [cq+128*10] + pmulld m6, m14, [cq+128*12] + pmulld m7, m14, [cq+128*14] + call m(idct_8x8_internal_16bpc).main_rect2 + call m(idct_8x16_internal_16bpc).main_evenhalf + psrld m15, m11, 11 ; pd_1 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*2] + paddd m15, m1, m9 ; out1 + psubd m1, m9 ; out14 + mova m9, [r6-32*1] + REPX {psrad x, 1}, m0, m15, m10, m1 + packssdw m0, m15 + packssdw m1, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6+32*0] + paddd m15, m3, m9 ; out3 + psubd m3, m9 ; out12 + mova m9, [r6+32*1] + REPX {psrad x, 1}, m2, m15, m10, m3 + packssdw m2, m15 + packssdw m3, m10 + psubd m10, m4, m8 ; out11 + paddd m4, m8 ; out4 + mova m8, [r6+32*2] + paddd m15, m5, m9 ; out5 + psubd m5, m9 ; out10 + mova m9, [r6+32*3] + REPX {psrad x, 1}, m4, m10, m15, m5 + packssdw m4, m15 + packssdw m5, m10 + psubd m10, m6, m8 ; out9 + paddd m6, m8 ; out6 + paddd m15, m7, m9 ; out7 + psubd m7, m9 ; out8 + REPX {psrad x, 1}, m6, m10, m15, m7 + packssdw m6, m15 + packssdw m7, m10 + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m4, m6 + punpcklwd m4, m6 + punpcklwd m6, m7, m5 + punpckhwd m7, m5 + pxor m5, m5 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m5 + mova [cq+r7+128*0], m5 + mova [cq+r7+128*1], m5 + mova [cq+r7+128*2], m5 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + punpcklwd m5, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m1 + punpckhwd m4, m1 + punpckhwd m1, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m6, m7 + punpcklwd m6, m7 + punpcklqdq m7, m1, m4 + punpckhqdq m1, m4 + punpckhqdq m4, m8, m3 + punpcklqdq m8, m3 + punpckhqdq m3, m6, m5 + punpcklqdq m6, m5 + punpcklqdq m5, m0, m2 + punpckhqdq m0, m2 + mova [r6+16*0], xm5 + mova [r6+16*1], xm6 + mova [r6+16*2], xm7 + mova [r6+16*3], xm8 + vextracti128 [r6+16*4], m5, 1 + vextracti128 [r6+16*5], m6, 1 + vextracti128 [r6+16*6], m7, 1 + vextracti128 [r6+16*7], m8, 1 + sub r6, 32*4 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*2], m7 + mova [rsp+gprsize+32*3], m15 + vpbroadcastd m15, [pw_2048] + vpbroadcastd m7, [pixel_max] + IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 + IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 + IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*0] + IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*2] + mova m2, [rsp+gprsize+32*3] + IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 + ret + +cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m8, [pw_2896x8] + vpbroadcastd m9, [pw_1697x16] + vpbroadcastd m11, [pw_8192] + vpbroadcastd m7, [pixel_max] + lea r6, [strideq*5] + pxor m6, m6 + paddw m10, m11, m11 ; pw_16384 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main2: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpcklwd m4, m2, m1 + punpckhwd m2, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + call m(iidentity_8x8_internal_16bpc).write_2x8x2 + punpcklqdq m0, m3, m2 + punpckhqdq m1, m3, m2 + jmp m(iidentity_8x8_internal_16bpc).write_2x8x2 + +cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob + %undef cmp + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + lea r6, [rsp+32*4] + call .main + cmp eobd, 36 + jge .full + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + lea rax, [pw_5+128] + mov r7, dstq + call m(idct_16x16_internal_8bpc).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + jmp .end +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 6144 + sar r6d, 13 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 +.full: + add cq, 32 + mova [r4+32*3], m0 + mova [r4+32*2], m1 + mova [r4+32*1], m2 + mova [r4+32*0], m3 + mova [r4-32*1], m4 + mova [r4-32*2], m5 + mova [r4-32*3], m6 + mova [r4-32*4], m7 + call .main + sub r4, 32*16 ; topleft 16x8 + call .transpose_16x16 + lea rax, [pw_5+128] + mov r7, dstq + call m(idct_16x16_internal_8bpc).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + add r4, 32*8 ; bottomleft 16x8 + call .transpose_16x16 +.end: + lea dstq, [r7+32] + call m(idct_16x16_internal_8bpc).main + call .write_16x16 + RET +ALIGN function_align +.transpose_16x16: + punpckhdq m8, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpckhdq m5, m4, m6 + punpckldq m4, m6 + punpckhqdq m6, m0, m4 + punpcklqdq m0, m4 + punpckhqdq m4, m1, m5 + punpcklqdq m1, m5 + punpckhqdq m5, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m2, m8 + punpcklqdq m2, m8 + vinserti128 m8, m0, xm7, 1 + vperm2i128 m12, m0, m7, 0x31 + vinserti128 m9, m6, xm5, 1 + vperm2i128 m13, m6, m5, 0x31 + vinserti128 m10, m1, xm2, 1 + vperm2i128 m14, m1, m2, 0x31 + vinserti128 m11, m4, xm3, 1 + vperm2i128 m15, m4, m3, 0x31 + mova m0, [r4+32*3] + mova m1, [r4+32*2] + mova m2, [r4+32*1] + mova m3, [r4+32*0] + mova m4, [r4-32*1] + mova m5, [r4-32*2] + mova m6, [r4-32*3] + mova m7, [r4-32*4] + mova [rsp+gprsize], m15 + jmp m(inv_txfm_add_dct_dct_8x32_16bpc).transpose +ALIGN function_align +.main: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + pmulld m0, m14, [cq+64* 1] + pmulld m1, m14, [cq+64* 7] + pmulld m2, m14, [cq+64* 9] + pmulld m3, m14, [cq+64*15] + pmulld m4, m14, [cq+64*17] + pmulld m5, m14, [cq+64*23] + pmulld m6, m14, [cq+64*25] + pmulld m7, m14, [cq+64*31] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+64* 3] + pmulld m1, m14, [cq+64* 5] + pmulld m2, m14, [cq+64*11] + pmulld m3, m14, [cq+64*13] + pmulld m4, m14, [cq+64*19] + pmulld m5, m14, [cq+64*21] + pmulld m6, m14, [cq+64*27] + pmulld m7, m14, [cq+64*29] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+64* 2] + pmulld m1, m14, [cq+64* 6] + pmulld m2, m14, [cq+64*10] + pmulld m3, m14, [cq+64*14] + pmulld m4, m14, [cq+64*18] + pmulld m5, m14, [cq+64*22] + pmulld m6, m14, [cq+64*26] + pmulld m7, m14, [cq+64*30] + call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+64* 0] + pmulld m1, m14, [cq+64* 4] + pmulld m2, m14, [cq+64* 8] + pmulld m3, m14, [cq+64*12] + pmulld m4, m14, [cq+64*16] + pmulld m5, m14, [cq+64*20] + pmulld m6, m14, [cq+64*24] + pmulld m7, m14, [cq+64*28] + call m(idct_8x8_internal_16bpc).main_rect2 + call m(idct_8x16_internal_16bpc).main_evenhalf + pxor m8, m8 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m8 + mova [cq+r7-64*1], m8 + mova [cq+r7+64*0], m8 + mova [cq+r7+64*1], m8 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m11, 11 ; pd_1 + IDCT32_END 0, 15, 8, 9, 10, 1 + IDCT32_END 1, 14, 8, 9, 10, 1 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 1 + IDCT32_END 3, 14, 8, 9, 10, 1 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 1 + IDCT32_END 5, 14, 8, 9, 10, 1 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 1 + IDCT32_END 7, 14, 8, 9, 10, 1 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 + ret +ALIGN function_align +.write_16x16: + mova m1, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_2048] + vpbroadcastd m9, [pixel_max] + lea r3, [strideq*3] + pxor m8, m8 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_16bpc).write_16x4 + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct_16x8_internal_16bpc).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct_16x8_internal_16bpc).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct_16x8_internal_16bpc).write_16x4 + +cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m8, [pw_2896x8] + vpbroadcastd m9, [pw_1697x16] + vpbroadcastd m10, [pw_2048] + vpbroadcastd m7, [pixel_max] + lea r6, [strideq*5] + pxor m6, m6 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*1] + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*2] + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*3] + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {paddsw x, x }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9 }, 0, 1, 2, 3 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(inv_txfm_add_identity_identity_16x32_16bpc).main2 + +cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob + %undef cmp + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly +.fast: + lea r4, [rsp+32*71] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r3, [rsp+32*3] + mov r4, r6 + lea r5, [r6+32*8] + lea rax, [pw_5+128] + call .pass2_oddhalf + call .pass2_evenhalf + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + lea r3, [rsp+32*11] + call .pass2_oddhalf + call .pass2_evenhalf + lea r3, [strideq*3] + call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 7] + mova m2, [cq+128* 9] + mova m3, [cq+128*15] + mova m4, [cq+128*17] + mova m5, [cq+128*23] + mova m6, [cq+128*25] + mova m7, [cq+128*31] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128* 5] + mova m2, [cq+128*11] + mova m3, [cq+128*13] + mova m4, [cq+128*19] + mova m5, [cq+128*21] + mova m6, [cq+128*27] + mova m7, [cq+128*29] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128* 6] + mova m2, [cq+128*10] + mova m3, [cq+128*14] + mova m4, [cq+128*18] + mova m5, [cq+128*22] + mova m6, [cq+128*26] + mova m7, [cq+128*30] + call m(idct_8x16_internal_16bpc).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mova m4, [cq+128*16] + mova m5, [cq+128*20] + mova m6, [cq+128*24] + mova m7, [cq+128*28] + call m(idct_8x8_internal_16bpc).main + call m(idct_8x16_internal_16bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_end + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret +ALIGN function_align +.pass2_oddhalf: + mova m0, [r3+32* 1] ; 1 + mova m1, [r3+32* 3] ; 3 + mova m2, [r3+32* 5] ; 5 + mova m3, [r3+32* 7] ; 7 + mova m4, [r3+32*17] ; 9 + mova m5, [r3+32*19] ; 11 + mova m6, [r3+32*21] ; 13 + mova m7, [r3+32*23] ; 15 + mova m8, [r3+32*33] ; 17 + mova m9, [r3+32*35] ; 19 + mova m10, [r3+32*37] ; 21 + mova m11, [r3+32*39] ; 23 + mova m12, [r3+32*49] ; 25 + mova m13, [r3+32*51] ; 27 + mova m14, [r3+32*53] ; 29 + mova m15, [r3+32*55] ; 31 + jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf +ALIGN function_align +.pass2_evenhalf: + mova m0, [r3+32* 0] ; 0 + mova m1, [r3+32* 2] ; 2 + mova m2, [r3+32* 4] ; 4 + mova m3, [r3+32* 6] ; 6 + mova m4, [r3+32*16] ; 8 + mova m5, [r3+32*18] ; 10 + mova m6, [r3+32*20] ; 12 + mova m7, [r3+32*22] ; 14 + mova m8, [r3+32*32] ; 16 + mova m9, [r3+32*34] ; 18 + mova m10, [r3+32*36] ; 20 + mova m11, [r3+32*38] ; 22 + mova m12, [r3+32*48] ; 24 + mova m13, [r3+32*50] ; 26 + mova m14, [r3+32*52] ; 28 + mova m15, [r3+32*54] ; 30 + mova [rsp+gprsize], m15 + jmp m(idct_16x16_internal_8bpc).main + +cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 8, 8, dst, stride, c, eob + %undef cmp + vpbroadcastd m5, [pw_8192] + vpbroadcastd m7, [pixel_max] + pxor m6, m6 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 + call .main ; 0 + cmp eobd, 36 + jl .ret + add cq, 128*8 ; 0 1 + mov r7, dstq ; 1 + add dstq, 16 + call .main + call .main2 + cmp eobd, 136 + jl .ret + add cq, 128*16-32 ; 0 1 2 + lea dstq, [r7+16*2] ; 1 2 + call .main ; 2 + call .main2 + call .main2 + cmp eobd, 300 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + add r7, 16*3 ; 1 2 3 + mov dstq, r7 ; 2 3 + call .main ; 3 + call .main2 + call .main2 + call .main2 + cmp eobd, 535 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + mov r7, dstq ; 2 3 4 + call .main ; 3 4 + call .main2 + call .main2 + cmp eobd, 755 + jl .ret + add cq, 128*16-32 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + call .main ; 2 3 4 5 + call .main2 ; 3 4 5 + cmp eobd, 911 + jl .ret + add cq, 128*8 ; 0 1 2 3 + add dstq, 16 ; 1 2 3 4 + call .main ; 2 3 4 5 +.ret: ; 3 4 5 6 + RET +ALIGN function_align +.main2: + sub cq, 128*8-32 + lea dstq, [dstq+strideq*8-16] +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + jmp m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero + +%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n + mova m%4, [r4-32*(14+%1)] ; idct32 out31-n +%else + mova m%5, [r4-32*(45-%1)] + mova m%4, [r5-32*(20+%1)] +%endif + paddsw m%6, m%5, m%4 ; idct32 out 0+n + psubsw m%5, m%4 ; idct32 out31-n + paddsw m%4, m%5, m%3 ; out31-n + psubsw m%5, m%3 ; out32+n + paddsw m%3, m%6, m%2 ; out 0+n + psubsw m%6, m%2 ; out63-n + REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + paddw m%3, [%%d0+%7 ] + paddw m%4, [%%d1+%8 ] + paddw m%5, [%%d0+%9 ] + paddw m%6, [%%d1+%10] + pxor m%2, m%2 + REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 + vpbroadcastd m%2, [pixel_max] + REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 + mova [%%d0+%7 ], m%3 + mova [%%d1+%8 ], m%4 + mova [%%d0+%9 ], m%5 + mova [%%d1+%10], m%6 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob + %undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*6] + call .main + sub eobd, 44 + jl .fast + call .main + sub eobd, 107 + jl .fast + call .main + sub eobd, 128 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 64 + add r6d, 10240 + sar r6d, 14 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 +.fast: + lea r4, [rsp+32*38] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea rax, [pw_5+128] + mova m0, [rsp+32* 2] ; in0 + mova m1, [rsp+32* 6] ; in4 + mova m2, [rsp+32*10] ; in8 + mova m3, [rsp+32*14] ; in12 + mova m4, [rsp+32*18] ; in16 + mova m5, [rsp+32*22] ; in20 + mova m6, [rsp+32*26] ; in24 + mova m7, [rsp+32*30] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*38] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [rsp+32* 4] ; in2 + mova m1, [rsp+32* 8] ; in6 + mova m2, [rsp+32*12] ; in10 + mova m3, [rsp+32*16] ; in14 + mova m4, [rsp+32*20] ; in18 + mova m5, [rsp+32*24] ; in22 + mova m6, [rsp+32*28] ; in26 + mova m7, [rsp+32*32] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova m0, [rsp+32* 3] ; in1 + mova m1, [rsp+32*33] ; in31 + mova m2, [rsp+32*19] ; in17 + mova m3, [rsp+32*17] ; in15 + mova m4, [rsp+32*11] ; in9 + mova m5, [rsp+32*25] ; in23 + mova m6, [rsp+32*27] ; in25 + mova m7, [rsp+32* 9] ; in7 + lea rax, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + mova m0, [rsp+32* 7] ; in5 + mova m1, [rsp+32*29] ; in27 + mova m2, [rsp+32*23] ; in21 + mova m3, [rsp+32*13] ; in11 + mova m4, [rsp+32*15] ; in13 + mova m5, [rsp+32*21] ; in19 + mova m6, [rsp+32*31] ; in29 + mova m7, [rsp+32* 5] ; in3 + add rax, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + call .main_part2_pass2 + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + call m(idct_8x16_internal_16bpc).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(idct_8x8_internal_16bpc).main + call m(idct_8x16_internal_16bpc).main_evenhalf + pxor m15, m15 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + psrld m15, m11, 10 ; pd_2 + mova m8, [r6-32*4] + mova m9, [r6+32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*3] + psubd m15, m7, m9 ; out8 + paddd m7, m9 ; out7 + mova m9, [r6+32*2] + REPX {psrad x, 2}, m0, m15, m10, m7 + packssdw m0, m15 + packssdw m7, m10 + psubd m10, m1, m8 ; out14 + paddd m1, m8 ; out1 + mova m8, [r6-32*2] + psubd m15, m6, m9 ; out9 + paddd m6, m9 ; out6 + mova m9, [r6+32*1] + REPX {psrad x, 2}, m1, m15, m10, m6 + packssdw m1, m15 + packssdw m6, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6-32*1] + psubd m15, m5, m9 ; out10 + paddd m5, m9 ; out5 + mova m9, [r6+32*0] + REPX {psrad x, 2}, m2, m15, m10, m5 + packssdw m2, m15 + packssdw m5, m10 + psubd m10, m3, m8 ; out12 + paddd m3, m8 ; out3 + psubd m15, m4, m9 ; out11 + paddd m4, m9 ; out4 + REPX {psrad x, 2}, m3, m15, m10, m4 + packssdw m3, m15 + packssdw m4, m10 + call m(idct_16x8_internal_16bpc).transpose3 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + ret +.main_part2_pass2: + vpbroadcastd m11, [pw_1567_3784] + vpbroadcastd m12, [pw_m3784_1567] + vpbroadcastd m13, [pw_2896_2896] + lea rax, [pw_5+128] + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [pw_m2896_2896] + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal + vpbroadcastd m14, [pw_2048] + IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp r4, r5 + jne .main_part2_pass2_loop + ret +ALIGN function_align +.main_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_part1: ; idct64 steps 1-5 + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vpbroadcastd m7, [r5+4*0] + vpbroadcastd m8, [r5+4*1] + vpbroadcastd m6, [r5+4*2] + vpbroadcastd m9, [r5+4*3] + vpbroadcastd m5, [r5+4*4] + vpbroadcastd m10, [r5+4*5] + vpbroadcastd m4, [r5+4*6] + vpbroadcastd m15, [r5+4*7] + pmulld m7, m0 ; t63a + pmulld m0, m8 ; t32a + pmulld m6, m1 ; t62a + pmulld m1, m9 ; t33a + pmulld m5, m2 ; t61a + pmulld m2, m10 ; t34a + pmulld m4, m3 ; t60a + pmulld m3, m15 ; t35a + vpbroadcastd m10, [r5+4*8] + vpbroadcastd m15, [r5+4*9] + REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 + psubd m8, m0, m1 ; t33 + paddd m0, m1 ; t32 + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m5 ; t61 + paddd m4, m5 ; t60 + REPX {pmaxsd x, m12}, m8, m1, m6, m2 + REPX {pminsd x, m13}, m8, m1, m6, m2 + ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a + REPX {pmaxsd x, m12}, m0, m3, m7, m4 + REPX {pminsd x, m13}, m0, m3, m7, m4 + vpbroadcastd m10, [r5+4*10] + vpbroadcastd m15, [r5+4*11] + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m8, m2 ; t61 + paddd m8, m2 ; t62 + REPX {pmaxsd x, m12}, m5, m3, m4, m6 + REPX {pminsd x, m13}, m5, m3, m4, m6 + ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 + ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a + REPX {pmaxsd x, m12}, m0, m7, m1, m8 + REPX {pminsd x, m13}, m0, m7, m1, m8 + add r5, 4*12 + mova [r6-32*4], m0 + mova [r6+32*3], m7 + mova [r6-32*3], m1 + mova [r6+32*2], m8 + mova [r6-32*2], m6 + mova [r6+32*1], m4 + mova [r6-32*1], m3 + mova [r6+32*0], m5 + add r6, 32*8 + ret +.main_part2: ; idct64 steps 6-9 + lea r5, [r6+32*3] + sub r6, 32*4 + vpbroadcastd m10, [pd_1567] + vpbroadcastd m15, [pd_3784] +.main_part2_loop: + mova m0, [r6-32*32] ; t32a + mova m1, [r5-32*24] ; t39a + mova m2, [r5-32*32] ; t63a + mova m3, [r6-32*24] ; t56a + mova m4, [r6-32*16] ; t40a + mova m5, [r5-32* 8] ; t47a + mova m6, [r5-32*16] ; t55a + mova m7, [r6-32* 8] ; t48a + psubd m8, m0, m1 ; t39 + paddd m0, m1 ; t32 + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + REPX {pmaxsd x, m12}, m8, m1, m3, m4 + REPX {pminsd x, m13}, m8, m1, m3, m4 + ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a + REPX {pmaxsd x, m12}, m0, m2, m5, m7 + REPX {pminsd x, m13}, m0, m5, m2, m7 + psubd m6, m2, m7 ; t48a + paddd m2, m7 ; t63a + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m8, m4 ; t55 + paddd m8, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + REPX {pmaxsd x, m12}, m6, m7, m5, m4 + REPX {pminsd x, m13}, m6, m7, m5, m4 + REPX {pmulld x, m14}, m6, m7, m5, m4 + REPX {pmaxsd x, m12}, m2, m0, m8, m1 + REPX {pminsd x, m13}, m2, m0, m8, m1 + paddd m6, m11 + paddd m5, m11 + psubd m3, m6, m7 ; t47 + paddd m6, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m3, m6, m7, m5 + mova [r5-32* 8], m2 + mova [r6-32*32], m0 + mova [r6-32* 8], m8 + mova [r5-32*32], m1 + mova [r5-32*24], m3 + mova [r6-32*16], m6 + mova [r6-32*24], m7 + mova [r5-32*16], m5 + add r6, 32 + sub r5, 32 + cmp r6, r5 + jl .main_part2_loop + ret + +cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob + %undef cmp + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + lea r6, [rsp+32*6] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 64 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 6144 + sar r6d, 13 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 +.fast: + lea r4, [rsp+32*70] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea rax, [pw_5 + 128] + mov r10, rsp + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10+32* 2] ; in0 + mova m1, [r10+32* 6] ; in4 + mova m2, [r10+32*18] ; in8 + mova m3, [r10+32*22] ; in12 + mova m4, [r10+32*34] ; in16 + mova m5, [r10+32*38] ; in20 + mova m6, [r10+32*50] ; in24 + mova m7, [r10+32*54] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*70] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10+32* 4] ; in2 + mova m1, [r10+32* 8] ; in6 + mova m2, [r10+32*20] ; in10 + mova m3, [r10+32*24] ; in14 + mova m4, [r10+32*36] ; in18 + mova m5, [r10+32*40] ; in22 + mova m6, [r10+32*52] ; in26 + mova m7, [r10+32*56] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova m0, [r10+32* 3] ; in1 + mova m1, [r10+32*57] ; in31 + mova m2, [r10+32*35] ; in17 + mova m3, [r10+32*25] ; in15 + mova m4, [r10+32*19] ; in9 + mova m5, [r10+32*41] ; in23 + mova m6, [r10+32*51] ; in25 + mova m7, [r10+32* 9] ; in7 + lea rax, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + mova m0, [r10+32* 7] ; in5 + mova m1, [r10+32*53] ; in27 + mova m2, [r10+32*39] ; in21 + mova m3, [r10+32*21] ; in11 + mova m4, [r10+32*23] ; in13 + mova m5, [r10+32*37] ; in19 + mova m6, [r10+32*55] ; in29 + mova m7, [r10+32* 5] ; in3 + add rax, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 + add r10, 32*8 + sub r4, 32*98 ; rsp+32*16 + sub dstq, r8 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 7] + pmulld m2, m14, [cq+128* 9] + pmulld m3, m14, [cq+128*15] + pmulld m4, m14, [cq+128*17] + pmulld m5, m14, [cq+128*23] + pmulld m6, m14, [cq+128*25] + pmulld m7, m14, [cq+128*31] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128* 5] + pmulld m2, m14, [cq+128*11] + pmulld m3, m14, [cq+128*13] + pmulld m4, m14, [cq+128*19] + pmulld m5, m14, [cq+128*21] + pmulld m6, m14, [cq+128*27] + pmulld m7, m14, [cq+128*29] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128* 6] + pmulld m2, m14, [cq+128*10] + pmulld m3, m14, [cq+128*14] + pmulld m4, m14, [cq+128*18] + pmulld m5, m14, [cq+128*22] + pmulld m6, m14, [cq+128*26] + pmulld m7, m14, [cq+128*30] + call m(idct_8x16_internal_16bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 4] + pmulld m2, m14, [cq+128* 8] + pmulld m3, m14, [cq+128*12] + pmulld m4, m14, [cq+128*16] + pmulld m5, m14, [cq+128*20] + pmulld m6, m14, [cq+128*24] + pmulld m7, m14, [cq+128*28] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + call m(idct_8x8_internal_16bpc).main_rect2 + call m(idct_8x16_internal_16bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_32x16_16bpc).main_end + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret + +cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .normal + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 16 +.dconly: + add r6d, 10240 + sar r6d, 14 +.dconly2: + imul r6d, 2896 + add r6d, 34816 + sar r6d, 16 + movd xm0, r6d +%if WIN64 + movaps [rsp+8], xmm6 +%endif + vpbroadcastw m0, xm0 + vpbroadcastd m6, [pixel_max] + pxor m5, m5 +.dconly_loop: + paddw m1, m0, [dstq+32*0] + paddw m2, m0, [dstq+32*1] + paddw m3, m0, [dstq+32*2] + paddw m4, m0, [dstq+32*3] + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + mova [dstq+32*2], m3 + mova [dstq+32*3], m4 + add dstq, strideq + dec r3d + jg .dconly_loop +%if WIN64 + movaps xmm6, [rsp+8] +%endif + RET +.normal: + PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob + %undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + call .main + call .shift_transpose + cmp eobd, 36 + jl .fast + call .main + call .shift_transpose + jmp .pass2 +.fast: + pxor m0, m0 + mov r3d, 4 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + dec r3d + jg .fast_loop +.pass2: + lea r7, [r6-32*64] + lea r4, [r6-32*32] + lea rax, [pw_5+128] + mov r5, dstq +.pass2_loop: + mova m0, [r7-32*4] + mova m1, [r7-32*3] + mova m2, [r7-32*2] + mova m3, [r7-32*1] + mova m4, [r7+32*0] + mova m5, [r7+32*1] + mova m6, [r7+32*2] + mova m7, [r7+32*3] + add r7, 32*32 + mova m8, [r7-32*4] + mova m9, [r7-32*3] + mova m10, [r7-32*2] + mova m11, [r7-32*1] + mova m12, [r7+32*0] + mova m13, [r7+32*1] + mova m14, [r7+32*2] + mova m15, [r7+32*3] + sub r7, 32*24 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + call m(inv_txfm_add_dct_dct_32x16_16bpc).write_16x16 + add r5, 32 + mov dstq, r5 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2 + mova m0, [cq+64* 2] + mova m1, [cq+64*14] + mova m2, [cq+64*18] + mova m3, [cq+64*30] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast + mova m0, [cq+64* 6] + mova m1, [cq+64*10] + mova m2, [cq+64*22] + mova m3, [cq+64*26] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast + mova m0, [cq+64* 4] + mova m1, [cq+64*12] + mova m2, [cq+64*20] + mova m3, [cq+64*28] + call m(idct_8x16_internal_16bpc).main_oddhalf_fast + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + pxor m15, m15 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m15 + mova [cq+r7-64*1], m15 + mova [cq+r7+64*0], m15 + mova [cq+r7+64*1], m15 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m15, m11, 10 ; pd_2 +.main_end2: + add cq, 32 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_16bpc).main + add r6, 32*8 + call m(idct_8x16_internal_16bpc).main_evenhalf + mova [r6+32*2], m1 + mova [r6+32*1], m2 + mova [r6+32*0], m3 + mova [r6-32*1], m4 + mova [r6-32*2], m5 + mova [r6-32*3], m6 + mova [r6-32*4], m7 + jmp .main_end_loop_start +.main_end_loop: + mova m0, [r6+32* 3] ; idct8 0 + n +.main_end_loop_start: + mova m1, [r5+32* 4] ; idct16 15 - n + mova m2, [r5-32*12] ; idct32 16 + n + mova m3, [r6-32*13] ; idct32 31 - n + mova m4, [r6-32*29] ; idct64 63 - n + mova m5, [r5-32*28] ; idct64 48 + n + mova m6, [r6-32*45] ; idct64 47 - n + mova m7, [r5-32*44] ; idct64 32 + n + paddd m8, m0, m1 ; idct16 out0 + n + psubd m0, m1 ; idct16 out15 - n + REPX {pmaxsd x, m12}, m8, m0 + REPX {pminsd x, m13}, m8, m0 + paddd m1, m8, m3 ; idct32 out0 + n + psubd m8, m3 ; idct32 out31 - n + paddd m3, m0, m2 ; idct32 out15 - n + psubd m0, m2 ; idct32 out16 + n + REPX {pmaxsd x, m12}, m1, m8, m3, m0 + REPX {pminsd x, m13}, m1, m3, m8, m0 + REPX {paddd x, m15}, m1, m3, m0, m8 + paddd m2, m1, m4 ; idct64 out0 + n (unshifted) + psubd m1, m4 ; idct64 out63 - n (unshifted) + paddd m4, m3, m5 ; idct64 out15 - n (unshifted) + psubd m3, m5 ; idct64 out48 + n (unshifted) + paddd m5, m0, m6 ; idct64 out16 + n (unshifted) + psubd m0, m6 ; idct64 out47 - n (unshifted) + paddd m6, m8, m7 ; idct64 out31 - n (unshifted) + psubd m8, m7 ; idct64 out32 + n (unshifted) + mova [r5-32*44], m2 + mova [r6+32* 3], m1 + mova [r6-32*45], m4 + mova [r5+32* 4], m3 + mova [r5-32*28], m5 + mova [r6-32*13], m0 + mova [r6-32*29], m6 + mova [r5-32*12], m8 + add r5, 32 + sub r6, 32 + cmp r5, r6 + jl .main_end_loop + ret +.shift_transpose: +%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift + sub r6, 32*48 + mov r5, r6 +%%loop: + mova m0, [r6-32* 4] + mova m4, [r6+32* 4] + mova m1, [r6-32* 3] + mova m5, [r6+32* 5] + mova m2, [r6-32* 2] + mova m6, [r6+32* 6] + mova m3, [r6-32* 1] + mova m7, [r6+32* 7] + REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m4, [r6+32* 0] + mova m6, [r6+32* 8] + mova m5, [r6+32* 1] + mova m7, [r6+32* 9] + REPX {psrad x, %1}, m4, m6, m5, m7 + packssdw m4, m6 + packssdw m5, m7 + mova m6, [r6+32* 2] + mova m8, [r6+32*10] + mova m7, [r6+32* 3] + mova m9, [r6+32*11] + REPX {psrad x, %1}, m6, m8, m7, m9 + packssdw m6, m8 + packssdw m7, m9 + call m(idct_16x8_internal_16bpc).transpose3 + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + add r6, 32*16 + add r5, 32*8 + cmp r5, r4 + jl %%loop + mov r6, r4 +%endmacro + IDCT64_SHIFT_TRANSPOSE 2 + ret + +cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob + %undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 32 + add r6d, 2048 + sar r6d, 12 + imul r6d, 2896 + add r6d, 6144 + sar r6d, 13 + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r7, [r6-32*32] + lea r5, [r6+32*8] + lea rax, [pw_5+128] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq +.pass2_loop: + mova m0, [r7-32*99] + mova m1, [r7-32*97] + mova m2, [r7-32*95] + mova m3, [r7-32*93] + mova m4, [r7-32*67] + mova m5, [r7-32*65] + mova m6, [r7-32*63] + mova m7, [r7-32*61] + mova m8, [r7-32*35] + mova m9, [r7-32*33] + mova m10, [r7-32*31] + mova m11, [r7-32*29] + mova m12, [r7-32* 3] + mova m13, [r7-32* 1] + mova m14, [r7+32* 1] + mova m15, [r7+32* 3] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + mova m0, [r7-32*100] + mova m1, [r7-32*98] + mova m2, [r7-32*96] + mova m3, [r7-32*94] + mova m4, [r7-32*68] + mova m5, [r7-32*66] + mova m6, [r7-32*64] + mova m7, [r7-32*62] + mova m8, [r7-32*36] + mova m9, [r7-32*34] + mova m10, [r7-32*32] + mova m11, [r7-32*30] + mova m12, [r7-32* 4] + mova m13, [r7-32* 2] + mova m14, [r7+32* 0] + mova m15, [r7+32* 2] + add r7, 32*8 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128*31] + pmulld m2, m14, [cq+128*17] + pmulld m3, m14, [cq+128*15] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 7] + pmulld m1, m14, [cq+128*25] + pmulld m2, m14, [cq+128*23] + pmulld m3, m14, [cq+128* 9] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 5] + pmulld m1, m14, [cq+128*27] + pmulld m2, m14, [cq+128*21] + pmulld m3, m14, [cq+128*11] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128*29] + pmulld m2, m14, [cq+128*19] + pmulld m3, m14, [cq+128*13] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128*14] + pmulld m2, m14, [cq+128*18] + pmulld m3, m14, [cq+128*30] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast_rect2 + pmulld m0, m14, [cq+128* 6] + pmulld m1, m14, [cq+128*10] + pmulld m2, m14, [cq+128*22] + pmulld m3, m14, [cq+128*26] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast_rect2 + pmulld m0, m14, [cq+128* 4] + pmulld m1, m14, [cq+128*12] + pmulld m2, m14, [cq+128*20] + pmulld m3, m14, [cq+128*28] + call m(idct_8x16_internal_16bpc).main_oddhalf_fast_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 8] + pmulld m2, m14, [cq+128*16] + pmulld m3, m14, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + psrld m15, m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end2 + IDCT64_SHIFT_TRANSPOSE 1 + ret + +cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob + %undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_min] + vpbroadcastd m13, [clip_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r10, [r6-32*32] + lea rax, [pw_5+128] + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10-32*100] ; in0 + mova m1, [r10-32*96] ; in4 + mova m2, [r10-32*68] ; in8 + mova m3, [r10-32*64] ; in12 + mova m4, [r10-32*36] ; in16 + mova m5, [r10-32*32] ; in20 + mova m6, [r10-32* 4] ; in24 + mova m7, [r10+32* 0] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10-32*98] ; in2 + mova m1, [r10-32*94] ; in6 + mova m2, [r10-32*66] ; in10 + mova m3, [r10-32*62] ; in14 + mova m4, [r10-32*34] ; in18 + mova m5, [r10-32*30] ; in22 + mova m6, [r10-32* 2] ; in26 + mova m7, [r10+32* 2] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova m0, [r10-32*99] ; in1 + mova m1, [r10+32* 3] ; in31 + mova m2, [r10-32*35] ; in17 + mova m3, [r10-32*61] ; in15 + mova m4, [r10-32*67] ; in9 + mova m5, [r10-32*29] ; in23 + mova m6, [r10-32* 3] ; in25 + mova m7, [r10-32*93] ; in7 + lea rax, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + mova m0, [r10-32*95] ; in5 + mova m1, [r10-32* 1] ; in27 + mova m2, [r10-32*31] ; in21 + mova m3, [r10-32*65] ; in11 + mova m4, [r10-32*63] ; in13 + mova m5, [r10-32*33] ; in19 + mova m6, [r10+32* 1] ; in29 + mova m7, [r10-32*97] ; in3 + add rax, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2 + add r10, 32*8 + sub dstq, r8 + sub r4, 32*44 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + mova m0, [cq+128* 1] + mova m1, [cq+128*31] + mova m2, [cq+128*17] + mova m3, [cq+128*15] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+128* 7] + mova m1, [cq+128*25] + mova m2, [cq+128*23] + mova m3, [cq+128* 9] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+128* 5] + mova m1, [cq+128*27] + mova m2, [cq+128*21] + mova m3, [cq+128*11] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128*29] + mova m2, [cq+128*19] + mova m3, [cq+128*13] + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128*14] + mova m2, [cq+128*18] + mova m3, [cq+128*30] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast + mova m0, [cq+128* 6] + mova m1, [cq+128*10] + mova m2, [cq+128*22] + mova m3, [cq+128*26] + call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast + mova m0, [cq+128* 4] + mova m1, [cq+128*12] + mova m2, [cq+128*20] + mova m3, [cq+128*28] + call m(idct_8x16_internal_16bpc).main_oddhalf_fast + mova m0, [cq+128* 0] + mova m1, [cq+128* 8] + mova m2, [cq+128*16] + mova m3, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).shift_transpose + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/itx16_sse.asm dav1d-0.9.1/src/x86/itx16_sse.asm --- dav1d-0.7.1/src/x86/itx16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/itx16_sse.asm 2021-07-28 21:38:28.905852000 +0000 @@ -0,0 +1,2345 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright © 2017-2021, The rav1e contributors +; Copyright © 2020, Nathan Egge +; Copyright © 2021, Matthias Dressel +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA +%macro COEF 1 +pd_%1: times 4 dd %1 +%endmacro + +COEF 201 +COEF 401 +COEF 601 +COEF 799 +COEF 995 +COEF 1189 +COEF 1380 +COEF 1567 +COEF 1751 +COEF 1931 +COEF 2106 +COEF 2276 +COEF 2440 +COEF 2598 +COEF 2751 +COEF 2896 +COEF 3035 +COEF 3166 +COEF 3290 +COEF 3406 +COEF 3513 +COEF 3612 +COEF 3703 +COEF 3784 +COEF 3857 +COEF 3920 +COEF 3973 +COEF 4017 +COEF 4052 +COEF 4076 +COEF 4091 + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +pd_1321: times 4 dd 1321 +pd_2482: times 4 dd 2482 +pd_m3344: times 4 dd -3344 +pd_2048: times 4 dd 2048 +pw_4x2048_4xm2048: times 4 dw 2048 + times 4 dw -2048 +pw_4xm2048_4x2048: times 4 dw -2048 + times 4 dw 2048 +pw_2048: times 8 dw 2048 +pw_m2048: times 8 dw -2048 +pd_3803: times 4 dd 3803 +pw_4096: times 8 dw 4096 +pd_5793: times 4 dd 5793 +pd_6144: times 4 dd 6144 +pw_1697x8: times 8 dw 1697*8 +pw_2896x8: times 8 dw 2896*8 +pw_1697x16: times 8 dw 1697*16 +pw_16384: times 8 dw 16384 +pixel_10bpc_max: times 8 dw 0x03ff + +pw_1567_3784: times 4 dw 1567, 3784 +pw_m3784_1567: times 4 dw -3784, 1567 + +clip_min: times 4 dd -0x20000 +clip_max: times 4 dd 0x1ffff + +cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 +cextern iadst_4x4_internal_8bpc_ssse3.main +cextern idct_4x8_internal_8bpc_ssse3.main +cextern iadst_4x8_internal_8bpc_ssse3.main +cextern idct_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end +cextern idct_8x4_internal_8bpc_ssse3.main +cextern iadst_8x4_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 +cextern iadst_8x8_internal_8bpc_ssse3.main +cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end +cextern idct_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end + +tbl_4x16_2d: db 0, 13, 29, 45 +tbl_4x16_h: db 0, 16, 32, 48 +tbl_4x16_v: db 0, 4, 8, 12 + +tbl_8x16_2d: db 0, 14, 30, 46 +tbl_8x16_v: db 0, 4, 8, 12 +tbl_8x16_h: db 0, 32, 64, 96 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) +%define m(x) m_suffix(x, SUFFIX) + +; This refers to the first function in itx_sse i.e. the start of the text section +; which is needed as a base pointer for constants. +%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3) + +%if ARCH_X86_64 +%define o(x) x +%else +%define o(x) r6-$$+x ; PIC +%endif + +%macro IWHT4_1D 0 + ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 + paddd m0, m1 ; in0 += in1 + psubd m4, m2, m3 ; tmp0 = in2 - in3 + psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 + psrad m5, 1 + psubd m2, m5, m1 ; in2 = tmp1 - in1 + psubd m5, m3 ; in1 = tmp1 - in3 + psubd m0, m5 ; in0 -= in1 + paddd m4, m2 ; in3 = tmp0 + in2 + ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 + ; m4 = out3, m5 = out1 +%endmacro + +INIT_XMM sse2 +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + REPX {psrad x, 2}, m0, m1, m2, m3 + IWHT4_1D + punpckldq m1, m0, m5 + punpckhdq m3, m0, m5 + punpckldq m5, m2, m4 + punpckhdq m2, m4 + punpcklqdq m0, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m4, m3, m2 + punpckhqdq m3, m2 + mova m2, m4 + IWHT4_1D + packssdw m0, m4 ; low: out3, high: out0 + packssdw m2, m5 ; low: out2, high: out1 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + lea r2, [dstq+strideq*2] + movq m1, [dstq+strideq*0] + movhps m1, [r2 +strideq*1] + movq m3, [r2 +strideq*0] + movhps m3, [dstq+strideq*1] + movd m5, bdmaxm + pshuflw m5, m5, q0000 ; broadcast + punpcklqdq m5, m5 ; broadcast + paddsw m0, m1 + paddsw m2, m3 + pmaxsw m0, m4 + pmaxsw m2, m4 + pminsw m0, m5 + pminsw m2, m5 + movhps [r2 +strideq*1], m0 ; write out0 + movhps [dstq+strideq*1], m2 ; write out1 + movq [r2 +strideq*0], m2 ; write out2 + movq [dstq+strideq*0], m0 ; write out3 + RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 2 = inv_dst1, 4 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +; %1 dst/src[1] +; %2 dst/src[2] +; %3 tmp[1] +; %4 tmp[2] +; %5 tmp[3] +; %6 rnd +; %7 coef[1] +; %8 coef[2] +; %9 flags +%ifnidn %7,%8 ; optimize when coef1 == coef2 +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else + mova m%3, [o(pd_%8)] + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else + mova m%5, [o(pd_%7)] + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 4 ; invert dst2 + paddd m%4, m%2 + psubd m%2, m%6, m%4 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%4, m%6 +%else + paddd m%1, m%6 +%endif +%endif +%ifnidn %7,%8 + paddd m%2, m%4 +%else + mova m%3, m%2 + paddd m%2, m%1 +%endif +%endif +%if %9 & 2 ; invert dst1 + psubd m%3, m%1 + paddd m%1, m%3, m%6 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%1, m%6 +%endif +%endif + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack +cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_16bpc) +%if ARCH_X86_32 + LEA r6, $$ +%endif +%if has_epilogue +%ifidn %1_%2, dct_dct + test eobd, eobd + jz %%end +%endif + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 +%if %3 + add eobd, %3 +%endif +%else + lea r5, [o(%3)] +%endif + call %%p1 + RET +%%end: +%else + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 +%if %3 + add eobd, %3 +%endif +%else + lea r5, [o(%3)] +%endif +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x4 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + movd m1, [o(pw_2896x8)] + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + movd m0, r5d + packssdw m0, m0 + pmulhrsw m0, m1 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + mova m1, m0 + TAIL_CALL m(iadst_4x4_internal_16bpc).end +%endif +%endmacro + +%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd + ; butterfly rotation + ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0 + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3 + ; Hadamard rotation + psubd m%5, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%3, m%4 + psubd m%3, m%4 + ; %1 (src1) = out0 + ; %2 (src2) = out1 + ; %3 (src3) = out3 + ; $5 (tmp1) = out2 +%endmacro + +INIT_XMM sse4 + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, identity +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst + +cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + mova m5, [o(pd_2048)] + call .pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + ; transpose + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass1_main: + IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 + ret +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + mova m4, [o(pw_m3784_1567)] + punpckhwd m2, m1, m0 + psubw m3, m0, m1 + paddw m0, m1 + punpcklqdq m0, m3 + pmaddwd m4, m2 + pmaddwd m2, [o(pw_1567_3784)] + pmulhrsw m0, [o(pw_2896x8)] ; t0 t1 + paddd m4, m5 + paddd m2, m5 + psrad m4, 12 + psrad m2, 12 + packssdw m2, m4 ; t3 t2 + psubsw m1, m0, m2 ; tmp3 tmp2 + paddsw m0, m2 ; tmp0 tmp1 + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*1] + movhps m3, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movhps [r5 +strideq*0], m1 + movq [r5 +strideq*1], m1 + RET + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + ; transpose + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main +.end: + mova m4, [o(pw_2048)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET +ALIGN function_align +.main: + mova m1, [cq+16*2] + mova m3, [cq+16*3] + mova m5, [cq+16*0] + lea r3, [cq+16*1] +.main2: + mova m0, [o(pd_1321)] ; SINPI_1_9 + mova m2, [o(pd_2482)] ; SINPI_2_9 + mova m6, [o(pd_3803)] ; SINPI_4_9 + pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] + pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] + pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] + pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0] + psubd m1, m3 ; T[2] - T[3] + pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3] + pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0] + paddd m0, m6 ; s[0] += s[3] + paddd m0, m3 ; s[0] += s[5] + mova m3, [o(pd_m3344)] ; -SINPI_3_9 + psubd m2, m4 ; s[1] -= s[4] + psubd m2, m7 ; s[1] -= s[6] + psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] + pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 + pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 + paddd m4, m0, m2 ; x[3] = s[0] + s[1] + psubd m2, m3 ; x[1] = s[1] + s[3] + psubd m0, m3 ; x[0] = s[0] + s[3] + paddd m4, m3 ; x[3] -= s[3] + paddd m2, m5 ; x[1] + 2048 + REPX {psrad x, 12}, m0, m2, m1, m4 + ret + + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_16bpc).main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + ; transpose + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main + mova m4, [o(pw_2048)] + movq m3, [dstq+strideq*1] + movhps m3, [dstq+strideq*0] + lea r5, [dstq+strideq*2] + movq m2, [r5 +strideq*1] + movhps m2, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movhps [dstq+strideq*0], m1 + movq [dstq+strideq*1], m1 + movhps [r5 +strideq*0], m0 + movq [r5 +strideq*1], m0 + RET + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m3, [o(pd_5793)] + pmulld m0, m3, [cq+16*0] + pmulld m1, m3, [cq+16*1] + pmulld m2, m3, [cq+16*2] + pmulld m3, [cq+16*3] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + ; transpose + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + mova m4, [o(pw_1697x8)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + pmulhrsw m3, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m3 + paddsw m1, m4 + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m4, [o(pixel_10bpc_max)] + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + pxor m5, m5 + mova [cq+16*0], m5 + mova [cq+16*1], m5 + mova [cq+16*2], m5 + mova [cq+16*3], m5 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m5 + pmaxsw m1, m5 + pminsw m0, m4 + pminsw m1, m4 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET + +%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2 + INV_TXFM_FN %1, %2, %3, 4x8 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 2 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 2048 + sar r5d, 12 +.end: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + pxor m4, m4 + mova m3, [o(pixel_10bpc_max)] + lea r2, [strideq*3] +.loop: + movq m1, [dstq+strideq*0] + movq m2, [dstq+strideq*2] + movhps m1, [dstq+strideq*1] + movhps m2, [dstq+r2] + paddw m1, m0 + paddw m2, m0 + REPX {pminsw x, m3}, m1, m2 + REPX {pmaxsw x, m4}, m1, m2 + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + movq [dstq+strideq*2], m2 + movhps [dstq+r2 ], m2 + lea dstq, [dstq+strideq*4] + dec r3d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity, 9 +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp + mova m5, [o(pd_2048)] +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 +.loop_pass1: + mova m3, [o(pd_2896)] + pmulld m0, m3, [cq+32*0+r5] + pmulld m1, m3, [cq+32*1+r5] + pmulld m2, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + call m(idct_4x4_internal_16bpc).pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova [cq+32*1+16], m4 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*0+16] + mova m6, [cq+32*1+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_4x8_internal_8bpc, _ssse3).main + ; m0-3 is now out0/1,3/2,4/5,7/6 + mova m4, [o(pw_2048)] + shufps m1, m1, q1032 + shufps m3, m3, q1032 +.end: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + mova m7, [o(pixel_10bpc_max)] + lea r2, [strideq*3] + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r2] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r2] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r2 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r2 ], m3 + RET + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity, 9 + +cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .pass1_main + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*2+16] + mova m6, [cq+32*3+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass1_main: +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 + lea r3, [cq+32*1+16] +.loop_pass1: + mova m0, [o(pd_2048)] + mova m3, [o(pd_2896)] + pmulld m5, m3, [cq+32*0+r5] + pmulld m2, m3, [cq+32*1+r5] + pmulld m1, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] + REPX {paddd x, m0}, m5, m2, m1, m3 + REPX {psrad x, 12}, m5, m2, m1, m3 + mova [r3], m2 + call m(iadst_4x4_internal_16bpc).main2 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*2+16], m0 + mova [cq+32*3+16], m1 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + ret +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, [o(pw_4x2048_4xm2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity, 9 + +cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_16bpc).pass1_main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + mova m6, [cq+32*2+16] + mova m2, [cq+32*3+16] + punpcklwd m4, m2, m6 + punpckhwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, m0 + mova m5, m1 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + pshufd m2, m5, q1032 + pshufd m3, m4, q1032 + mova m4, [o(pw_4xm2048_4x2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity, 3 + +cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp + mova m5, [o(pd_2048)] + mova m4, [o(pd_2896)] + mova m6, [o(pd_5793)] + ; clear m7 in case we skip the bottom square + pxor m7, m7 +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 16 + setge r5b +%else + mov r5d, 1 + cmp eobd, 16 + sbb r5d, 0 +%endif + shl r5d, 4 +.loop_pass1: + pmulld m0, m4, [cq+32*0+r5] + pmulld m1, m4, [cq+32*1+r5] + pmulld m2, m4, [cq+32*2+r5] + pmulld m3, m4, [cq+32*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova m7, m2 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + punpckhwd m4, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m4 + punpcklwd m0, m4 + mova m2, [cq+32*0+16] + punpckhwd m4, m2, m7 + punpcklwd m2, m7 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + mova m4, [o(pw_4096)] + jmp m(idct_4x8_internal_16bpc).end + +%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2 + INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 4 + add r5d, 6144 + sar r5d, 13 + jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity, v +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif + mova m5, [o(pd_2048)] +.loop_pass1: + mova m0, [cq+64*0+r5] + mova m1, [cq+64*1+r5] + mova m2, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(idct_4x4_internal_16bpc).pass1_main + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m1, m4, m2 + REPX {psrad x, 1}, m0, m1, m4, m2 + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r5d, r5d + jz .end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.end_pass1: + mova m2, [cq+64*0+16] + mova m3, [cq+64*1+16] + mova m4, [cq+64*0+32] + mova m5, [cq+64*1+32] + mova m6, [cq+64*0+48] + mova m7, [cq+64*1+48] + ; m0-7 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_16x4_internal_8bpc, _ssse3).main + ; m0-6 is out0-13 [with odd registers having inversed output] + ; [coeffq+16*7] has out15/14 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [cq+16*7] + REPX {shufps x, x, q1032}, m1, m3, m5, m7 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova [cq+16*2], m6 + mova [cq+16*3], m7 +.end: + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + mova m7, [o(pixel_10bpc_max)] + mov r5d, 2 + lea r3, [strideq*3] +.loop: + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r3] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r3] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r3 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r3 ], m3 + dec r5d + jz .end2 + lea dstq, [dstq+strideq*8] + mova m0, [cq+0*16] + mova m1, [cq+1*16] + mova m2, [cq+2*16] + mova m3, [cq+3*16] + REPX {mova [cq+x*16], m4}, 0, 1, 2, 3 + jmp .loop +.end2: + RET + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity, v + +cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r6+r5] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8 + ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out4/11,5/10,6/9,7/8 + ; m0/3/6/1 = out0/15,3/12,1/14,2/13 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movhps [cq+0*8], m4 + movhps [cq+1*8], m2 + movhps [cq+2*8], m5 + movhps [cq+3*8], m7 + movhps [cq+4*8], m3 + movhps [cq+5*8], m1 + movhps [cq+6*8], m6 + movhps [cq+7*8], m0 + punpcklqdq m0, m6 + punpcklqdq m1, m3 + punpcklqdq m3, m2, m4 + punpcklqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity, v + +cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out3 out2 + packssdw m1, m4 ; out1 out0 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7 + ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out11/4,10/5,9/6,8/7 + ; m0/3/6/1 = out15/0,12/3,14/1,13/2 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movq [cq+0*8], m4 + movq [cq+1*8], m2 + movq [cq+2*8], m5 + movq [cq+3*8], m7 + movq [cq+4*8], m3 + movq [cq+5*8], m1 + movq [cq+6*8], m6 + movq [cq+7*8], m0 + punpckhqdq m0, m6 + punpckhqdq m1, m3 + punpckhqdq m3, m2, m4 + punpckhqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN identity, dct, h +INV_TXFM_4X16_FN identity, adst, h +INV_TXFM_4X16_FN identity, flipadst, h +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif + mova m5, [o(pd_6144)] + mova m4, [o(pd_5793)] +.loop_pass1: + pmulld m0, m4, [cq+64*0+r5] + pmulld m1, m4, [cq+64*1+r5] + pmulld m2, m4, [cq+64*2+r5] + pmulld m3, m4, [cq+64*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: + mova [cq+16*4], m0 + mova [cq+16*5], m1 + mova [cq+16*6], m2 + mova [cq+16*7], m7 + mova m0, [o(pw_1697x16)] + mova m7, [o(pw_2048)] + pmulhrsw m1, m0, m4 + pmulhrsw m2, m0, m5 + REPX {paddsw x, x}, m4, m5 + paddsw m4, m1 + paddsw m5, m2 + REPX {pmulhrsw x, m7}, m4, m5 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova m4, [cq+16*7] + pmulhrsw m1, m0, m6 + pmulhrsw m2, m0, m4 + REPX {paddsw x, x}, m6, m4 + paddsw m6, m1 + paddsw m4, m2 + REPX {pmulhrsw x, m7}, m6, m4 + mova [cq+16*2], m6 + mova [cq+16*3], m4 + mova m4, [cq+16*4] + mova m1, [cq+16*5] + mova m2, [cq+16*6] + pmulhrsw m5, m0, m2 + pmulhrsw m6, m0, m3 + REPX {paddsw x, x}, m2, m3 + paddsw m2, m5 + paddsw m3, m6 + pmulhrsw m6, m0, m1 + pmulhrsw m0, m4 + REPX {paddsw x, x}, m1, m4 + paddsw m1, m6 + paddsw m0, m4 + REPX {pmulhrsw x, m7}, m2, m3, m1, m0 + jmp m(idct_4x16_internal_16bpc).end + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, 0, 8x4, 14 +%else + INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + RET +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, identity +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst + +cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .load +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif + call .main_pass1 + call .round +.pack_transpose: + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 +.transpose: + ; transpose + punpckhwd m5, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m7, m5, m4 + punpcklwd m5, m4 + + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + punpcklwd m2, m3, m7 + punpckhwd m3, m7 + ; m0-3 = packed & transposed output + jmp tx2q +.load: + mova m7, [o(pd_2896)] + pmulld m0, m7, [cq+0*16] + pmulld m1, m7, [cq+1*16] + pmulld m2, m7, [cq+2*16] + pmulld m3, m7, [cq+3*16] + pmulld m4, m7, [cq+4*16] + pmulld m5, m7, [cq+5*16] + pmulld m6, m7, [cq+6*16] + pmulld m7, [cq+7*16] +%if ARCH_X86_64 + mova m8, [o(pd_2048)] + REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [cq+0*16], m7 + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [cq+0*16] +%endif + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +.main_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_min)] + mova m13, [o(clip_max)] + ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m3, [o(pd_2896)] + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m3 }, m0, m4, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +.round: + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 +%else + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m4 + mova [r3+3*16], m6 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a + paddd m2, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m4, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m6, [o(clip_min)] + REPX {pmaxsd x, m6 }, m1, m2, m7, m4 + mova m6, [o(clip_max)] + REPX {pminsd x, m6 }, m1, m2, m7, m4 + mova m6, [r3+3*16] + mova [r3+3*16], m2 + mova m2, [r3+1*16] + mova [r3+1*16], m4 + + ITX_MULSUB_2D 2, 6, 4, 3, 5, 0, 1567, 3784 ; t2 t3 + mova m3, [o(pd_2896)] + mova m5, [r3+0*16] + mova m4, [r3+2*16] + REPX {pmulld x, m3 }, m5, m4, m7, m1 + paddd m7, m0 + paddd m0, m5 + + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + + mova m1, [o(clip_min)] + REPX {pmaxsd x, m1 }, m0, m6, m5, m3 + mova m1, [o(clip_max)] + REPX {pminsd x, m1 }, m0, m6, m5, m3 + ret +.round: + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + mova [r3+0*16], m6 + mova m6, [r3+1*16] + psubd m7, m0, m6 ; out7 + paddd m0, m6 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + mova m6, [r3+3*16] + psubd m4, m3, m6 ; out4 + paddd m3, m6 ; out3 + mova m6, [r3+0*16] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x4_internal_8bpc, _ssse3).main +.end: + lea r3, [strideq*3] +.end2: + ; output is in m0-3 + mova m4, [o(pw_2048)] +.end3: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + mova m7, [o(pixel_10bpc_max)] + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + RET + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(idct_8x4_internal_16bpc).load +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif + call .main_pass1 + call .round + jmp m(idct_8x4_internal_16bpc).pack_transpose +.main_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_min)] + mova m13, [o(clip_max)] + + ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a + psubd m8, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m5, m1 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7 + REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7 + ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a + psubd m9, m6, m8 ; t7 + paddd m6, m8 ; out6 + mova m8, [o(pd_2896)] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m2 ; t2 + paddd m0, m2 ; out0 + psubd m2, m1, m4 ; t6 + paddd m1, m4 ; -out1 + REPX {pmaxsd x, m12}, m5, m3, m2, m9 + REPX {pminsd x, m13}, m5, m3, m2, m9 + REPX {pmulld x, m8 }, m5, m3, m2, m9 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m2, m9 ; (t6 - t7) * 2896 + paddd m2, m9 ; (t6 + t7) * 2896 + ret +.round: + + ; m0=out0,m1=-out1,m6=out6,m7=-out7 + + pcmpeqd m8, m8 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psubd x, m8 }, m1, m7 + REPX {paddd x, m11}, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 +%else + mova [r3+0*16], m2 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m5 + mova m5, [o(pd_2048)] + + ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a + mova m2, [r3+0*16] + mova m3, [r3+1*16] + mova m4, [r3+2*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m6 + mova m1, [r3+3*16] + mova [r3+3*16], m7 + ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a + mova m0, [r3+0*16] + mova m6, [r3+2*16] + psubd m7, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + mova [r3+0*16], m7 + mova m5, [r3+1*16] + mova m7, [r3+3*16] + psubd m4, m1, m5 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + mova m3, [o(clip_min)] + REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 + mova [r3+1*16], m7 + mova m7, [o(clip_max)] + pmaxsd m3, [r3+0*16] + REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 + pminsd m7, [r3+1*16] + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m5 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a + mova m5, [r3+2*16] + mova m7, [r3+3*16] + psubd m2, m6, m3 ; t7 + paddd m6, m3 ; out6 + mova [r3+3*16], m6 + mova m0, [r3+0*16] + mova m6, [r3+1*16] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m6 ; t2 + paddd m0, m6 ; out0 + psubd m6, m1, m4 ; t6 + paddd m1, m4 ; -out1 + mova m4, [o(clip_min)] + REPX {pmaxsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(clip_max)] + REPX {pminsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(pd_2896)] + REPX {pmulld x, m4 }, m5, m3, m6, m2 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m6, m2 ; (t6 - t7) * 2896 + paddd m2, m6 ; (t6 + t7) * 2896 + ret +.round: + mova [r3+2*16], m0 + + pcmpeqd m0, m0 + mova m6, [o(pd_2048)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 + + mova m6, [r3+3*16] + mova m0, [r3+2*16] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + jmp m(idct_8x4_internal_16bpc).end + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(idct_8x4_internal_16bpc).load +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif + call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x4_internal_16bpc).round + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + jmp m(idct_8x4_internal_16bpc).transpose +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + lea r3, [strideq*3] + add dstq, r3 + neg strideq + neg r3 + jmp m(idct_8x4_internal_16bpc).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(idct_8x4_internal_16bpc).load + REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp m(idct_8x4_internal_16bpc).pack_transpose +.pass2: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_8x4_internal_16bpc).end + +%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, %3, 8x8, 14, 0-3*16 +%else + INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + mov r3d, 2 +.end: + add r5d, 6144 + sar r5d, 13 + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] +.loop: + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + lea dstq, [dstq+strideq*4] + dec r3d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity, 6 +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + DECLARE_REG_TMP 1 + mov [rsp+4*16+1*gprsize], r1 +%else + DECLARE_REG_TMP 6 +%endif + lea t0, [o(.pass1_main)] + +.pass1_full: +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 10 + setge r5b +%else + mov r5d, 1 + cmp eobd, 10 + sbb r5d, 0 +%endif + shl r5d, 4 +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m0, [cq+0*32+r5] + mova m1, [cq+1*32+r5] + mova m2, [cq+2*32+r5] + mova m3, [cq+3*32+r5] + mova m4, [cq+4*32+r5] + mova m5, [cq+5*32+r5] + mova m6, [cq+6*32+r5] + mova m7, [cq+7*32+r5] + call t0 + + test r5d, r5d + jz .end_pass1 + + mova [cq+0*32+16], m0 + mova [cq+1*32+16], m1 + mova [cq+2*32+16], m2 + mova [cq+3*32+16], m3 + + sub r5d, 16 + jmp .loop_pass1 +.end_pass1: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_32 + mov r1, [rsp+4*16+1*gprsize] +%endif + jmp tx2q +.pass1_main: + call m(idct_8x4_internal_16bpc).main_pass1 + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + call m(idct_8x4_internal_16bpc).round + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 +.pack_and_transpose: + packssdw m2, m3 + packssdw m6, m7 + packssdw m0, m1 + packssdw m4, m5 +.transpose: + punpcklwd m7, m2, m6 + punpckhwd m2, m6 + punpckhwd m5, m0, m4 + punpcklwd m0, m4 + + punpckhwd m4, m5, m2 + punpcklwd m5, m2 + punpckhwd m2, m0, m7 + punpcklwd m0, m7 + + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+0*16] +.end: + lea r3, [strideq*3] +%if ARCH_X86_64 +%define mzero m8 +%define mlim m11 +%else + mova [rsp+0*16+gprsize], m6 + mova [rsp+1*16+gprsize], m7 +%define mzero m6 +%define mlim m7 +%endif + pxor mzero, mzero + REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + mova mlim, [o(pixel_10bpc_max)] + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + REPX {pminsw x, mlim }, m0, m1, m2, m3 + REPX {pmaxsw x, mzero}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] +%if ARCH_X86_32 + SWAP 2, 6 + SWAP 3, 7 + mova m6, [rsp+0*16+gprsize] + mova m7, [rsp+1*16+gprsize] +%define mzero m2 +%define mlim m3 +%endif + paddw m4, [dstq+strideq*0] + paddw m5, [dstq+strideq*1] + paddw m6, [dstq+strideq*2] + paddw m7, [dstq+r3] + REPX {pminsw x, mlim }, m4, m5, m6, m7 + REPX {pmaxsw x, mzero}, m4, m5, m6, m7 + mova [dstq+strideq*0], m4 + mova [dstq+strideq*1], m5 + mova [dstq+strideq*2], m6 + mova [dstq+r3 ], m7 +%undef mzero +%undef mlim + RET + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity, 6 + +cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call .round + jmp m(idct_8x8_internal_16bpc).pack_and_transpose +.round: +%if ARCH_X86_64 + pcmpeqd m8, m8 ; -1 + mova m11, [o(pd_6144)] + REPX {psubd x, m8 }, m0, m6 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m0, m1, m6, m7 + REPX {psubd x, m8 }, m1, m7 + REPX {paddd x, m11}, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 +%else + mova [r3+2*16], m0 + + pcmpeqd m0, m0 ; -1 + mova m6, [o(pd_6144)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m1, m7 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 + + mova m0, [r3+2*16] + psrld m6, 12 ; +1 + paddd m0, m6 + paddd m6, [r3+3*16] + REPX {psrad x, 1 }, m0, m6 +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova m7, [o(pw_m2048)] + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + jmp m(idct_8x8_internal_16bpc).end + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity, 6 + +cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x8_internal_16bpc).round + ; invert registers + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + jmp m(idct_8x8_internal_16bpc).transpose + +.pass2: + lea dstq, [dstq+strideq*8] + sub dstq, strideq + neg strideq + jmp m(iadst_8x8_internal_16bpc).pass2 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m0, [cq+0*32] + mova m1, [cq+1*32] + mova m2, [cq+2*32] + mova m3, [cq+3*32] + mova m4, [cq+4*32] + mova m5, [cq+5*32] + mova m6, [cq+6*32] + mova m7, [cq+7*32] + packssdw m0, [cq+0*32+16] + packssdw m1, [cq+1*32+16] + packssdw m2, [cq+2*32+16] + packssdw m3, [cq+3*32+16] + packssdw m4, [cq+4*32+16] + packssdw m5, [cq+5*32+16] + packssdw m6, [cq+6*32+16] + packssdw m7, [cq+7*32+16] + mova [rsp+gprsize+16*1], m6 + jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3 + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_4096)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [rsp+gprsize+0*16], m7 + mova m7, [o(pw_4096)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+0*16] +%endif + jmp m(idct_8x8_internal_16bpc).end + +%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 14, 0-16*16 +%else + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 2896 + mov [cq], eobd ; 0 + add r5d, 2048 + sar r5d, 12 + imul r5d, 2896 + mov r3d, 4 +%if stack_size_padded > 0 + ; adjust to caller's stack allocation + add rsp, (12+ARCH_X86_64)*16 +%endif + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, v +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +%if ARCH_X86_64 +DECLARE_REG_TMP 7 +%endif + +cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)] +.pass1_full: +%undef cmp + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, [rsp+16*16+2*gprsize] + ; setup stack pointer + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m7, [o(pd_2896)] + pmulld m0, m7, [cq+0*64+r5] + pmulld m1, m7, [cq+1*64+r5] + pmulld m2, m7, [cq+2*64+r5] + pmulld m3, m7, [cq+3*64+r5] + pmulld m4, m7, [cq+4*64+r5] + pmulld m5, m7, [cq+5*64+r5] + pmulld m6, m7, [cq+6*64+r5] + pmulld m7, [cq+7*64+r5] +%if ARCH_X86_64 + mova m8, [o(pd_2048)] + REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [rsp+gprsize+0*16], m7 + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [rsp+gprsize+0*16] +%endif + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + call t0 + + mova [cq+0*64+r5], m0 + mova [cq+1*64+r5], m1 + mova [cq+2*64+r5], m2 + mova [cq+3*64+r5], m3 + sub r5d, 16 + jge .loop_pass1 +%if WIN64 + POP r7 +%elif ARCH_X86_32 + mov r1, [rsp+16*16+1*gprsize] +%endif + jmp tx2q + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + + ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15 + ; some are still pre-loaded from the final loop iteration in pass=1 + + mova m1, m2 + mova m2, [cq+ 1*16] + mova m3, [cq+ 9*16] + mova m4, [cq+ 2*16] + mova m5, [cq+10*16] + mova m6, [cq+ 3*16] + mova m7, [cq+11*16] + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+3*16], m0 + mova [rsp+gprsize+4*16], m1 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m3 + mova [rsp+gprsize+7*16], m4 + mova [rsp+gprsize+8*16], m5 + mova [rsp+gprsize+9*16], m6 + ; m7 is already stored in [rsp+gprsize+0*16] + mova m0, [cq+ 4*16] + mova m1, [cq+12*16] + mova m2, [cq+ 5*16] + mova m3, [cq+13*16] + mova m4, [cq+ 6*16] + mova m5, [cq+14*16] + mova m6, [cq+ 7*16] + mova m7, [cq+15*16] + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + + ; out0-7 is in rsp+gprsize+3-10*mmsize + ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize + +%if ARCH_X86_64 +%define mzero m8 +%define mlim m9 +%define mula m10 +%define mulb m11 +%else +%define mzero m4 +%define mlim m5 +%define mula m6 +%define mulb m7 +%endif + mova m7, [rsp+gprsize+0*16] +%if ARCH_X86_32 + mova [rsp+gprsize+11*16], m4 + mova [rsp+gprsize+12*16], m5 + mova [rsp+gprsize+13*16], m6 + mova [rsp+gprsize+14*16], m7 +%endif + + mova mula, [o(pw_2048)] + mova mulb, mula +.end: + lea r3, [strideq*3] + lea r5, [dstq+strideq*8] + pxor mzero, mzero + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + mova mlim, [o(pixel_10bpc_max)] + call .write_8x4 + lea r5, [dstq+r3*4] +%if ARCH_X86_64 + mova m0, m4 + mova m1, m5 + mova m2, m6 + mova m3, m7 +%else + mova m0, [rsp+gprsize+11*16] + mova m1, [rsp+gprsize+12*16] + mova m2, [rsp+gprsize+13*16] + mova m3, [rsp+gprsize+14*16] +%endif + call .write_8x4 + mov r5, dstq + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + call .write_8x4 + lea r5, [dstq+strideq*4] + mova m0, [rsp+gprsize+ 7*16] + mova m1, [rsp+gprsize+ 8*16] + mova m2, [rsp+gprsize+ 9*16] + mova m3, [rsp+gprsize+10*16] + call .write_8x4 + RET +.write_8x4: + REPX {pmulhrsw x, mula}, m0, m2 + REPX {pmulhrsw x, mulb}, m1, m3 + paddw m0, [r5+strideq*0] + paddw m1, [r5+strideq*1] + paddw m2, [r5+strideq*2] + paddw m3, [r5+r3] + REPX {pminsw x, mlim }, m0, m1, m2, m3 + REPX {pmaxsw x, mzero}, m0, m1, m2, m3 + mova [r5+strideq*0], m0 + mova [r5+strideq*1], m1 + mova [r5+strideq*2], m2 + mova [r5+r3 ], m3 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, v + +cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m4, [cq+ 9*16] + mova m5, [cq+13*16] + mova [rsp+gprsize+7*16], m0 + mova [rsp+gprsize+8*16], m1 + mova [rsp+gprsize+5*16], m4 + mova [rsp+gprsize+6*16], m5 + mova m0, m2 + mova m1, m3 + mova m2, [cq+ 1*16] + mova m3, [cq+ 5*16] + mova m4, [cq+ 2*16] + mova m5, [cq+ 6*16] + mova m6, [cq+11*16] + mova m7, [cq+15*16] + mova [rsp+gprsize+ 3*16], m4 + mova [rsp+gprsize+ 4*16], m5 + mova [rsp+gprsize+ 9*16], m6 + mova [rsp+gprsize+10*16], m7 + mova m4, [cq+10*16] + mova m5, [cq+14*16] + mova m6, [cq+ 3*16] + mova m7, [cq+ 7*16] + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end + mova m7, [rsp+gprsize+0*16] +%if ARCH_X86_32 + mova [rsp+gprsize+11*16], m4 + mova [rsp+gprsize+12*16], m5 + mova [rsp+gprsize+13*16], m6 + mova [rsp+gprsize+14*16], m7 +%endif + mova mula, [o(pw_2048)] + mova mulb, [o(pw_m2048)] + jmp m(idct_8x16_internal_16bpc).end + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, v + +cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: + lea r3, [strideq*3] + lea r3, [r3*5] + add dstq, r3 + neg strideq + jmp m(iadst_8x16_internal_16bpc).pass2 + +INV_TXFM_8X16_FN identity, dct, h +INV_TXFM_8X16_FN identity, adst, h +INV_TXFM_8X16_FN identity, flipadst, h +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_1697x16)] +%endif + call .main + mova [rsp+ 3*16+gprsize], m0 + mova [rsp+ 4*16+gprsize], m1 + mova [rsp+ 5*16+gprsize], m2 + mova [rsp+ 6*16+gprsize], m3 + mova m0, [cq+ 1*16] + mova m1, [cq+ 5*16] + mova m2, [cq+ 9*16] + mova m3, [cq+13*16] + call .main + mova [rsp+ 7*16+gprsize], m0 + mova [rsp+ 8*16+gprsize], m1 + mova [rsp+ 9*16+gprsize], m2 + mova [rsp+10*16+gprsize], m3 +%if ARCH_X86_32 + mova m0, [cq+ 3*16] + mova m1, [cq+ 7*16] + mova m2, [cq+11*16] + mova m3, [cq+15*16] + call .main + mova [rsp+11*16+gprsize], m0 + mova [rsp+12*16+gprsize], m1 + mova [rsp+13*16+gprsize], m2 + mova [rsp+14*16+gprsize], m3 +%endif + mova m0, [cq+ 2*16] + mova m1, [cq+ 6*16] + mova m2, [cq+10*16] + mova m3, [cq+14*16] + call .main +%if ARCH_X86_64 + mova m4, [cq+ 3*16] + mova m5, [cq+ 7*16] + mova m6, [cq+11*16] + mova m7, [cq+15*16] + pmulhrsw m9, m8, m4 + pmulhrsw m10, m8, m5 + pmulhrsw m11, m8, m6 + pmulhrsw m8, m7 + REPX {paddsw x, x}, m4, m5, m6, m7 + paddsw m4, m9 + paddsw m5, m10 + paddsw m6, m11 + paddsw m7, m8 +%endif + mova mula, [o(pw_2048)] + mova mulb, mula + jmp m(idct_8x16_internal_16bpc).end +.main: + ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y) +%if ARCH_X86_32 + mova m7, [o(pw_1697x16)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 +%else + pmulhrsw m4, m8, m0 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 +%endif + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + ret +%undef mula +%undef mulb +%undef mlim +%undef mzero diff -Nru dav1d-0.7.1/src/x86/itx.asm dav1d-0.9.1/src/x86/itx.asm --- dav1d-0.7.1/src/x86/itx.asm 2020-06-21 11:48:55.024126500 +0000 +++ dav1d-0.9.1/src/x86/itx.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,5562 +0,0 @@ -; Copyright © 2018, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - -%if ARCH_X86_64 - -SECTION_RODATA 16 - -; Note: The order of (at least some of) those constants matter! - -deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 - -%macro COEF_PAIR 2 -pw_%1_%2: dw %1, %2 -pw_m%2_%1: dw -%2, %1 -%endmacro - -; ADST-only -pw_3803_1321: dw 3803, 1321 -pw_m1321_2482: dw -1321, 2482 -pw_2482_3344: dw 2482, 3344 -pw_m3344_3344: dw -3344, 3344 -pw_m3803_3344: dw -3803, 3344 -pw_m3803_m6688: dw -3803, -6688 -pw_2896_m2896: dw 2896, -2896 - -pw_5: times 2 dw 5 -pw_2048: times 2 dw 2048 -pw_4096: times 2 dw 4096 -pw_8192: times 2 dw 8192 -pw_16384: times 2 dw 16384 -pw_1697x16: times 2 dw 1697*16 -pw_1697x8: times 2 dw 1697*8 -pw_2896x8: times 2 dw 2896*8 - -pd_2048: dd 2048 - -COEF_PAIR 2896, 2896 -COEF_PAIR 1567, 3784 -COEF_PAIR 3784, 1567 -COEF_PAIR 201, 4091 -COEF_PAIR 995, 3973 -COEF_PAIR 1751, 3703 -COEF_PAIR 2440, 3290 -COEF_PAIR 3035, 2751 -COEF_PAIR 3513, 2106 -COEF_PAIR 3857, 1380 -COEF_PAIR 4052, 601 -COEF_PAIR 401, 4076 -COEF_PAIR 1931, 3612 -COEF_PAIR 3166, 2598 -COEF_PAIR 3920, 1189 -COEF_PAIR 799, 4017 -COEF_PAIR 3406, 2276 -pw_m799_m4017: dw -799, -4017 -pw_m1567_m3784: dw -1567, -3784 -pw_m3406_m2276: dw -3406, -2276 -pw_m401_m4076: dw -401, -4076 -pw_m3166_m2598: dw -3166, -2598 -pw_m1931_m3612: dw -1931, -3612 -pw_m3920_m1189: dw -3920, -1189 -COEF_PAIR 2276, 3406 -COEF_PAIR 4017, 799 - -%macro COEF_X8 1-* -%rep %0 - dw %1*8, %1*8 - %rotate 1 -%endrep -%endmacro - -pw_3703x8: COEF_X8 3703 -pw_1751x8: COEF_X8 1751 -pw_m1380x8: COEF_X8 -1380 -pw_3857x8: COEF_X8 3857 -pw_3973x8: COEF_X8 3973 -pw_995x8: COEF_X8 995 -pw_m2106x8: COEF_X8 -2106 -pw_3513x8: COEF_X8 3513 -pw_3290x8: COEF_X8 3290 -pw_2440x8: COEF_X8 2440 -pw_m601x8: COEF_X8 -601 -pw_4052x8: COEF_X8 4052 - -idct64_mul: COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 - COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 - COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 - COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 - -pw_201_4091x8: dw 201*8, 4091*8 -pw_m601_4052x8: dw -601*8, 4052*8 -pw_995_3973x8: dw 995*8, 3973*8 -pw_m1380_3857x8: dw -1380*8, 3857*8 -pw_1751_3703x8: dw 1751*8, 3703*8 -pw_m2106_3513x8: dw -2106*8, 3513*8 -pw_2440_3290x8: dw 2440*8, 3290*8 -pw_m2751_3035x8: dw -2751*8, 3035*8 - -%define o_idct64_offset idct64_mul - (o_base) - 8 - -SECTION .text - -; Code size reduction trickery: Intead of using rip-relative loads with -; mandatory 4-byte offsets everywhere, we can set up a base pointer with a -; single rip-relative lea and then address things relative from that with -; 1-byte offsets as long as data is within +-128 bytes of the base pointer. -%define o_base deint_shuf + 128 -%define o(x) (rax - (o_base) + (x)) - -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - -%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) - -; flags: 1 = swap, 2 = interleave, 4: coef_regs -%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags -%if %7 & 4 - pmaddwd m%2, m%5, m%1 - pmaddwd m%1, m%6 -%else -%if %7 & 1 - vpbroadcastd m%2, [o(pw_%5_%6)] - vpbroadcastd m%3, [o(pw_m%6_%5)] -%else - vpbroadcastd m%2, [o(pw_m%6_%5)] - vpbroadcastd m%3, [o(pw_%5_%6)] -%endif - pmaddwd m%2, m%1 - pmaddwd m%1, m%3 -%endif - paddd m%2, m%4 - paddd m%1, m%4 -%if %7 & 2 - pslld m%2, 4 - psrld m%1, 12 - pblendw m%1, m%2, 0xaa -%else - psrad m%2, 12 - psrad m%1, 12 - packssdw m%1, m%2 -%endif -%endmacro - -; flags: 1 = swap, 2 = interleave, 4 = coef_regs -%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags -%if %10 & 1 - vpbroadcastd m%3, [o(pw_%8_%9)] - vpbroadcastd m%4, [o(pw_m%9_%8)] - vpbroadcastd xm%2, [o(pw_%6_%7)] - vpblendd m%2, m%3, 0xf0 - vpbroadcastd xm%3, [o(pw_m%7_%6)] -%else - vpbroadcastd m%3, [o(pw_m%9_%8)] - vpbroadcastd m%4, [o(pw_%8_%9)] - vpbroadcastd xm%2, [o(pw_m%7_%6)] - vpblendd m%2, m%3, 0xf0 - vpbroadcastd xm%3, [o(pw_%6_%7)] -%endif - vpblendd m%3, m%4, 0xf0 - ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) -%endmacro - -; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 -; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 -%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 - punpckhwd m%3, m%2, m%1 - punpcklwd m%2, m%1 -%if %7 < 32 - pmaddwd m%1, m%7, m%2 - pmaddwd m%4, m%7, m%3 -%else - vpbroadcastd m%1, [o(pw_m%7_%6)] - pmaddwd m%4, m%3, m%1 - pmaddwd m%1, m%2 -%endif - paddd m%4, m%5 - paddd m%1, m%5 - psrad m%4, 12 - psrad m%1, 12 - packssdw m%1, m%4 -%if %7 < 32 - pmaddwd m%3, m%6 - pmaddwd m%2, m%6 -%else - vpbroadcastd m%4, [o(pw_%6_%7)] - pmaddwd m%3, m%4 - pmaddwd m%2, m%4 -%endif - paddd m%3, m%5 - paddd m%2, m%5 - psrad m%3, 12 - psrad m%2, 12 -%if %0 == 8 - packssdw m%8, m%2, m%3 -%else - packssdw m%2, m%3 -%endif -%endmacro - -%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 - ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3 - ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0 - psubsw m%3, m%1, m%2 - paddsw m%2, m%1 - paddsw m%1, m%4, m%5 - psubsw m%4, m%5 -%endmacro - -%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 - ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a - ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a - ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 - paddsw m%9, m%2, m%6 ; t4 - psubsw m%2, m%6 ; t5a - paddsw m%10, m%8, m%4 ; t7 - psubsw m%8, m%4 ; t6a - ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0 - ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6 - psubsw m%6, m%1, m%3 ; dct4 out2 - paddsw m%3, m%1 ; dct4 out1 - paddsw m%1, m%5, m%7 ; dct4 out0 - psubsw m%5, m%7 ; dct4 out3 - psubsw m%7, m%3, m%2 ; out6 - paddsw m%2, m%3 ; out1 - paddsw m%3, m%6, m%8 ; out2 - psubsw m%6, m%8 ; out5 - psubsw m%8, m%1, m%10 ; out7 - paddsw m%1, m%10 ; out0 - paddsw m%4, m%5, m%9 ; out3 - psubsw m%5, m%9 ; out4 -%endmacro - -; in1 = %1, in3 = %2, in5 = %3, in7 = %4 -; in9 = %5, in11 = %6, in13 = %7, in15 = %8 -%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048 - ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a - ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a - ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a - ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a - psubsw m%9, m%2, m%6 ; t13 - paddsw m%6, m%2 ; t12 - psubsw m%2, m%8, m%4 ; t14 - paddsw m%8, m%4 ; t15 - psubsw m%4, m%7, m%3 ; t10 - paddsw m%3, m%7 ; t11 - psubsw m%7, m%1, m%5 ; t9 - paddsw m%1, m%5 ; t8 - ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a - ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a - psubsw m%5, m%1, m%3 ; t11a - paddsw m%1, m%3 ; t8a - psubsw m%3, m%7, m%4 ; t13 - paddsw m%7, m%4 ; t14 - psubsw m%4, m%8, m%6 ; t12a - paddsw m%8, m%6 ; t15a - psubsw m%6, m%2, m%9 ; t10 - paddsw m%2, m%9 ; t9 - ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a - ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12 -%endmacro - -%macro WRAP_XMM 1+ - INIT_XMM cpuname - %1 - INIT_YMM cpuname -%endmacro - -%macro ITX4_END 4-5 2048 ; row[1-4], rnd -%if %5 - vpbroadcastd m2, [o(pw_%5)] - pmulhrsw m0, m2 - pmulhrsw m1, m2 -%endif - lea r2, [dstq+strideq*2] -%assign %%i 1 -%rep 4 - %if %1 & 2 - CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) - %else - CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) - %endif - %assign %%i %%i + 1 - %rotate 1 -%endrep - movd m2, [%%row_adr1] - pinsrd m2, [%%row_adr2], 1 - movd m3, [%%row_adr3] - pinsrd m3, [%%row_adr4], 1 - pmovzxbw m2, m2 - pmovzxbw m3, m3 - paddw m0, m2 - paddw m1, m3 - packuswb m0, m1 - movd [%%row_adr1], m0 - pextrd [%%row_adr2], m0, 1 - pextrd [%%row_adr3], m0, 2 - pextrd [%%row_adr4], m0, 3 - ret -%endmacro - -%macro IWHT4_1D_PACKED 0 - punpckhqdq m3, m0, m1 ; in1 in3 - punpcklqdq m0, m1 ; in0 in2 - psubw m2, m0, m3 - paddw m0, m3 - punpckhqdq m2, m2 ; t2 t2 - punpcklqdq m0, m0 ; t0 t0 - psubw m1, m0, m2 - psraw m1, 1 - psubw m1, m3 ; t1 t3 - psubw m0, m1 ; ____ out0 - paddw m2, m1 ; out3 ____ -%endmacro - -INIT_XMM avx2 -cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c - mova m0, [cq+16*0] - mova m1, [cq+16*1] - pxor m2, m2 - mova [cq+16*0], m2 - mova [cq+16*1], m2 - psraw m0, 2 - psraw m1, 2 - IWHT4_1D_PACKED - punpckhwd m0, m1 - punpcklwd m3, m1, m2 - punpckhdq m1, m0, m3 - punpckldq m0, m3 - IWHT4_1D_PACKED - vpblendd m0, m2, 0x03 - ITX4_END 3, 0, 2, 1, 0 - -%macro INV_TXFM_FN 3 ; type1, type2, size -cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2 - %define %%p1 m(i%1_%3_internal) - lea rax, [o_base] - ; Jump to the 1st txfm function if we're not taking the fast path, which - ; in turn performs an indirect jump to the 2nd txfm function. - lea tx2q, [m(i%2_%3_internal).pass2] -%ifidn %1_%2, dct_dct - test eobd, eobd - jnz %%p1 -%else - ; jump to the 1st txfm function unless it's located directly after this - times ((%%end - %%p1) >> 31) & 1 jmp %%p1 -ALIGN function_align -%%end: -%endif -%endmacro - -%macro INV_TXFM_4X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 4x4 -%ifidn %1_%2, dct_dct - vpbroadcastw m0, [cq] - vpbroadcastd m1, [o(pw_2896x8)] - pmulhrsw m0, m1 - mov [cq], eobd ; 0 - pmulhrsw m0, m1 - mova m1, m0 - jmp m(iadst_4x4_internal).end2 -%endif -%endmacro - -%macro IDCT4_1D_PACKED 0 - vpbroadcastd m4, [o(pd_2048)] - punpckhwd m2, m1, m0 - punpcklwd m1, m0 - ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 - ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 - paddsw m0, m1, m2 ; out0 out1 - psubsw m1, m2 ; out3 out2 -%endmacro - -%macro IADST4_1D_PACKED 0 - punpcklwd m2, m1, m0 - punpckhwd m3, m1, m0 - vpbroadcastd m5, [o(pw_m3344_3344)] - vpbroadcastd m0, [o(pw_3803_1321)] - vpbroadcastd m4, [o(pw_m1321_2482)] - pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 - psrld m5, 16 - pmaddwd m0, m2 - pmaddwd m2, m4 - pmaddwd m5, m3 ; 3344*in0 - paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 - vpbroadcastd m4, [o(pw_2482_3344)] - vpbroadcastd m5, [o(pw_m3803_3344)] - pmaddwd m4, m3 - pmaddwd m5, m3 - paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 - vpbroadcastd m0, [o(pw_m3803_m6688)] - pmaddwd m3, m0 - vpbroadcastd m0, [o(pd_2048)] - paddd m2, m0 - paddd m1, m0 - paddd m0, m4 - paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 - paddd m2, m4 - paddd m2, m3 - REPX {psrad x, 12}, m1, m2, m0, m5 - packssdw m0, m5 ; out0 out1 - packssdw m1, m2 ; out2 out3 -%endmacro - -INV_TXFM_4X4_FN dct, dct -INV_TXFM_4X4_FN dct, adst -INV_TXFM_4X4_FN dct, flipadst -INV_TXFM_4X4_FN dct, identity - -cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 - mova m0, [cq+16*0] - mova m1, [cq+16*1] - IDCT4_1D_PACKED - mova m2, [o(deint_shuf)] - shufps m3, m0, m1, q1331 - shufps m0, m1, q0220 - pshufb m0, m2 - pshufb m1, m3, m2 - jmp tx2q -.pass2: - IDCT4_1D_PACKED - pxor m2, m2 - mova [cq+16*0], m2 - mova [cq+16*1], m2 - ITX4_END 0, 1, 3, 2 - -INV_TXFM_4X4_FN adst, dct -INV_TXFM_4X4_FN adst, adst -INV_TXFM_4X4_FN adst, flipadst -INV_TXFM_4X4_FN adst, identity - -cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 - mova m0, [cq+16*0] - mova m1, [cq+16*1] - call .main - punpckhwd m3, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m0, m3 - punpcklwd m0, m3 - jmp tx2q -.pass2: - call .main -.end: - pxor m2, m2 - mova [cq+16*0], m2 - mova [cq+16*1], m2 -.end2: - ITX4_END 0, 1, 2, 3 -ALIGN function_align -.main: - IADST4_1D_PACKED - ret - -INV_TXFM_4X4_FN flipadst, dct -INV_TXFM_4X4_FN flipadst, adst -INV_TXFM_4X4_FN flipadst, flipadst -INV_TXFM_4X4_FN flipadst, identity - -cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 - mova m0, [cq+16*0] - mova m1, [cq+16*1] - call m(iadst_4x4_internal).main - punpcklwd m2, m1, m0 - punpckhwd m1, m0 - punpcklwd m0, m1, m2 - punpckhwd m1, m2 - jmp tx2q -.pass2: - call m(iadst_4x4_internal).main -.end: - pxor m2, m2 - mova [cq+16*0], m2 - mova [cq+16*1], m2 -.end2: - ITX4_END 3, 2, 1, 0 - -INV_TXFM_4X4_FN identity, dct -INV_TXFM_4X4_FN identity, adst -INV_TXFM_4X4_FN identity, flipadst -INV_TXFM_4X4_FN identity, identity - -cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 - mova m0, [cq+16*0] - mova m1, [cq+16*1] - vpbroadcastd m3, [o(pw_1697x8)] - pmulhrsw m2, m3, m0 - pmulhrsw m3, m1 - paddsw m0, m2 - paddsw m1, m3 - punpckhwd m2, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m0, m2 - punpcklwd m0, m2 - jmp tx2q -.pass2: - vpbroadcastd m3, [o(pw_1697x8)] - pmulhrsw m2, m3, m0 - pmulhrsw m3, m1 - paddsw m0, m2 - paddsw m1, m3 - jmp m(iadst_4x4_internal).end - -%macro WRITE_4X8 2 ; coefs[1-2] - movd xm4, [dstq+strideq*0] - pinsrd xm4, [dstq+strideq*1], 1 - movd xm5, [dstq+strideq*2] - pinsrd xm5, [dstq+r3 ], 1 - pinsrd xm4, [r2 +strideq*0], 2 - pinsrd xm4, [r2 +strideq*1], 3 - pinsrd xm5, [r2 +strideq*2], 2 - pinsrd xm5, [r2 +r3 ], 3 - pmovzxbw m4, xm4 - pmovzxbw m5, xm5 - paddw m4, m%1 - paddw m5, m%2 - packuswb m4, m5 - vextracti128 xm5, m4, 1 - movd [dstq+strideq*0], xm4 - pextrd [dstq+strideq*1], xm4, 1 - pextrd [dstq+strideq*2], xm4, 2 - pextrd [dstq+r3 ], xm4, 3 - movd [r2 +strideq*0], xm5 - pextrd [r2 +strideq*1], xm5, 1 - pextrd [r2 +strideq*2], xm5, 2 - pextrd [r2 +r3 ], xm5, 3 -%endmacro - -%macro INV_TXFM_4X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 4x8 -%ifidn %1_%2, dct_dct - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_2048)] - mov [cq], eobd - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mova m1, m0 - jmp m(iadst_4x8_internal).end3 -%endif -%endmacro - -%macro IDCT8_1D_PACKED 0 - vpbroadcastd m6, [o(pd_2048)] - punpckhwd m5, m3, m0 ; in7 in1 - punpckhwd m4, m1, m2 ; in3 in5 - punpcklwd m3, m1 ; in6 in2 - punpcklwd m2, m0 ; in4 in0 - ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a - ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a - ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 - psubsw m0, m5, m4 ; t5a t6a (interleaved) - paddsw m4, m5 ; t4 t7 (interleaved) - ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 - vpbroadcastd m1, [o(pw_m2896_2896)] - ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 -%if mmsize > 16 - vbroadcasti128 m1, [o(deint_shuf)] - pshufb m4, m1 -%else - pshufb m4, [o(deint_shuf)] -%endif - psubsw m1, m2, m3 ; tmp3 tmp2 - paddsw m3, m2 ; tmp0 tmp1 - shufps m2, m4, m0, q1032 ; t7 t6 - vpblendd m4, m0, 0xcc ; t4 t5 - paddsw m0, m3, m2 ; out0 out1 - psubsw m3, m2 ; out7 out6 - psubsw m2, m1, m4 ; out4 out5 - paddsw m1, m4 ; out3 out2 -%endmacro - -%macro IADST8_1D_PACKED 1 ; pass - vpbroadcastd m6, [o(pd_2048)] - punpckhwd m0, m4, m3 ; 0 7 - punpckhwd m1, m5, m2 ; 2 5 - punpcklwd m2, m5 ; 4 3 - punpcklwd m3, m4 ; 6 1 -%if %1 == 1 - ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a - ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a - ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a - ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a - psubsw m4, m0, m2 ; t5 t4 - paddsw m0, m2 ; t1 t0 - psubsw m5, m1, m3 ; t6 t7 - paddsw m1, m3 ; t2 t3 - ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a - ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a -%if mmsize > 16 - vbroadcasti128 m2, [o(deint_shuf)] -%else - mova m2, [o(deint_shuf)] -%endif - pshuflw m1, m1, q2301 - pshufhw m1, m1, q2301 - psubsw m3, m0, m1 ; t3 t2 - paddsw m0, m1 ; -out7 out0 - psubsw m1, m4, m5 ; t7 t6 - paddsw m4, m5 ; out6 -out1 - pshufb m0, m2 - pshufb m4, m2 - vpbroadcastd m5, [o(pw_m2896_2896)] - pmaddwd m2, m5, m3 - pmaddwd m5, m1 - paddd m2, m6 - paddd m5, m6 - psrad m2, 12 - psrad m5, 12 - packssdw m2, m5 ; out4 -out5 - vpbroadcastd m5, [o(pw_2896_2896)] - pmaddwd m3, m5 - pmaddwd m1, m5 - paddd m3, m6 - paddd m1, m6 - psrad m3, 12 - psrad m1, 12 - packssdw m1, m3 ; out2 -out3 - punpcklqdq m3, m4, m0 ; out6 -out7 - punpckhqdq m0, m4 ; out0 -out1 -%else - ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a - ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a - ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a - ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a - psubsw m4, m0, m2 ; t4 t5 - paddsw m0, m2 ; t0 t1 - psubsw m5, m1, m3 ; t6 t7 - paddsw m1, m3 ; t2 t3 - shufps m2, m5, m4, q1032 - punpckhwd m4, m2 - punpcklwd m5, m2 - ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a - ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a - psubsw m2, m0, m1 ; t2 t3 - paddsw m0, m1 ; out0 -out7 - psubsw m1, m4, m5 ; t7 t6 - paddsw m4, m5 ; out6 -out1 - vpbroadcastd m5, [o(pw_2896x8)] - vpblendd m3, m0, m4, 0x33 ; out6 -out7 - vpblendd m0, m4, 0xcc ; out0 -out1 - shufps m4, m2, m1, q1032 ; t3 t7 - vpblendd m1, m2, 0x33 ; t2 t6 - psubsw m2, m1, m4 ; t2-t3 t6-t7 - paddsw m1, m4 ; t2+t3 t6+t7 - pmulhrsw m2, m5 ; out4 -out5 - pshufd m1, m1, q1032 - pmulhrsw m1, m5 ; out2 -out3 -%endif -%endmacro - -INIT_YMM avx2 -INV_TXFM_4X8_FN dct, dct -INV_TXFM_4X8_FN dct, adst -INV_TXFM_4X8_FN dct, flipadst -INV_TXFM_4X8_FN dct, identity - -cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpermq m0, [cq+32*0], q3120 - vpermq m1, [cq+32*1], q3120 - vpbroadcastd m2, [o(pw_2896x8)] - pmulhrsw m0, m2 - pmulhrsw m1, m2 - IDCT4_1D_PACKED - vbroadcasti128 m2, [o(deint_shuf)] - shufps m3, m0, m1, q1331 - shufps m0, m1, q0220 - pshufb m0, m2 - pshufb m1, m3, m2 - jmp tx2q -.pass2: - vextracti128 xm2, m0, 1 - vextracti128 xm3, m1, 1 - call .main - vpbroadcastd m4, [o(pw_2048)] - vinserti128 m0, xm2, 1 - vinserti128 m1, xm3, 1 - pshufd m1, m1, q1032 - jmp m(iadst_4x8_internal).end2 -ALIGN function_align -.main: - WRAP_XMM IDCT8_1D_PACKED - ret - -INV_TXFM_4X8_FN adst, dct -INV_TXFM_4X8_FN adst, adst -INV_TXFM_4X8_FN adst, flipadst -INV_TXFM_4X8_FN adst, identity - -cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpermq m0, [cq+32*0], q3120 - vpermq m1, [cq+32*1], q3120 - vpbroadcastd m2, [o(pw_2896x8)] - pmulhrsw m0, m2 - pmulhrsw m1, m2 - call m(iadst_8x4_internal).main - punpckhwd m3, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m0, m3 - punpcklwd m0, m3 - jmp tx2q -.pass2: - vextracti128 xm2, m0, 1 - vextracti128 xm3, m1, 1 - pshufd xm4, xm0, q1032 - pshufd xm5, xm1, q1032 - call .main_pass2 - vpbroadcastd m4, [o(pw_2048)] - vinserti128 m0, xm2, 1 - vinserti128 m1, xm3, 1 - pxor m5, m5 - psubw m5, m4 -.end: - vpblendd m4, m5, 0xcc -.end2: - pmulhrsw m0, m4 - pmulhrsw m1, m4 - WIN64_RESTORE_XMM - pxor m2, m2 - mova [cq+32*0], m2 - mova [cq+32*1], m2 -.end3: - lea r2, [dstq+strideq*4] - lea r3, [strideq*3] - WRITE_4X8 0, 1 - RET -ALIGN function_align -.main_pass1: - WRAP_XMM IADST8_1D_PACKED 1 - ret -ALIGN function_align -.main_pass2: - WRAP_XMM IADST8_1D_PACKED 2 - ret - -INV_TXFM_4X8_FN flipadst, dct -INV_TXFM_4X8_FN flipadst, adst -INV_TXFM_4X8_FN flipadst, flipadst -INV_TXFM_4X8_FN flipadst, identity - -cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpermq m0, [cq+32*0], q3120 - vpermq m1, [cq+32*1], q3120 - vpbroadcastd m2, [o(pw_2896x8)] - pmulhrsw m0, m2 - pmulhrsw m1, m2 - call m(iadst_8x4_internal).main - punpcklwd m3, m1, m0 - punpckhwd m1, m0 - punpcklwd m0, m1, m3 - punpckhwd m1, m3 - jmp tx2q -.pass2: - vextracti128 xm2, m0, 1 - vextracti128 xm3, m1, 1 - pshufd xm4, xm0, q1032 - pshufd xm5, xm1, q1032 - call m(iadst_4x8_internal).main_pass2 - vpbroadcastd m5, [o(pw_2048)] - vinserti128 m3, xm1, 1 - vinserti128 m2, xm0, 1 - pxor m4, m4 - psubw m4, m5 - pshufd m0, m3, q1032 - pshufd m1, m2, q1032 - jmp m(iadst_4x8_internal).end - -INV_TXFM_4X8_FN identity, dct -INV_TXFM_4X8_FN identity, adst -INV_TXFM_4X8_FN identity, flipadst -INV_TXFM_4X8_FN identity, identity - -cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpermq m2, [cq+32*0], q3120 - vpermq m0, [cq+32*1], q3120 - vpbroadcastd m3, [o(pw_2896x8)] - vpbroadcastd m4, [o(pw_1697x8)] - punpcklwd m1, m2, m0 - punpckhwd m2, m0 - pmulhrsw m1, m3 - pmulhrsw m2, m3 - punpcklwd m0, m1, m2 - punpckhwd m1, m2 - pmulhrsw m2, m4, m0 - pmulhrsw m4, m1 - paddsw m0, m2 - paddsw m1, m4 - jmp tx2q -.pass2: - vpbroadcastd m4, [o(pw_4096)] - jmp m(iadst_4x8_internal).end2 - -%macro INV_TXFM_4X16_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 4x16 -%ifidn %1_%2, dct_dct - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_16384)] - movd xm3, [o(pw_2048)] - mov [cq], eobd - pmulhrsw xm0, xm2 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm3 - vpbroadcastw m0, xm0 - mova m1, m0 - mova m2, m0 - mova m3, m0 - jmp m(iadst_4x16_internal).end3 -%endif -%endmacro - -%macro IDCT16_1D_PACKED 0 - vpbroadcastd m10, [o(pd_2048)] -.main2: - punpckhwd m8, m7, m0 ; dct16 in15 in1 - punpcklwd m9, m4, m0 ; dct4 in2 in0 - punpckhwd m0, m3, m4 ; dct16 in7 in9 - punpcklwd m7, m1 ; dct8 in7 in1 - punpckhwd m1, m6 ; dct16 in3 in13 - punpcklwd m3, m5 ; dct8 in3 in5 - punpckhwd m5, m2 ; dct16 in11 in5 - punpcklwd m6, m2 ; dct4 in3 in1 - ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a - ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a - ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a - ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a - ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a - ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a - ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 - psubsw m2, m8, m0 ; t9 t14 - paddsw m8, m0 ; t8 t15 - psubsw m0, m1, m5 ; t10 t13 - paddsw m1, m5 ; t11 t12 - vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784 - ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a - vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 - ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a - psubsw m4, m8, m1 ; t11a t12a - paddsw m8, m1 ; t8a t15a - psubsw m1, m7, m3 ; t5a t6a - paddsw m7, m3 ; t4 t7 - paddsw m3, m2, m0 ; t9 t14 - psubsw m2, m0 ; t10 t13 -%if mmsize > 16 - vbroadcasti128 m0, [o(deint_shuf)] -%else - mova m0, [o(deint_shuf)] -%endif - pshufb m8, m0 - pshufb m7, m0 - pshufb m3, m0 - ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1 - vpbroadcastd m0, [o(pw_m2896_2896)] - ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 - vpbroadcastd m5, [o(pw_2896_2896)] - ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 - vpbroadcastd m0, [o(pw_m2896_2896)] - ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a - punpckhqdq m0, m8, m3 ; t15a t14 - punpcklqdq m8, m3 ; t8a t9 - shufps m5, m4, m2, q1032 ; t12 t13a - vpblendd m4, m2, 0xcc ; t11 t10a - shufps m2, m7, m1, q1032 ; t7 t6 - vpblendd m7, m1, 0xcc ; t4 t5 - psubsw m1, m9, m6 ; dct4 out3 out2 - paddsw m9, m6 ; dct4 out0 out1 - psubsw m3, m9, m2 ; dct8 out7 out6 - paddsw m9, m2 ; dct8 out0 out1 - psubsw m2, m1, m7 ; dct8 out4 out5 - paddsw m1, m7 ; dct8 out3 out2 - psubsw m7, m9, m0 ; out15 out14 - paddsw m0, m9 ; out0 out1 - psubsw m6, m1, m5 ; out12 out13 - paddsw m1, m5 ; out3 out2 - psubsw m5, m2, m4 ; out11 out10 - paddsw m2, m4 ; out4 out5 - psubsw m4, m3, m8 ; out8 out9 - paddsw m3, m8 ; out7 out6 -%endmacro - -INV_TXFM_4X16_FN dct, dct -INV_TXFM_4X16_FN dct, adst -INV_TXFM_4X16_FN dct, flipadst -INV_TXFM_4X16_FN dct, identity - -cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 - mova m0, [cq+32*0] - mova m1, [cq+32*1] - mova m2, [cq+32*2] - mova m3, [cq+32*3] - call m(idct_16x4_internal).main - vpbroadcastd m5, [o(pw_16384)] - punpckhwd m4, m2, m3 - punpcklwd m2, m3 - punpckhwd m3, m0, m1 - punpcklwd m0, m1 - REPX {pmulhrsw x, m5}, m0, m4, m2, m3 - punpckhdq m1, m0, m2 - punpckldq m0, m2 - punpckldq m2, m3, m4 - punpckhdq m3, m4 - jmp tx2q -.pass2: - vextracti128 xm4, m0, 1 - vextracti128 xm5, m1, 1 - vextracti128 xm6, m2, 1 - vextracti128 xm7, m3, 1 - call .main - vinserti128 m0, xm4, 1 - vinserti128 m1, xm5, 1 - vpbroadcastd m5, [o(pw_2048)] - vinserti128 m2, xm6, 1 - vinserti128 m3, xm7, 1 - pshufd m1, m1, q1032 - pshufd m3, m3, q1032 - jmp m(iadst_4x16_internal).end2 -ALIGN function_align -.main: - WRAP_XMM IDCT16_1D_PACKED - ret - -INV_TXFM_4X16_FN adst, dct -INV_TXFM_4X16_FN adst, adst -INV_TXFM_4X16_FN adst, flipadst -INV_TXFM_4X16_FN adst, identity - -cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 - mova m0, [cq+32*0] - mova m1, [cq+32*1] - mova m2, [cq+32*2] - mova m3, [cq+32*3] - call m(iadst_16x4_internal).main - vpbroadcastd m5, [o(pw_16384)] - punpckhwd m4, m2, m3 - punpcklwd m2, m3 - punpckhwd m3, m0, m1 - punpcklwd m0, m1 - REPX {pmulhrsw x, m5}, m4, m2, m3, m0 - punpckhdq m1, m0, m2 - punpckldq m0, m2 - punpckldq m2, m3, m4 - punpckhdq m3, m4 - jmp tx2q -.pass2: - call .main - vpbroadcastd m5, [o(pw_2896x8)] - paddsw m1, m2, m4 - psubsw m2, m4 - pmulhrsw m1, m5 ; -out7 out4 out6 -out5 - pmulhrsw m2, m5 ; out8 -out11 -out9 out10 - vpbroadcastd m5, [o(pw_2048)] - pshufd m1, m1, q1032 - vpblendd m4, m1, m0, 0x33 - vpblendd m0, m2, 0x33 - vpblendd m2, m3, 0x33 - vpblendd m3, m1, 0x33 - vpermq m0, m0, q2031 - vpermq m1, m2, q1302 - vpermq m2, m3, q3120 - vpermq m3, m4, q0213 - psubw m6, m7, m5 -.end: - vpblendd m5, m6, 0xcc -.end2: - REPX {pmulhrsw x, m5}, m0, m1, m2, m3 - WIN64_RESTORE_XMM - pxor m4, m4 - mova [cq+32*0], m4 - mova [cq+32*1], m4 - mova [cq+32*2], m4 - mova [cq+32*3], m4 -.end3: - lea r2, [dstq+strideq*8] - lea r3, [strideq*3] - WRITE_4X8 0, 1 - lea dstq, [dstq+strideq*4] - lea r2, [r2 +strideq*4] - WRITE_4X8 2, 3 - RET -ALIGN function_align -.main: - vpblendd m4, m1, m0, 0xcc - vpblendd m1, m0, 0x33 - vpblendd m5, m2, m3, 0xcc - vpblendd m2, m3, 0x33 - vperm2i128 m3, m5, m2, 0x31 - vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 - vperm2i128 m4, m1, m4, 0x31 - vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 - pshufd m3, m3, q1032 ; in12 in15 in13 in14 - pshufd m2, m4, q1032 ; in11 in8 in9 in10 -.main2: - vpbroadcastd m8, [o(pd_2048)] - pxor m7, m7 - punpckhwd m4, m3, m0 ; in12 in3 in14 in1 - punpcklwd m0, m3 ; in0 in15 in2 in13 - punpckhwd m3, m2, m1 ; in8 in7 in10 in5 - punpcklwd m1, m2 ; in4 in11 in6 in9 - ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 - ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 - ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 - ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 - psubsw m2, m0, m3 ; t9a t8a t11a t10a - paddsw m0, m3 ; t1a t0a t3a t2a - psubsw m3, m1, m4 ; t13a t12a t15a t14a - paddsw m1, m4 ; t5a t4a t7a t6a - ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 - psubw m6, m7, m5 - ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 - vpbroadcastd m6, [o(pw_m3784_1567)] - vpbroadcastd m5, [o(pw_1567_3784)] - psubsw m4, m0, m1 ; t5 t4 t7 t6 - paddsw m0, m1 ; t1 t0 t3 t2 - psubsw m1, m2, m3 ; t13a t12a t15a t14a - paddsw m2, m3 ; t9a t8a t11a t10a - psubw m3, m7, m6 ; pw_3784_m1567 - vpblendd m6, m3, 0xf0 - ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a - ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 - vbroadcasti128 m5, [o(deint_shuf)] - pshufb m0, m5 - pshufb m2, m5 - vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a - vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a - vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 - vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 - pshufd m2, m2, q1032 ; t6a t7a t14 t15 - psubsw m1, m0, m3 ; t3a t2a t11 t10 - paddsw m0, m3 ; -out15 out0 out14 -out1 - paddsw m3, m4, m2 ; -out3 out12 out2 -out13 - psubsw m4, m2 ; t6 t7 t14a t15a - shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a - vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a - ret -ALIGN function_align -.main_pass1_end: - vpbroadcastd m5, [o(pw_m2896_2896)] - vpbroadcastd m6, [o(pw_2896_2896)] - punpcklwd m1, m4, m2 - punpckhwd m4, m2 - pmaddwd m2, m5, m4 - pmaddwd m4, m6 - pmaddwd m5, m1 - pmaddwd m1, m6 - REPX {paddd x, m8}, m5, m1, m2, m4 - REPX {psrad x, 12}, m5, m2, m1, m4 - packssdw m2, m5 ; -out11 out8 out10 -out9 - packssdw m1, m4 ; -out7 out4 out6 -out5 - ret - -INV_TXFM_4X16_FN flipadst, dct -INV_TXFM_4X16_FN flipadst, adst -INV_TXFM_4X16_FN flipadst, flipadst -INV_TXFM_4X16_FN flipadst, identity - -cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 - mova m0, [cq+32*0] - mova m1, [cq+32*1] - mova m2, [cq+32*2] - mova m3, [cq+32*3] - call m(iadst_16x4_internal).main - vpbroadcastd m5, [o(pw_16384)] - punpcklwd m4, m1, m0 - punpckhwd m1, m0 - punpcklwd m0, m3, m2 - punpckhwd m3, m2 - REPX {pmulhrsw x, m5}, m4, m1, m0, m3 - punpckldq m2, m3, m1 - punpckhdq m3, m1 - punpckhdq m1, m0, m4 - punpckldq m0, m4 - jmp tx2q -.pass2: - call m(iadst_4x16_internal).main - vpbroadcastd m5, [o(pw_2896x8)] - paddsw m1, m2, m4 - psubsw m2, m4 - pmulhrsw m1, m5 ; -out7 out4 out6 -out5 - pmulhrsw m2, m5 ; out8 -out11 -out9 out10 - vpbroadcastd m6, [o(pw_2048)] - pshufd m1, m1, q1032 - vpblendd m4, m0, m2, 0x33 - vpblendd m0, m1, 0xcc - vpblendd m1, m3, 0xcc - vpblendd m2, m3, 0x33 - vpermq m0, m0, q3120 - vpermq m1, m1, q0213 - vpermq m2, m2, q2031 - vpermq m3, m4, q1302 - psubw m5, m7, m6 - jmp m(iadst_4x16_internal).end - -INV_TXFM_4X16_FN identity, dct -INV_TXFM_4X16_FN identity, adst -INV_TXFM_4X16_FN identity, flipadst -INV_TXFM_4X16_FN identity, identity - -cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 - mova m3, [cq+32*0] - mova m2, [cq+32*1] - mova m4, [cq+32*2] - mova m5, [cq+32*3] - vpbroadcastd m8, [o(pw_1697x8)] - pcmpeqw m0, m0 ; -1 - punpcklwd m1, m3, m2 - punpckhwd m3, m2 - punpcklwd m2, m4, m5 - punpckhwd m4, m5 - pmulhrsw m5, m8, m1 - pmulhrsw m6, m8, m2 - pmulhrsw m7, m8, m3 - pmulhrsw m8, m4 - pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is - pxor m1, m9 ; unsigned. as long as both signs are equal - pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the - pxor m2, m9 ; pmulhrsw result will become 0 which causes - pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless - pxor m3, m9 ; we explicitly deal with that case here. - pcmpeqw m0, m4 - pxor m4, m0 - pavgw m1, m5 - pavgw m2, m6 - pavgw m3, m7 - pavgw m4, m8 - punpckldq m0, m1, m2 - punpckhdq m1, m2 - punpckldq m2, m3, m4 - punpckhdq m3, m4 - jmp tx2q -.pass2: - vpbroadcastd m8, [o(pw_1697x16)] - vpbroadcastd m5, [o(pw_2048)] - pmulhrsw m4, m8, m0 - pmulhrsw m6, m8, m1 - pmulhrsw m7, m8, m2 - pmulhrsw m8, m3 - REPX {paddsw x, x}, m0, m1, m2, m3 - paddsw m0, m4 - paddsw m1, m6 - paddsw m2, m7 - paddsw m3, m8 - jmp m(iadst_4x16_internal).end2 - -%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] - movq xm%3, [dstq ] - movhps xm%3, [dstq+%5] - movq xm%4, [dstq+%6] - movhps xm%4, [dstq+%7] - pmovzxbw m%3, xm%3 - pmovzxbw m%4, xm%4 -%ifnum %1 - paddw m%3, m%1 -%else - paddw m%3, %1 -%endif -%ifnum %2 - paddw m%4, m%2 -%else - paddw m%4, %2 -%endif - packuswb m%3, m%4 - vextracti128 xm%4, m%3, 1 - movq [dstq ], xm%3 - movhps [dstq+%6], xm%3 - movq [dstq+%5], xm%4 - movhps [dstq+%7], xm%4 -%endmacro - -%macro INV_TXFM_8X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 8x4 -%ifidn %1_%2, dct_dct - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - pmulhrsw xm0, xm1 - movd xm2, [o(pw_2048)] - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mova m1, m0 - jmp m(iadst_8x4_internal).end3 -%endif -%endmacro - -INV_TXFM_8X4_FN dct, dct -INV_TXFM_8X4_FN dct, adst -INV_TXFM_8X4_FN dct, flipadst -INV_TXFM_8X4_FN dct, identity - -cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpbroadcastd xm3, [o(pw_2896x8)] - pmulhrsw xm0, xm3, [cq+16*0] - pmulhrsw xm1, xm3, [cq+16*1] - pmulhrsw xm2, xm3, [cq+16*2] - pmulhrsw xm3, [cq+16*3] - call m(idct_4x8_internal).main - vbroadcasti128 m4, [o(deint_shuf)] - vinserti128 m3, m1, xm3, 1 - vinserti128 m1, m0, xm2, 1 - shufps m0, m1, m3, q0220 - shufps m1, m3, q1331 - pshufb m0, m4 - pshufb m1, m4 - jmp tx2q -.pass2: - IDCT4_1D_PACKED - vpermq m0, m0, q3120 - vpermq m1, m1, q2031 - jmp m(iadst_8x4_internal).end2 - -INV_TXFM_8X4_FN adst, dct -INV_TXFM_8X4_FN adst, adst -INV_TXFM_8X4_FN adst, flipadst -INV_TXFM_8X4_FN adst, identity - -cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpbroadcastd xm0, [o(pw_2896x8)] - pshufd xm4, [cq+16*0], q1032 - pmulhrsw xm3, xm0, [cq+16*3] - pshufd xm5, [cq+16*1], q1032 - pmulhrsw xm2, xm0, [cq+16*2] - pmulhrsw xm4, xm0 - pmulhrsw xm5, xm0 - call m(iadst_4x8_internal).main_pass1 - vinserti128 m0, xm2, 1 - vinserti128 m1, xm3, 1 - punpckhwd m2, m0, m1 - punpcklwd m0, m1 - pxor m3, m3 - psubsw m3, m2 - punpckhwd m1, m0, m3 - punpcklwd m0, m3 - jmp tx2q -.pass2: - call .main -.end: - vpermq m0, m0, q3120 - vpermq m1, m1, q3120 -.end2: - vpbroadcastd m2, [o(pw_2048)] - pmulhrsw m0, m2 - pmulhrsw m1, m2 - WIN64_RESTORE_XMM -.end3: - pxor m2, m2 - mova [cq+32*0], m2 - mova [cq+32*1], m2 - lea r3, [strideq*3] - WRITE_8X4 0, 1, 4, 5 - RET -ALIGN function_align -.main: - IADST4_1D_PACKED - ret - -INV_TXFM_8X4_FN flipadst, dct -INV_TXFM_8X4_FN flipadst, adst -INV_TXFM_8X4_FN flipadst, flipadst -INV_TXFM_8X4_FN flipadst, identity - -cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpbroadcastd xm0, [o(pw_2896x8)] - pshufd xm4, [cq+16*0], q1032 - pmulhrsw xm3, xm0, [cq+16*3] - pshufd xm5, [cq+16*1], q1032 - pmulhrsw xm2, xm0, [cq+16*2] - pmulhrsw xm4, xm0 - pmulhrsw xm5, xm0 - call m(iadst_4x8_internal).main_pass1 - vinserti128 m3, xm1, 1 - vinserti128 m2, xm0, 1 - punpckhwd m1, m3, m2 - punpcklwd m3, m2 - pxor m0, m0 - psubsw m0, m1 - punpckhwd m1, m0, m3 - punpcklwd m0, m3 - jmp tx2q -.pass2: - call m(iadst_8x4_internal).main - mova m2, m1 - vpermq m1, m0, q2031 - vpermq m0, m2, q2031 - jmp m(iadst_8x4_internal).end2 - -INV_TXFM_8X4_FN identity, dct -INV_TXFM_8X4_FN identity, adst -INV_TXFM_8X4_FN identity, flipadst -INV_TXFM_8X4_FN identity, identity - -cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 - mova xm2, [cq+16*0] - mova xm0, [cq+16*1] - vinserti128 m2, [cq+16*2], 1 - vinserti128 m0, [cq+16*3], 1 - vpbroadcastd m3, [o(pw_2896x8)] - punpcklwd m1, m2, m0 - punpckhwd m2, m0 - pmulhrsw m1, m3 - pmulhrsw m2, m3 - punpcklwd m0, m1, m2 - punpckhwd m1, m2 - paddsw m0, m0 - paddsw m1, m1 - jmp tx2q -.pass2: - vpbroadcastd m3, [o(pw_1697x8)] - pmulhrsw m2, m3, m0 - pmulhrsw m3, m1 - paddsw m0, m2 - paddsw m1, m3 - jmp m(iadst_8x4_internal).end - -%macro INV_TXFM_8X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 8x8 -%ifidn %1_%2, dct_dct - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_16384)] - mov [cq], eobd - pmulhrsw xm0, xm2 - psrlw xm2, 3 ; pw_2048 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 -.end: - mov r2d, 2 -.end2: - lea r3, [strideq*3] -.loop: - WRITE_8X4 0, 0, 1, 2 - lea dstq, [dstq+strideq*4] - dec r2d - jg .loop - RET -%endif -%endmacro - -INV_TXFM_8X8_FN dct, dct -INV_TXFM_8X8_FN dct, adst -INV_TXFM_8X8_FN dct, flipadst -INV_TXFM_8X8_FN dct, identity - -cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpermq m0, [cq+32*0], q3120 ; 0 1 - vpermq m3, [cq+32*3], q3120 ; 6 7 - vpermq m2, [cq+32*2], q3120 ; 4 5 - vpermq m1, [cq+32*1], q3120 ; 2 3 - call .main - shufps m4, m0, m1, q0220 - shufps m5, m0, m1, q1331 - shufps m1, m2, m3, q0220 - shufps m3, m2, m3, q1331 - vbroadcasti128 m0, [o(deint_shuf)] - vpbroadcastd m2, [o(pw_16384)] - REPX {pshufb x, m0}, m4, m5, m1, m3 - REPX {pmulhrsw x, m2}, m4, m5, m1, m3 - vinserti128 m0, m4, xm1, 1 - vperm2i128 m2, m4, m1, 0x31 - vinserti128 m1, m5, xm3, 1 - vperm2i128 m3, m5, m3, 0x31 - jmp tx2q -.pass2: - call .main - vpbroadcastd m4, [o(pw_2048)] - vpermq m0, m0, q3120 - vpermq m1, m1, q2031 - vpermq m2, m2, q3120 - vpermq m3, m3, q2031 - jmp m(iadst_8x8_internal).end2 -ALIGN function_align -.main: - IDCT8_1D_PACKED - ret - -INV_TXFM_8X8_FN adst, dct -INV_TXFM_8X8_FN adst, adst -INV_TXFM_8X8_FN adst, flipadst -INV_TXFM_8X8_FN adst, identity - -cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpermq m4, [cq+32*0], q1302 ; 1 0 - vpermq m3, [cq+32*3], q3120 ; 6 7 - vpermq m5, [cq+32*1], q1302 ; 3 2 - vpermq m2, [cq+32*2], q3120 ; 4 5 - call .main_pass1 - vpbroadcastd m5, [o(pw_16384)] - punpcklwd m4, m0, m1 - punpckhwd m0, m1 - punpcklwd m1, m2, m3 - punpckhwd m2, m3 - pxor m3, m3 - psubw m3, m5 ; negate odd elements during rounding - pmulhrsw m4, m5 - pmulhrsw m0, m3 - pmulhrsw m1, m5 - pmulhrsw m2, m3 - punpcklwd m3, m4, m0 - punpckhwd m4, m0 - punpcklwd m0, m1, m2 - punpckhwd m1, m2 - vperm2i128 m2, m3, m0, 0x31 - vinserti128 m0, m3, xm0, 1 - vperm2i128 m3, m4, m1, 0x31 - vinserti128 m1, m4, xm1, 1 - jmp tx2q -.pass2: - pshufd m4, m0, q1032 - pshufd m5, m1, q1032 - call .main_pass2 - vpbroadcastd m5, [o(pw_2048)] - vpbroadcastd xm4, [o(pw_4096)] - psubw m4, m5 ; lower half = 2048, upper half = -2048 -.end: - REPX {vpermq x, x, q3120}, m0, m1, m2, m3 -.end2: - pmulhrsw m0, m4 - pmulhrsw m1, m4 -.end3: - pmulhrsw m2, m4 - pmulhrsw m3, m4 - WIN64_RESTORE_XMM -.end4: - pxor m4, m4 - mova [cq+32*0], m4 - mova [cq+32*1], m4 - mova [cq+32*2], m4 - mova [cq+32*3], m4 - lea r3, [strideq*3] - WRITE_8X4 0, 1, 4, 5 - lea dstq, [dstq+strideq*4] - WRITE_8X4 2, 3, 4, 5 - RET -ALIGN function_align -.main_pass1: - IADST8_1D_PACKED 1 - ret -ALIGN function_align -.main_pass2: - IADST8_1D_PACKED 2 - ret - -INV_TXFM_8X8_FN flipadst, dct -INV_TXFM_8X8_FN flipadst, adst -INV_TXFM_8X8_FN flipadst, flipadst -INV_TXFM_8X8_FN flipadst, identity - -cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 - vpermq m4, [cq+32*0], q1302 ; 1 0 - vpermq m3, [cq+32*3], q3120 ; 6 7 - vpermq m5, [cq+32*1], q1302 ; 3 2 - vpermq m2, [cq+32*2], q3120 ; 4 5 - call m(iadst_8x8_internal).main_pass1 - vpbroadcastd m5, [o(pw_16384)] - punpckhwd m4, m3, m2 - punpcklwd m3, m2 - punpckhwd m2, m1, m0 - punpcklwd m1, m0 - pxor m0, m0 - psubw m0, m5 - pmulhrsw m4, m0 - pmulhrsw m3, m5 - pmulhrsw m2, m0 - pmulhrsw m1, m5 - punpckhwd m0, m4, m3 - punpcklwd m4, m3 - punpckhwd m3, m2, m1 - punpcklwd m2, m1 - vinserti128 m1, m0, xm3, 1 - vperm2i128 m3, m0, m3, 0x31 - vinserti128 m0, m4, xm2, 1 - vperm2i128 m2, m4, m2, 0x31 - jmp tx2q -.pass2: - pshufd m4, m0, q1032 - pshufd m5, m1, q1032 - call m(iadst_8x8_internal).main_pass2 - vpbroadcastd m4, [o(pw_2048)] - vpbroadcastd xm5, [o(pw_4096)] - psubw m4, m5 ; lower half = -2048, upper half = 2048 - vpermq m5, m3, q2031 - vpermq m3, m0, q2031 - vpermq m0, m2, q2031 - vpermq m2, m1, q2031 - pmulhrsw m1, m0, m4 - pmulhrsw m0, m5, m4 - jmp m(iadst_8x8_internal).end3 - -INV_TXFM_8X8_FN identity, dct -INV_TXFM_8X8_FN identity, adst -INV_TXFM_8X8_FN identity, flipadst -INV_TXFM_8X8_FN identity, identity - -cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 - mova xm3, [cq+16*0] - mova xm2, [cq+16*1] - vinserti128 m3, [cq+16*4], 1 - vinserti128 m2, [cq+16*5], 1 - mova xm4, [cq+16*2] - mova xm0, [cq+16*3] - vinserti128 m4, [cq+16*6], 1 - vinserti128 m0, [cq+16*7], 1 - punpcklwd m1, m3, m2 - punpckhwd m3, m2 - punpcklwd m2, m4, m0 - punpckhwd m4, m0 - punpckldq m0, m1, m2 - punpckhdq m1, m2 - punpckldq m2, m3, m4 - punpckhdq m3, m4 - jmp tx2q -.pass2: - vpbroadcastd m4, [o(pw_4096)] - jmp m(iadst_8x8_internal).end - -%macro INV_TXFM_8X16_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 8x16 -%ifidn %1_%2, dct_dct - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_16384)] - mov [cq], eobd - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - psrlw xm2, 3 ; pw_2048 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_8x8).end2 -%endif -%endmacro - -%macro ITX_8X16_LOAD_COEFS 0 - vpbroadcastd m4, [o(pw_2896x8)] - pmulhrsw m0, m4, [cq+32*0] - add cq, 32*4 - pmulhrsw m7, m4, [cq+32*3] - pmulhrsw m1, m4, [cq-32*3] - pmulhrsw m6, m4, [cq+32*2] - pmulhrsw m2, m4, [cq-32*2] - pmulhrsw m5, m4, [cq+32*1] - pmulhrsw m3, m4, [cq-32*1] - pmulhrsw m4, [cq+32*0] -%endmacro - -INV_TXFM_8X16_FN dct, dct -INV_TXFM_8X16_FN dct, adst -INV_TXFM_8X16_FN dct, flipadst -INV_TXFM_8X16_FN dct, identity - -cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 - ITX_8X16_LOAD_COEFS - call m(idct_16x8_internal).main - vpbroadcastd m10, [o(pw_16384)] -.pass1_end: - vperm2i128 m9, m3, m7, 0x31 - vinserti128 m3, xm7, 1 - vperm2i128 m8, m2, m6, 0x31 - vinserti128 m2, xm6, 1 - vperm2i128 m6, m1, m5, 0x31 - vinserti128 m1, xm5, 1 - vperm2i128 m5, m0, m4, 0x31 - vinserti128 m0, xm4, 1 - punpckhwd m4, m2, m3 - punpcklwd m2, m3 - punpckhwd m3, m0, m1 - punpcklwd m0, m1 -.pass1_end2: - punpckhwd m7, m5, m6 - punpcklwd m5, m6 - punpcklwd m6, m8, m9 - punpckhwd m8, m9 - REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 - punpckhdq m1, m0, m2 - punpckldq m0, m2 - punpckldq m2, m3, m4 - punpckhdq m3, m4 - punpckldq m4, m5, m6 - punpckhdq m5, m6 - punpckldq m6, m7, m8 - punpckhdq m7, m8 - jmp tx2q -.pass2: - call .main - REPX {vpermq x, x, q3120}, m0, m2, m4, m6 - REPX {vpermq x, x, q2031}, m1, m3, m5, m7 -.end: - vpbroadcastd m8, [o(pw_2048)] -.end2: - REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 -.end3: - pxor m8, m8 - REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 - lea r3, [strideq*3] - WRITE_8X4 0, 1, 8, 9 - lea dstq, [dstq+strideq*4] - WRITE_8X4 2, 3, 0, 1 - lea dstq, [dstq+strideq*4] - WRITE_8X4 4, 5, 0, 1 - lea dstq, [dstq+strideq*4] - WRITE_8X4 6, 7, 0, 1 - RET -ALIGN function_align -.main: - IDCT16_1D_PACKED - ret - -INV_TXFM_8X16_FN adst, dct -INV_TXFM_8X16_FN adst, adst -INV_TXFM_8X16_FN adst, flipadst -INV_TXFM_8X16_FN adst, identity - -cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 - ITX_8X16_LOAD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end - vpbroadcastd m10, [o(pw_16384)] - pslld m9, m10, 17 - psubw m10, m9 ; 16384, -16384 - jmp m(idct_8x16_internal).pass1_end -ALIGN function_align -.pass2: - call .main - call .main_pass2_end - vpbroadcastd m9, [o(pw_2048)] - vpbroadcastd xm8, [o(pw_4096)] - psubw m8, m9 - REPX {vpermq x, x, q2031}, m0, m1, m2, m3 - REPX {vpermq x, x, q3120}, m4, m5, m6, m7 - jmp m(idct_8x16_internal).end2 -ALIGN function_align -.main: - REPX {pshufd x, x, q1032}, m7, m1, m5, m3 -.main2: - vpbroadcastd m10, [o(pd_2048)] - punpckhwd m8, m7, m0 ; in14 in1 - punpcklwd m0, m7 ; in0 in15 - punpcklwd m7, m6, m1 ; in12 in3 - punpckhwd m1, m6 ; in2 in13 - punpckhwd m6, m5, m2 ; in10 in5 - punpcklwd m2, m5 ; in4 in11 - punpcklwd m5, m4, m3 ; in8 in7 - punpckhwd m3, m4 ; in6 in9 - ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 - ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 - ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 - ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 - ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 - ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 - ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 - ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 - psubsw m4, m0, m5 ; t9a t8a - paddsw m0, m5 ; t1a t0a - psubsw m5, m1, m6 ; t11a t10a - paddsw m1, m6 ; t3a t2a - psubsw m6, m2, m7 ; t13a t12a - paddsw m2, m7 ; t5a t4a - psubsw m7, m3, m8 ; t15a t14a - paddsw m3, m8 ; t7a t6a - vpbroadcastd m11, [o(pw_m4017_799)] - vpbroadcastd m12, [o(pw_799_4017)] - pxor m9, m9 - ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 - psubw m8, m9, m11 ; pw_4017_m799 - ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 - vpbroadcastd m11, [o(pw_m2276_3406)] - vpbroadcastd m12, [o(pw_3406_2276)] - ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 - psubw m8, m9, m11 ; pw_2276_m3406 - ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 - psubsw m8, m1, m3 ; t7 t6 - paddsw m1, m3 ; t3 t2 - psubsw m3, m0, m2 ; t5 t4 - paddsw m0, m2 ; t1 t0 - psubsw m2, m5, m7 ; t14a t15a - paddsw m7, m5 ; t10a t11a - psubsw m5, m4, m6 ; t12a t13a - paddsw m4, m6 ; t8a t9a - vpbroadcastd m11, [o(pw_m3784_1567)] - vpbroadcastd m12, [o(pw_1567_3784)] - ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a - psubw m6, m9, m11 ; pw_3784_m1567 - ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a - vpbroadcastd m11, [o(pw_m1567_3784)] - vpbroadcastd m12, [o(pw_3784_1567)] - ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 - psubw m6, m9, m11 ; pw_1567_m3784 - ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 - vbroadcasti128 m12, [o(deint_shuf)] - paddsw m6, m4, m7 ; -out1 out14 - psubsw m4, m7 ; t10 t11 - psubsw m11, m3, m8 ; t7 t6 - paddsw m8, m3 ; out12 -out3 - psubsw m3, m0, m1 ; t3a t2a - paddsw m0, m1 ; -out15 out0 - paddsw m1, m2, m5 ; -out13 out2 - psubsw m5, m2 ; t15a t14a - pshufb m0, m12 - pshufb m6, m12 - pshufb m8, m12 - pshufb m1, m12 - shufps m7, m6, m0, q1032 ; out14 -out15 - vpblendd m0, m6, 0x33 ; -out1 out0 - punpcklqdq m6, m8, m1 ; out12 -out13 - punpckhqdq m1, m8, m1 ; -out3 out2 - ret -ALIGN function_align -.main_pass1_end: - vpbroadcastd m8, [o(pw_m2896_2896)] - vpbroadcastd m12, [o(pw_2896_2896)] - pmaddwd m9, m8, m11 ; -out11 - pmaddwd m2, m12, m5 ; -out5 - pmaddwd m5, m8 ; out10 - pmaddwd m11, m12 ; out4 - REPX {paddd x, m10}, m9, m5, m2, m11 - REPX {psrad x, 12 }, m9, m5, m2, m11 - packssdw m5, m9 ; out10 -out11 - packssdw m2, m11 ; -out5 out4 - pmaddwd m11, m8, m3 ; out8 - vpbroadcastd m8, [o(pw_2896_m2896)] - pmaddwd m3, m12 ; -out7 - pmaddwd m8, m4 ; -out9 - pmaddwd m4, m12 ; out6 - REPX {paddd x, m10}, m11, m3, m8, m4 - REPX {psrad x, 12 }, m11, m3, m8, m4 - packssdw m3, m4 ; -out7 out6 - packssdw m4, m11, m8 ; out8 -out9 - vpbroadcastd m10, [o(pw_16384)] - pxor m9, m9 - ret -ALIGN function_align -.main_pass2_end: - vpbroadcastd m8, [o(pw_2896x8)] - pshufb m2, m11, m12 - pshufb m5, m12 - pshufb m3, m12 - pshufb m4, m12 - punpcklqdq m11, m5, m2 ; t15a t7 - punpckhqdq m5, m2 ; t14a t6 - shufps m2, m3, m4, q1032 ; t2a t10 - vpblendd m3, m4, 0xcc ; t3a t11 - psubsw m4, m2, m3 ; out8 -out9 - paddsw m3, m2 ; -out7 out6 - paddsw m2, m5, m11 ; -out5 out4 - psubsw m5, m11 ; out10 -out11 - REPX {pmulhrsw x, m8}, m2, m3, m4, m5 - ret - -INV_TXFM_8X16_FN flipadst, dct -INV_TXFM_8X16_FN flipadst, adst -INV_TXFM_8X16_FN flipadst, flipadst -INV_TXFM_8X16_FN flipadst, identity - -cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 - ITX_8X16_LOAD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end - vpbroadcastd m9, [o(pw_16384)] - pslld m10, m9, 17 - psubw m10, m9 ; -16384, 16384 - vperm2i128 m9, m4, m0, 0x31 - vinserti128 m0, m4, xm0, 1 - vperm2i128 m8, m5, m1, 0x31 - vinserti128 m4, m5, xm1, 1 - vperm2i128 m5, m7, m3, 0x31 - vinserti128 m3, m7, xm3, 1 - vinserti128 m1, m6, xm2, 1 - vperm2i128 m6, m6, m2, 0x31 - punpcklwd m2, m4, m0 - punpckhwd m4, m0 - punpcklwd m0, m3, m1 - punpckhwd m3, m1 - jmp m(idct_8x16_internal).pass1_end2 -.pass2: - call m(iadst_8x16_internal).main - call m(iadst_8x16_internal).main_pass2_end - vpbroadcastd m8, [o(pw_2048)] - vpbroadcastd xm9, [o(pw_4096)] - psubw m8, m9 - vpermq m9, m0, q3120 - vpermq m0, m7, q2031 - vpermq m7, m1, q3120 - vpermq m1, m6, q2031 - vpermq m6, m2, q3120 - vpermq m2, m5, q2031 - vpermq m5, m3, q3120 - vpermq m3, m4, q2031 - pmulhrsw m0, m8 - pmulhrsw m1, m8 - pmulhrsw m2, m8 - pmulhrsw m3, m8 - pmulhrsw m4, m5, m8 - pmulhrsw m5, m6, m8 - pmulhrsw m6, m7, m8 - pmulhrsw m7, m9, m8 - jmp m(idct_8x16_internal).end3 - -INV_TXFM_8X16_FN identity, dct -INV_TXFM_8X16_FN identity, adst -INV_TXFM_8X16_FN identity, flipadst -INV_TXFM_8X16_FN identity, identity - -%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] - pmulhrsw m%2, m%3, m%1 -%if %0 == 4 ; if downshifting by 1 - pmulhrsw m%2, m%4 -%else - paddsw m%1, m%1 -%endif - paddsw m%1, m%2 -%endmacro - -cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 - mova xm3, [cq+16*0] - mova xm2, [cq+16*2] - add cq, 16*8 - vinserti128 m3, [cq+16*0], 1 - vinserti128 m2, [cq+16*2], 1 - vpbroadcastd m9, [o(pw_2896x8)] - mova xm4, [cq-16*4] - mova xm5, [cq-16*2] - vinserti128 m4, [cq+16*4], 1 - vinserti128 m5, [cq+16*6], 1 - mova xm7, [cq-16*7] - mova xm6, [cq-16*5] - vinserti128 m7, [cq+16*1], 1 - vinserti128 m6, [cq+16*3], 1 - mova xm8, [cq-16*3] - mova xm0, [cq-16*1] - vinserti128 m8, [cq+16*5], 1 - vinserti128 m0, [cq+16*7], 1 - punpcklwd m1, m3, m2 - punpckhwd m3, m2 - punpcklwd m2, m4, m5 - punpckhwd m4, m5 - punpcklwd m5, m7, m6 - punpckhwd m7, m6 - punpcklwd m6, m8, m0 - punpckhwd m8, m0 - REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 - punpckldq m0, m1, m2 - punpckhdq m1, m2 - punpckldq m2, m3, m4 - punpckhdq m3, m4 - punpckldq m4, m5, m6 - punpckhdq m5, m6 - punpckldq m6, m7, m8 - punpckhdq m7, m8 - jmp tx2q -.pass2: - vpbroadcastd m8, [o(pw_1697x16)] - REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 - jmp m(idct_8x16_internal).end - -%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] - pmovzxbw m%3, [dstq+%5] -%ifnum %1 - paddw m%3, m%1 -%else - paddw m%3, %1 -%endif - pmovzxbw m%4, [dstq+%6] -%ifnum %2 - paddw m%4, m%2 -%else - paddw m%4, %2 -%endif - packuswb m%3, m%4 - vpermq m%3, m%3, q3120 - mova [dstq+%5], xm%3 - vextracti128 [dstq+%6], m%3, 1 -%endmacro - -%macro INV_TXFM_16X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 16x4 -%ifidn %1_%2, dct_dct - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_16384)] - mov [cq], eobd - mov r2d, 2 -.dconly: - pmulhrsw xm0, xm2 - movd xm2, [pw_2048] ; intentionally rip-relative - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - pxor m3, m3 -.dconly_loop: - mova xm1, [dstq] - vinserti128 m1, [dstq+strideq], 1 - punpckhbw m2, m1, m3 - punpcklbw m1, m3 - paddw m2, m0 - paddw m1, m0 - packuswb m1, m2 - mova [dstq], xm1 - vextracti128 [dstq+strideq], m1, 1 - lea dstq, [dstq+strideq*2] - dec r2d - jg .dconly_loop - RET -%endif -%endmacro - -INV_TXFM_16X4_FN dct, dct -INV_TXFM_16X4_FN dct, adst -INV_TXFM_16X4_FN dct, flipadst -INV_TXFM_16X4_FN dct, identity - -cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 - mova xm0, [cq+16*0] - mova xm1, [cq+16*1] - mova xm2, [cq+16*2] - mova xm3, [cq+16*3] - mova xm4, [cq+16*4] - mova xm5, [cq+16*5] - mova xm6, [cq+16*6] - mova xm7, [cq+16*7] - call m(idct_4x16_internal).main - vinserti128 m6, m2, xm6, 1 - vinserti128 m2, m0, xm4, 1 - vinserti128 m0, m1, xm5, 1 - vinserti128 m1, m3, xm7, 1 - punpcklwd m3, m2, m6 - punpckhwd m2, m6 - vpbroadcastd m6, [o(pw_16384)] - punpckhwd m4, m0, m1 - punpcklwd m0, m1 - mova m1, m6 - jmp m(iadst_16x4_internal).pass1_end -.pass2: - call .main - jmp m(iadst_16x4_internal).end -ALIGN function_align -.main: - vpbroadcastd m6, [o(pd_2048)] - IDCT4_1D 0, 1, 2, 3, 4, 5, 6 - ret - -INV_TXFM_16X4_FN adst, dct -INV_TXFM_16X4_FN adst, adst -INV_TXFM_16X4_FN adst, flipadst -INV_TXFM_16X4_FN adst, identity - -cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 - vpermq m0, [cq+32*0], q1230 - vpermq m3, [cq+32*3], q2103 - vpermq m1, [cq+32*1], q1230 - vpermq m2, [cq+32*2], q2103 - call m(iadst_4x16_internal).main2 - call m(iadst_4x16_internal).main_pass1_end - punpcklwd m4, m3, m1 - punpcklwd m5, m2, m0 - punpckhwd m0, m1 - punpckhwd m2, m3 - vpbroadcastd m1, [o(pw_16384)] - vinserti128 m3, m0, xm2, 1 - vperm2i128 m2, m0, m2, 0x31 - vinserti128 m0, m4, xm5, 1 - vperm2i128 m4, m4, m5, 0x31 - psubw m6, m7, m1 -.pass1_end: - pmulhrsw m3, m1 - pmulhrsw m2, m6 - pmulhrsw m4, m1 - pmulhrsw m0, m6 - punpcklwd m1, m3, m2 - punpckhwd m3, m2 - punpcklwd m2, m4, m0 - punpckhwd m4, m0 - punpckldq m0, m1, m2 - punpckhdq m1, m2 - punpckldq m2, m3, m4 - punpckhdq m3, m4 - jmp tx2q -.pass2: - call .main -.end: - vpbroadcastd m4, [o(pw_2048)] - REPX {pmulhrsw x, m4}, m0, m1, m2, m3 - WIN64_RESTORE_XMM -.end2: - pxor m4, m4 - mova [cq+32*0], m4 - mova [cq+32*1], m4 - mova [cq+32*2], m4 - mova [cq+32*3], m4 -.end3: - WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 - lea dstq, [dstq+strideq*2] - WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 - RET -ALIGN function_align -.main: - vpbroadcastd m6, [o(pw_m3344_3344)] - vpbroadcastd m7, [o(pw_3803_1321)] - vpbroadcastd m8, [o(pw_m1321_2482)] - vpbroadcastd m9, [o(pw_2482_3344)] - punpcklwd m4, m2, m0 ; in2 in0 l - punpckhwd m2, m0 ; in2 in0 h - psrld m5, m6, 16 - pmaddwd m10, m6, m4 ; t2:02 l - pmaddwd m6, m2 ; t2:02 h - pmaddwd m0, m7, m4 ; t0:02 l - pmaddwd m7, m2 ; t0:02 h - pmaddwd m4, m8 ; t1:02 l - pmaddwd m8, m2 ; t1:02 h - punpckhwd m2, m3, m1 ; in3 in1 h - punpcklwd m3, m1 ; in3 in1 l - pmaddwd m1, m5, m2 ; t2:3 h - pmaddwd m5, m3 ; t2:3 l - paddd m6, m1 - vpbroadcastd m1, [o(pd_2048)] - paddd m10, m5 - pmaddwd m5, m9, m3 - pmaddwd m9, m2 - paddd m0, m1 - paddd m7, m1 - paddd m0, m5 ; t0 + t3 + 2048 l - paddd m7, m9 ; t0 + t3 + 2048 h - vpbroadcastd m9, [o(pw_m3803_3344)] - pmaddwd m5, m9, m2 - pmaddwd m9, m3 - paddd m10, m1 ; t2 + 2048 l - paddd m6, m1 ; t2 + 2048 h - paddd m5, m1 ; t1:13 + 2048 h - paddd m1, m9 ; t1:13 + 2048 l - vpbroadcastd m9, [o(pw_m3803_m6688)] - pmaddwd m2, m9 - pmaddwd m3, m9 - paddd m5, m8 ; t1 + t3 + 2048 h - paddd m1, m4 ; t1 + t3 + 2048 l - paddd m8, m7 - paddd m4, m0 - paddd m2, m8 ; t0 + t1 - t3 + 2048 h - paddd m3, m4 ; t0 + t1 - t3 + 2048 l - REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 - packssdw m0, m7 - packssdw m1, m5 - packssdw m3, m2 - packssdw m2, m10, m6 - ret - -INV_TXFM_16X4_FN flipadst, dct -INV_TXFM_16X4_FN flipadst, adst -INV_TXFM_16X4_FN flipadst, flipadst -INV_TXFM_16X4_FN flipadst, identity - -cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 - vpermq m0, [cq+32*0], q1230 - vpermq m3, [cq+32*3], q2103 - vpermq m1, [cq+32*1], q1230 - vpermq m2, [cq+32*2], q2103 - call m(iadst_4x16_internal).main2 - call m(iadst_4x16_internal).main_pass1_end - punpckhwd m4, m3, m2 - punpckhwd m5, m1, m0 - punpcklwd m0, m2 - punpcklwd m1, m3 - vpbroadcastd m6, [o(pw_16384)] - vinserti128 m3, m0, xm1, 1 - vperm2i128 m2, m0, m1, 0x31 - vinserti128 m0, m4, xm5, 1 - vperm2i128 m4, m4, m5, 0x31 - psubw m1, m7, m6 - jmp m(iadst_16x4_internal).pass1_end -ALIGN function_align -.pass2: - call m(iadst_16x4_internal).main - vpbroadcastd m4, [o(pw_2048)] - REPX {pmulhrsw x, m4}, m3, m2, m1, m0 - pxor m4, m4 - mova [cq+32*0], m4 - mova [cq+32*1], m4 - mova [cq+32*2], m4 - mova [cq+32*3], m4 - WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 - lea dstq, [dstq+strideq*2] - WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 - RET - -INV_TXFM_16X4_FN identity, dct -INV_TXFM_16X4_FN identity, adst -INV_TXFM_16X4_FN identity, flipadst -INV_TXFM_16X4_FN identity, identity - -cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 - mova xm2, [cq+16*0] - mova xm4, [cq+16*1] - vinserti128 m2, [cq+16*4], 1 - vinserti128 m4, [cq+16*5], 1 - mova xm0, [cq+16*2] - mova xm1, [cq+16*3] - vinserti128 m0, [cq+16*6], 1 - vinserti128 m1, [cq+16*7], 1 - vpbroadcastd m7, [o(pw_1697x16)] - vpbroadcastd m8, [o(pw_16384)] - punpcklwd m3, m2, m4 - punpckhwd m2, m4 - punpcklwd m4, m0, m1 - punpckhwd m0, m1 - punpcklwd m1, m3, m2 - punpckhwd m3, m2 - punpcklwd m2, m4, m0 - punpckhwd m4, m0 - pmulhrsw m0, m7, m1 - pmulhrsw m5, m7, m2 - pmulhrsw m6, m7, m3 - pmulhrsw m7, m4 - REPX {pmulhrsw x, m8}, m0, m5, m6, m7 - paddsw m1, m0 - paddsw m2, m5 - paddsw m3, m6 - paddsw m4, m7 - punpcklqdq m0, m1, m2 - punpckhqdq m1, m2 - punpcklqdq m2, m3, m4 - punpckhqdq m3, m4 - jmp tx2q -.pass2: - vpbroadcastd m7, [o(pw_1697x8)] - pmulhrsw m4, m7, m0 - pmulhrsw m5, m7, m1 - pmulhrsw m6, m7, m2 - pmulhrsw m7, m3 - paddsw m0, m4 - paddsw m1, m5 - paddsw m2, m6 - paddsw m3, m7 - jmp m(iadst_16x4_internal).end - -%macro INV_TXFM_16X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 16x8 -%ifidn %1_%2, dct_dct - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_16384)] - mov [cq], eobd - pmulhrsw xm0, xm1 - mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_16x4).dconly -%endif -%endmacro - -%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd - vpbroadcastd m8, [o(pw_2896x8)] - vpermq m0, [cq+32*0], q3120 - add cq, 32*4 - vpermq m7, [cq+32*3], q%1 - vpermq m1, [cq-32*3], q%1 - vpermq m6, [cq+32*2], q3120 - vpermq m2, [cq-32*2], q3120 - vpermq m5, [cq+32*1], q%1 - vpermq m3, [cq-32*1], q%1 - vpermq m4, [cq+32*0], q3120 - REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 -%endmacro - -INV_TXFM_16X8_FN dct, dct -INV_TXFM_16X8_FN dct, adst -INV_TXFM_16X8_FN dct, flipadst -INV_TXFM_16X8_FN dct, identity - -cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 - ITX_16X8_LOAD_COEFS 3120 - call m(idct_8x16_internal).main - vpbroadcastd m10, [o(pw_16384)] - punpckhwd m8, m0, m2 - punpcklwd m0, m2 - punpckhwd m2, m1, m3 - punpcklwd m1, m3 - punpcklwd m9, m4, m6 - punpckhwd m4, m6 - punpcklwd m6, m5, m7 - punpckhwd m5, m7 - REPX {pmulhrsw x, m10}, m8, m1, m4, m6 -.pass1_end: - REPX {pmulhrsw x, m10}, m0, m2, m9, m5 - punpckhwd m3, m0, m8 - punpcklwd m0, m8 - punpckhwd m8, m2, m1 - punpcklwd m2, m1 - punpcklwd m7, m9, m4 - punpckhwd m9, m4 - punpcklwd m4, m5, m6 - punpckhwd m5, m6 - punpckhdq m1, m0, m2 - punpckldq m0, m2 - punpckldq m2, m3, m8 - punpckhdq m3, m8 - punpckldq m6, m7, m4 - punpckhdq m7, m4 - punpckldq m8, m9, m5 - punpckhdq m9, m5 - vperm2i128 m4, m0, m6, 0x31 - vinserti128 m0, xm6, 1 - vperm2i128 m5, m1, m7, 0x31 - vinserti128 m1, xm7, 1 - vperm2i128 m6, m2, m8, 0x31 - vinserti128 m2, xm8, 1 - vperm2i128 m7, m3, m9, 0x31 - vinserti128 m3, xm9, 1 - jmp tx2q -.pass2: - call .main - vpbroadcastd m8, [o(pw_2048)] -.end: - REPX {pmulhrsw x, m8}, m0, m2, m4, m6 -.end2: - REPX {pmulhrsw x, m8}, m1, m3, m5, m7 - lea r3, [strideq*3] - WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 - WRITE_16X2 2, 3, 0, 1, strideq*2, r3 -.end3: - pxor m0, m0 - REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 -.end4: - lea dstq, [dstq+strideq*4] - WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 - WRITE_16X2 6, 7, 0, 1, strideq*2, r3 - RET -ALIGN function_align -.main: - vpbroadcastd m10, [o(pd_2048)] -.main2: - IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 - ret - -INV_TXFM_16X8_FN adst, dct -INV_TXFM_16X8_FN adst, adst -INV_TXFM_16X8_FN adst, flipadst -INV_TXFM_16X8_FN adst, identity - -cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 - ITX_16X8_LOAD_COEFS 1302 - call m(iadst_8x16_internal).main2 - call m(iadst_8x16_internal).main_pass1_end - psubw m11, m9, m10 - punpcklwd m8, m0, m2 - punpckhwd m0, m2 - punpckhwd m2, m1, m3 - punpcklwd m1, m3 - punpcklwd m9, m4, m6 - punpckhwd m4, m6 - punpckhwd m6, m5, m7 - punpcklwd m5, m7 - REPX {pmulhrsw x, m11}, m8, m1, m4, m6 - jmp m(idct_16x8_internal).pass1_end -ALIGN function_align -.pass2: - call .main - call .main_pass2_end - pxor m8, m8 - psubw m8, m9 - REPX {pmulhrsw x, m9}, m0, m2, m4, m6 - jmp m(idct_16x8_internal).end2 -ALIGN function_align -.main: - vpbroadcastd m10, [o(pd_2048)] - ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a - ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a - ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a - ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a - psubsw m8, m2, m6 ; t6 - paddsw m2, m6 ; t2 - psubsw m6, m0, m4 ; t4 - paddsw m0, m4 ; t0 - psubsw m4, m5, m1 ; t7 - paddsw m5, m1 ; t3 - psubsw m1, m7, m3 ; t5 - paddsw m7, m3 ; t1 - ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a - ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a - psubsw m9, m6, m8 ; t7 - paddsw m6, m8 ; out6 - psubsw m3, m7, m5 ; t3 - paddsw m7, m5 ; -out7 - psubsw m5, m0, m2 ; t2 - paddsw m0, m2 ; out0 - psubsw m2, m1, m4 ; t6 - paddsw m1, m4 ; -out1 - ret -ALIGN function_align -.main_pass1_end: - vpbroadcastd m11, [o(pw_m2896_2896)] - vpbroadcastd m12, [o(pw_2896_2896)] - punpckhwd m4, m3, m5 - punpcklwd m3, m5 - pmaddwd m5, m11, m4 - pmaddwd m4, m12 - pmaddwd m8, m11, m3 - pmaddwd m3, m12 - REPX {paddd x, m10}, m5, m4, m8, m3 - REPX {psrad x, 12 }, m5, m8, m4, m3 - packssdw m3, m4 ; -out3 - packssdw m4, m8, m5 ; out4 - punpcklwd m5, m9, m2 - punpckhwd m9, m2 - pmaddwd m2, m12, m5 - pmaddwd m5, m11 - pmaddwd m12, m9 - pmaddwd m11, m9 - REPX {paddd x, m10}, m2, m5, m12, m11 - REPX {psrad x, 12 }, m2, m12, m5, m11 - packssdw m2, m12 ; out2 - packssdw m5, m11 ; -out5 - ret -ALIGN function_align -.main_pass2_end: - vpbroadcastd m8, [o(pw_2896x8)] - psubsw m4, m5, m3 - paddsw m3, m5 - psubsw m5, m2, m9 - paddsw m2, m9 - pmulhrsw m2, m8 ; out2 - pmulhrsw m3, m8 ; -out3 - pmulhrsw m4, m8 ; out4 - pmulhrsw m5, m8 ; -out5 - vpbroadcastd m9, [o(pw_2048)] - ret - -INV_TXFM_16X8_FN flipadst, dct -INV_TXFM_16X8_FN flipadst, adst -INV_TXFM_16X8_FN flipadst, flipadst -INV_TXFM_16X8_FN flipadst, identity - -cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 - ITX_16X8_LOAD_COEFS 1302 - call m(iadst_8x16_internal).main2 - call m(iadst_8x16_internal).main_pass1_end - psubw m9, m10 - punpcklwd m8, m6, m4 - punpckhwd m6, m4 - punpcklwd m4, m7, m5 - punpckhwd m7, m5 - punpckhwd m5, m3, m1 - punpcklwd m3, m1 - punpckhwd m1, m2, m0 - punpcklwd m2, m0 - REPX {pmulhrsw x, m10}, m8, m4, m5, m1 - REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 - punpcklwd m0, m7, m4 - punpckhwd m7, m4 - punpckhwd m4, m6, m8 - punpcklwd m6, m8 - punpckhwd m8, m3, m5 - punpcklwd m3, m5 - punpcklwd m5, m2, m1 - punpckhwd m2, m1 - punpckhdq m1, m0, m6 - punpckldq m0, m6 - punpckldq m6, m7, m4 - punpckhdq m7, m4 - punpckhdq m4, m3, m5 - punpckldq m3, m5 - punpckldq m5, m8, m2 - punpckhdq m8, m2 - vinserti128 m2, m6, xm5, 1 - vperm2i128 m6, m5, 0x31 - vperm2i128 m5, m1, m4, 0x31 - vinserti128 m1, xm4, 1 - vperm2i128 m4, m0, m3, 0x31 - vinserti128 m0, xm3, 1 - vinserti128 m3, m7, xm8, 1 - vperm2i128 m7, m8, 0x31 - jmp tx2q -.pass2: - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end - pxor m8, m8 - psubw m8, m9 - pmulhrsw m10, m7, m8 - pmulhrsw m7, m0, m9 - pmulhrsw m0, m6, m9 - pmulhrsw m6, m1, m8 - pmulhrsw m1, m5, m8 - pmulhrsw m5, m2, m9 - pmulhrsw m2, m4, m9 - pmulhrsw m4, m3, m8 - lea r3, [strideq*3] - WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 - WRITE_16X2 1, 2, 0, 1, strideq*2, r3 - jmp m(idct_16x8_internal).end3 - -INV_TXFM_16X8_FN identity, dct -INV_TXFM_16X8_FN identity, adst -INV_TXFM_16X8_FN identity, flipadst -INV_TXFM_16X8_FN identity, identity - -cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 - mova xm7, [cq+16*0] - mova xm2, [cq+16*1] - add cq, 16*8 - vpbroadcastd m3, [o(pw_2896x8)] - vinserti128 m7, [cq+16*0], 1 - vinserti128 m2, [cq+16*1], 1 - mova xm6, [cq-16*6] - mova xm4, [cq-16*5] - vinserti128 m6, [cq+16*2], 1 - vinserti128 m4, [cq+16*3], 1 - mova xm8, [cq-16*4] - mova xm5, [cq-16*3] - vinserti128 m8, [cq+16*4], 1 - vinserti128 m5, [cq+16*5], 1 - mova xm0, [cq-16*2] - mova xm1, [cq-16*1] - vinserti128 m0, [cq+16*6], 1 - vinserti128 m1, [cq+16*7], 1 - vpbroadcastd m10, [o(pw_1697x16)] - vpbroadcastd m11, [o(pw_16384)] - REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 - punpcklwd m3, m7, m2 - punpckhwd m7, m2 - punpcklwd m2, m6, m4 - punpckhwd m6, m4 - punpcklwd m4, m8, m5 - punpckhwd m8, m5 - punpcklwd m5, m0, m1 - punpckhwd m0, m1 - punpckldq m1, m3, m2 - punpckhdq m3, m2 - punpckldq m2, m4, m5 - punpckhdq m4, m5 - punpckldq m5, m7, m6 - punpckhdq m7, m6 - punpckldq m6, m8, m0 - punpckhdq m8, m0 - REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 - punpcklqdq m0, m1, m2 - punpckhqdq m1, m2 - punpcklqdq m2, m3, m4 - punpckhqdq m3, m4 - punpcklqdq m4, m5, m6 - punpckhqdq m5, m6 - punpcklqdq m6, m7, m8 - punpckhqdq m7, m8 - jmp tx2q -.pass2: - vpbroadcastd m8, [o(pw_4096)] - jmp m(idct_16x8_internal).end - -%define o_base pw_5 + 128 - -%macro INV_TXFM_16X16_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 16x16 -%ifidn %1_%2, dct_dct - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_8192)] - mov [cq], eobd - mov r2d, 8 - jmp m(inv_txfm_add_dct_dct_16x4).dconly -%endif -%endmacro - -%macro ITX_16X16_LOAD_COEFS 0 - mova m0, [cq+32*0] - mova m1, [cq+32*1] - mova m2, [cq+32*2] - mova m3, [cq+32*3] - add cq, 32*8 - mova m4, [cq-32*4] - mova m5, [cq-32*3] - mova m6, [cq-32*2] - mova m7, [cq-32*1] - mova m8, [cq+32*0] - mova m9, [cq+32*1] - mova m10, [cq+32*2] - mova m11, [cq+32*3] - mova m12, [cq+32*4] - mova m13, [cq+32*5] - mova m14, [cq+32*6] - mova m15, [cq+32*7] - mova [rsp], m15 -%endmacro - -INV_TXFM_16X16_FN dct, dct -INV_TXFM_16X16_FN dct, adst -INV_TXFM_16X16_FN dct, flipadst -INV_TXFM_16X16_FN dct, identity - -cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 - ITX_16X16_LOAD_COEFS - call .main -.pass1_end: - vpbroadcastd m1, [o(pw_8192)] - REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 - vextracti128 [rsp+16*5], m8, 1 - mova [rsp+16*1], xm8 -.pass1_end2: - vextracti128 [rsp+16*4], m0, 1 - mova [rsp+16*0], xm0 - REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 - pmulhrsw m1, [rsp+32*1] - vperm2i128 m8, m1, m9, 0x31 - vinserti128 m1, xm9, 1 - vperm2i128 m9, m2, m10, 0x31 - vinserti128 m2, xm10, 1 - vperm2i128 m10, m3, m11, 0x31 - vinserti128 m3, xm11, 1 - vperm2i128 m11, m4, m12, 0x31 - vinserti128 m4, xm12, 1 - vperm2i128 m12, m5, m13, 0x31 - vinserti128 m5, xm13, 1 - vperm2i128 m13, m6, m14, 0x31 - vinserti128 m6, xm14, 1 - vperm2i128 m14, m7, m15, 0x31 - vinserti128 m7, xm15, 1 - mova m15, [rsp+32*2] -.pass1_end3: - punpcklwd m0, m9, m10 - punpckhwd m9, m10 - punpcklwd m10, m15, m8 - punpckhwd m15, m8 - punpckhwd m8, m11, m12 - punpcklwd m11, m12 - punpckhwd m12, m13, m14 - punpcklwd m13, m14 - punpckhdq m14, m11, m13 - punpckldq m11, m13 - punpckldq m13, m15, m9 - punpckhdq m15, m9 - punpckldq m9, m10, m0 - punpckhdq m10, m0 - punpckhdq m0, m8, m12 - punpckldq m8, m12 - punpcklqdq m12, m13, m8 - punpckhqdq m13, m8 - punpcklqdq m8, m9, m11 - punpckhqdq m9, m11 - punpckhqdq m11, m10, m14 - punpcklqdq m10, m14 - punpcklqdq m14, m15, m0 - punpckhqdq m15, m0 - mova m0, [rsp] - mova [rsp], m15 - punpckhwd m15, m4, m5 - punpcklwd m4, m5 - punpckhwd m5, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m6, m7 - punpcklwd m6, m7 - punpckhwd m7, m2, m3 - punpcklwd m2, m3 - punpckhdq m3, m0, m2 - punpckldq m0, m2 - punpckldq m2, m4, m6 - punpckhdq m4, m6 - punpckhdq m6, m5, m7 - punpckldq m5, m7 - punpckldq m7, m15, m1 - punpckhdq m15, m1 - punpckhqdq m1, m0, m2 - punpcklqdq m0, m2 - punpcklqdq m2, m3, m4 - punpckhqdq m3, m4 - punpcklqdq m4, m5, m7 - punpckhqdq m5, m7 - punpckhqdq m7, m6, m15 - punpcklqdq m6, m15 - jmp tx2q -.pass2: - call .main -.end: - vpbroadcastd m1, [o(pw_2048)] - REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 - mova [rsp], m6 -.end2: - REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 - pmulhrsw m1, [rsp+32*1] - lea r3, [strideq*3] - WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 - WRITE_16X2 2, 3, 0, 1, strideq*2, r3 - lea dstq, [dstq+strideq*4] - WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 - WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 -.end3: - pxor m2, m2 - REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 - lea dstq, [dstq+strideq*4] - WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 - WRITE_16X2 10, 11, 0, 1, strideq*2, r3 - REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 - lea dstq, [dstq+strideq*4] - WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 - WRITE_16X2 14, 15, 0, 1, strideq*2, r3 - RET -ALIGN function_align -.main: - vpbroadcastd m15, [o(pd_2048)] - mova [rsp+gprsize+32*1], m1 - mova [rsp+gprsize+32*2], m9 - IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 - mova m1, [rsp+gprsize+32*2] ; in9 - mova [rsp+gprsize+32*2], m14 ; tmp7 - mova m9, [rsp+gprsize+32*1] ; in1 - mova [rsp+gprsize+32*1], m10 ; tmp5 - mova m14, [rsp+gprsize+32*0] ; in15 - mova [rsp+gprsize+32*0], m6 ; tmp3 - IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 - mova m6, [rsp+gprsize+32*1] ; tmp5 - psubsw m15, m0, m14 ; out15 - paddsw m0, m14 ; out0 - psubsw m14, m2, m13 ; out14 - paddsw m2, m13 ; out1 - mova [rsp+gprsize+32*1], m2 - psubsw m13, m4, m11 ; out13 - paddsw m2, m4, m11 ; out2 - psubsw m11, m8, m7 ; out11 - paddsw m4, m8, m7 ; out4 - mova m7, [rsp+gprsize+32*2] ; tmp7 - psubsw m10, m6, m5 ; out10 - paddsw m5, m6 ; out5 - psubsw m8, m7, m9 ; out8 - paddsw m7, m9 ; out7 - psubsw m9, m12, m3 ; out9 - paddsw m6, m12, m3 ; out6 - mova m3, [rsp+gprsize+32*0] ; tmp3 - psubsw m12, m3, m1 ; out12 - paddsw m3, m1 ; out3 - ret - -INV_TXFM_16X16_FN adst, dct -INV_TXFM_16X16_FN adst, adst -INV_TXFM_16X16_FN adst, flipadst - -cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 - ITX_16X16_LOAD_COEFS - call .main - call .main_pass1_end - pmulhrsw m0, m1, [cq+32*0] - pmulhrsw m2, m1, [cq+32*1] - REPX {pmulhrsw x, m1}, m4, m6, m8, m10 - pmulhrsw m12, m1, [cq+32*2] - pmulhrsw m14, m1, [cq+32*3] - vextracti128 [rsp+16*5], m8, 1 - mova [rsp+16*1], xm8 - pxor m8, m8 - psubw m1, m8, m1 - jmp m(idct_16x16_internal).pass1_end2 -ALIGN function_align -.pass2: - call .main - call .main_pass2_end - REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 - mova [rsp+32*0], m6 - pxor m6, m6 - psubw m1, m6, m1 - jmp m(idct_16x16_internal).end2 -ALIGN function_align -.main: - vpbroadcastd m15, [o(pd_2048)] - mova [rsp+gprsize+32*1], m0 - mova [rsp+gprsize+32*2], m4 - ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2 - ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 - ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 - ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 - psubsw m0, m2, m10 ; t10a - paddsw m2, m10 ; t2a - psubsw m10, m13, m5 ; t11a - paddsw m13, m5 ; t3a - psubsw m5, m6, m14 ; t14a - paddsw m6, m14 ; t6a - psubsw m14, m9, m1 ; t15a - paddsw m9, m1 ; t7a - ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 - ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 - psubsw m1, m10, m14 ; t14a - paddsw m10, m14 ; t10a - psubsw m14, m0, m5 ; t15a - paddsw m0, m5 ; t11a - psubsw m5, m2, m6 ; t6 - paddsw m2, m6 ; t2 - psubsw m6, m13, m9 ; t7 - paddsw m13, m9 ; t3 - ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a - ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 - mova m9, [rsp+gprsize+32*0] ; in15 - mova [rsp+gprsize+32*0], m10 ; t10a - mova m4, [rsp+gprsize+32*1] ; in0 - mova [rsp+gprsize+32*1], m6 ; t6a - mova m6, [rsp+gprsize+32*2] ; in4 - mova [rsp+gprsize+32*2], m2 ; t2 - ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0 - ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4 - ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8 - ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12 - psubsw m10, m4, m8 ; t8a - paddsw m8, m4 ; t0a - psubsw m4, m9, m7 ; t9a - paddsw m9, m7 ; t1a - psubsw m7, m6, m12 ; t12a - paddsw m6, m12 ; t4a - psubsw m12, m11, m3 ; t13a - paddsw m11, m3 ; t5a - ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8 - ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13 - psubsw m3, m9, m11 ; t5 - paddsw m9, m11 ; t1 - psubsw m11, m4, m12 ; t12a - paddsw m4, m12 ; t8a - paddsw m12, m8, m6 ; t0 - psubsw m8, m6 ; t4 - paddsw m6, m10, m7 ; t9a - psubsw m10, m7 ; t13a - ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a - ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12 - mova m7, [rsp+gprsize+32*0] ; t10a - mova m2, [rsp+gprsize+32*1] ; t6a - paddsw m15, m9, m13 ; -out15 - psubsw m9, m13 ; t3a - paddsw m13, m11, m1 ; -out13 - psubsw m11, m1 ; t15a - psubsw m1, m4, m7 ; t10 - paddsw m7, m4 ; -out1 - psubsw m4, m3, m2 ; t6 - paddsw m3, m2 ; -out3 - paddsw m2, m10, m14 ; out2 - psubsw m10, m14 ; t14a - paddsw m14, m6, m0 ; out14 - psubsw m6, m0 ; t11 - mova m0, [rsp+gprsize+32*2] ; t2 - mova [rsp+gprsize+32*1], m7 - psubsw m7, m12, m0 ; t2a - paddsw m0, m12 ; out0 - paddsw m12, m8, m5 ; out12 - psubsw m8, m5 ; t7 - ret -ALIGN function_align -.main_pass1_end: - mova [cq+32*0], m0 - mova [cq+32*1], m2 - mova [cq+32*2], m12 - mova [cq+32*3], m14 - vpbroadcastd m14, [pw_m2896_2896] - vpbroadcastd m12, [pw_2896_2896] - vpbroadcastd m2, [pd_2048] - punpcklwd m5, m11, m10 - punpckhwd m11, m10 - pmaddwd m10, m14, m5 - pmaddwd m0, m14, m11 - pmaddwd m5, m12 - pmaddwd m11, m12 - REPX {paddd x, m2}, m10, m0, m5, m11 - REPX {psrad x, 12}, m10, m0, m5, m11 - packssdw m10, m0 ; out10 - packssdw m5, m11 ; -out5 - punpcklwd m11, m8, m4 - punpckhwd m8, m4 - pmaddwd m4, m12, m11 - pmaddwd m0, m12, m8 - pmaddwd m11, m14 - pmaddwd m8, m14 - REPX {paddd x, m2}, m4, m0, m11, m8 - REPX {psrad x, 12}, m4, m0, m11, m8 - packssdw m4, m0 ; out4 - packssdw m11, m8 ; -out11 - punpcklwd m8, m9, m7 - punpckhwd m9, m7 - pmaddwd m7, m12, m8 - pmaddwd m0, m12, m9 - pmaddwd m8, m14 - pmaddwd m9, m14 - REPX {paddd x, m2}, m7, m0, m8, m9 - REPX {psrad x, 12}, m7, m0, m8, m9 - packssdw m7, m0 ; -out7 - packssdw m8, m9 ; out8 - punpckhwd m0, m6, m1 - punpcklwd m6, m1 - pmaddwd m1, m14, m0 - pmaddwd m9, m14, m6 - pmaddwd m0, m12 - pmaddwd m6, m12 - REPX {paddd x, m2}, m1, m9, m0, m6 - REPX {psrad x, 12}, m1, m9, m0, m6 - packssdw m9, m1 ; -out7 - packssdw m6, m0 ; out8 - vpbroadcastd m1, [o(pw_8192)] - ret -ALIGN function_align -.main_pass2_end: - ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to - ; 16-bit here will produce the same result as using 32-bit intermediates. - paddsw m5, m10, m11 ; -out5 - psubsw m10, m11 ; out10 - psubsw m11, m4, m8 ; -out11 - paddsw m4, m8 ; out4 - psubsw m8, m7, m9 ; out8 - paddsw m7, m9 ; -out7 - psubsw m9, m1, m6 ; -out9 - paddsw m6, m1 ; out6 - vpbroadcastd m1, [o(pw_2896x8)] - REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 - vpbroadcastd m1, [o(pw_2048)] - ret - -INV_TXFM_16X16_FN flipadst, dct -INV_TXFM_16X16_FN flipadst, adst -INV_TXFM_16X16_FN flipadst, flipadst - -cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 - ITX_16X16_LOAD_COEFS - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass1_end - pmulhrsw m6, m1 - pmulhrsw m2, m1, m8 - mova [rsp+32*2], m6 - pmulhrsw m6, m1, m4 - pmulhrsw m4, m1, m10 - pmulhrsw m8, m1, [cq+32*3] - pmulhrsw m10, m1, [cq+32*2] - pmulhrsw m12, m1, [cq+32*1] - pmulhrsw m14, m1, [cq+32*0] - pxor m0, m0 - psubw m0, m1 - REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 - pmulhrsw m1, m0, m9 - pmulhrsw m9, m0, m13 - pmulhrsw m0, [rsp+32*1] - mova [rsp+16*0], xm15 - mova [rsp+16*1], xm7 - vperm2i128 m15, m15, m7, 0x31 - vinserti128 m7, m2, xm14, 1 - vperm2i128 m14, m2, m14, 0x31 - vinserti128 m2, m9, xm5, 1 - vperm2i128 m9, m9, m5, 0x31 - vinserti128 m5, m4, xm12, 1 - vperm2i128 m12, m4, m12, 0x31 - vinserti128 m4, m11, xm3, 1 - vperm2i128 m11, m11, m3, 0x31 - vinserti128 m3, m10, xm6, 1 - vperm2i128 m10, m10, m6, 0x31 - vinserti128 m6, m1, xm0, 1 - vperm2i128 m13, m1, m0, 0x31 - vinserti128 m1, m8, [rsp+32*2], 1 - vperm2i128 m8, m8, [rsp+32*2], 0x31 - jmp m(idct_16x16_internal).pass1_end3 -.pass2: - call m(iadst_16x16_internal).main - call m(iadst_16x16_internal).main_pass2_end - pmulhrsw m0, m1 - pmulhrsw m8, m1 - mova [rsp+32*0], m0 - mova [rsp+32*2], m8 - pxor m0, m0 - psubw m0, m1 - pmulhrsw m8, m0, m7 - pmulhrsw m7, m0, m9 - pmulhrsw m9, m1, m6 - pmulhrsw m6, m1, m10 - pmulhrsw m10, m0, m5 - pmulhrsw m5, m0, m11 - pmulhrsw m11, m1, m4 - pmulhrsw m4, m1, m12 - pmulhrsw m12, m0, m3 - pmulhrsw m3, m0, m13 - pmulhrsw m13, m1, m2 - pmulhrsw m1, m14 - pmulhrsw m14, m0, [rsp+32*1] - pmulhrsw m0, m15 - lea r3, [strideq*3] - WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 - mova m15, [rsp+32*0] - WRITE_16X2 3, 4, 0, 1, strideq*2, r3 - lea dstq, [dstq+strideq*4] - WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 - WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 - jmp m(idct_16x16_internal).end3 - -%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 - pmulhrsw m%2, m%3, m%1 - psraw m%2, 1 - pavgw m%1, m%2 ; signs are guaranteed to be equal -%endmacro - -INV_TXFM_16X16_FN identity, dct -INV_TXFM_16X16_FN identity, identity - -cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 - vpbroadcastd m7, [o(pw_1697x16)] - mova xm0, [cq+16* 0] - vinserti128 m0, [cq+16*16], 1 - mova xm15, [cq+16* 1] - vinserti128 m15, [cq+16*17], 1 - mova xm1, [cq+16* 2] - vinserti128 m1, [cq+16*18], 1 - mova xm8, [cq+16* 3] - vinserti128 m8, [cq+16*19], 1 - mova xm2, [cq+16* 4] - vinserti128 m2, [cq+16*20], 1 - mova xm9, [cq+16* 5] - vinserti128 m9, [cq+16*21], 1 - mova xm3, [cq+16* 6] - vinserti128 m3, [cq+16*22], 1 - mova xm10, [cq+16* 7] - add cq, 16*16 - vinserti128 m10, [cq+16* 7], 1 - mova xm4, [cq-16* 8] - vinserti128 m4, [cq+16* 8], 1 - mova xm11, [cq-16* 7] - vinserti128 m11, [cq+16* 9], 1 - mova xm5, [cq-16* 6] - vinserti128 m5, [cq+16*10], 1 - mova xm12, [cq-16* 5] - vinserti128 m12, [cq+16*11], 1 - mova xm13, [cq-16* 3] - vinserti128 m13, [cq+16*13], 1 - mova xm14, [cq-16* 1] - vinserti128 m14, [cq+16*15], 1 - REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ - 10, 4, 11, 5, 12, 13, 14 - mova xm6, [cq-16* 4] - vinserti128 m6, [cq+16*12], 1 - mova [rsp], m0 - IDTX16B 6, 0, 7 - mova xm0, [cq-16* 2] - vinserti128 m0, [cq+16*14], 1 - pmulhrsw m7, m0 - psraw m7, 1 - pavgw m7, m0 - jmp m(idct_16x16_internal).pass1_end3 -ALIGN function_align -.pass2: - vpbroadcastd m15, [o(pw_1697x16)] - mova [rsp+32*1], m0 - REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ - 8, 9, 10, 11, 12, 13, 14 - mova m0, [rsp+32*1] - mova [rsp+32*1], m1 - IDTX16 0, 1, 15 - mova m1, [rsp+32*0] - pmulhrsw m15, m1 - paddsw m1, m1 - paddsw m15, m1 - jmp m(idct_16x16_internal).end - -%define o_base deint_shuf + 128 - -%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 -%if %3 - vpbroadcastd m15, [o(pw_2896x8)] - pmulhrsw m0, m15, [%1+%2*0] - pmulhrsw m1, m15, [%1+%2*1] - pmulhrsw m2, m15, [%1+%2*2] - pmulhrsw m3, m15, [%1+%2*3] - pmulhrsw m4, m15, [%1+%2*4] - pmulhrsw m5, m15, [%1+%2*5] - pmulhrsw m6, m15, [%1+%2*6] - pmulhrsw m7, m15, [%1+%2*7] -%else - mova m0, [%1+%2*0] - mova m1, [%1+%2*1] - mova m2, [%1+%2*2] - mova m3, [%1+%2*3] - mova m4, [%1+%2*4] - mova m5, [%1+%2*5] - mova m6, [%1+%2*6] - mova m7, [%1+%2*7] -%endif -%endmacro - -%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 -%if %3 -%if %3 == 1 - vpbroadcastd m15, [o(pw_2896x8)] -%endif - pmulhrsw m8, m15, [%1+%2*0] - pmulhrsw m9, m15, [%1+%2*1] - pmulhrsw m10, m15, [%1+%2*2] - pmulhrsw m11, m15, [%1+%2*3] - pmulhrsw m12, m15, [%1+%2*4] - pmulhrsw m13, m15, [%1+%2*5] - pmulhrsw m14, m15, [%1+%2*6] - pmulhrsw m15, [%1+%2*7] -%else - mova m8, [%1+%2*0] - mova m9, [%1+%2*1] - mova m10, [%1+%2*2] - mova m11, [%1+%2*3] - mova m12, [%1+%2*4] - mova m13, [%1+%2*5] - mova m14, [%1+%2*6] - mova m15, [%1+%2*7] -%endif -%endmacro - -%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] - vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8] - punpcklwd m%1, m%2, m%2 - pmulhrsw m%1, m%3 - vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8] - punpckhwd m%2, m%2 - pmulhrsw m%2, m%3 -%endmacro - -cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jz .dconly - PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob - %undef cmp - cmp eobd, 106 - jle .fast - LOAD_8ROWS cq+32*1, 32*2 - call m(idct_16x8_internal).main - vperm2i128 m11, m0, m4, 0x31 - vinserti128 m0, xm4, 1 - vperm2i128 m4, m1, m5, 0x31 - vinserti128 m1, xm5, 1 - vperm2i128 m5, m2, m6, 0x31 - vinserti128 m2, xm6, 1 - vperm2i128 m6, m3, m7, 0x31 - vinserti128 m3, xm7, 1 - pxor m7, m7 - REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 - punpckhwd m7, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m2, m3 - punpcklwd m2, m3 - punpcklwd m3, m11, m4 - punpckhwd m11, m4 - punpckhwd m4, m5, m6 - punpcklwd m5, m6 - punpckhdq m6, m0, m2 - punpckldq m0, m2 - punpckldq m2, m3, m5 - punpckhdq m3, m5 - punpckhdq m5, m11, m4 - punpckldq m11, m4 - punpckldq m4, m7, m1 - punpckhdq m7, m1 - punpckhqdq m12, m6, m0 - punpcklqdq m0, m6 ; out4 - punpckhqdq m13, m7, m4 - punpcklqdq m4, m7 ; out5 - punpckhqdq m14, m3, m2 - punpcklqdq m2, m3 ; out6 - punpckhqdq m15, m5, m11 - punpcklqdq m11, m5 ; out7 - mova [rsp+32*0], m0 - mova [rsp+32*1], m4 - mova [rsp+32*2], m2 -.fast: - LOAD_8ROWS cq+32*0, 32*2 - call m(idct_16x8_internal).main - vperm2i128 m8, m0, m4, 0x31 - vinserti128 m0, xm4, 1 - vperm2i128 m4, m1, m5, 0x31 - vinserti128 m1, xm5, 1 - vperm2i128 m5, m2, m6, 0x31 - vinserti128 m2, xm6, 1 - vperm2i128 m6, m3, m7, 0x31 - vinserti128 m3, xm7, 1 - vpbroadcastd m9, [o(pw_8192)] - pxor m7, m7 - REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 - punpckhwd m7, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m2, m3 - punpcklwd m2, m3 - punpckhwd m3, m8, m4 - punpcklwd m8, m4 - punpckhwd m4, m5, m6 - punpcklwd m5, m6 - punpckhdq m6, m0, m2 - punpckldq m0, m2 - punpckldq m2, m8, m5 - punpckhdq m8, m5 - punpckhdq m5, m3, m4 - punpckldq m3, m4 - punpckhdq m4, m7, m1 - punpckldq m7, m1 - punpcklqdq m1, m7, m4 - punpckhqdq m7, m4 ; out9 - punpckhqdq m4, m2, m8 ; out10 - punpcklqdq m2, m8 - punpckhqdq m8, m3, m5 - punpcklqdq m3, m5 - punpckhqdq m5, m0, m6 ; out8 - punpcklqdq m0, m6 - REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 - cmp eobd, 106 - jg .full - mova [rsp+32*0], m5 - mova [rsp+32*1], m7 - mova [rsp+32*2], m4 - pmulhrsw m11, m9, m8 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call .main_fast - jmp .pass2 -.dconly: - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_8192)] - mov [cq], eobd - pmulhrsw xm0, xm2 - psrlw xm2, 2 ; pw_2048 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mov r2d, 8 - jmp m(inv_txfm_add_dct_dct_8x8).end2 -.full: - REPX {pmulhrsw x, m9}, m12, m13, m14, m15 - pmulhrsw m6, m9, [rsp+32*2] - mova [rsp+32*2], m4 - pmulhrsw m4, m9, [rsp+32*0] - mova [rsp+32*0], m5 - pmulhrsw m5, m9, [rsp+32*1] - mova [rsp+32*1], m7 - pmulhrsw m7, m9, m11 - pmulhrsw m11, m9, m8 - call .main -.pass2: - vpbroadcastd m12, [o(pw_2048)] - REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ - m8, m9, m10, m11, m13, m14, m15 - pmulhrsw m12, [rsp] - REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 - REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 - mova [rsp+32*0], m4 - mova [rsp+32*1], m6 - lea r3, [strideq*3] - WRITE_8X4 0, 1, 4, 6 - lea dstq, [dstq+strideq*4] - WRITE_8X4 2, 3, 4, 6 - lea dstq, [dstq+strideq*4] - WRITE_8X4 [rsp+32*0], 5, 4, 6 - lea dstq, [dstq+strideq*4] - WRITE_8X4 [rsp+32*1], 7, 4, 6 - lea dstq, [dstq+strideq*4] - WRITE_8X4 8, 9, 4, 6 - lea dstq, [dstq+strideq*4] - WRITE_8X4 10, 11, 4, 6 - lea dstq, [dstq+strideq*4] - WRITE_8X4 12, 13, 4, 6 - lea dstq, [dstq+strideq*4] - WRITE_8X4 14, 15, 4, 6 - RET -ALIGN function_align -.main_fast: ; bottom half is zero - call m(idct_8x16_internal).main - mova m8, [rsp+gprsize+0*32] - mova [rsp+gprsize+0*32], m0 - mova m9, [rsp+gprsize+1*32] - mova [rsp+gprsize+1*32], m1 - mova m0, [rsp+gprsize+2*32] - mova [rsp+gprsize+2*32], m6 - lea r5, [rax-(o_base)+pw_201_4091x8] - ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a - ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a - ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a - ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a - jmp .main2 -ALIGN function_align -.main: - call m(idct_8x16_internal).main - mova m8, [rsp+gprsize+0*32] - mova [rsp+gprsize+0*32], m0 - mova m9, [rsp+gprsize+1*32] - mova [rsp+gprsize+1*32], m1 - mova m0, [rsp+gprsize+2*32] - mova [rsp+gprsize+2*32], m6 - punpcklwd m1, m15, m8 ; in31 in1 - punpckhwd m8, m15 ; in3 in29 - punpcklwd m15, m14, m9 ; in27 in5 - punpckhwd m9, m14 ; in7 in25 - punpcklwd m14, m13, m0 ; in23 in9 - punpckhwd m0, m13 ; in11 in21 - punpcklwd m13, m12, m11 ; in19 in13 - punpckhwd m11, m12 ; in15 in17 - ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a - ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a - ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a - ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a - ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a - ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a - ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a - ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a -.main2: - psubsw m6, m1, m11 ; t17 t30 - paddsw m1, m11 ; t16 t31 - psubsw m11, m9, m14 ; t18 t29 - paddsw m9, m14 ; t19 t28 - psubsw m14, m15, m0 ; t21 t26 - paddsw m15, m0 ; t20 t27 - psubsw m0, m8, m13 ; t22 t25 - paddsw m8, m13 ; t23 t24 - ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a - ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a - ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a - ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a - psubsw m13, m1, m9 ; t19a t28a - paddsw m1, m9 ; t16a t31a - psubsw m9, m8, m15 ; t20a t27a - paddsw m8, m15 ; t23a t24a - psubsw m15, m6, m11 ; t18 t29 - paddsw m6, m11 ; t17 t30 - psubsw m11, m0, m14 ; t21 t26 - paddsw m0, m14 ; t22 t25 - ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a - ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28 - ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27 - ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a - vbroadcasti128 m12, [o(deint_shuf)] - psubsw m14, m1, m8 ; t23 t24 - paddsw m1, m8 ; t16 t31 - psubsw m8, m6, m0 ; t22a t25a - paddsw m6, m0 ; t17a t30a - psubsw m0, m15, m11 ; t21 t26 - paddsw m15, m11 ; t18 t29 - psubsw m11, m13, m9 ; t20a t27a - paddsw m13, m9 ; t19a t28a - REPX {pshufb x, m12}, m1, m6, m15, m13 - ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a - vpbroadcastd m9, [o(pw_m2896_2896)] - ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 - vpbroadcastd m12, [o(pw_2896_2896)] - ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a - vpbroadcastd m12, [o(pw_2896_2896)] - ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 - shufps m9, m14, m8, q1032 ; t23a t22 - vpblendd m14, m8, 0xcc ; t24a t25 - shufps m8, m11, m0, q1032 ; t20 t21a - vpblendd m11, m0, 0xcc ; t27 t26a - punpcklqdq m0, m1, m6 ; t16 t17a - punpckhqdq m1, m6 ; t31 t30a - psubsw m10, m5, m8 ; out20 out21 - paddsw m5, m8 ; out11 out10 - psubsw m6, m3, m14 ; out24 out25 - paddsw m3, m14 ; out7 out6 - psubsw m8, m7, m0 ; out16 out17 - paddsw m7, m0 ; out15 out14 - mova m0, [rsp+gprsize+0*32] - punpcklqdq m12, m13, m15 ; t19a t18 - punpckhqdq m13, m15 ; t28a t29 - psubsw m15, m0, m1 ; out31 out30 - paddsw m0, m1 ; out0 out1 - mova m1, [rsp+gprsize+1*32] - mova [rsp+gprsize+0*32], m6 - mova m6, [rsp+gprsize+2*32] - psubsw m14, m1, m13 ; out28 out29 - paddsw m1, m13 ; out3 out2 - psubsw m13, m2, m11 ; out27 out26 - paddsw m2, m11 ; out4 out5 - psubsw m11, m4, m9 ; out23 out22 - paddsw m4, m9 ; out8 out9 - psubsw m9, m6, m12 ; out19 out18 - paddsw m6, m12 ; out12 out13 - ret - -%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] - vbroadcasti128 m%1, [cq+16*%3] - vbroadcasti128 m%2, [cq+16*%4] - shufpd m%1, m%2, 0x0c -%endmacro - -cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jnz .normal - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_8192)] - mov [cq], eobd - mov r2d, 8 -.dconly: - pmulhrsw xm0, xm2 - movd xm2, [pw_2048] ; intentionally rip-relative - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - pxor m3, m3 -.dconly_loop: - mova m1, [dstq] - punpckhbw m2, m1, m3 - punpcklbw m1, m3 - paddw m2, m0 - paddw m1, m0 - packuswb m1, m2 - mova [dstq], m1 - add dstq, strideq - dec r2d - jg .dconly_loop - RET -.normal: - PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob - %undef cmp - LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 - LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 - LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 - LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 - pxor m8, m8 - REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 - add cq, 16*16 - LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 - LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 - LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 - LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 - REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 - mova [rsp+32*0], m4 - mova [rsp+32*1], m5 - mova [rsp+32*2], m6 - cmp eobd, 106 - jg .full - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(inv_txfm_add_dct_dct_8x32).main_fast - jmp .pass2 -.full: - LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 - LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 - LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 - LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 - REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 - add cq, 16*8 - LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 - LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 - LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 - LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 - pxor m8, m8 - REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 - call m(inv_txfm_add_dct_dct_8x32).main -.pass2: - vpbroadcastd m12, [o(pw_8192)] - REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 - mova [rsp+32*1], m9 - mova [rsp+32*2], m10 - punpckhwd m9, m0, m2 - punpcklwd m0, m2 - punpckhwd m2, m1, m3 - punpcklwd m1, m3 - punpcklwd m10, m4, m6 - punpckhwd m4, m6 - punpcklwd m6, m5, m7 - punpckhwd m5, m7 - punpckhwd m3, m0, m9 - punpcklwd m0, m9 - punpckhwd m9, m2, m1 - punpcklwd m2, m1 - punpcklwd m7, m10, m4 - punpckhwd m10, m4 - punpcklwd m4, m5, m6 - punpckhwd m5, m6 - punpckhdq m1, m0, m2 - punpckldq m0, m2 - punpckldq m2, m3, m9 - punpckhdq m3, m9 - punpckldq m6, m7, m4 - punpckhdq m7, m4 - punpckldq m9, m10, m5 - punpckhdq m10, m5 - REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 - pmulhrsw m12, [rsp+32*0] - mova [rsp+32*0], m8 - vperm2i128 m4, m0, m6, 0x31 - vinserti128 m0, xm6, 1 - vperm2i128 m5, m1, m7, 0x31 - vinserti128 m1, xm7, 1 - vperm2i128 m6, m2, m9, 0x31 - vinserti128 m2, xm9, 1 - vperm2i128 m7, m3, m10, 0x31 - vinserti128 m3, xm10, 1 - call m(idct_16x8_internal).main - vpbroadcastd m8, [o(pw_2048)] - REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 - lea r2, [strideq*3] - WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 - WRITE_16X2 2, 3, 0, 1, strideq*2, r2 - lea r3, [dstq+strideq*4] - %define dstq r3 - WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 - WRITE_16X2 6, 7, 0, 1, strideq*2, r2 - mova m0, [rsp+32*0] - mova m1, [rsp+32*1] - mova m2, [rsp+32*2] - punpckhwd m7, m0, m2 - punpcklwd m0, m2 - punpckhwd m2, m1, m11 - punpcklwd m1, m11 - punpckhwd m4, m12, m14 - punpcklwd m12, m14 - punpckhwd m5, m13, m15 - punpcklwd m13, m15 - punpckhwd m3, m0, m7 - punpcklwd m0, m7 - punpckhwd m9, m2, m1 - punpcklwd m2, m1 - punpcklwd m7, m12, m4 - punpckhwd m12, m4 - punpcklwd m4, m5, m13 - punpckhwd m5, m13 - punpckhdq m1, m0, m2 - punpckldq m0, m2 - punpckldq m2, m3, m9 - punpckhdq m3, m9 - punpckldq m6, m7, m4 - punpckhdq m7, m4 - punpckldq m9, m12, m5 - punpckhdq m12, m5 - vperm2i128 m4, m0, m6, 0x31 - vinserti128 m0, xm6, 1 - vperm2i128 m5, m1, m7, 0x31 - vinserti128 m1, xm7, 1 - vperm2i128 m6, m2, m9, 0x31 - vinserti128 m2, xm9, 1 - vperm2i128 m7, m3, m12, 0x31 - vinserti128 m3, xm12, 1 - call m(idct_16x8_internal).main2 - vpbroadcastd m8, [o(pw_2048)] - REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 - add r0, 16 - add r3, 16 - %define dstq r0 - WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 - WRITE_16X2 2, 3, 0, 1, strideq*2, r2 - %define dstq r3 - WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 - WRITE_16X2 6, 7, 0, 1, strideq*2, r2 - RET - -cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob - vpbroadcastd m9, [pw_5] - lea r4, [strideq*3] - sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) -.loop: - mova xm0,[cq+16* 0] - mova xm1, [cq+16* 4] - vinserti128 m0, [cq+16* 1], 1 - vinserti128 m1, [cq+16* 5], 1 - pxor m8, m8 - mova [cq+32*0], m8 - mova [cq+32*2], m8 - add cq, 16*16 - mova xm2, [cq-16* 8] - mova xm3, [cq-16* 4] - vinserti128 m2, [cq-16* 7], 1 - vinserti128 m3, [cq-16* 3], 1 - mova xm4, [cq+16* 0] - mova xm5, [cq+16* 4] - vinserti128 m4, [cq+16* 1], 1 - vinserti128 m5, [cq+16* 5], 1 - mova xm6, [cq+16* 8] - mova xm7, [cq+16*12] - vinserti128 m6, [cq+16* 9], 1 - vinserti128 m7, [cq+16*13], 1 - REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 - REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - call .transpose8x8 - REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 - WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 - add dstq, strideq - WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 - add dstq, strideq - WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 - add dstq, strideq - WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 - add dstq, strideq - sub cq, 16*16-32 - lea dstq, [dstq+r4*4] - add eobd, 0x80000000 - jnc .loop - RET -ALIGN function_align -.transpose8x8: - punpckhwd m8, m4, m5 - punpcklwd m4, m5 - punpckhwd m5, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m6, m7 - punpcklwd m6, m7 - punpckhwd m7, m2, m3 - punpcklwd m2, m3 - punpckhdq m3, m0, m2 - punpckldq m0, m2 - punpckldq m2, m4, m6 - punpckhdq m4, m6 - punpckhdq m6, m5, m7 - punpckldq m5, m7 - punpckldq m7, m8, m1 - punpckhdq m8, m1 - punpckhqdq m1, m0, m2 - punpcklqdq m0, m2 - punpcklqdq m2, m3, m4 - punpckhqdq m3, m4 - punpcklqdq m4, m5, m7 - punpckhqdq m5, m7 - punpckhqdq m7, m6, m8 - punpcklqdq m6, m8 - ret - -cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob - add cq, 16*8 - vpbroadcastd m9, [pw_4096] - lea r4, [strideq*3] - lea r5, [dstq+strideq*4] - sub eobd, 107 -.loop: - mova xm0, [cq-16*8] - mova xm1, [cq-16*7] - vinserti128 m0, [cq+16*0], 1 - vinserti128 m1, [cq+16*1], 1 - mova xm2, [cq-16*6] - mova xm3, [cq-16*5] - vinserti128 m2, [cq+16*2], 1 - vinserti128 m3, [cq+16*3], 1 - mova xm4, [cq-16*4] - mova xm5, [cq-16*3] - vinserti128 m4, [cq+16*4], 1 - vinserti128 m5, [cq+16*5], 1 - mova xm6, [cq-16*2] - mova xm7, [cq-16*1] - vinserti128 m6, [cq+16*6], 1 - vinserti128 m7, [cq+16*7], 1 - pxor m8, m8 - REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 - WRITE_16X2 2, 3, 0, 1, strideq*2, r4 - %define dstq r5 - WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 - WRITE_16X2 6, 7, 0, 1, strideq*2, r4 - add cq, 16*16 - add r0, 16 - add r5, 16 - add eobd, 0x80000000 - jnc .loop - RET - -%define o_base pw_5 + 128 - -%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs -%if %3 - vpbroadcastd m15, [o(pw_2896x8)] - pmulhrsw m0, m15, [%1+%2* 0] - pmulhrsw m1, m15, [%1+%2* 1] - pmulhrsw m2, m15, [%1+%2* 2] - pmulhrsw m3, m15, [%1+%2* 3] - pmulhrsw m4, m15, [%1+%2* 4] - pmulhrsw m5, m15, [%1+%2* 5] - pmulhrsw m6, m15, [%1+%2* 6] - pmulhrsw m7, m15, [%1+%2* 7] - pmulhrsw m8, m15, [%1+%2* 8] - pmulhrsw m9, m15, [%1+%2* 9] - pmulhrsw m10, m15, [%1+%2*10] - pmulhrsw m11, m15, [%1+%2*11] - pmulhrsw m12, m15, [%1+%2*12] - pmulhrsw m13, m15, [%1+%2*13] - pmulhrsw m14, m15, [%1+%2*14] - pmulhrsw m15, [%1+%2*15] -%else - mova m0, [%1+%2* 0] - mova m1, [%1+%2* 1] - mova m2, [%1+%2* 2] - mova m3, [%1+%2* 3] - mova m4, [%1+%2* 4] - mova m5, [%1+%2* 5] - mova m6, [%1+%2* 6] - mova m7, [%1+%2* 7] - mova m8, [%1+%2* 8] - mova m9, [%1+%2* 9] - mova m10, [%1+%2*10] - mova m11, [%1+%2*11] - mova m12, [%1+%2*12] - mova m13, [%1+%2*13] - mova m14, [%1+%2*14] - mova m15, [%1+%2*15] -%endif - mova [rsp], m15 -%if %4 - pxor m15, m15 - REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ - 8, 9, 10, 11, 12, 13, 14, 15 -%endif -%endmacro - -%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] - mova m%4, [%2] - paddsw m%3, m%1, m%4 - psubsw m%1, m%4 - pmovzxbw m%4, [dstq+%6] - pmulhrsw m%3, m%5 - pmulhrsw m%1, m%5 - paddw m%3, m%4 - pmovzxbw m%4, [r2+%7] - paddw m%1, m%4 - packuswb m%3, m%1 - vpermq m%3, m%3, q3120 - mova [dstq+%6], xm%3 - vextracti128 [r2+%7], m%3, 1 -%endmacro - -cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jz .dconly - PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ - base, tmp3 - %undef cmp - LOAD_16ROWS cq, 64, 1 - call m(idct_16x16_internal).main - lea tmp1q, [rsp+32*7] - lea tmp2q, [tmp1q+32*8] - lea tmp3q, [tmp1q+32*16] - mova m1, [rsp+32*1] - mova [rsp+32*0], m6 - mova [rsp+32*1], m7 - vpbroadcastd m7, [o(pw_16384)] - call .transpose_2x8x8_round - mova m15, [rsp+32*0] - mova [tmp3q-32*4+ 0], xm0 - vextracti128 [tmp3q+32*0+ 0], m0, 1 - mova [tmp3q-32*3+ 0], xm2 - vextracti128 [tmp3q+32*1+ 0], m2, 1 - mova [tmp3q-32*2+ 0], xm4 - vextracti128 [tmp3q+32*2+ 0], m4, 1 - mova [tmp3q-32*1+ 0], xm6 - vextracti128 [tmp3q+32*3+ 0], m6, 1 - mova [tmp3q-32*4+16], xm8 - vextracti128 [tmp3q+32*0+16], m8, 1 - mova [tmp3q-32*3+16], xm10 - vextracti128 [tmp3q+32*1+16], m10, 1 - mova [tmp3q-32*2+16], xm12 - vextracti128 [tmp3q+32*2+16], m12, 1 - mova [tmp3q-32*1+16], xm14 - vextracti128 [tmp3q+32*3+16], m14, 1 - cmp eobd, 150 - jg .full - vinserti128 m0, m1, xm9, 1 - vperm2i128 m4, m1, m9, 0x31 - vinserti128 m2, m5, xm13, 1 - vperm2i128 m6, m5, m13, 0x31 - vinserti128 m1, m3, xm11, 1 - vperm2i128 m5, m3, m11, 0x31 - vinserti128 m3, m7, xm15, 1 - vperm2i128 m7, m7, m15, 0x31 - call .main_oddhalf_fast - pxor m8, m8 - REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 - jmp .idct16 -.dconly: - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_16384)] - mov [cq], eobd - pmulhrsw xm0, xm1 - mov r2d, 16 - jmp m(inv_txfm_add_dct_dct_16x4).dconly -.full: - mova [tmp1q-32*4], m1 - mova [tmp1q-32*3], m3 - mova [tmp1q-32*2], m5 - mova [tmp1q-32*1], m7 - mova [tmp1q+32*0], m9 - mova [tmp1q+32*1], m11 - mova [tmp1q+32*2], m13 - mova [tmp1q+32*3], m15 - LOAD_16ROWS cq+32, 64, 1 - call m(idct_16x16_internal).main - lea r2, [tmp3q+32*8] - mova m1, [rsp+32*1] - mova [rsp+32*0], m6 - mova [rsp+32*1], m7 - vpbroadcastd m7, [o(pw_16384)] - call .transpose_2x8x8_round - mova m15, [rsp+32*0] - mova [r2-32*4+ 0], xm0 - vextracti128 [r2+32*0+ 0], m0, 1 - mova [r2-32*3+ 0], xm2 - vextracti128 [r2+32*1+ 0], m2, 1 - mova [r2-32*2+ 0], xm4 - vextracti128 [r2+32*2+ 0], m4, 1 - mova [r2-32*1+ 0], xm6 - vextracti128 [r2+32*3+ 0], m6, 1 - mova [r2-32*4+16], xm8 - vextracti128 [r2+32*0+16], m8, 1 - mova [r2-32*3+16], xm10 - vextracti128 [r2+32*1+16], m10, 1 - mova [r2-32*2+16], xm12 - vextracti128 [r2+32*2+16], m12, 1 - mova [r2-32*1+16], xm14 - vextracti128 [r2+32*3+16], m14, 1 - vinserti128 m8, m1, xm9, 1 - vperm2i128 m12, m1, m9, 0x31 - mova xm0, [tmp1q-32*4] - mova xm1, [tmp1q-32*3] - vinserti128 m0, [tmp1q+32*0], 1 - vinserti128 m1, [tmp1q+32*1], 1 - vinserti128 m10, m5, xm13, 1 - vperm2i128 m14, m5, m13, 0x31 - mova xm4, [tmp1q-32*4+16] - mova xm5, [tmp1q-32*3+16] - vinserti128 m4, [tmp1q+32*0+16], 1 - vinserti128 m5, [tmp1q+32*1+16], 1 - vinserti128 m9, m3, xm11, 1 - vperm2i128 m13, m3, m11, 0x31 - mova xm2, [tmp1q-32*2] - mova xm3, [tmp1q-32*1] - vinserti128 m2, [tmp1q+32*2], 1 - vinserti128 m3, [tmp1q+32*3], 1 - vinserti128 m11, m7, xm15, 1 - vperm2i128 m15, m7, m15, 0x31 - mova xm6, [tmp1q-32*2+16] - mova xm7, [tmp1q-32*1+16] - vinserti128 m6, [tmp1q+32*2+16], 1 - vinserti128 m7, [tmp1q+32*3+16], 1 - call .main_oddhalf - LOAD_8ROWS_H r2-32*4, 32 -.idct16: - LOAD_8ROWS tmp3q-32*4, 32 - mova [rsp], m15 - call m(idct_16x16_internal).main - imul r2, strideq, 19 - lea r3, [strideq*3] - add r2, dstq - call .pass2_end - RET -ALIGN function_align -.main_oddhalf_fast: ; lower half is zero - mova [rsp+gprsize+32*1], m7 - pxor m7, m7 - mova [rsp+gprsize+32*0], m7 - mova [rsp+gprsize+32*2], m7 - vpbroadcastd m11, [o(pw_3703x8)] - vpbroadcastd m7, [o(pw_1751x8)] - vpbroadcastd m12, [o(pw_m1380x8)] - vpbroadcastd m8, [o(pw_3857x8)] - vpbroadcastd m13, [o(pw_3973x8)] - vpbroadcastd m15, [o(pw_995x8)] - pmulhrsw m11, m4 ; t29a - pmulhrsw m4, m7 ; t18a - pmulhrsw m12, m3 ; t19a - pmulhrsw m3, m8 ; t28a - pmulhrsw m13, m2 ; t27a - pmulhrsw m2, m15 ; t20a - vpbroadcastd m10, [o(pw_m2106x8)] - vpbroadcastd m7, [o(pw_3513x8)] - vpbroadcastd m9, [o(pw_3290x8)] - vpbroadcastd m8, [o(pw_2440x8)] - vpbroadcastd m14, [o(pw_m601x8)] - vpbroadcastd m15, [o(pw_4052x8)] - pmulhrsw m10, m5 ; t21a - pmulhrsw m5, m7 ; t26a - pmulhrsw m9, m6 ; t25a - pmulhrsw m6, m8 ; t22a - pmulhrsw m14, m1 ; t23a - pmulhrsw m1, m15 ; t24a - vpbroadcastd m15, [o(pd_2048)] - jmp .main2 -ALIGN function_align -.main_oddhalf: - mova [rsp+gprsize+32*0], m15 - mova [rsp+gprsize+32*1], m7 - mova [rsp+gprsize+32*2], m8 - vpbroadcastd m15, [o(pd_2048)] - ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a - ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a - ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a - ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a - ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a - ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a -.main2: - psubsw m7, m12, m4 ; t18 - paddsw m12, m4 ; t19 - psubsw m4, m2, m10 ; t21 - paddsw m2, m10 ; t20 - psubsw m10, m14, m6 ; t22 - paddsw m14, m6 ; t23 - psubsw m6, m1, m9 ; t25 - paddsw m1, m9 ; t24 - psubsw m9, m13, m5 ; t26 - paddsw m13, m5 ; t27 - psubsw m5, m3, m11 ; t29 - paddsw m3, m11 ; t28 - ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a - ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a - ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a - psubsw m8, m14, m2 ; t20a - paddsw m14, m2 ; t23a - psubsw m2, m1, m13 ; t27a - paddsw m1, m13 ; t24a - psubsw m13, m6, m9 ; t21 - paddsw m6, m9 ; t22 - psubsw m9, m10, m4 ; t26 - paddsw m10, m4 ; t25 - ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27 - ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a - mova m4, [rsp+gprsize+32*0] ; in31 - mova [rsp+gprsize+32*0], m6 ; t22 - mova m6, [rsp+gprsize+32*1] ; in15 - mova [rsp+gprsize+32*1], m14 ; t23a - mova m14, [rsp+gprsize+32*2] ; in17 - mova [rsp+gprsize+32*2], m1 ; t24a - ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a - ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a - psubsw m1, m0, m14 ; t17 - paddsw m0, m14 ; t16 - psubsw m14, m4, m6 ; t30 - paddsw m4, m6 ; t31 - ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a - psubsw m6, m0, m12 ; t19a - paddsw m0, m12 ; t16a - psubsw m12, m4, m3 ; t28a - paddsw m4, m3 ; t31a - psubsw m3, m14, m5 ; t18 - paddsw m14, m5 ; t17 - psubsw m5, m1, m7 ; t29 - paddsw m1, m7 ; t30 - ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a - ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28 - psubsw m7, m1, m10 ; t25a - paddsw m1, m10 ; t30a - psubsw m10, m5, m9 ; t21 - paddsw m5, m9 ; t18 - psubsw m9, m12, m2 ; t20a - paddsw m12, m2 ; t19a - psubsw m2, m3, m13 ; t26 - paddsw m3, m13 ; t29 - psubsw m13, m6, m8 ; t27a - paddsw m6, m8 ; t28a - mova [tmp1q-32*2], m5 - mova [tmp1q-32*1], m12 - mova [tmp2q+32*0], m6 - mova [tmp2q+32*1], m3 - mova [tmp2q+32*2], m1 - mova m5, [rsp+gprsize+32*0] ; t22 - mova m6, [rsp+gprsize+32*1] ; t23 - mova m3, [rsp+gprsize+32*2] ; t24a - psubsw m1, m14, m5 ; t22a - paddsw m14, m5 ; t17a - psubsw m5, m0, m6 ; t23 - paddsw m0, m6 ; t16 - psubsw m6, m4, m3 ; t24 - paddsw m4, m3 ; t31 - vpbroadcastd m8, [o(pw_m2896_2896)] - vpbroadcastd m3, [o(pw_2896_2896)] - mova [tmp1q-32*4], m0 - mova [tmp1q-32*3], m14 - mova [tmp2q+32*3], m4 - ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27 - ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a - ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25 - ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a - mova [tmp1q+32*0], m13 - mova [tmp1q+32*1], m2 - mova [tmp1q+32*2], m7 - mova [tmp1q+32*3], m6 - mova [tmp2q-32*4], m5 - mova [tmp2q-32*3], m1 - mova [tmp2q-32*2], m10 - mova [tmp2q-32*1], m9 - ret -ALIGN function_align -.transpose_2x8x8_round: - punpckhwd m6, m12, m13 - punpcklwd m12, m13 - punpckhwd m13, m8, m9 - punpcklwd m8, m9 - punpckhwd m9, m14, m15 - punpcklwd m14, m15 - punpckhwd m15, m10, m11 - punpcklwd m10, m11 - REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 - punpckhdq m11, m8, m10 - punpckldq m8, m10 - punpckldq m10, m12, m14 - punpckhdq m12, m14 - punpckhdq m14, m13, m15 - punpckldq m13, m15 - punpckldq m15, m6, m9 - punpckhdq m6, m9 - punpckhqdq m9, m8, m10 - punpcklqdq m8, m10 - punpcklqdq m10, m11, m12 - punpckhqdq m11, m12 - punpcklqdq m12, m13, m15 - punpckhqdq m13, m15 - punpckhqdq m15, m14, m6 - punpcklqdq m14, m6 - pmulhrsw m6, m7, [rsp+gprsize+32*0] - REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 - pmulhrsw m7, [rsp+gprsize+32*1] - mova [rsp+gprsize+32*0], m15 - punpckhwd m15, m4, m5 - punpcklwd m4, m5 - punpckhwd m5, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m6, m7 - punpcklwd m6, m7 - punpckhwd m7, m2, m3 - punpcklwd m2, m3 - punpckhdq m3, m0, m2 - punpckldq m0, m2 - punpckldq m2, m4, m6 - punpckhdq m4, m6 - punpckhdq m6, m5, m7 - punpckldq m5, m7 - punpckldq m7, m15, m1 - punpckhdq m15, m1 - punpckhqdq m1, m0, m2 - punpcklqdq m0, m2 - punpcklqdq m2, m3, m4 - punpckhqdq m3, m4 - punpcklqdq m4, m5, m7 - punpckhqdq m5, m7 - punpckhqdq m7, m6, m15 - punpcklqdq m6, m15 - ret -ALIGN function_align -.pass2_end: - mova [rsp+gprsize+32*0], m7 - mova [rsp+gprsize+32*2], m15 - vpbroadcastd m15, [o(pw_2048)] - IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 - IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 - IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 - IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 - add dstq, strideq - sub r2, strideq - mova m1, [rsp+gprsize+32*1] - IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 - IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 - IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 - IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 - add dstq, strideq - sub r2, strideq - IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 - IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 - IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 - IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 - add dstq, strideq - sub r2, strideq - mova m7, [rsp+gprsize+32*0] - mova m1, [rsp+gprsize+32*2] - IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 - IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 - IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 - IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 - ret - -; Perform the final sumsub step and YMM lane shuffling -%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2] - mova m%3, [tmp2q+32*( 3-%1)] - psubsw m%4, m%1, m%3 - paddsw m%1, m%3 - mova m%3, [tmp1q+32*(11-%2)] - mova [tmp1q+32*(11-%2)+16], xm%4 - vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 - paddsw m%4, m%2, m%3 - psubsw m%2, m%3 - mova [tmp1q+32*(11-%2)], xm%2 - vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 - vperm2i128 m%2, m%1, m%4, 0x31 - vinserti128 m%1, xm%4, 1 -%endmacro - -cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jnz .normal - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_16384)] - mov [cq], eobd - pmulhrsw xm0, xm1 - mov r2d, 16 - jmp m(inv_txfm_add_dct_dct_32x8).dconly -.normal: - PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 - vpbroadcastd m15, [o(pw_2896x8)] - pmulhrsw m0, m15, [cq+32* 1] - pmulhrsw m1, m15, [cq+32* 3] - pmulhrsw m2, m15, [cq+32* 5] - pmulhrsw m3, m15, [cq+32* 7] - pmulhrsw m4, m15, [cq+32* 9] - pmulhrsw m5, m15, [cq+32*11] - pmulhrsw m6, m15, [cq+32*13] - pmulhrsw m7, m15, [cq+32*15] - pmulhrsw m8, m15, [cq+32*17] - pmulhrsw m9, m15, [cq+32*19] - pmulhrsw m10, m15, [cq+32*21] - pmulhrsw m11, m15, [cq+32*23] - pmulhrsw m12, m15, [cq+32*25] - pmulhrsw m13, m15, [cq+32*27] - pmulhrsw m14, m15, [cq+32*29] - pmulhrsw m15, [cq+32*31] - lea tmp1q, [rsp+32*7] - lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf - LOAD_16ROWS cq+32*0, 32*2, 1, 0 - pxor m15, m15 - mov r3d, 8 -.zero_loop: - mova [cq+32*0], m15 - mova [cq+32*1], m15 - mova [cq+32*2], m15 - mova [cq+32*3], m15 - add cq, 32*4 - dec r3d - jg .zero_loop - call m(idct_16x16_internal).main - call .pass1_end - lea r2, [strideq*3] - mov r3, dstq -.pass2: - vpbroadcastd m7, [o(pw_16384)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round - call m(idct_16x16_internal).main - mova [rsp+32*2], m15 - vpbroadcastd m15, [o(pw_2048)] - REPX {pmulhrsw x, m15}, m2, m3, m0 - WRITE_16X2 2, 3, 1, 2, strideq*2, r2 - pmulhrsw m1, m15, [rsp+32*1] - WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 - lea dstq, [dstq+strideq*4] - REPX {pmulhrsw x, m15}, m4, m5, m6, m7 - WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 - WRITE_16X2 6, 7, 2, 3, strideq*2, r2 - lea dstq, [dstq+strideq*4] - REPX {pmulhrsw x, m15}, m8, m9, m10, m11 - WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 - WRITE_16X2 10, 11, 2, 3, strideq*2, r2 - lea dstq, [dstq+strideq*4] - REPX {pmulhrsw x, m15}, m11, m12, m13, m14 - pmulhrsw m15, [rsp+32*2] - WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 - WRITE_16X2 14, 15, 2, 3, strideq*2, r2 - test r3, r3 - jnz .right_half - RET -.right_half: - LOAD_8ROWS tmp1q-32*4, 32 - LOAD_8ROWS_H tmp2q-32*4, 32 - lea dstq, [r3+16] - xor r3d, r3d - mova [rsp+32*0], m6 - mova [rsp+32*1], m7 - jmp .pass2 -ALIGN function_align -.pass1_end: - mova [rsp+gprsize+32*0], m9 - IDCT32_PASS1_END 0, 8, 1, 9 - IDCT32_PASS1_END 2, 10, 1, 9 - IDCT32_PASS1_END 3, 11, 1, 9 - IDCT32_PASS1_END 4, 12, 1, 9 - IDCT32_PASS1_END 5, 13, 1, 9 - IDCT32_PASS1_END 6, 14, 1, 9 - IDCT32_PASS1_END 7, 15, 1, 9 - mova m1, [rsp+gprsize+32*1] - mova m9, [rsp+gprsize+32*0] - mova [rsp+gprsize+32*0], m6 - mova [rsp+gprsize+32*1], m7 - IDCT32_PASS1_END 1, 9, 6, 7 - ret - -cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob -%undef cmp - lea rax, [o_base] - vpbroadcastd m9, [o(pw_2896x8)] - vpbroadcastd m10, [o(pw_1697x16)] - vpbroadcastd m12, [o(pw_8192)] - cmp eobd, 43 ; if (eob > 43) - setg r4b ; iteration_count++ - cmp eobd, 150 ; if (eob > 150) - setg al ; iteration_count++ - add eobd, -279 ; if (eob > 278) - adc r4b, al ; iteration_count++ - lea r3, [strideq*3] - mov rax, cq - paddw m11, m12, m12 ; pw_16384 -.loop: - mova xm0, [cq+64* 0] - mova xm1, [cq+64* 1] - vinserti128 m0, [cq+64* 8], 1 - vinserti128 m1, [cq+64* 9], 1 - mova xm2, [cq+64* 2] - mova xm3, [cq+64* 3] - vinserti128 m2, [cq+64*10], 1 - vinserti128 m3, [cq+64*11], 1 - mova xm4, [cq+64* 4] - mova xm5, [cq+64* 5] - vinserti128 m4, [cq+64*12], 1 - vinserti128 m5, [cq+64*13], 1 - mova xm6, [cq+64* 6] - mova xm7, [cq+64* 7] - vinserti128 m6, [cq+64*14], 1 - vinserti128 m7, [cq+64*15], 1 - REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 - WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 - WRITE_16X2 2, 3, 0, 1, strideq*2, r3 - lea dstq, [dstq+strideq*4] - WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 - WRITE_16X2 6, 7, 0, 1, strideq*2, r3 - lea dstq, [dstq+strideq*4] - add cq, 16 - dec r4b - jge .loop - sub cq, 32 - pxor m0, m0 - mov r0d, 8 - cmp cq, rax - ja .zero_loop -.zero_loop_half: - mova [rax+64*0], m0 - mova [rax+64*1], m0 - add rax, 64*4 - mova [rax-64*2], m0 - mova [rax-64*1], m0 - sub r0d, 2 - jg .zero_loop_half - RET -.zero_loop: - mova [rax+32*0], m0 - mova [rax+32*1], m0 - mova [rax+32*2], m0 - mova [rax+32*3], m0 - add rax, 32*4 - dec r0d - jg .zero_loop - RET - -cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob -%undef cmp - lea rax, [o_base] - vpbroadcastd m9, [o(pw_2896x8)] - vpbroadcastd m10, [o(pw_1697x16)] - vpbroadcastd m11, [o(pw_2048)] - cmp eobd, 35 ; if (eob > 35) - setg r4b ; iteration_count++ - cmp eobd, 150 ; if (eob > 150) - setg r3b ; iteration_count += 2 - lea r4d, [r4+r3*2] - lea r3, [strideq*3] - mov r5, dstq - mov rax, cq -.loop: - mova xm0, [cq+32* 0] - mova xm1, [cq+32* 1] - vinserti128 m0, [cq+32* 8], 1 - vinserti128 m1, [cq+32* 9], 1 - mova xm2, [cq+32* 2] - mova xm3, [cq+32* 3] - vinserti128 m2, [cq+32*10], 1 - vinserti128 m3, [cq+32*11], 1 - mova xm4, [cq+32* 4] - mova xm5, [cq+32* 5] - vinserti128 m4, [cq+32*12], 1 - vinserti128 m5, [cq+32*13], 1 - mova xm6, [cq+32* 6] - mova xm7, [cq+32* 7] - vinserti128 m6, [cq+32*14], 1 - vinserti128 m7, [cq+32*15], 1 - REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 - REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 - WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 - WRITE_16X2 2, 3, 0, 1, strideq*2, r3 - lea dstq, [dstq+strideq*4] - WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 - WRITE_16X2 6, 7, 0, 1, strideq*2, r3 - lea dstq, [dstq+strideq*4] - add cq, 16 - dec r4b - jl .ret - test r4b, 1 - jz .loop - add cq, 32*15 - lea dstq, [r5+16] - jmp .loop -.ret: - sub cd, eax - pxor m0, m0 - add cd, 384 -.zero_loop: - mova [rax+32*0], m0 - mova [rax+32*1], m0 - mova [rax+32*2], m0 - mova [rax+32*3], m0 - add rax, 32*4 - sub cd, 128 - jge .zero_loop - RET - -cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jnz .normal - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_8192)] - mov [cq], eobd - mov r2d, 32 - jmp m(inv_txfm_add_dct_dct_32x8).dconly -.normal: - PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ - base, tmp3, tmp4 - %undef cmp - lea tmp1q, [rsp+32*7] - lea tmp2q, [tmp1q+32*8] - sub eobd, 136 - mov tmp4d, eobd -.pass1_loop: - LOAD_8ROWS cq+64*1, 64*2 - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 - test tmp4d, tmp4d - jl .fast - LOAD_8ROWS_H cq+64*17, 64*2 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf - LOAD_8ROWS_H cq+64*16, 64*2 - pxor m0, m0 - REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ - 24, 25, 26, 27, 28, 29, 30, 31 - mova [rsp], m15 - jmp .idct16 -.fast: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - pxor m8, m8 - REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 - mova [rsp], m8 -.idct16: - LOAD_8ROWS cq+64*0, 64*2 - pxor m15, m15 - REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_32x16).pass1_end - vpbroadcastd m7, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round - lea tmp3q, [tmp1q+32*32] - mova m15, [rsp] - mova [tmp3q-32*4], m0 - mova [tmp3q-32*3], m2 - mova [tmp3q-32*2], m4 - mova [tmp3q-32*1], m6 - mova [tmp3q+32*0], m8 - mova [tmp3q+32*1], m10 - mova [tmp3q+32*2], m12 - mova [tmp3q+32*3], m14 - add tmp3q, 32*8 - mova [tmp3q-32*4], m1 - mova [tmp3q-32*3], m3 - mova [tmp3q-32*2], m5 - mova [tmp3q-32*1], m7 - mova [tmp3q+32*0], m9 - mova [tmp3q+32*1], m11 - mova [tmp3q+32*2], m13 - mova [tmp3q+32*3], m15 - vpbroadcastd m9, [o(pw_8192)] - pmulhrsw m0, m9, [tmp1q-32*4] - pmulhrsw m1, m9, [tmp1q-32*3] - pmulhrsw m2, m9, [tmp1q-32*2] - pmulhrsw m3, m9, [tmp1q-32*1] - pmulhrsw m4, m9, [tmp1q+32*0] - pmulhrsw m5, m9, [tmp1q+32*1] - pmulhrsw m6, m9, [tmp1q+32*2] - pmulhrsw m7, m9, [tmp1q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - mova [tmp1q-32*4], m0 - pmulhrsw m0, m9, [tmp2q-32*4] - mova [tmp2q-32*4], m1 - pmulhrsw m1, m9, [tmp2q-32*3] - mova [tmp1q-32*3], m2 - pmulhrsw m2, m9, [tmp2q-32*2] - mova [tmp2q-32*3], m3 - pmulhrsw m3, m9, [tmp2q-32*1] - mova [tmp1q-32*2], m4 - pmulhrsw m4, m9, [tmp2q+32*0] - mova [tmp2q-32*2], m5 - pmulhrsw m5, m9, [tmp2q+32*1] - mova [tmp1q-32*1], m6 - pmulhrsw m6, m9, [tmp2q+32*2] - mova [tmp2q-32*1], m7 - pmulhrsw m7, m9, [tmp2q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - mova [tmp1q+32*0], m0 - mova [tmp2q+32*0], m1 - mova [tmp1q+32*1], m2 - mova [tmp2q+32*1], m3 - mova [tmp1q+32*2], m4 - mova [tmp2q+32*2], m5 - mova [tmp1q+32*3], m6 - mova [tmp2q+32*3], m7 - add cq, 32 - add tmp1q, 32*16 - add tmp2q, 32*16 - add eobd, 0x80000000 - jnc .pass1_loop - add tmp1q, 32*24 - imul r2, strideq, 19 - lea r3, [strideq*3] - add r2, dstq - test tmp4d, tmp4d - jge .pass2_loop - add tmp1q, 32*16 - add tmp2q, 32*16 - add tmp3q, 32*16 -.pass2_loop: - LOAD_8ROWS tmp2q-32*4, 32 - test tmp4d, tmp4d - jl .fast2 - LOAD_8ROWS_H tmp3q-32*4, 32 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf - sub tmp3q, 32*8 - LOAD_8ROWS_H tmp3q-32*4, 32 - sub tmp3q, 32*16 - jmp .pass2_loop_end -.fast2: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - sub tmp3q, 32*24 - pxor m8, m8 - REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 -.pass2_loop_end: - LOAD_8ROWS tmp3q-32*4, 32 - mova [rsp], m15 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_16x32).pass2_end - lea tmp3q, [tmp1q-32*32] - cmp tmp2q, tmp3q - jb .ret - sub tmp2q, 32*32 - sub dstq, r3 - lea r2, [r2+r3+16] - add dstq, 16 - jmp .pass2_loop -.ret: - RET - -cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob - %undef cmp - vpbroadcastd m9, [pw_8192] - sub eobd, 136 ; if (eob < 136) - shr eobd, 30 ; topleft 16x16 only - lea eobd, [eobq*2-8] - lea r4, [strideq*3] - mov r5, dstq - lea rax, [cq+32] -.loop: - mova xm0, [cq+64* 0] - mova xm1, [cq+64* 1] - vinserti128 m0, [cq+64* 8], 1 - vinserti128 m1, [cq+64* 9], 1 - mova xm2, [cq+64* 2] - mova xm3, [cq+64* 3] - vinserti128 m2, [cq+64*10], 1 - vinserti128 m3, [cq+64*11], 1 - mova xm4, [cq+64* 4] - mova xm5, [cq+64* 5] - vinserti128 m4, [cq+64*12], 1 - vinserti128 m5, [cq+64*13], 1 - mova xm6, [cq+64* 6] - mova xm7, [cq+64* 7] - vinserti128 m6, [cq+64*14], 1 - vinserti128 m7, [cq+64*15], 1 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 - WRITE_16X2 2, 3, 0, 1, strideq*2, r4 - lea dstq, [dstq+strideq*4] - WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 - WRITE_16X2 6, 7, 0, 1, strideq*2, r4 - lea dstq, [dstq+strideq*4] - add cq, 16 - inc eobd - jz .ret - test eobd, 3 - jnz .loop - add cq, 64*15 - lea dstq, [r5+16] - jmp .loop -.ret: - pxor m0, m0 - mov r0d, 16 - cmp cq, rax - jne .zero_loop -.zero_loop_topleft: - mova [rax-32*1], m0 - mova [rax+32*1], m0 - mova [rax+32*3], m0 - mova [rax+32*5], m0 - add rax, 64*4 - sub r0d, 4 - jg .zero_loop_topleft - RET -.zero_loop: - mova [rax-32*1], m0 - mova [rax+32*0], m0 - mova [rax+32*1], m0 - mova [rax+32*2], m0 - add rax, 32*4 - dec r0d - jg .zero_loop - RET - -%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) -%if %1 & 1 - mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n - mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n -%else - mova m%5, [tmp1q-32*(45-%1)] - mova m%4, [tmp2q-32*(20+%1)] -%endif - psubsw m%6, m%5, m%4 ; idct32 out31-n - paddsw m%5, m%4 ; idct32 out 0+n - psubsw m%4, m%6, m%3 ; out32+n - paddsw m%6, m%3 ; out31-n - psubsw m%3, m%5, m%2 ; out63-n - paddsw m%5, m%2 ; out 0+n -%if %0 == 6 ; pass 1 -%if %1 & 1 - mova [tmp2q-32*(19-%1)], m%4 - mova [tmp1q-32*(14+%1)], m%6 - mova [tmp1q+32*(18-%1)], m%3 - mova [tmp2q-32*(51-%1)], m%5 -%else - mova [tmp1q-32*(13-%1)], m%4 - mova [tmp2q-32*(20+%1)], m%6 - mova [tmp2q+32*(12-%1)], m%3 - mova [tmp1q-32*(45-%1)], m%5 -%endif -%else ; pass 2 - REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 -%if %1 & 1 - %define %%d0 r2 - %define %%d1 dstq -%else - %define %%d0 dstq - %define %%d1 r2 -%endif - pmovzxbw m%2, [%%d0+%9 ] - paddw m%2, m%4 - pmovzxbw m%4, [%%d1+%8 ] - paddw m%4, m%6 - pmovzxbw m%6, [%%d1+%10] - paddw m%3, m%6 - pmovzxbw m%6, [%%d0+%7 ] - paddw m%5, m%6 - packuswb m%2, m%4 - packuswb m%3, m%5 - vpermq m%2, m%2, q3120 - vpermq m%3, m%3, q3120 - mova [%%d0+%9 ], xm%2 - vextracti128 [%%d1+%8 ], m%2, 1 - mova [%%d1+%10], xm%3 - vextracti128 [%%d0+%7 ], m%3, 1 -%endif -%endmacro - -cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jnz .normal - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_8192)] - mov [cq], eobd - mov r2d, 32 - jmp m(inv_txfm_add_dct_dct_16x4).dconly -.normal: - PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 - %undef cmp - lea tmp1q, [rsp+32*23] - lea tmp2q, [tmp1q+32*24] - sub eobd, 151 - mov r7d, eobd -.pass1_loop: - LOAD_16ROWS cq, 64 - call m(idct_16x16_internal).main - mova m1, [rsp+32*1] - mova [rsp+32*0], m6 - mova [rsp+32*1], m7 - vpbroadcastd m7, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round - mova m15, [rsp+32*0] - mova [tmp1q-32*4], m0 - mova [tmp1q-32*3], m2 - mova [tmp1q-32*2], m4 - mova [tmp1q-32*1], m6 - mova [tmp1q+32*0], m8 - mova [tmp1q+32*1], m10 - mova [tmp1q+32*2], m12 - mova [tmp1q+32*3], m14 - mova [tmp2q-32*4], m1 - mova [tmp2q-32*3], m3 - mova [tmp2q-32*2], m5 - mova [tmp2q-32*1], m7 - mova [tmp2q+32*0], m9 - mova [tmp2q+32*1], m11 - mova [tmp2q+32*2], m13 - mova [tmp2q+32*3], m15 - add cq, 32 - add tmp1q, 32*8 - add tmp2q, 32*8 - add eobd, 0x80000000 - jnc .pass1_loop - lea r2, [rsp+32*23] - mova xm0, [r2-32*4+ 0] - mova xm1, [r2-32*2+ 0] - vinserti128 m0, [r2+32*0+ 0], 1 - vinserti128 m1, [r2+32*2+ 0], 1 - mova xm2, [r2-32*4+16] - mova xm3, [r2-32*2+16] - vinserti128 m2, [r2+32*0+16], 1 - vinserti128 m3, [r2+32*2+16], 1 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 - test r7d, r7d - jl .fast - lea r3, [r2+32*8] - mova xm4, [r3-32*4+ 0] - mova xm5, [r3-32*2+ 0] - vinserti128 m4, [r3+32*0+ 0], 1 - vinserti128 m5, [r3+32*2+ 0], 1 - mova xm6, [r3-32*4+16] - mova xm7, [r3-32*2+16] - vinserti128 m6, [r3+32*0+16], 1 - vinserti128 m7, [r3+32*2+16], 1 -.fast: - mova [rsp], m8 - lea tmp1q, [rsp+32*7] - call m(idct_16x16_internal).main - mova m1, [rsp+32*1] - mova [tmp1q-32*4], m0 - mova [tmp1q-32*3], m1 - mova [tmp1q-32*2], m2 - mova [tmp1q-32*1], m3 - mova [tmp1q+32*0], m4 - mova [tmp1q+32*1], m5 - mova [tmp1q+32*2], m6 - mova [tmp1q+32*3], m7 - add tmp1q, 32*8 - mova [tmp1q-32*4], m8 - mova [tmp1q-32*3], m9 - mova [tmp1q-32*2], m10 - mova [tmp1q-32*1], m11 - mova [tmp1q+32*0], m12 - mova [tmp1q+32*1], m13 - mova [tmp1q+32*2], m14 - mova [tmp1q+32*3], m15 - mova xm0, [r2-32*3+ 0] - mova xm1, [r2-32*1+ 0] - vinserti128 m0, [r2+32*1+ 0], 1 - vinserti128 m1, [r2+32*3+ 0], 1 - mova xm2, [r2-32*3+16] - mova xm3, [r2-32*1+16] - vinserti128 m2, [r2+32*1+16], 1 - vinserti128 m3, [r2+32*3+16], 1 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - test r7d, r7d - jl .fast2 - mova xm4, [r3-32*3+ 0] - mova xm5, [r3-32*1+ 0] - vinserti128 m4, [r3+32*1+ 0], 1 - vinserti128 m5, [r3+32*3+ 0], 1 - mova xm6, [r3-32*3+16] - mova xm7, [r3-32*1+16] - vinserti128 m6, [r3+32*1+16], 1 - vinserti128 m7, [r3+32*3+16], 1 -.fast2: - add tmp1q, 32*8 - lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - add r2, 32*24 - vpbroadcastd m15, [o(pd_2048)] - add tmp1q, 32*16 - add tmp2q, 32*32 - mova xm0, [r2-32*4+ 0] - mova xm3, [r2-32*1+16] - vinserti128 m0, [r2+32*0+ 0], 1 - vinserti128 m3, [r2+32*3+16], 1 - mova xm4, [r2-32*4+16] - mova xm7, [r2-32*1+ 0] - vinserti128 m4, [r2+32*0+16], 1 - vinserti128 m7, [r2+32*3+ 0], 1 - pxor m1, m1 - REPX {mova x, m1}, m2, m5, m6 - test r7d, r7d - jl .fast3 - add r3, 32*24 - mova xm1, [r3-32*1+16] - mova xm2, [r3-32*4+ 0] - vinserti128 m1, [r3+32*3+16], 1 - vinserti128 m2, [r3+32*0+ 0], 1 - mova xm5, [r3-32*1+ 0] - mova xm6, [r3-32*4+16] - vinserti128 m5, [r3+32*3+ 0], 1 - vinserti128 m6, [r3+32*0+16], 1 -.fast3: - add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 - add rax, 8 - add tmp1q, 32*8 - sub tmp2q, 32*8 - mova xm0, [r2-32*2+ 0] - mova xm3, [r2-32*3+16] - vinserti128 m0, [r2+32*2+ 0], 1 - vinserti128 m3, [r2+32*1+16], 1 - mova xm4, [r2-32*2+16] - mova xm7, [r2-32*3+ 0] - vinserti128 m4, [r2+32*2+16], 1 - vinserti128 m7, [r2+32*1+ 0], 1 - pxor m1, m1 - REPX {mova x, m1}, m2, m5, m6 - test r7d, r7d - jl .fast4 - mova xm1, [r3-32*3+16] - mova xm2, [r3-32*2+ 0] - vinserti128 m1, [r3+32*1+16], 1 - vinserti128 m2, [r3+32*2+ 0], 1 - mova xm5, [r3-32*3+ 0] - mova xm6, [r3-32*2+16] - vinserti128 m5, [r3+32*1+ 0], 1 - vinserti128 m6, [r3+32*2+16], 1 -.fast4: - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 - RET -ALIGN function_align -%define o_base idct64_mul - 8 -.main_part1: - ; idct64 steps 1-5: - ; in1/31/17/15/ 9/23/25/ 7 -> - ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a - ; in5/27/21/11/13/19/29/ 3 -> - ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a - vpbroadcastd m11, [o(idct64_mul+4* 0)] - vpbroadcastd m13, [o(idct64_mul+4* 1)] - vpbroadcastd m10, [o(idct64_mul+4* 4)] - vpbroadcastd m12, [o(idct64_mul+4* 5)] - pmulhrsw m11, m0 ; t63a - pmulhrsw m0, m13 ; t32a - pmulhrsw m10, m1 ; t62a - pmulhrsw m1, m12 ; t33a - vpbroadcastd m9, [o(idct64_mul+4* 8)] - vpbroadcastd m13, [o(idct64_mul+4* 9)] - vpbroadcastd m8, [o(idct64_mul+4*12)] - vpbroadcastd m12, [o(idct64_mul+4*13)] - pmulhrsw m9, m2 ; t61a - pmulhrsw m2, m13 ; t34a - pmulhrsw m8, m3 ; t60a - pmulhrsw m3, m12 ; t35a - psubsw m12, m0, m1 ; t33 - paddsw m0, m1 ; t32 - psubsw m1, m3, m2 ; t34 - paddsw m3, m2 ; t35 - psubsw m2, m8, m9 ; t61 - paddsw m8, m9 ; t60 - psubsw m9, m11, m10 ; t62 - paddsw m11, m10 ; t63 - ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a - vpbroadcastd m14, [o(pw_401_4076)] - ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a - psubsw m10, m0, m3 ; t35a - paddsw m0, m3 ; t32a - psubsw m3, m11, m8 ; t60a - paddsw m11, m8 ; t63a - psubsw m8, m9, m2 ; t34 - paddsw m9, m2 ; t33 - psubsw m2, m12, m1 ; t61 - paddsw m12, m1 ; t62 - mova [tmp1q-32*4], m0 - mova [tmp1q-32*3], m9 - mova [tmp2q+32*2], m12 - mova [tmp2q+32*3], m11 - vpbroadcastd m13, [o(pw_m4017_799)] - vpbroadcastd m14, [o(pw_799_4017)] - ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a - ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60 - mova [tmp1q-32*2], m2 - mova [tmp1q-32*1], m3 - mova [tmp2q+32*0], m10 - mova [tmp2q+32*1], m8 - vpbroadcastd m3, [o(idct64_mul+4*16)] - vpbroadcastd m11, [o(idct64_mul+4*17)] - vpbroadcastd m2, [o(idct64_mul+4*20)] - vpbroadcastd m10, [o(idct64_mul+4*21)] - vpbroadcastd m1, [o(idct64_mul+4*24)] - vpbroadcastd m9, [o(idct64_mul+4*25)] - vpbroadcastd m0, [o(idct64_mul+4*28)] - vpbroadcastd m8, [o(idct64_mul+4*29)] - pmulhrsw m3, m4 ; t59a - pmulhrsw m4, m11 ; t36a - pmulhrsw m2, m5 ; t58a - pmulhrsw m5, m10 ; t37a - pmulhrsw m1, m6 ; t57a - pmulhrsw m6, m9 ; t38a - pmulhrsw m0, m7 ; t56a - pmulhrsw m7, m8 ; t39a - psubsw m8, m4, m5 ; t37 - paddsw m4, m5 ; t36 - psubsw m5, m7, m6 ; t38 - paddsw m7, m6 ; t39 - psubsw m6, m0, m1 ; t57 - paddsw m0, m1 ; t56 - psubsw m1, m3, m2 ; t58 - paddsw m3, m2 ; t59 - ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a - vpbroadcastd m10, [o(pw_3166_2598)] - ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a - psubsw m2, m7, m4 ; t36a - paddsw m7, m4 ; t39a - psubsw m4, m0, m3 ; t59a - paddsw m0, m3 ; t56a - psubsw m3, m6, m1 ; t37 - paddsw m6, m1 ; t38 - psubsw m1, m5, m8 ; t58 - paddsw m5, m8 ; t57 - mova [tmp1q+32*2], m6 - mova [tmp1q+32*3], m7 - mova [tmp2q-32*4], m0 - mova [tmp2q-32*3], m5 - vpbroadcastd m6, [o(pw_m799_m4017)] - vpbroadcastd m7, [o(pw_m4017_799)] - ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59 - ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a - mova [tmp1q+32*0], m4 - mova [tmp1q+32*1], m1 - mova [tmp2q-32*2], m3 - mova [tmp2q-32*1], m2 - ret -%define o_base pw_5 + 128 -.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub - sub rax, o_idct64_offset + 8 - vpbroadcastd m11, [o(pw_1567_3784)] - vpbroadcastd m12, [o(pw_m3784_1567)] - vpbroadcastd m13, [o(pw_2896_2896)] - vpbroadcastd m14, [o(pw_m2896_2896)] -.main_part2_pass1_loop: - call .main_part2_internal - IDCT64_PART2_END 0, 7, 0, 6, 9, 10 - IDCT64_PART2_END 7, 8, 5, 0, 6, 7 - IDCT64_PART2_END 8, 2, 1, 0, 6, 7 - IDCT64_PART2_END 15, 3, 4, 0, 6, 7 - cmp tmp1q, tmp2q - jne .main_part2_pass1_loop - ret -.main_part2_internal: - mova m0, [tmp1q-32*12] ; t32a - mova m6, [tmp2q-32*13] ; t39a - mova m1, [tmp1q-32* 4] ; t40a - mova m5, [tmp2q+32* 3] ; t55a - add tmp1q, 32 - sub tmp2q, 32 - mova m2, [tmp1q+32* 3] ; t48a - mova m4, [tmp2q-32* 4] ; t47a - mova m3, [tmp1q+32*11] ; t56a - mova m7, [tmp2q+32*12] ; t63a - psubsw m8, m0, m6 ; t39 - paddsw m0, m6 ; t32 - psubsw m6, m4, m1 ; t40 - paddsw m4, m1 ; t47 - psubsw m1, m2, m5 ; t55 - paddsw m2, m5 ; t48 - psubsw m5, m7, m3 ; t56 - paddsw m7, m3 ; t63 - ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a - vpbroadcastd m9, [o(pw_m1567_m3784)] - ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a - psubsw m3, m0, m4 ; t47a - paddsw m0, m4 ; t32a - psubsw m4, m7, m2 ; t48a - paddsw m7, m2 ; t63a - psubsw m2, m5, m1 ; t40 - paddsw m5, m1 ; t39 - psubsw m1, m8, m6 ; t55 - paddsw m8, m6 ; t56 - ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48 - ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a - ret -.main_part2_pass2: - sub rax, o_idct64_offset + 8 - vpbroadcastd m11, [o(pw_1567_3784)] - vpbroadcastd m12, [o(pw_m3784_1567)] - vpbroadcastd m13, [o(pw_2896_2896)] - lea r9, [strideq*5] ; stride*5 - lea r3, [r9+strideq*1] ; stride*6 - lea r7, [r9+strideq*2] ; stride*7 - lea r8, [r3+strideq*2] ; stride*8 - lea r2, [dstq+r7] -.main_part2_pass2_loop: - vpbroadcastd m14, [o(pw_m2896_2896)] - call .main_part2_internal - vpbroadcastd m14, [o(pw_2048)] - IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 - IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 - IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 - IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 - add dstq, strideq - sub r2, strideq - cmp tmp1q, tmp2q - jne .main_part2_pass2_loop - ret - -cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jnz .normal - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_8192)] - mov [cq], eobd - mov r2d, 16 -.dconly: - pmulhrsw xm0, xm2 - movd xm2, [o(pw_2048)] - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - pxor m1, m1 -.dconly_loop: - mova m2, [dstq+32*0] - mova m3, [dstq+32*1] - punpckhbw m4, m2, m1 - punpcklbw m2, m1 - punpckhbw m5, m3, m1 - punpcklbw m3, m1 - paddw m4, m0 - paddw m2, m0 - paddw m5, m0 - paddw m3, m0 - packuswb m2, m4 - packuswb m3, m5 - mova [dstq+32*0], m2 - mova [dstq+32*1], m3 - add dstq, strideq - dec r2d - jg .dconly_loop - RET -.normal: - PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 - LOAD_8ROWS cq+32*0, 32*4 - pxor m8, m8 - REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 - REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 - mova [rsp], m8 - lea tmp1q, [rsp+32*7] - call m(idct_16x16_internal).main - mova m1, [rsp+32*1] - mova [tmp1q-32*4], m0 - mova [tmp1q-32*3], m1 - mova [tmp1q-32*2], m2 - mova [tmp1q-32*1], m3 - mova [tmp1q+32*0], m4 - mova [tmp1q+32*1], m5 - mova [tmp1q+32*2], m6 - mova [tmp1q+32*3], m7 - add tmp1q, 32*8 - mova [tmp1q-32*4], m8 - mova [tmp1q-32*3], m9 - mova [tmp1q-32*2], m10 - mova [tmp1q-32*1], m11 - mova [tmp1q+32*0], m12 - mova [tmp1q+32*1], m13 - mova [tmp1q+32*2], m14 - mova [tmp1q+32*3], m15 - LOAD_8ROWS cq+32*2, 32*4 - pxor m8, m8 - REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 - add tmp1q, 32*8 - lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - vpbroadcastd m15, [o(pd_2048)] - add tmp1q, 32*16 - add tmp2q, 32*32 - mova m0, [cq+32* 1] - mova m1, [cq+32*31] - mova m2, [cq+32*17] - mova m3, [cq+32*15] - mova m4, [cq+32* 9] - mova m5, [cq+32*23] - mova m6, [cq+32*25] - mova m7, [cq+32* 7] - pxor m8, m8 - REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 - add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 - add rax, 8 - add tmp1q, 32*8 - sub tmp2q, 32*8 - mova m0, [cq+32* 5] - mova m1, [cq+32*27] - mova m2, [cq+32*21] - mova m3, [cq+32*11] - mova m4, [cq+32*13] - mova m5, [cq+32*19] - mova m6, [cq+32*29] - mova m7, [cq+32* 3] - pxor m8, m8 - REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 - sub tmp1q, 32*36 - lea r2, [strideq*3] - mov tmp2d, 4 -.pass2_loop: - lea r3, [tmp1q-32*8] - mova xm0, [r3 -32*4] - mova xm1, [r3 -32*3] - vinserti128 m0, [tmp1q-32*4], 1 - vinserti128 m1, [tmp1q-32*3], 1 - mova xm2, [r3 -32*2] - mova xm3, [r3 -32*1] - vinserti128 m2, [tmp1q-32*2], 1 - vinserti128 m3, [tmp1q-32*1], 1 - mova xm4, [r3 +32*0] - mova xm5, [r3 +32*1] - vinserti128 m4, [tmp1q+32*0], 1 - vinserti128 m5, [tmp1q+32*1], 1 - mova xm6, [r3 +32*2] - mova xm7, [r3 +32*3] - vinserti128 m6, [tmp1q+32*2], 1 - vinserti128 m7, [tmp1q+32*3], 1 - mova xm8, [r3 -32*4+16] - mova xm9, [r3 -32*3+16] - vinserti128 m8, [tmp1q-32*4+16], 1 - vinserti128 m9, [tmp1q-32*3+16], 1 - mova xm10, [r3 -32*2+16] - mova xm11, [r3 -32*1+16] - vinserti128 m10, [tmp1q-32*2+16], 1 - vinserti128 m11, [tmp1q-32*1+16], 1 - mova xm12, [r3 +32*0+16] - mova xm13, [r3 +32*1+16] - vinserti128 m12, [tmp1q+32*0+16], 1 - vinserti128 m13, [tmp1q+32*1+16], 1 - mova xm14, [r3 +32*2+16] - mova xm15, [r3 +32*3+16] - vinserti128 m14, [tmp1q+32*2+16], 1 - vinserti128 m15, [tmp1q+32*3+16], 1 - mova [rsp+32*0], m6 - mova [rsp+32*1], m7 - vpbroadcastd m7, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round - call m(idct_16x16_internal).main - mova [rsp+32*0], m15 - vpbroadcastd m15, [o(pw_2048)] - REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 - WRITE_16X2 2, 3, 1, 2, strideq*2, r2 - pmulhrsw m1, m15, [rsp+32*1] - WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 - lea r3, [dstq+strideq*4] - %define dstq r3 - WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 - WRITE_16X2 6, 7, 2, 3, strideq*2, r2 - REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 - lea r3, [r3+strideq*4] - WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 - WRITE_16X2 10, 11, 2, 3, strideq*2, r2 - pmulhrsw m15, [rsp+32*0] - lea r3, [r3+strideq*4] - WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 - WRITE_16X2 14, 15, 2, 3, strideq*2, r2 - add tmp1q, 32*16 - add r0, 16 - dec tmp2d - jg .pass2_loop - RET - -cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jnz .normal - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_16384)] - mov [cq], eobd - pmulhrsw xm0, xm1 - mov r2d, 64 - jmp m(inv_txfm_add_dct_dct_32x8).dconly -.normal: - PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 - lea tmp1q, [rsp+32*7] - lea r10d, [eobq-136] - sar r10d, 31 -.pass1_loop: - lea tmp2q, [tmp1q+32*16] - LOAD_8ROWS cq+64*1, 64*2, 1 - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 - test r10b, r10b - jnz .fast - LOAD_8ROWS_H cq+64*17, 64*2, 2 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf - LOAD_8ROWS_H cq+64*16, 64*2, 1 - mova [rsp], m15 - pxor m15, m15 - REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ - 24, 25, 26, 27, 28, 29, 30, 31 - jmp .idct16 -.fast: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - pxor m8, m8 - REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 - mova [rsp], m8 -.idct16: - LOAD_8ROWS cq+64*0, 64*2, 1 - pxor m15, m15 - REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_32x16).pass1_end - vpbroadcastd m7, [o(pw_16384)] - call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round - lea r3, [tmp1q+32*48] - mova m15, [rsp] - mova [r3-32*4], m0 - mova [r3-32*3], m2 - mova [r3-32*2], m4 - mova [r3-32*1], m6 - mova [r3+32*0], m8 - mova [r3+32*1], m10 - mova [r3+32*2], m12 - mova [r3+32*3], m14 - add r3, 32*24 - mova [r3-32*4], m1 - mova [r3-32*3], m3 - mova [r3-32*2], m5 - mova [r3-32*1], m7 - mova [r3+32*0], m9 - mova [r3+32*1], m11 - mova [r3+32*2], m13 - mova [r3+32*3], m15 - vpbroadcastd m9, [o(pw_16384)] - pmulhrsw m0, m9, [tmp1q-32*4] - pmulhrsw m1, m9, [tmp1q-32*3] - pmulhrsw m2, m9, [tmp1q-32*2] - pmulhrsw m3, m9, [tmp1q-32*1] - pmulhrsw m4, m9, [tmp1q+32*0] - pmulhrsw m5, m9, [tmp1q+32*1] - pmulhrsw m6, m9, [tmp1q+32*2] - pmulhrsw m7, m9, [tmp1q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - mova [tmp1q-32*4], m0 - pmulhrsw m0, m9, [tmp2q-32*4] - mova [tmp2q-32*4], m1 - pmulhrsw m1, m9, [tmp2q-32*3] - mova [tmp1q-32*3], m2 - pmulhrsw m2, m9, [tmp2q-32*2] - mova [tmp2q-32*3], m3 - pmulhrsw m3, m9, [tmp2q-32*1] - mova [tmp1q-32*2], m4 - pmulhrsw m4, m9, [tmp2q+32*0] - mova [tmp2q-32*2], m5 - pmulhrsw m5, m9, [tmp2q+32*1] - mova [tmp1q-32*1], m6 - pmulhrsw m6, m9, [tmp2q+32*2] - mova [tmp2q-32*1], m7 - pmulhrsw m7, m9, [tmp2q+32*3] - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - mova [tmp1q+32*0], m0 - mova [tmp2q+32*0], m1 - mova [tmp1q+32*1], m2 - mova [tmp2q+32*1], m3 - mova [tmp1q+32*2], m4 - mova [tmp2q+32*2], m5 - mova [tmp1q+32*3], m6 - mova [tmp2q+32*3], m7 - add cq, 32 - add tmp1q, 32*8 - add r10d, 0x80000000 - jnc .pass1_loop - lea r2, [rsp+32*55] - lea r7, [r2+32*24] -.pass2_loop: - lea r3, [r2+32*8] - lea r8, [r7+32*8] - mova m0, [r2-32*4] - mova m1, [r2-32*2] - mova m2, [r2+32*0] - mova m3, [r2+32*2] - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 - test r10b, r10b - jnz .fast2 - mova m4, [r3-32*4] - mova m5, [r3-32*2] - mova m6, [r3+32*0] - mova m7, [r3+32*2] -.fast2: - mova [rsp], m8 - lea tmp1q, [rsp+32*39] - call m(idct_16x16_internal).main - mova m1, [rsp+32*1] - mova [tmp1q-32*4], m0 - mova [tmp1q-32*3], m1 - mova [tmp1q-32*2], m2 - mova [tmp1q-32*1], m3 - mova [tmp1q+32*0], m4 - mova [tmp1q+32*1], m5 - mova [tmp1q+32*2], m6 - mova [tmp1q+32*3], m7 - add tmp1q, 32*8 - mova [tmp1q-32*4], m8 - mova [tmp1q-32*3], m9 - mova [tmp1q-32*2], m10 - mova [tmp1q-32*1], m11 - mova [tmp1q+32*0], m12 - mova [tmp1q+32*1], m13 - mova [tmp1q+32*2], m14 - mova [tmp1q+32*3], m15 - mova m0, [r2-32*3] - mova m1, [r2-32*1] - mova m2, [r2+32*1] - mova m3, [r2+32*3] - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - test r10b, r10b - jnz .fast3 - mova m4, [r3-32*3] - mova m5, [r3-32*1] - mova m6, [r3+32*1] - mova m7, [r3+32*3] -.fast3: - add tmp1q, 32*8 - lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - vpbroadcastd m15, [o(pd_2048)] - add tmp1q, 32*16 - add tmp2q, 32*32 - mova m0, [r7-32*4] - mova m3, [r7+32*3] - mova m4, [r7+32*0] - mova m7, [r7-32*1] - pxor m1, m1 - REPX {mova x, m1}, m2, m5, m6 - test r10b, r10b - jnz .fast4 - mova m1, [r8+32*3] - mova m2, [r8-32*4] - mova m5, [r8-32*1] - mova m6, [r8+32*0] -.fast4: - add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 - add rax, 8 - add tmp1q, 32*8 - sub tmp2q, 32*8 - mova m0, [r7-32*2] - mova m3, [r7+32*1] - mova m4, [r7+32*2] - mova m7, [r7-32*3] - pxor m1, m1 - REPX {mova x, m1}, m2, m5, m6 - test r10b, r10b - jnz .fast5 - mova m1, [r8+32*1] - mova m2, [r8-32*2] - mova m5, [r8-32*3] - mova m6, [r8+32*2] -.fast5: - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 - add r10d, 0x80000000 - jc .ret - lea r2, [rsp+32*7] - lea r7, [r2+32*16] - sub dstq, r8 - lea dstq, [dstq+strideq*4+16] - jmp .pass2_loop -.ret: - RET - -cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jnz .normal - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_16384)] - mov [cq], eobd - pmulhrsw xm0, xm1 - mov r2d, 32 - jmp m(inv_txfm_add_dct_dct_64x16).dconly -.normal: - PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ - base, tmp3, tmp4 - lea tmp1q, [rsp+32*7] - lea tmp4d, [eobq-136] -.pass1_loop: - LOAD_8ROWS cq+64*0, 64*4, 1 - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 - REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 - mova [rsp], m8 - call m(idct_16x16_internal).main - mova m1, [rsp+32*1] - mova [tmp1q-32*4], m0 - mova [tmp1q-32*3], m1 - mova [tmp1q-32*2], m2 - mova [tmp1q-32*1], m3 - mova [tmp1q+32*0], m4 - mova [tmp1q+32*1], m5 - mova [tmp1q+32*2], m6 - mova [tmp1q+32*3], m7 - add tmp1q, 32*8 - mova [tmp1q-32*4], m8 - mova [tmp1q-32*3], m9 - mova [tmp1q-32*2], m10 - mova [tmp1q-32*1], m11 - mova [tmp1q+32*0], m12 - mova [tmp1q+32*1], m13 - mova [tmp1q+32*2], m14 - mova [tmp1q+32*3], m15 - LOAD_8ROWS cq+64*2, 64*4, 1 - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 - add tmp1q, 32*8 - lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - vpbroadcastd m15, [o(pd_2048)] - add tmp1q, 32*16 - add tmp2q, 32*32 - vpbroadcastd m7, [o(pw_2896x8)] - pmulhrsw m0, m7, [cq+64* 1] - pmulhrsw m1, m7, [cq+64*31] - pmulhrsw m2, m7, [cq+64*17] - pmulhrsw m3, m7, [cq+64*15] - pmulhrsw m4, m7, [cq+64* 9] - pmulhrsw m5, m7, [cq+64*23] - pmulhrsw m6, m7, [cq+64*25] - pmulhrsw m7, [cq+64* 7] - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 - add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 - vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] - add rax, 8 - add tmp1q, 32*8 - sub tmp2q, 32*8 - pmulhrsw m0, m7, [cq+64* 5] - pmulhrsw m1, m7, [cq+64*27] - pmulhrsw m2, m7, [cq+64*21] - pmulhrsw m3, m7, [cq+64*11] - pmulhrsw m4, m7, [cq+64*13] - pmulhrsw m5, m7, [cq+64*19] - pmulhrsw m6, m7, [cq+64*29] - pmulhrsw m7, [cq+64* 3] - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 - sub tmp1q, 32*44 - vpbroadcastd m10, [o(pw_16384)] - call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave - add cq, 32 - add tmp4d, 0x80000000 - jnc .pass1_loop - lea tmp1q, [rsp+32*15] - imul r2, strideq, 19 - lea r3, [strideq*3] - add r2, dstq - mov tmp4b, 4 -.pass2_loop: - lea tmp2q, [tmp1q+32*64] - LOAD_8ROWS tmp1q-32*4, 32 - test tmp4d, 0x40000000 - jnz .fast - LOAD_8ROWS_H tmp2q-32*4, 32 - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf - lea tmp3q, [tmp2q-32*8] - LOAD_8ROWS_H tmp3q-32*4, 32 - mova [rsp], m15 - jmp .idct16 -.fast: - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - pxor m8, m8 - REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 - mova [rsp], m8 -.idct16: - lea tmp3q, [tmp1q-32*8] - LOAD_8ROWS tmp3q-32*4, 32 - call m(idct_16x16_internal).main - call m(inv_txfm_add_dct_dct_16x32).pass2_end - add tmp1q, 32*16 - sub dstq, r3 - lea r2, [r2+r3+16] - add dstq, 16 - dec tmp4b - jg .pass2_loop - RET -ALIGN function_align -.transpose_round_interleave: - mov tmp3d, 4 -.loop: - lea tmp2q, [tmp1q+32*8] - mova xm0, [tmp1q-32*4] - mova xm1, [tmp1q-32*3] - vinserti128 m0, [tmp2q-32*4], 1 - vinserti128 m1, [tmp2q-32*3], 1 - mova xm2, [tmp1q-32*2] - mova xm3, [tmp1q-32*1] - vinserti128 m2, [tmp2q-32*2], 1 - vinserti128 m3, [tmp2q-32*1], 1 - mova xm4, [tmp1q+32*0] - mova xm5, [tmp1q+32*1] - vinserti128 m4, [tmp2q+32*0], 1 - vinserti128 m5, [tmp2q+32*1], 1 - mova xm6, [tmp1q+32*2] - mova xm7, [tmp1q+32*3] - vinserti128 m6, [tmp2q+32*2], 1 - vinserti128 m7, [tmp2q+32*3], 1 - REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - mova xm8, [tmp1q-32*4+16] - mova xm9, [tmp1q-32*3+16] - vinserti128 m8, [tmp2q-32*4+16], 1 - vinserti128 m9, [tmp2q-32*3+16], 1 - mova [tmp1q-32*4], m0 - mova [tmp2q-32*4], m1 - mova [tmp1q-32*3], m2 - mova [tmp2q-32*3], m3 - mova xm2, [tmp1q-32*2+16] - mova xm3, [tmp1q-32*1+16] - vinserti128 m2, [tmp2q-32*2+16], 1 - vinserti128 m3, [tmp2q-32*1+16], 1 - mova [tmp1q-32*2], m4 - mova [tmp2q-32*2], m5 - mova [tmp1q-32*1], m6 - mova [tmp2q-32*1], m7 - mova xm4, [tmp1q+32*0+16] - mova xm5, [tmp1q+32*1+16] - vinserti128 m4, [tmp2q+32*0+16], 1 - vinserti128 m5, [tmp2q+32*1+16], 1 - mova xm6, [tmp1q+32*2+16] - mova xm7, [tmp1q+32*3+16] - vinserti128 m6, [tmp2q+32*2+16], 1 - vinserti128 m7, [tmp2q+32*3+16], 1 - pmulhrsw m0, m8, m10 - pmulhrsw m1, m9, m10 - REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 - call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - mova [tmp1q+32*0], m0 - mova [tmp2q+32*0], m1 - mova [tmp1q+32*1], m2 - mova [tmp2q+32*1], m3 - mova [tmp1q+32*2], m4 - mova [tmp2q+32*2], m5 - mova [tmp1q+32*3], m6 - mova [tmp2q+32*3], m7 - add tmp1q, 32*16 - dec tmp3d - jg .loop - ret - -cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob - lea rax, [o_base] - test eobd, eobd - jnz .normal - movd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1, [cq] - movd xm2, [o(pw_8192)] - mov [cq], eobd - mov r2d, 64 - jmp m(inv_txfm_add_dct_dct_64x16).dconly -.normal: - PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 - lea tmp1q, [rsp+32*71] - lea r10d, [eobq-136] -.pass1_loop: - LOAD_8ROWS cq+64*0, 64*4 - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 - REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 - mova [rsp], m8 - call m(idct_16x16_internal).main - mova m1, [rsp+32*1] - mova [tmp1q-32*4], m0 - mova [tmp1q-32*3], m1 - mova [tmp1q-32*2], m2 - mova [tmp1q-32*1], m3 - mova [tmp1q+32*0], m4 - mova [tmp1q+32*1], m5 - mova [tmp1q+32*2], m6 - mova [tmp1q+32*3], m7 - add tmp1q, 32*8 - mova [tmp1q-32*4], m8 - mova [tmp1q-32*3], m9 - mova [tmp1q-32*2], m10 - mova [tmp1q-32*1], m11 - mova [tmp1q+32*0], m12 - mova [tmp1q+32*1], m13 - mova [tmp1q+32*2], m14 - mova [tmp1q+32*3], m15 - LOAD_8ROWS cq+64*2, 64*4 - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 - add tmp1q, 32*8 - lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - vpbroadcastd m15, [o(pd_2048)] - add tmp1q, 32*16 - add tmp2q, 32*32 - mova m0, [cq+64* 1] - mova m1, [cq+64*31] - mova m2, [cq+64*17] - mova m3, [cq+64*15] - mova m4, [cq+64* 9] - mova m5, [cq+64*23] - mova m6, [cq+64*25] - mova m7, [cq+64* 7] - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 - add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 - add rax, 8 - add tmp1q, 32*8 - sub tmp2q, 32*8 - mova m0, [cq+64* 5] - mova m1, [cq+64*27] - mova m2, [cq+64*21] - mova m3, [cq+64*11] - mova m4, [cq+64*13] - mova m5, [cq+64*19] - mova m6, [cq+64*29] - mova m7, [cq+64* 3] - pxor m8, m8 - REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1 - sub tmp1q, 32*44 - vpbroadcastd m10, [o(pw_8192)] - call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave - add cq, 32 - add r10d, 0x80000000 - jnc .pass1_loop - lea tmp1q, [rsp+32*7] - mov r10b, 4 -.pass2_loop: - lea r2, [tmp1q+32*64] - mova m0, [r2-32*4] - mova m1, [r2-32*2] - mova m2, [r2+32*0] - mova m3, [r2+32*2] - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 - mova [rsp], m4 - test r10d, 0x40000000 - jnz .fast - lea r3, [r2+32*64] - mova m4, [r3-32*4] - mova m5, [r3-32*2] - mova m6, [r3+32*0] - mova m7, [r3+32*2] -.fast: - call m(idct_16x16_internal).main - mova m1, [rsp+32*1] - mova [tmp1q-32*4], m0 - mova [tmp1q-32*3], m1 - mova [tmp1q-32*2], m2 - mova [tmp1q-32*1], m3 - mova [tmp1q+32*0], m4 - mova [tmp1q+32*1], m5 - mova [tmp1q+32*2], m6 - mova [tmp1q+32*3], m7 - add tmp1q, 32*8 - mova [tmp1q-32*4], m8 - mova [tmp1q-32*3], m9 - mova [tmp1q-32*2], m10 - mova [tmp1q-32*1], m11 - mova [tmp1q+32*0], m12 - mova [tmp1q+32*1], m13 - mova [tmp1q+32*2], m14 - mova [tmp1q+32*3], m15 - mova m0, [r2-32*3] - mova m1, [r2-32*1] - mova m2, [r2+32*1] - mova m3, [r2+32*3] - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - test r10d, 0x40000000 - jnz .fast2 - mova m4, [r3-32*3] - mova m5, [r3-32*1] - mova m6, [r3+32*1] - mova m7, [r3+32*3] -.fast2: - add tmp1q, 32*8 - lea tmp2q, [tmp1q+32*8] - call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast - vpbroadcastd m15, [o(pd_2048)] - add r2, 32*8 - add r3, 32*8 - add tmp1q, 32*16 - add tmp2q, 32*32 - mova m0, [r2-32*4] ; 1 - mova m3, [r2+32*3] ; 15 - mova m4, [r2+32*0] ; 9 - mova m7, [r2-32*1] ; 7 - pxor m1, m1 - REPX {mova x, m1}, m2, m5, m6 - test r10d, 0x40000000 - jnz .fast3 - mova m1, [r3+32*3] ; 31 - mova m2, [r3-32*4] ; 17 - mova m5, [r3-32*1] ; 23 - mova m6, [r3+32*0] ; 25 -.fast3: - add rax, o_idct64_offset - call m(inv_txfm_add_dct_dct_16x64).main_part1 - add rax, 8 - add tmp1q, 32*8 - sub tmp2q, 32*8 - mova m0, [r2-32*2] ; 5 - mova m3, [r2+32*1] ; 11 - mova m4, [r2+32*2] ; 13 - mova m7, [r2-32*3] ; 3 - pxor m1, m1 - REPX {mova x, m1}, m2, m5, m6 - test r10d, 0x40000000 - jnz .fast4 - mova m1, [r3+32*1] ; 27 - mova m2, [r3-32*2] ; 21 - mova m5, [r3-32*3] ; 19 - mova m6, [r3+32*2] ; 29 -.fast4: - call m(inv_txfm_add_dct_dct_16x64).main_part1 - call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 - sub tmp1q, 32*28 - sub dstq, r8 - lea dstq, [dstq+strideq*4+16] - dec r10b - jg .pass2_loop - RET - -%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/itx_avx2.asm dav1d-0.9.1/src/x86/itx_avx2.asm --- dav1d-0.7.1/src/x86/itx_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/itx_avx2.asm 2021-07-28 21:38:28.905852000 +0000 @@ -0,0 +1,5565 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 + +; Note: The order of (at least some of) those constants matter! + +const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +%macro COEF_PAIR 2 +pw_%1_%2: dw %1, %2 +pw_m%2_%1: dw -%2, %1 +%endmacro + +; ADST-only +pw_3803_1321: dw 3803, 1321 +pw_m1321_2482: dw -1321, 2482 +pw_2482_3344: dw 2482, 3344 +pw_m3344_3344: dw -3344, 3344 +pw_m3803_3344: dw -3803, 3344 +pw_m3803_m6688: dw -3803, -6688 +pw_2896_m2896: dw 2896, -2896 + +const pw_5, times 2 dw 5 +const pw_2048, times 2 dw 2048 +const pw_4096, times 2 dw 4096 +const pw_8192, times 2 dw 8192 +const pw_16384, times 2 dw 16384 +const pw_1697x16, times 2 dw 1697*16 +const pw_1697x8, times 2 dw 1697*8 +const pw_2896x8, times 2 dw 2896*8 +const pd_2048, dd 2048 + +const pw_2896_2896, dw 2896, 2896 +const pw_m2896_2896, dw -2896, 2896 +const pw_1567_3784, dw 1567, 3784 +const pw_m3784_1567, dw -3784, 1567 +COEF_PAIR 3784, 1567 +COEF_PAIR 201, 4091 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 2440, 3290 +COEF_PAIR 3035, 2751 +COEF_PAIR 3513, 2106 +COEF_PAIR 3857, 1380 +COEF_PAIR 4052, 601 +COEF_PAIR 401, 4076 +COEF_PAIR 1931, 3612 +COEF_PAIR 3166, 2598 +COEF_PAIR 3920, 1189 +COEF_PAIR 799, 4017 +COEF_PAIR 3406, 2276 +pw_m799_m4017: dw -799, -4017 +const pw_m1567_m3784, dw -1567, -3784 +pw_m3406_m2276: dw -3406, -2276 +pw_m401_m4076: dw -401, -4076 +pw_m3166_m2598: dw -3166, -2598 +pw_m1931_m3612: dw -1931, -3612 +pw_m3920_m1189: dw -3920, -1189 +COEF_PAIR 2276, 3406 +COEF_PAIR 4017, 799 + +%macro COEF_X8 1-* +%rep %0 + dw %1*8, %1*8 + %rotate 1 +%endrep +%endmacro + +pw_3703x8: COEF_X8 3703 +pw_1751x8: COEF_X8 1751 +pw_m1380x8: COEF_X8 -1380 +pw_3857x8: COEF_X8 3857 +pw_3973x8: COEF_X8 3973 +pw_995x8: COEF_X8 995 +pw_m2106x8: COEF_X8 -2106 +pw_3513x8: COEF_X8 3513 +pw_3290x8: COEF_X8 3290 +pw_2440x8: COEF_X8 2440 +pw_m601x8: COEF_X8 -601 +pw_4052x8: COEF_X8 4052 + +const idct64_mul +COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 +COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 +COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 +COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 + +pw_201_4091x8: dw 201*8, 4091*8 +pw_m601_4052x8: dw -601*8, 4052*8 +pw_995_3973x8: dw 995*8, 3973*8 +pw_m1380_3857x8: dw -1380*8, 3857*8 +pw_1751_3703x8: dw 1751*8, 3703*8 +pw_m2106_3513x8: dw -2106*8, 3513*8 +pw_2440_3290x8: dw 2440*8, 3290*8 +pw_m2751_3035x8: dw -2751*8, 3035*8 + +%define o_idct64_offset idct64_mul - (o_base) - 8 + +SECTION .text + +; Code size reduction trickery: Intead of using rip-relative loads with +; mandatory 4-byte offsets everywhere, we can set up a base pointer with a +; single rip-relative lea and then address things relative from that with +; 1-byte offsets as long as data is within +-128 bytes of the base pointer. +%define o_base deint_shuf + 128 +%define o(x) (rax - (o_base) + (x)) + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave, 4: coef_regs +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags +%if %7 & 4 + pmaddwd m%2, m%5, m%1 + pmaddwd m%1, m%6 +%else +%if %7 & 1 + vpbroadcastd m%2, [o(pw_%5_%6)] + vpbroadcastd m%3, [o(pw_m%6_%5)] +%else + vpbroadcastd m%2, [o(pw_m%6_%5)] + vpbroadcastd m%3, [o(pw_%5_%6)] +%endif + pmaddwd m%2, m%1 + pmaddwd m%1, m%3 +%endif + paddd m%2, m%4 + paddd m%1, m%4 +%if %7 & 2 + pslld m%2, 4 + psrld m%1, 12 + pblendw m%1, m%2, 0xaa +%else + psrad m%2, 12 + psrad m%1, 12 + packssdw m%1, m%2 +%endif +%endmacro + +; flags: 1 = swap, 2 = interleave, 4 = coef_regs +%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags +%if %10 & 1 + vpbroadcastd m%3, [o(pw_%8_%9)] + vpbroadcastd m%4, [o(pw_m%9_%8)] + vpbroadcastd xm%2, [o(pw_%6_%7)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(pw_m%7_%6)] +%else + vpbroadcastd m%3, [o(pw_m%9_%8)] + vpbroadcastd m%4, [o(pw_%8_%9)] + vpbroadcastd xm%2, [o(pw_m%7_%6)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(pw_%6_%7)] +%endif + vpblendd m%3, m%4, 0xf0 + ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 + punpckhwd m%3, m%2, m%1 + punpcklwd m%2, m%1 +%if %7 < 32 + pmaddwd m%1, m%7, m%2 + pmaddwd m%4, m%7, m%3 +%else + vpbroadcastd m%1, [o(pw_m%7_%6)] + pmaddwd m%4, m%3, m%1 + pmaddwd m%1, m%2 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 12 + psrad m%1, 12 + packssdw m%1, m%4 +%if %7 < 32 + pmaddwd m%3, m%6 + pmaddwd m%2, m%6 +%else + vpbroadcastd m%4, [o(pw_%6_%7)] + pmaddwd m%3, m%4 + pmaddwd m%2, m%4 +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 12 + psrad m%2, 12 +%if %0 == 8 + packssdw m%8, m%2, m%3 +%else + packssdw m%2, m%3 +%endif +%endmacro + +%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0 + psubsw m%3, m%1, m%2 + paddsw m%2, m%1 + paddsw m%1, m%4, m%5 + psubsw m%4, m%5 +%endmacro + +%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 + ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a + ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a + ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 + paddsw m%9, m%2, m%6 ; t4 + psubsw m%2, m%6 ; t5a + paddsw m%10, m%8, m%4 ; t7 + psubsw m%8, m%4 ; t6a + ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0 + ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6 + psubsw m%6, m%1, m%3 ; dct4 out2 + paddsw m%3, m%1 ; dct4 out1 + paddsw m%1, m%5, m%7 ; dct4 out0 + psubsw m%5, m%7 ; dct4 out3 + psubsw m%7, m%3, m%2 ; out6 + paddsw m%2, m%3 ; out1 + paddsw m%3, m%6, m%8 ; out2 + psubsw m%6, m%8 ; out5 + psubsw m%8, m%1, m%10 ; out7 + paddsw m%1, m%10 ; out0 + paddsw m%4, m%5, m%9 ; out3 + psubsw m%5, m%9 ; out4 +%endmacro + +; in1 = %1, in3 = %2, in5 = %3, in7 = %4 +; in9 = %5, in11 = %6, in13 = %7, in15 = %8 +%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048 + ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a + ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a + psubsw m%9, m%2, m%6 ; t13 + paddsw m%6, m%2 ; t12 + psubsw m%2, m%8, m%4 ; t14 + paddsw m%8, m%4 ; t15 + psubsw m%4, m%7, m%3 ; t10 + paddsw m%3, m%7 ; t11 + psubsw m%7, m%1, m%5 ; t9 + paddsw m%1, m%5 ; t8 + ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a + ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a + psubsw m%5, m%1, m%3 ; t11a + paddsw m%1, m%3 ; t8a + psubsw m%3, m%7, m%4 ; t13 + paddsw m%7, m%4 ; t14 + psubsw m%4, m%8, m%6 ; t12a + paddsw m%8, m%6 ; t15a + psubsw m%6, m%2, m%9 ; t10 + paddsw m%2, m%9 ; t9 + ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a + ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12 +%endmacro + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + vpbroadcastd m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + movd m2, [%%row_adr1] + pinsrd m2, [%%row_adr2], 1 + movd m3, [%%row_adr3] + pinsrd m3, [%%row_adr4], 1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + movd [%%row_adr1], m0 + pextrd [%%row_adr2], m0, 1 + pextrd [%%row_adr3], m0, 2 + pextrd [%%row_adr4], m0, 3 + ret +%endmacro + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ; in1 in3 + punpcklqdq m0, m1 ; in0 in2 + psubw m2, m0, m3 + paddw m0, m3 + punpckhqdq m2, m2 ; t2 t2 + punpcklqdq m0, m0 ; t0 t0 + psubw m1, m0, m2 + psraw m1, 1 + psubw m1, m3 ; t1 t3 + psubw m0, m1 ; ____ out0 + paddw m2, m1 ; out3 ____ +%endmacro + +INIT_XMM avx2 +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c + mova m0, [cq+16*0] + mova m1, [cq+16*1] + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + IWHT4_1D_PACKED + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x03 + ITX4_END 3, 0, 2, 1, 0 + +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) + lea rax, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%3_internal_8bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct_dct + vpbroadcastw m0, [cq] + vpbroadcastd m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [cq], eobd ; 0 + pmulhrsw m0, m1 + mova m1, m0 + jmp m(iadst_4x4_internal_8bpc).end2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0 + vpbroadcastd m4, [o(pd_2048)] + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 +%endmacro + +%macro IADST4_1D_PACKED 0 + punpcklwd m2, m1, m0 + punpckhwd m3, m1, m0 + vpbroadcastd m5, [o(pw_m3344_3344)] + vpbroadcastd m0, [o(pw_3803_1321)] + vpbroadcastd m4, [o(pw_m1321_2482)] + pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 + psrld m5, 16 + pmaddwd m0, m2 + pmaddwd m2, m4 + pmaddwd m5, m3 ; 3344*in0 + paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 + vpbroadcastd m4, [o(pw_2482_3344)] + vpbroadcastd m5, [o(pw_m3803_3344)] + pmaddwd m4, m3 + pmaddwd m5, m3 + paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 + vpbroadcastd m0, [o(pw_m3803_m6688)] + pmaddwd m3, m0 + vpbroadcastd m0, [o(pd_2048)] + paddd m2, m0 + paddd m1, m0 + paddd m0, m4 + paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 + paddd m2, m4 + paddd m2, m3 + REPX {psrad x, 12}, m1, m2, m0, m5 + packssdw m0, m5 ; out0 out1 + packssdw m1, m2 ; out2 out3 +%endmacro + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + IDCT4_1D_PACKED + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call .main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 0, 1, 2, 3 +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call m(iadst_4x4_internal_8bpc).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x4_internal_8bpc).main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8bpc).end + +%macro WRITE_4X8 2 ; coefs[1-2] + movd xm4, [dstq+strideq*0] + pinsrd xm4, [dstq+strideq*1], 1 + movd xm5, [dstq+strideq*2] + pinsrd xm5, [dstq+r3 ], 1 + pinsrd xm4, [r2 +strideq*0], 2 + pinsrd xm4, [r2 +strideq*1], 3 + pinsrd xm5, [r2 +strideq*2], 2 + pinsrd xm5, [r2 +r3 ], 3 + pmovzxbw m4, xm4 + pmovzxbw m5, xm5 + paddw m4, m%1 + paddw m5, m%2 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+strideq*2], xm4, 2 + pextrd [dstq+r3 ], xm4, 3 + movd [r2 +strideq*0], xm5 + pextrd [r2 +strideq*1], xm5, 1 + pextrd [r2 +strideq*2], xm5, 2 + pextrd [r2 +r3 ], xm5, 3 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_4x8_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT8_1D_PACKED 0 + vpbroadcastd m6, [o(pd_2048)] + punpckhwd m5, m3, m0 ; in7 in1 + punpckhwd m4, m1, m2 ; in3 in5 + punpcklwd m3, m1 ; in6 in2 + punpcklwd m2, m0 ; in4 in0 + ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 + psubsw m0, m5, m4 ; t5a t6a (interleaved) + paddsw m4, m5 ; t4 t7 (interleaved) + ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 + vpbroadcastd m1, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 +%if mmsize > 16 + vbroadcasti128 m1, [o(deint_shuf)] + pshufb m4, m1 +%else + pshufb m4, [o(deint_shuf)] +%endif + psubsw m1, m2, m3 ; tmp3 tmp2 + paddsw m3, m2 ; tmp0 tmp1 + shufps m2, m4, m0, q1032 ; t7 t6 + vpblendd m4, m0, 0xcc ; t4 t5 + paddsw m0, m3, m2 ; out0 out1 + psubsw m3, m2 ; out7 out6 + psubsw m2, m1, m4 ; out4 out5 + paddsw m1, m4 ; out3 out2 +%endmacro + +%macro IADST8_1D_PACKED 1 ; pass + vpbroadcastd m6, [o(pd_2048)] + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti128 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + pshuflw m1, m1, q2301 + pshufhw m1, m1, q2301 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + vpbroadcastd m5, [o(pw_m2896_2896)] + pmaddwd m2, m5, m3 + pmaddwd m5, m1 + paddd m2, m6 + paddd m5, m6 + psrad m2, 12 + psrad m5, 12 + packssdw m2, m5 ; out4 -out5 + vpbroadcastd m5, [o(pw_2896_2896)] + pmaddwd m3, m5 + pmaddwd m1, m5 + paddd m3, m6 + paddd m1, m6 + psrad m3, 12 + psrad m1, 12 + packssdw m1, m3 ; out2 -out3 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 +%else + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a + psubsw m4, m0, m2 ; t4 t5 + paddsw m0, m2 ; t0 t1 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + shufps m2, m5, m4, q1032 + punpckhwd m4, m2 + punpcklwd m5, m2 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a + psubsw m2, m0, m1 ; t2 t3 + paddsw m0, m1 ; out0 -out7 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + vpbroadcastd m5, [o(pw_2896x8)] + vpblendd m3, m0, m4, 0x33 ; out6 -out7 + vpblendd m0, m4, 0xcc ; out0 -out1 + shufps m4, m2, m1, q1032 ; t3 t7 + vpblendd m1, m2, 0x33 ; t2 t6 + psubsw m2, m1, m4 ; t2-t3 t6-t7 + paddsw m1, m4 ; t2+t3 t6+t7 + pmulhrsw m2, m5 ; out4 -out5 + pshufd m1, m1, q1032 + pmulhrsw m1, m5 ; out2 -out3 +%endif +%endmacro + +INIT_YMM avx2 +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity + +cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + IDCT4_1D_PACKED + vbroadcasti128 m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + call .main + vpbroadcastd m4, [o(pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pshufd m1, m1, q1032 + jmp m(iadst_4x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT8_1D_PACKED + ret + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call .main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pxor m5, m5 + psubw m5, m4 +.end: + vpblendd m4, m5, 0xcc +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + WIN64_RESTORE_XMM + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 +.end3: + lea r2, [dstq+strideq*4] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + RET +ALIGN function_align +.main_pass1: + WRAP_XMM IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + WRAP_XMM IADST8_1D_PACKED 2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call m(iadst_4x8_internal_8bpc).main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + pxor m4, m4 + psubw m4, m5 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + jmp m(iadst_4x8_internal_8bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m2, [cq+32*0], q3120 + vpermq m0, [cq+32*1], q3120 + vpbroadcastd m3, [o(pw_2896x8)] + vpbroadcastd m4, [o(pw_1697x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + pmulhrsw m2, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m2 + paddsw m1, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_4x8_internal_8bpc).end2 + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + movd xm3, [o(pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm2 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm3 + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp m(iadst_4x16_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT16_1D_PACKED 0 + vpbroadcastd m10, [o(pd_2048)] +.main2: + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 + ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a + ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a + ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a + ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 + psubsw m2, m8, m0 ; t9 t14 + paddsw m8, m0 ; t8 t15 + psubsw m0, m1, m5 ; t10 t13 + paddsw m1, m5 ; t11 t12 + vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784 + ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a + vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 + ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a + psubsw m4, m8, m1 ; t11a t12a + paddsw m8, m1 ; t8a t15a + psubsw m1, m7, m3 ; t5a t6a + paddsw m7, m3 ; t4 t7 + paddsw m3, m2, m0 ; t9 t14 + psubsw m2, m0 ; t10 t13 +%if mmsize > 16 + vbroadcasti128 m0, [o(deint_shuf)] +%else + mova m0, [o(deint_shuf)] +%endif + pshufb m8, m0 + pshufb m7, m0 + pshufb m3, m0 + ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1 + vpbroadcastd m0, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 + vpbroadcastd m5, [o(pw_2896_2896)] + ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 + vpbroadcastd m0, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + shufps m5, m4, m2, q1032 ; t12 t13a + vpblendd m4, m2, 0xcc ; t11 t10a + shufps m2, m7, m1, q1032 ; t7 t6 + vpblendd m7, m1, 0xcc ; t4 t5 + psubsw m1, m9, m6 ; dct4 out3 out2 + paddsw m9, m6 ; dct4 out0 out1 + psubsw m3, m9, m2 ; dct8 out7 out6 + paddsw m9, m2 ; dct8 out0 out1 + psubsw m2, m1, m7 ; dct8 out4 out5 + paddsw m1, m7 ; dct8 out3 out2 + psubsw m7, m9, m0 ; out15 out14 + paddsw m0, m9 ; out0 out1 + psubsw m6, m1, m5 ; out12 out13 + paddsw m1, m5 ; out3 out2 + psubsw m5, m2, m4 ; out11 out10 + paddsw m2, m4 ; out4 out5 + psubsw m4, m3, m8 ; out8 out9 + paddsw m3, m8 ; out7 out6 +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity + +cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(idct_16x4_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m0, m4, m2, m3 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vextracti128 xm4, m0, 1 + vextracti128 xm5, m1, 1 + vextracti128 xm6, m2, 1 + vextracti128 xm7, m3, 1 + call .main + vinserti128 m0, xm4, 1 + vinserti128 m1, xm5, 1 + vpbroadcastd m5, [o(pw_2048)] + vinserti128 m2, xm6, 1 + vinserti128 m3, xm7, 1 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x16_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT16_1D_PACKED + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m4, m2, m3, m0 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m5, [o(pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m1, m0, 0x33 + vpblendd m0, m2, 0x33 + vpblendd m2, m3, 0x33 + vpblendd m3, m1, 0x33 + vpermq m0, m0, q2031 + vpermq m1, m2, q1302 + vpermq m2, m3, q3120 + vpermq m3, m4, q0213 + psubw m6, m7, m5 +.end: + vpblendd m5, m6, 0xcc +.end2: + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + WIN64_RESTORE_XMM + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + lea r2, [dstq+strideq*8] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + lea dstq, [dstq+strideq*4] + lea r2, [r2 +strideq*4] + WRITE_4X8 2, 3 + RET +ALIGN function_align +.main: + vpblendd m4, m1, m0, 0xcc + vpblendd m1, m0, 0x33 + vpblendd m5, m2, m3, 0xcc + vpblendd m2, m3, 0x33 + vperm2i128 m3, m5, m2, 0x31 + vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 + vperm2i128 m4, m1, m4, 0x31 + vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 + pshufd m3, m3, q1032 ; in12 in15 in13 in14 + pshufd m2, m4, q1032 ; in11 in8 in9 in10 +cglobal_label .main2 + vpbroadcastd m8, [o(pd_2048)] + pxor m7, m7 + punpckhwd m4, m3, m0 ; in12 in3 in14 in1 + punpcklwd m0, m3 ; in0 in15 in2 in13 + punpckhwd m3, m2, m1 ; in8 in7 in10 in5 + punpcklwd m1, m2 ; in4 in11 in6 in9 + ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 + ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 + ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 + ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 + psubsw m2, m0, m3 ; t9a t8a t11a t10a + paddsw m0, m3 ; t1a t0a t3a t2a + psubsw m3, m1, m4 ; t13a t12a t15a t14a + paddsw m1, m4 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 + psubw m6, m7, m5 + ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 + vpbroadcastd m6, [o(pw_m3784_1567)] + vpbroadcastd m5, [o(pw_1567_3784)] + psubsw m4, m0, m1 ; t5 t4 t7 t6 + paddsw m0, m1 ; t1 t0 t3 t2 + psubsw m1, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + psubw m3, m7, m6 ; pw_3784_m1567 + vpblendd m6, m3, 0xf0 + ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a + ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 + vbroadcasti128 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a + vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a + vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 + vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 + pshufd m2, m2, q1032 ; t6a t7a t14 t15 + psubsw m1, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m4, m2 ; -out3 out12 out2 -out13 + psubsw m4, m2 ; t6 t7 t14a t15a + shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a + vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m5, [o(pw_m2896_2896)] + vpbroadcastd m6, [o(pw_2896_2896)] + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + pmaddwd m2, m5, m4 + pmaddwd m4, m6 + pmaddwd m5, m1 + pmaddwd m1, m6 + REPX {paddd x, m8}, m5, m1, m2, m4 + REPX {psrad x, 12}, m5, m2, m1, m4 + packssdw m2, m5 ; -out11 out8 out10 -out9 + packssdw m1, m4 ; -out7 out4 out6 -out5 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpcklwd m4, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m3, m2 + punpckhwd m3, m2 + REPX {pmulhrsw x, m5}, m4, m1, m0, m3 + punpckldq m2, m3, m1 + punpckhdq m3, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_8bpc).main + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m6, [o(pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m0, m2, 0x33 + vpblendd m0, m1, 0xcc + vpblendd m1, m3, 0xcc + vpblendd m2, m3, 0x33 + vpermq m0, m0, q3120 + vpermq m1, m1, q0213 + vpermq m2, m2, q2031 + vpermq m3, m4, q1302 + psubw m5, m7, m6 + jmp m(iadst_4x16_internal_8bpc).end + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m3, [cq+32*0] + mova m2, [cq+32*1] + mova m4, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m8, [o(pw_1697x8)] + pcmpeqw m0, m0 ; -1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 + pmulhrsw m8, m4 + pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is + pxor m1, m9 ; unsigned. as long as both signs are equal + pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the + pxor m2, m9 ; pmulhrsw result will become 0 which causes + pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless + pxor m3, m9 ; we explicitly deal with that case here. + pcmpeqw m0, m4 + pxor m4, m0 + pavgw m1, m5 + pavgw m2, m6 + pavgw m3, m7 + pavgw m4, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + vpbroadcastd m5, [o(pw_2048)] + pmulhrsw m4, m8, m0 + pmulhrsw m6, m8, m1 + pmulhrsw m7, m8, m2 + pmulhrsw m8, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m6 + paddsw m2, m7 + paddsw m3, m8 + jmp m(iadst_4x16_internal_8bpc).end2 + +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] + movq xm%3, [dstq ] + movhps xm%3, [dstq+%5] + movq xm%4, [dstq+%6] + movhps xm%4, [dstq+%7] + pmovzxbw m%3, xm%3 + pmovzxbw m%4, xm%4 +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vextracti128 xm%4, m%3, 1 + movq [dstq ], xm%3 + movhps [dstq+%6], xm%3 + movq [dstq+%5], xm%4 + movhps [dstq+%7], xm%4 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + pmulhrsw xm0, xm1 + movd xm2, [o(pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_8x4_internal_8bpc).end3 +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm3, [o(pw_2896x8)] + pmulhrsw xm0, xm3, [cq+16*0] + pmulhrsw xm1, xm3, [cq+16*1] + pmulhrsw xm2, xm3, [cq+16*2] + pmulhrsw xm3, [cq+16*3] + call m(idct_4x8_internal_8bpc).main + vbroadcasti128 m4, [o(deint_shuf)] + vinserti128 m3, m1, xm3, 1 + vinserti128 m1, m0, xm2, 1 + shufps m0, m1, m3, q0220 + shufps m1, m3, q1331 + pshufb m0, m4 + pshufb m1, m4 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pxor m3, m3 + psubsw m3, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +.end2: + vpbroadcastd m2, [o(pw_2048)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + WIN64_RESTORE_XMM +.end3: + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + RET +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + punpckhwd m1, m3, m2 + punpcklwd m3, m2 + pxor m0, m0 + psubsw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_8bpc).main + mova m2, m1 + vpermq m1, m0, q2031 + vpermq m0, m2, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m0, [cq+16*3], 1 + vpbroadcastd m3, [o(pw_2896x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + paddsw m0, m0 + paddsw m1, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_8x4_internal_8bpc).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm2 + psrlw xm2, 3 ; pw_2048 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 +.end: + mov r2d, 2 +.end2: + lea r3, [strideq*3] +.loop: + WRITE_8X4 0, 0, 1, 2 + lea dstq, [dstq+strideq*4] + dec r2d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m1, [cq+32*1], q3120 ; 2 3 + call .main + shufps m4, m0, m1, q0220 + shufps m5, m0, m1, q1331 + shufps m1, m2, m3, q0220 + shufps m3, m2, m3, q1331 + vbroadcasti128 m0, [o(deint_shuf)] + vpbroadcastd m2, [o(pw_16384)] + REPX {pshufb x, m0}, m4, m5, m1, m3 + REPX {pmulhrsw x, m2}, m4, m5, m1, m3 + vinserti128 m0, m4, xm1, 1 + vperm2i128 m2, m4, m1, 0x31 + vinserti128 m1, m5, xm3, 1 + vperm2i128 m3, m5, m3, 0x31 + jmp tx2q +.pass2: + call .main + vpbroadcastd m4, [o(pw_2048)] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + jmp m(iadst_8x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call .main_pass1 + vpbroadcastd m5, [o(pw_16384)] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + pxor m3, m3 + psubw m3, m5 ; negate odd elements during rounding + pmulhrsw m4, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m5 + pmulhrsw m2, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vperm2i128 m2, m3, m0, 0x31 + vinserti128 m0, m3, xm0, 1 + vperm2i128 m3, m4, m1, 0x31 + vinserti128 m1, m4, xm1, 1 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call .main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vpbroadcastd xm4, [o(pw_4096)] + psubw m4, m5 ; lower half = 2048, upper half = -2048 +.end: + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + pmulhrsw m2, m4 + pmulhrsw m3, m4 + WIN64_RESTORE_XMM +.end4: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 5 + RET +ALIGN function_align +.main_pass1: + IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + IADST8_1D_PACKED 2 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call m(iadst_8x8_internal_8bpc).main_pass1 + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + pxor m0, m0 + psubw m0, m5 + pmulhrsw m4, m0 + pmulhrsw m3, m5 + pmulhrsw m2, m0 + pmulhrsw m1, m5 + punpckhwd m0, m4, m3 + punpcklwd m4, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + vinserti128 m1, m0, xm3, 1 + vperm2i128 m3, m0, m3, 0x31 + vinserti128 m0, m4, xm2, 1 + vperm2i128 m2, m4, m2, 0x31 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vpbroadcastd xm5, [o(pw_4096)] + psubw m4, m5 ; lower half = -2048, upper half = 2048 + vpermq m5, m3, q2031 + vpermq m3, m0, q2031 + vpermq m0, m2, q2031 + vpermq m2, m1, q2031 + pmulhrsw m1, m0, m4 + pmulhrsw m0, m5, m4 + jmp m(iadst_8x8_internal_8bpc).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti128 m3, [cq+16*4], 1 + vinserti128 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti128 m4, [cq+16*6], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_8x8_internal_8bpc).end + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + psrlw xm2, 3 ; pw_2048 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mov r2d, 4 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 +%endif +%endmacro + +%macro ITX_8X16_LOAD_COEFS 0 + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m0, m4, [cq+32*0] + add cq, 32*4 + pmulhrsw m7, m4, [cq+32*3] + pmulhrsw m1, m4, [cq-32*3] + pmulhrsw m6, m4, [cq+32*2] + pmulhrsw m2, m4, [cq-32*2] + pmulhrsw m5, m4, [cq+32*1] + pmulhrsw m3, m4, [cq-32*1] + pmulhrsw m4, [cq+32*0] +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity + +cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m10, [o(pw_16384)] +.pass1_end: + vperm2i128 m9, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vperm2i128 m8, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 +.pass1_end2: + punpckhwd m7, m5, m6 + punpcklwd m5, m6 + punpcklwd m6, m8, m9 + punpckhwd m8, m9 + REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + call .main + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + vpbroadcastd m8, [o(pw_2048)] +.end2: + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +.end3: + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 8, 9 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 4, 5, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 6, 7, 0, 1 + RET +ALIGN function_align +cglobal_label .main + IDCT16_1D_PACKED + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + vpbroadcastd m10, [o(pw_16384)] + pslld m9, m10, 17 + psubw m10, m9 ; 16384, -16384 + jmp m(idct_8x16_internal_8bpc).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + vpbroadcastd m9, [o(pw_2048)] + vpbroadcastd xm8, [o(pw_4096)] + psubw m8, m9 + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + jmp m(idct_8x16_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + REPX {pshufd x, x, q1032}, m7, m1, m5, m3 +.main2: + vpbroadcastd m10, [o(pd_2048)] + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 + psubsw m4, m0, m5 ; t9a t8a + paddsw m0, m5 ; t1a t0a + psubsw m5, m1, m6 ; t11a t10a + paddsw m1, m6 ; t3a t2a + psubsw m6, m2, m7 ; t13a t12a + paddsw m2, m7 ; t5a t4a + psubsw m7, m3, m8 ; t15a t14a + paddsw m3, m8 ; t7a t6a + vpbroadcastd m11, [o(pw_m4017_799)] + vpbroadcastd m12, [o(pw_799_4017)] + pxor m9, m9 + ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 + psubw m8, m9, m11 ; pw_4017_m799 + ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 + vpbroadcastd m11, [o(pw_m2276_3406)] + vpbroadcastd m12, [o(pw_3406_2276)] + ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 + psubw m8, m9, m11 ; pw_2276_m3406 + ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 + psubsw m8, m1, m3 ; t7 t6 + paddsw m1, m3 ; t3 t2 + psubsw m3, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m2, m5, m7 ; t14a t15a + paddsw m7, m5 ; t10a t11a + psubsw m5, m4, m6 ; t12a t13a + paddsw m4, m6 ; t8a t9a + vpbroadcastd m11, [o(pw_m3784_1567)] + vpbroadcastd m12, [o(pw_1567_3784)] + ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a + psubw m6, m9, m11 ; pw_3784_m1567 + ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a + vpbroadcastd m11, [o(pw_m1567_3784)] + vpbroadcastd m12, [o(pw_3784_1567)] + ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 + psubw m6, m9, m11 ; pw_1567_m3784 + ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 + vbroadcasti128 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a + paddsw m0, m1 ; -out15 out0 + paddsw m1, m2, m5 ; -out13 out2 + psubsw m5, m2 ; t15a t14a + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + vpblendd m0, m6, 0x33 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + pmaddwd m9, m8, m11 ; -out11 + pmaddwd m2, m12, m5 ; -out5 + pmaddwd m5, m8 ; out10 + pmaddwd m11, m12 ; out4 + REPX {paddd x, m10}, m9, m5, m2, m11 + REPX {psrad x, 12 }, m9, m5, m2, m11 + packssdw m5, m9 ; out10 -out11 + packssdw m2, m11 ; -out5 out4 + pmaddwd m11, m8, m3 ; out8 + vpbroadcastd m8, [o(pw_2896_m2896)] + pmaddwd m3, m12 ; -out7 + pmaddwd m8, m4 ; -out9 + pmaddwd m4, m12 ; out6 + REPX {paddd x, m10}, m11, m3, m8, m4 + REPX {psrad x, 12 }, m11, m3, m8, m4 + packssdw m3, m4 ; -out7 out6 + packssdw m4, m11, m8 ; out8 -out9 + vpbroadcastd m10, [o(pw_16384)] + pxor m9, m9 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(pw_2896x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m11, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + vpblendd m3, m4, 0xcc ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m11 ; -out5 out4 + psubsw m5, m11 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + vpbroadcastd m9, [o(pw_16384)] + pslld m10, m9, 17 + psubw m10, m9 ; -16384, 16384 + vperm2i128 m9, m4, m0, 0x31 + vinserti128 m0, m4, xm0, 1 + vperm2i128 m8, m5, m1, 0x31 + vinserti128 m4, m5, xm1, 1 + vperm2i128 m5, m7, m3, 0x31 + vinserti128 m3, m7, xm3, 1 + vinserti128 m1, m6, xm2, 1 + vperm2i128 m6, m6, m2, 0x31 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m3, m1 + punpckhwd m3, m1 + jmp m(idct_8x16_internal_8bpc).pass1_end2 +.pass2: + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end + vpbroadcastd m8, [o(pw_2048)] + vpbroadcastd xm9, [o(pw_4096)] + psubw m8, m9 + vpermq m9, m0, q3120 + vpermq m0, m7, q2031 + vpermq m7, m1, q3120 + vpermq m1, m6, q2031 + vpermq m6, m2, q3120 + vpermq m2, m5, q2031 + vpermq m5, m3, q3120 + vpermq m3, m4, q2031 + pmulhrsw m0, m8 + pmulhrsw m1, m8 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + pmulhrsw m4, m5, m8 + pmulhrsw m5, m6, m8 + pmulhrsw m6, m7, m8 + pmulhrsw m7, m9, m8 + jmp m(idct_8x16_internal_8bpc).end3 + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*2] + add cq, 16*8 + vinserti128 m3, [cq+16*0], 1 + vinserti128 m2, [cq+16*2], 1 + vpbroadcastd m9, [o(pw_2896x8)] + mova xm4, [cq-16*4] + mova xm5, [cq-16*2] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*6], 1 + mova xm7, [cq-16*7] + mova xm6, [cq-16*5] + vinserti128 m7, [cq+16*1], 1 + vinserti128 m6, [cq+16*3], 1 + mova xm8, [cq-16*3] + mova xm0, [cq-16*1] + vinserti128 m8, [cq+16*5], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + punpcklwd m5, m7, m6 + punpckhwd m7, m6 + punpcklwd m6, m8, m0 + punpckhwd m8, m0 + REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(idct_8x16_internal_8bpc).end + +%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] + pmovzxbw m%3, [dstq+%5] +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif + pmovzxbw m%4, [dstq+%6] +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vpermq m%3, m%3, q3120 + mova [dstq+%5], xm%3 + vextracti128 [dstq+%6], m%3, 1 +%endmacro + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + mov r2d, 2 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova xm1, [dstq] + vinserti128 m1, [dstq+strideq], 1 + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq], xm1 + vextracti128 [dstq+strideq], m1, 1 + lea dstq, [dstq+strideq*2] + dec r2d + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] + mova xm4, [cq+16*4] + mova xm5, [cq+16*5] + mova xm6, [cq+16*6] + mova xm7, [cq+16*7] + call m(idct_4x16_internal_8bpc).main + vinserti128 m6, m2, xm6, 1 + vinserti128 m2, m0, xm4, 1 + vinserti128 m0, m1, xm5, 1 + vinserti128 m1, m3, xm7, 1 + punpcklwd m3, m2, m6 + punpckhwd m2, m6 + vpbroadcastd m6, [o(pw_16384)] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + mova m1, m6 + jmp m(iadst_16x4_internal_8bpc).pass1_end +.pass2: + call .main + jmp m(iadst_16x4_internal_8bpc).end +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(pd_2048)] + IDCT4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end + punpcklwd m4, m3, m1 + punpcklwd m5, m2, m0 + punpckhwd m0, m1 + punpckhwd m2, m3 + vpbroadcastd m1, [o(pw_16384)] + vinserti128 m3, m0, xm2, 1 + vperm2i128 m2, m0, m2, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m6, m7, m1 +.pass1_end: + pmulhrsw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m4, m1 + pmulhrsw m0, m6 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m4, [o(pw_2048)] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + WIN64_RESTORE_XMM +.end2: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(pw_m3344_3344)] + vpbroadcastd m7, [o(pw_3803_1321)] + vpbroadcastd m8, [o(pw_m1321_2482)] + vpbroadcastd m9, [o(pw_2482_3344)] + punpcklwd m4, m2, m0 ; in2 in0 l + punpckhwd m2, m0 ; in2 in0 h + psrld m5, m6, 16 + pmaddwd m10, m6, m4 ; t2:02 l + pmaddwd m6, m2 ; t2:02 h + pmaddwd m0, m7, m4 ; t0:02 l + pmaddwd m7, m2 ; t0:02 h + pmaddwd m4, m8 ; t1:02 l + pmaddwd m8, m2 ; t1:02 h + punpckhwd m2, m3, m1 ; in3 in1 h + punpcklwd m3, m1 ; in3 in1 l + pmaddwd m1, m5, m2 ; t2:3 h + pmaddwd m5, m3 ; t2:3 l + paddd m6, m1 + vpbroadcastd m1, [o(pd_2048)] + paddd m10, m5 + pmaddwd m5, m9, m3 + pmaddwd m9, m2 + paddd m0, m1 + paddd m7, m1 + paddd m0, m5 ; t0 + t3 + 2048 l + paddd m7, m9 ; t0 + t3 + 2048 h + vpbroadcastd m9, [o(pw_m3803_3344)] + pmaddwd m5, m9, m2 + pmaddwd m9, m3 + paddd m10, m1 ; t2 + 2048 l + paddd m6, m1 ; t2 + 2048 h + paddd m5, m1 ; t1:13 + 2048 h + paddd m1, m9 ; t1:13 + 2048 l + vpbroadcastd m9, [o(pw_m3803_m6688)] + pmaddwd m2, m9 + pmaddwd m3, m9 + paddd m5, m8 ; t1 + t3 + 2048 h + paddd m1, m4 ; t1 + t3 + 2048 l + paddd m8, m7 + paddd m4, m0 + paddd m2, m8 ; t0 + t1 - t3 + 2048 h + paddd m3, m4 ; t0 + t1 - t3 + 2048 l + REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 + packssdw m0, m7 + packssdw m1, m5 + packssdw m3, m2 + packssdw m2, m10, m6 + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end + punpckhwd m4, m3, m2 + punpckhwd m5, m1, m0 + punpcklwd m0, m2 + punpcklwd m1, m3 + vpbroadcastd m6, [o(pw_16384)] + vinserti128 m3, m0, xm1, 1 + vperm2i128 m2, m0, m1, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m1, m7, m6 + jmp m(iadst_16x4_internal_8bpc).pass1_end +ALIGN function_align +.pass2: + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m4, [o(pw_2048)] + REPX {pmulhrsw x, m4}, m3, m2, m1, m0 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 + RET + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm4, [cq+16*1] + vinserti128 m2, [cq+16*4], 1 + vinserti128 m4, [cq+16*5], 1 + mova xm0, [cq+16*2] + mova xm1, [cq+16*3] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m7, [o(pw_1697x16)] + vpbroadcastd m8, [o(pw_16384)] + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + pmulhrsw m0, m7, m1 + pmulhrsw m5, m7, m2 + pmulhrsw m6, m7, m3 + pmulhrsw m7, m4 + REPX {pmulhrsw x, m8}, m0, m5, m6, m7 + paddsw m1, m0 + paddsw m2, m5 + paddsw m3, m6 + paddsw m4, m7 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_16x4_internal_8bpc).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 4 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +%endif +%endmacro + +%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m0, [cq+32*0], q3120 + add cq, 32*4 + vpermq m7, [cq+32*3], q%1 + vpermq m1, [cq-32*3], q%1 + vpermq m6, [cq+32*2], q3120 + vpermq m2, [cq-32*2], q3120 + vpermq m5, [cq+32*1], q%1 + vpermq m3, [cq-32*1], q%1 + vpermq m4, [cq+32*0], q3120 + REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity + +cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 3120 + call m(idct_8x16_internal_8bpc).main + vpbroadcastd m10, [o(pw_16384)] + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + REPX {pmulhrsw x, m10}, m8, m1, m4, m6 +.pass1_end: + REPX {pmulhrsw x, m10}, m0, m2, m9, m5 + punpckhwd m3, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m9, m4 + punpckhwd m9, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m8 + punpckhdq m3, m8 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m9, m5 + punpckhdq m9, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m8, 0x31 + vinserti128 m2, xm8, 1 + vperm2i128 m7, m3, m9, 0x31 + vinserti128 m3, xm9, 1 + jmp tx2q +.pass2: + call .main + vpbroadcastd m8, [o(pw_2048)] +.end: + REPX {pmulhrsw x, m8}, m0, m2, m4, m6 +.end2: + REPX {pmulhrsw x, m8}, m1, m3, m5, m7 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 +.end3: + pxor m0, m0 + REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 +.end4: + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(pd_2048)] +.main2: + IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end + psubw m11, m9, m10 + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpckhwd m6, m5, m7 + punpcklwd m5, m7 + REPX {pmulhrsw x, m11}, m8, m1, m4, m6 + jmp m(idct_16x8_internal_8bpc).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + pxor m8, m8 + psubw m8, m9 + REPX {pmulhrsw x, m9}, m0, m2, m4, m6 + jmp m(idct_16x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(pd_2048)] + ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a + ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a + ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a + psubsw m8, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m0, m4 ; t4 + paddsw m0, m4 ; t0 + psubsw m4, m5, m1 ; t7 + paddsw m5, m1 ; t3 + psubsw m1, m7, m3 ; t5 + paddsw m7, m3 ; t1 + ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a + psubsw m9, m6, m8 ; t7 + paddsw m6, m8 ; out6 + psubsw m3, m7, m5 ; t3 + paddsw m7, m5 ; -out7 + psubsw m5, m0, m2 ; t2 + paddsw m0, m2 ; out0 + psubsw m2, m1, m4 ; t6 + paddsw m1, m4 ; -out1 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + punpckhwd m4, m3, m5 + punpcklwd m3, m5 + pmaddwd m5, m11, m4 + pmaddwd m4, m12 + pmaddwd m8, m11, m3 + pmaddwd m3, m12 + REPX {paddd x, m10}, m5, m4, m8, m3 + REPX {psrad x, 12 }, m5, m8, m4, m3 + packssdw m3, m4 ; -out3 + packssdw m4, m8, m5 ; out4 + punpcklwd m5, m9, m2 + punpckhwd m9, m2 + pmaddwd m2, m12, m5 + pmaddwd m5, m11 + pmaddwd m12, m9 + pmaddwd m11, m9 + REPX {paddd x, m10}, m2, m5, m12, m11 + REPX {psrad x, 12 }, m2, m12, m5, m11 + packssdw m2, m12 ; out2 + packssdw m5, m11 ; -out5 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(pw_2896x8)] + psubsw m4, m5, m3 + paddsw m3, m5 + psubsw m5, m2, m9 + paddsw m2, m9 + pmulhrsw m2, m8 ; out2 + pmulhrsw m3, m8 ; -out3 + pmulhrsw m4, m8 ; out4 + pmulhrsw m5, m8 ; -out5 + vpbroadcastd m9, [o(pw_2048)] + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end + psubw m9, m10 + punpcklwd m8, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m7, m5 + punpckhwd m7, m5 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m2, m0 + punpcklwd m2, m0 + REPX {pmulhrsw x, m10}, m8, m4, m5, m1 + REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 + punpcklwd m0, m7, m4 + punpckhwd m7, m4 + punpckhwd m4, m6, m8 + punpcklwd m6, m8 + punpckhwd m8, m3, m5 + punpcklwd m3, m5 + punpcklwd m5, m2, m1 + punpckhwd m2, m1 + punpckhdq m1, m0, m6 + punpckldq m0, m6 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckhdq m4, m3, m5 + punpckldq m3, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m4, 0x31 + vinserti128 m1, xm4, 1 + vperm2i128 m4, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vinserti128 m3, m7, xm8, 1 + vperm2i128 m7, m8, 0x31 + jmp tx2q +.pass2: + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + pxor m8, m8 + psubw m8, m9 + pmulhrsw m10, m7, m8 + pmulhrsw m7, m0, m9 + pmulhrsw m0, m6, m9 + pmulhrsw m6, m1, m8 + pmulhrsw m1, m5, m8 + pmulhrsw m5, m2, m9 + pmulhrsw m2, m4, m9 + pmulhrsw m4, m3, m8 + lea r3, [strideq*3] + WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 + WRITE_16X2 1, 2, 0, 1, strideq*2, r3 + jmp m(idct_16x8_internal_8bpc).end3 + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm7, [cq+16*0] + mova xm2, [cq+16*1] + add cq, 16*8 + vpbroadcastd m3, [o(pw_2896x8)] + vinserti128 m7, [cq+16*0], 1 + vinserti128 m2, [cq+16*1], 1 + mova xm6, [cq-16*6] + mova xm4, [cq-16*5] + vinserti128 m6, [cq+16*2], 1 + vinserti128 m4, [cq+16*3], 1 + mova xm8, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m8, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm0, [cq-16*2] + mova xm1, [cq-16*1] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m11, [o(pw_16384)] + REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 + punpcklwd m3, m7, m2 + punpckhwd m7, m2 + punpcklwd m2, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m8, m5 + punpckhwd m8, m5 + punpcklwd m5, m0, m1 + punpckhwd m0, m1 + punpckldq m1, m3, m2 + punpckhdq m3, m2 + punpckldq m2, m4, m5 + punpckhdq m4, m5 + punpckldq m5, m7, m6 + punpckhdq m7, m6 + punpckldq m6, m8, m0 + punpckhdq m8, m0 + REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m6 + punpckhqdq m5, m6 + punpcklqdq m6, m7, m8 + punpckhqdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_4096)] + jmp m(idct_16x8_internal_8bpc).end + +%define o_base pw_5 + 128 + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 8 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +%endif +%endmacro + +%macro ITX_16X16_LOAD_COEFS 0 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + add cq, 32*8 + mova m4, [cq-32*4] + mova m5, [cq-32*3] + mova m6, [cq-32*2] + mova m7, [cq-32*1] + mova m8, [cq+32*0] + mova m9, [cq+32*1] + mova m10, [cq+32*2] + mova m11, [cq+32*3] + mova m12, [cq+32*4] + mova m13, [cq+32*5] + mova m14, [cq+32*6] + mova m15, [cq+32*7] + mova [rsp], m15 +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity + +cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main +.pass1_end: + vpbroadcastd m1, [o(pw_8192)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 +.pass1_end2: + vextracti128 [rsp+16*4], m0, 1 + mova [rsp+16*0], xm0 + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + vperm2i128 m8, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vperm2i128 m9, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vperm2i128 m10, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + vperm2i128 m11, m4, m12, 0x31 + vinserti128 m4, xm12, 1 + vperm2i128 m12, m5, m13, 0x31 + vinserti128 m5, xm13, 1 + vperm2i128 m13, m6, m14, 0x31 + vinserti128 m6, xm14, 1 + vperm2i128 m14, m7, m15, 0x31 + vinserti128 m7, xm15, 1 + mova m15, [rsp+32*2] +.pass1_end3: + punpcklwd m0, m9, m10 + punpckhwd m9, m10 + punpcklwd m10, m15, m8 + punpckhwd m15, m8 + punpckhwd m8, m11, m12 + punpcklwd m11, m12 + punpckhwd m12, m13, m14 + punpcklwd m13, m14 + punpckhdq m14, m11, m13 + punpckldq m11, m13 + punpckldq m13, m15, m9 + punpckhdq m15, m9 + punpckldq m9, m10, m0 + punpckhdq m10, m0 + punpckhdq m0, m8, m12 + punpckldq m8, m12 + punpcklqdq m12, m13, m8 + punpckhqdq m13, m8 + punpcklqdq m8, m9, m11 + punpckhqdq m9, m11 + punpckhqdq m11, m10, m14 + punpcklqdq m10, m14 + punpcklqdq m14, m15, m0 + punpckhqdq m15, m0 + mova m0, [rsp] + mova [rsp], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m1, [o(pw_2048)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp], m6 +.end2: + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + lea r3, [strideq*3] + WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 +.end3: + pxor m2, m2 + REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 + lea dstq, [dstq+strideq*4] + WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 + WRITE_16X2 10, 11, 0, 1, strideq*2, r3 + REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 + lea dstq, [dstq+strideq*4] + WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 + WRITE_16X2 14, 15, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(pd_2048)] + mova [rsp+gprsize+32*1], m1 + mova [rsp+gprsize+32*2], m9 + IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 + mova m1, [rsp+gprsize+32*2] ; in9 + mova [rsp+gprsize+32*2], m14 ; tmp7 + mova m9, [rsp+gprsize+32*1] ; in1 + mova [rsp+gprsize+32*1], m10 ; tmp5 + mova m14, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m6 ; tmp3 + IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 + mova m6, [rsp+gprsize+32*1] ; tmp5 + psubsw m15, m0, m14 ; out15 + paddsw m0, m14 ; out0 + psubsw m14, m2, m13 ; out14 + paddsw m2, m13 ; out1 + mova [rsp+gprsize+32*1], m2 + psubsw m13, m4, m11 ; out13 + paddsw m2, m4, m11 ; out2 + psubsw m11, m8, m7 ; out11 + paddsw m4, m8, m7 ; out4 + mova m7, [rsp+gprsize+32*2] ; tmp7 + psubsw m10, m6, m5 ; out10 + paddsw m5, m6 ; out5 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; out7 + psubsw m9, m12, m3 ; out9 + paddsw m6, m12, m3 ; out6 + mova m3, [rsp+gprsize+32*0] ; tmp3 + psubsw m12, m3, m1 ; out12 + paddsw m3, m1 ; out3 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main + call .main_pass1_end + pmulhrsw m0, m1, [cq+32*0] + pmulhrsw m2, m1, [cq+32*1] + REPX {pmulhrsw x, m1}, m4, m6, m8, m10 + pmulhrsw m12, m1, [cq+32*2] + pmulhrsw m14, m1, [cq+32*3] + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 + pxor m8, m8 + psubw m1, m8, m1 + jmp m(idct_16x16_internal_8bpc).pass1_end2 +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp+32*0], m6 + pxor m6, m6 + psubw m1, m6, m1 + jmp m(idct_16x16_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(pd_2048)] + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*2], m4 + ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2 + ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 + ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 + ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 + psubsw m0, m2, m10 ; t10a + paddsw m2, m10 ; t2a + psubsw m10, m13, m5 ; t11a + paddsw m13, m5 ; t3a + psubsw m5, m6, m14 ; t14a + paddsw m6, m14 ; t6a + psubsw m14, m9, m1 ; t15a + paddsw m9, m1 ; t7a + ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 + ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 + psubsw m1, m10, m14 ; t14a + paddsw m10, m14 ; t10a + psubsw m14, m0, m5 ; t15a + paddsw m0, m5 ; t11a + psubsw m5, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m13, m9 ; t7 + paddsw m13, m9 ; t3 + ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a + ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 + mova m9, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m10 ; t10a + mova m4, [rsp+gprsize+32*1] ; in0 + mova [rsp+gprsize+32*1], m6 ; t6a + mova m6, [rsp+gprsize+32*2] ; in4 + mova [rsp+gprsize+32*2], m2 ; t2 + ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0 + ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4 + ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8 + ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12 + psubsw m10, m4, m8 ; t8a + paddsw m8, m4 ; t0a + psubsw m4, m9, m7 ; t9a + paddsw m9, m7 ; t1a + psubsw m7, m6, m12 ; t12a + paddsw m6, m12 ; t4a + psubsw m12, m11, m3 ; t13a + paddsw m11, m3 ; t5a + ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8 + ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13 + psubsw m3, m9, m11 ; t5 + paddsw m9, m11 ; t1 + psubsw m11, m4, m12 ; t12a + paddsw m4, m12 ; t8a + paddsw m12, m8, m6 ; t0 + psubsw m8, m6 ; t4 + paddsw m6, m10, m7 ; t9a + psubsw m10, m7 ; t13a + ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12 + mova m7, [rsp+gprsize+32*0] ; t10a + mova m2, [rsp+gprsize+32*1] ; t6a + paddsw m15, m9, m13 ; -out15 + psubsw m9, m13 ; t3a + paddsw m13, m11, m1 ; -out13 + psubsw m11, m1 ; t15a + psubsw m1, m4, m7 ; t10 + paddsw m7, m4 ; -out1 + psubsw m4, m3, m2 ; t6 + paddsw m3, m2 ; -out3 + paddsw m2, m10, m14 ; out2 + psubsw m10, m14 ; t14a + paddsw m14, m6, m0 ; out14 + psubsw m6, m0 ; t11 + mova m0, [rsp+gprsize+32*2] ; t2 + mova [rsp+gprsize+32*1], m7 + psubsw m7, m12, m0 ; t2a + paddsw m0, m12 ; out0 + paddsw m12, m8, m5 ; out12 + psubsw m8, m5 ; t7 + ret +ALIGN function_align +.main_pass1_end: + mova [cq+32*0], m0 + mova [cq+32*1], m2 + mova [cq+32*2], m12 + mova [cq+32*3], m14 + vpbroadcastd m14, [pw_m2896_2896] + vpbroadcastd m12, [pw_2896_2896] + vpbroadcastd m2, [pd_2048] + punpcklwd m5, m11, m10 + punpckhwd m11, m10 + pmaddwd m10, m14, m5 + pmaddwd m0, m14, m11 + pmaddwd m5, m12 + pmaddwd m11, m12 + REPX {paddd x, m2}, m10, m0, m5, m11 + REPX {psrad x, 12}, m10, m0, m5, m11 + packssdw m10, m0 ; out10 + packssdw m5, m11 ; -out5 + punpcklwd m11, m8, m4 + punpckhwd m8, m4 + pmaddwd m4, m12, m11 + pmaddwd m0, m12, m8 + pmaddwd m11, m14 + pmaddwd m8, m14 + REPX {paddd x, m2}, m4, m0, m11, m8 + REPX {psrad x, 12}, m4, m0, m11, m8 + packssdw m4, m0 ; out4 + packssdw m11, m8 ; -out11 + punpcklwd m8, m9, m7 + punpckhwd m9, m7 + pmaddwd m7, m12, m8 + pmaddwd m0, m12, m9 + pmaddwd m8, m14 + pmaddwd m9, m14 + REPX {paddd x, m2}, m7, m0, m8, m9 + REPX {psrad x, 12}, m7, m0, m8, m9 + packssdw m7, m0 ; -out7 + packssdw m8, m9 ; out8 + punpckhwd m0, m6, m1 + punpcklwd m6, m1 + pmaddwd m1, m14, m0 + pmaddwd m9, m14, m6 + pmaddwd m0, m12 + pmaddwd m6, m12 + REPX {paddd x, m2}, m1, m9, m0, m6 + REPX {psrad x, 12}, m1, m9, m0, m6 + packssdw m9, m1 ; -out7 + packssdw m6, m0 ; out8 + vpbroadcastd m1, [o(pw_8192)] + ret +ALIGN function_align +cglobal_label .main_pass2_end + ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to + ; 16-bit here will produce the same result as using 32-bit intermediates. + paddsw m5, m10, m11 ; -out5 + psubsw m10, m11 ; out10 + psubsw m11, m4, m8 ; -out11 + paddsw m4, m8 ; out4 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; -out7 + psubsw m9, m1, m6 ; -out9 + paddsw m6, m1 ; out6 + vpbroadcastd m1, [o(pw_2896x8)] + REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 + vpbroadcastd m1, [o(pw_2048)] + ret + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass1_end + pmulhrsw m6, m1 + pmulhrsw m2, m1, m8 + mova [rsp+32*2], m6 + pmulhrsw m6, m1, m4 + pmulhrsw m4, m1, m10 + pmulhrsw m8, m1, [cq+32*3] + pmulhrsw m10, m1, [cq+32*2] + pmulhrsw m12, m1, [cq+32*1] + pmulhrsw m14, m1, [cq+32*0] + pxor m0, m0 + psubw m0, m1 + REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 + pmulhrsw m1, m0, m9 + pmulhrsw m9, m0, m13 + pmulhrsw m0, [rsp+32*1] + mova [rsp+16*0], xm15 + mova [rsp+16*1], xm7 + vperm2i128 m15, m15, m7, 0x31 + vinserti128 m7, m2, xm14, 1 + vperm2i128 m14, m2, m14, 0x31 + vinserti128 m2, m9, xm5, 1 + vperm2i128 m9, m9, m5, 0x31 + vinserti128 m5, m4, xm12, 1 + vperm2i128 m12, m4, m12, 0x31 + vinserti128 m4, m11, xm3, 1 + vperm2i128 m11, m11, m3, 0x31 + vinserti128 m3, m10, xm6, 1 + vperm2i128 m10, m10, m6, 0x31 + vinserti128 m6, m1, xm0, 1 + vperm2i128 m13, m1, m0, 0x31 + vinserti128 m1, m8, [rsp+32*2], 1 + vperm2i128 m8, m8, [rsp+32*2], 0x31 + jmp m(idct_16x16_internal_8bpc).pass1_end3 +.pass2: + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end + pmulhrsw m0, m1 + pmulhrsw m8, m1 + mova [rsp+32*0], m0 + mova [rsp+32*2], m8 + pxor m0, m0 + psubw m0, m1 + pmulhrsw m8, m0, m7 + pmulhrsw m7, m0, m9 + pmulhrsw m9, m1, m6 + pmulhrsw m6, m1, m10 + pmulhrsw m10, m0, m5 + pmulhrsw m5, m0, m11 + pmulhrsw m11, m1, m4 + pmulhrsw m4, m1, m12 + pmulhrsw m12, m0, m3 + pmulhrsw m3, m0, m13 + pmulhrsw m13, m1, m2 + pmulhrsw m1, m14 + pmulhrsw m14, m0, [rsp+32*1] + pmulhrsw m0, m15 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 + mova m15, [rsp+32*0] + WRITE_16X2 3, 4, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 + WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 + jmp m(idct_16x16_internal_8bpc).end3 + +%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 ; signs are guaranteed to be equal +%endmacro + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + vpbroadcastd m7, [o(pw_1697x16)] + mova xm0, [cq+16* 0] + vinserti128 m0, [cq+16*16], 1 + mova xm15, [cq+16* 1] + vinserti128 m15, [cq+16*17], 1 + mova xm1, [cq+16* 2] + vinserti128 m1, [cq+16*18], 1 + mova xm8, [cq+16* 3] + vinserti128 m8, [cq+16*19], 1 + mova xm2, [cq+16* 4] + vinserti128 m2, [cq+16*20], 1 + mova xm9, [cq+16* 5] + vinserti128 m9, [cq+16*21], 1 + mova xm3, [cq+16* 6] + vinserti128 m3, [cq+16*22], 1 + mova xm10, [cq+16* 7] + add cq, 16*16 + vinserti128 m10, [cq+16* 7], 1 + mova xm4, [cq-16* 8] + vinserti128 m4, [cq+16* 8], 1 + mova xm11, [cq-16* 7] + vinserti128 m11, [cq+16* 9], 1 + mova xm5, [cq-16* 6] + vinserti128 m5, [cq+16*10], 1 + mova xm12, [cq-16* 5] + vinserti128 m12, [cq+16*11], 1 + mova xm13, [cq-16* 3] + vinserti128 m13, [cq+16*13], 1 + mova xm14, [cq-16* 1] + vinserti128 m14, [cq+16*15], 1 + REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ + 10, 4, 11, 5, 12, 13, 14 + mova xm6, [cq-16* 4] + vinserti128 m6, [cq+16*12], 1 + mova [rsp], m0 + IDTX16B 6, 0, 7 + mova xm0, [cq-16* 2] + vinserti128 m0, [cq+16*14], 1 + pmulhrsw m7, m0 + psraw m7, 1 + pavgw m7, m0 + jmp m(idct_16x16_internal_8bpc).pass1_end3 +ALIGN function_align +.pass2: + vpbroadcastd m15, [o(pw_1697x16)] + mova [rsp+32*1], m0 + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [rsp+32*1] + mova [rsp+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [rsp+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + jmp m(idct_16x16_internal_8bpc).end + +%define o_base deint_shuf + 128 + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [%1+%2*0] + pmulhrsw m1, m15, [%1+%2*1] + pmulhrsw m2, m15, [%1+%2*2] + pmulhrsw m3, m15, [%1+%2*3] + pmulhrsw m4, m15, [%1+%2*4] + pmulhrsw m5, m15, [%1+%2*5] + pmulhrsw m6, m15, [%1+%2*6] + pmulhrsw m7, m15, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 +%if %3 +%if %3 == 1 + vpbroadcastd m15, [o(pw_2896x8)] +%endif + pmulhrsw m8, m15, [%1+%2*0] + pmulhrsw m9, m15, [%1+%2*1] + pmulhrsw m10, m15, [%1+%2*2] + pmulhrsw m11, m15, [%1+%2*3] + pmulhrsw m12, m15, [%1+%2*4] + pmulhrsw m13, m15, [%1+%2*5] + pmulhrsw m14, m15, [%1+%2*6] + pmulhrsw m15, [%1+%2*7] +%else + mova m8, [%1+%2*0] + mova m9, [%1+%2*1] + mova m10, [%1+%2*2] + mova m11, [%1+%2*3] + mova m12, [%1+%2*4] + mova m13, [%1+%2*5] + mova m14, [%1+%2*6] + mova m15, [%1+%2*7] +%endif +%endmacro + +%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] + vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8] + punpcklwd m%1, m%2, m%2 + pmulhrsw m%1, m%3 + vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8] + punpckhwd m%2, m%2 + pmulhrsw m%2, m%3 +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + cmp eobd, 106 + jle .fast + LOAD_8ROWS cq+32*1, 32*2 + call m(idct_16x8_internal_8bpc).main + vperm2i128 m11, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m11, m4 + punpckhwd m11, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 + punpckhdq m5, m11, m4 + punpckldq m11, m4 + punpckldq m4, m7, m1 + punpckhdq m7, m1 + punpckhqdq m12, m6, m0 + punpcklqdq m0, m6 ; out4 + punpckhqdq m13, m7, m4 + punpcklqdq m4, m7 ; out5 + punpckhqdq m14, m3, m2 + punpcklqdq m2, m3 ; out6 + punpckhqdq m15, m5, m11 + punpcklqdq m11, m5 ; out7 + mova [rsp+32*0], m0 + mova [rsp+32*1], m4 + mova [rsp+32*2], m2 +.fast: + LOAD_8ROWS cq+32*0, 32*2 + call m(idct_16x8_internal_8bpc).main + vperm2i128 m8, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vpbroadcastd m9, [o(pw_8192)] + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m8, m4 + punpcklwd m8, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m8, m5 + punpckhdq m8, m5 + punpckhdq m5, m3, m4 + punpckldq m3, m4 + punpckhdq m4, m7, m1 + punpckldq m7, m1 + punpcklqdq m1, m7, m4 + punpckhqdq m7, m4 ; out9 + punpckhqdq m4, m2, m8 ; out10 + punpcklqdq m2, m8 + punpckhqdq m8, m3, m5 + punpcklqdq m3, m5 + punpckhqdq m5, m0, m6 ; out8 + punpcklqdq m0, m6 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 + cmp eobd, 106 + jg .full + mova [rsp+32*0], m5 + mova [rsp+32*1], m7 + mova [rsp+32*2], m4 + pmulhrsw m11, m9, m8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call .main_fast + jmp .pass2 +.dconly: + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + pmulhrsw xm0, xm2 + psrlw xm2, 2 ; pw_2048 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mov r2d, 8 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 +.full: + REPX {pmulhrsw x, m9}, m12, m13, m14, m15 + pmulhrsw m6, m9, [rsp+32*2] + mova [rsp+32*2], m4 + pmulhrsw m4, m9, [rsp+32*0] + mova [rsp+32*0], m5 + pmulhrsw m5, m9, [rsp+32*1] + mova [rsp+32*1], m7 + pmulhrsw m7, m9, m11 + pmulhrsw m11, m9, m8 + call .main +.pass2: + vpbroadcastd m12, [o(pw_2048)] + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m13, m14, m15 + pmulhrsw m12, [rsp] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 + mova [rsp+32*0], m4 + mova [rsp+32*1], m6 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*0], 5, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*1], 7, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 8, 9, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 10, 11, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 12, 13, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 14, 15, 4, 6 + RET +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + call m(idct_8x16_internal_8bpc).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + lea r5, [rax-(o_base)+pw_201_4091x8] + ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a + jmp .main2 +ALIGN function_align +cglobal_label .main + call m(idct_8x16_internal_8bpc).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + punpcklwd m1, m15, m8 ; in31 in1 + punpckhwd m8, m15 ; in3 in29 + punpcklwd m15, m14, m9 ; in27 in5 + punpckhwd m9, m14 ; in7 in25 + punpcklwd m14, m13, m0 ; in23 in9 + punpckhwd m0, m13 ; in11 in21 + punpcklwd m13, m12, m11 ; in19 in13 + punpckhwd m11, m12 ; in15 in17 + ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a + ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a + ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a + ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a + ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a + ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a + ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a + ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a +.main2: + psubsw m6, m1, m11 ; t17 t30 + paddsw m1, m11 ; t16 t31 + psubsw m11, m9, m14 ; t18 t29 + paddsw m9, m14 ; t19 t28 + psubsw m14, m15, m0 ; t21 t26 + paddsw m15, m0 ; t20 t27 + psubsw m0, m8, m13 ; t22 t25 + paddsw m8, m13 ; t23 t24 + ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a + ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a + ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a + ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a + psubsw m13, m1, m9 ; t19a t28a + paddsw m1, m9 ; t16a t31a + psubsw m9, m8, m15 ; t20a t27a + paddsw m8, m15 ; t23a t24a + psubsw m15, m6, m11 ; t18 t29 + paddsw m6, m11 ; t17 t30 + psubsw m11, m0, m14 ; t21 t26 + paddsw m0, m14 ; t22 t25 + ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a + ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28 + ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27 + ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a + vbroadcasti128 m12, [o(deint_shuf)] + psubsw m14, m1, m8 ; t23 t24 + paddsw m1, m8 ; t16 t31 + psubsw m8, m6, m0 ; t22a t25a + paddsw m6, m0 ; t17a t30a + psubsw m0, m15, m11 ; t21 t26 + paddsw m15, m11 ; t18 t29 + psubsw m11, m13, m9 ; t20a t27a + paddsw m13, m9 ; t19a t28a + REPX {pshufb x, m12}, m1, m6, m15, m13 + ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a + vpbroadcastd m9, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 + vpbroadcastd m12, [o(pw_2896_2896)] + ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a + vpbroadcastd m12, [o(pw_2896_2896)] + ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 + shufps m9, m14, m8, q1032 ; t23a t22 + vpblendd m14, m8, 0xcc ; t24a t25 + shufps m8, m11, m0, q1032 ; t20 t21a + vpblendd m11, m0, 0xcc ; t27 t26a + punpcklqdq m0, m1, m6 ; t16 t17a + punpckhqdq m1, m6 ; t31 t30a + psubsw m10, m5, m8 ; out20 out21 + paddsw m5, m8 ; out11 out10 + psubsw m6, m3, m14 ; out24 out25 + paddsw m3, m14 ; out7 out6 + psubsw m8, m7, m0 ; out16 out17 + paddsw m7, m0 ; out15 out14 + mova m0, [rsp+gprsize+0*32] + punpcklqdq m12, m13, m15 ; t19a t18 + punpckhqdq m13, m15 ; t28a t29 + psubsw m15, m0, m1 ; out31 out30 + paddsw m0, m1 ; out0 out1 + mova m1, [rsp+gprsize+1*32] + mova [rsp+gprsize+0*32], m6 + mova m6, [rsp+gprsize+2*32] + psubsw m14, m1, m13 ; out28 out29 + paddsw m1, m13 ; out3 out2 + psubsw m13, m2, m11 ; out27 out26 + paddsw m2, m11 ; out4 out5 + psubsw m11, m4, m9 ; out23 out22 + paddsw m4, m9 ; out8 out9 + psubsw m9, m6, m12 ; out19 out18 + paddsw m6, m12 ; out12 out13 + ret + +%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] + vbroadcasti128 m%1, [cq+16*%3] + vbroadcasti128 m%2, [cq+16*%4] + shufpd m%1, m%2, 0x0c +%endmacro + +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 8 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova m1, [dstq] + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq], m1 + add dstq, strideq + dec r2d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 + LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 + LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 + LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*16 + LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 + LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 + LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 + LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 + mova [rsp+32*0], m4 + mova [rsp+32*1], m5 + mova [rsp+32*2], m6 + cmp eobd, 106 + jg .full + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .pass2 +.full: + LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 + LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 + LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 + LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*8 + LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 + LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 + LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 + LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main +.pass2: + vpbroadcastd m12, [o(pw_8192)] + REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 + mova [rsp+32*1], m9 + mova [rsp+32*2], m10 + punpckhwd m9, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m10, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpckhwd m3, m0, m9 + punpcklwd m0, m9 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m10, m4 + punpckhwd m10, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m10, m5 + punpckhdq m10, m5 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 + pmulhrsw m12, [rsp+32*0] + mova [rsp+32*0], m8 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m10, 0x31 + vinserti128 m3, xm10, 1 + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + lea r2, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + mova m0, [rsp+32*0] + mova m1, [rsp+32*1] + mova m2, [rsp+32*2] + punpckhwd m7, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m11 + punpcklwd m1, m11 + punpckhwd m4, m12, m14 + punpcklwd m12, m14 + punpckhwd m5, m13, m15 + punpcklwd m13, m15 + punpckhwd m3, m0, m7 + punpcklwd m0, m7 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m12, m4 + punpckhwd m12, m4 + punpcklwd m4, m5, m13 + punpckhwd m5, m13 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m12, m5 + punpckhdq m12, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m12, 0x31 + vinserti128 m3, xm12, 1 + call m(idct_16x8_internal_8bpc).main2 + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + add r0, 16 + add r3, 16 + %define dstq r0 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + RET + +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob + vpbroadcastd m9, [pw_5] + lea r4, [strideq*3] + sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) +.loop: + mova xm0,[cq+16* 0] + mova xm1, [cq+16* 4] + vinserti128 m0, [cq+16* 1], 1 + vinserti128 m1, [cq+16* 5], 1 + pxor m8, m8 + mova [cq+32*0], m8 + mova [cq+32*2], m8 + add cq, 16*16 + mova xm2, [cq-16* 8] + mova xm3, [cq-16* 4] + vinserti128 m2, [cq-16* 7], 1 + vinserti128 m3, [cq-16* 3], 1 + mova xm4, [cq+16* 0] + mova xm5, [cq+16* 4] + vinserti128 m4, [cq+16* 1], 1 + vinserti128 m5, [cq+16* 5], 1 + mova xm6, [cq+16* 8] + mova xm7, [cq+16*12] + vinserti128 m6, [cq+16* 9], 1 + vinserti128 m7, [cq+16*13], 1 + REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 + REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose8x8 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + sub cq, 16*16-32 + lea dstq, [dstq+r4*4] + add eobd, 0x80000000 + jnc .loop + RET +ALIGN function_align +.transpose8x8: + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m8, m1 + punpckhdq m8, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + ret + +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob + add cq, 16*8 + vpbroadcastd m9, [pw_4096] + lea r4, [strideq*3] + lea r5, [dstq+strideq*4] + sub eobd, 107 +.loop: + mova xm0, [cq-16*8] + mova xm1, [cq-16*7] + vinserti128 m0, [cq+16*0], 1 + vinserti128 m1, [cq+16*1], 1 + mova xm2, [cq-16*6] + mova xm3, [cq-16*5] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m3, [cq+16*3], 1 + mova xm4, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm6, [cq-16*2] + mova xm7, [cq-16*1] + vinserti128 m6, [cq+16*6], 1 + vinserti128 m7, [cq+16*7], 1 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + %define dstq r5 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + add cq, 16*16 + add r0, 16 + add r5, 16 + add eobd, 0x80000000 + jnc .loop + RET + +%define o_base pw_5 + 128 + +%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs +%if %3 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [%1+%2* 0] + pmulhrsw m1, m15, [%1+%2* 1] + pmulhrsw m2, m15, [%1+%2* 2] + pmulhrsw m3, m15, [%1+%2* 3] + pmulhrsw m4, m15, [%1+%2* 4] + pmulhrsw m5, m15, [%1+%2* 5] + pmulhrsw m6, m15, [%1+%2* 6] + pmulhrsw m7, m15, [%1+%2* 7] + pmulhrsw m8, m15, [%1+%2* 8] + pmulhrsw m9, m15, [%1+%2* 9] + pmulhrsw m10, m15, [%1+%2*10] + pmulhrsw m11, m15, [%1+%2*11] + pmulhrsw m12, m15, [%1+%2*12] + pmulhrsw m13, m15, [%1+%2*13] + pmulhrsw m14, m15, [%1+%2*14] + pmulhrsw m15, [%1+%2*15] +%else + mova m0, [%1+%2* 0] + mova m1, [%1+%2* 1] + mova m2, [%1+%2* 2] + mova m3, [%1+%2* 3] + mova m4, [%1+%2* 4] + mova m5, [%1+%2* 5] + mova m6, [%1+%2* 6] + mova m7, [%1+%2* 7] + mova m8, [%1+%2* 8] + mova m9, [%1+%2* 9] + mova m10, [%1+%2*10] + mova m11, [%1+%2*11] + mova m12, [%1+%2*12] + mova m13, [%1+%2*13] + mova m14, [%1+%2*14] + mova m15, [%1+%2*15] +%endif + mova [rsp], m15 +%if %4 + pxor m15, m15 + REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15 +%endif +%endmacro + +%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 + pmovzxbw m%4, [dstq+%6] + pmulhrsw m%3, m%5 + pmulhrsw m%1, m%5 + paddw m%3, m%4 + pmovzxbw m%4, [r2+%7] + paddw m%1, m%4 + packuswb m%3, m%1 + vpermq m%3, m%3, q3120 + mova [dstq+%6], xm%3 + vextracti128 [r2+%7], m%3, 1 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3 + %undef cmp + LOAD_16ROWS cq, 64, 1 + call m(idct_16x16_internal_8bpc).main + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + lea tmp3q, [tmp1q+32*16] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp3q-32*4+ 0], xm0 + vextracti128 [tmp3q+32*0+ 0], m0, 1 + mova [tmp3q-32*3+ 0], xm2 + vextracti128 [tmp3q+32*1+ 0], m2, 1 + mova [tmp3q-32*2+ 0], xm4 + vextracti128 [tmp3q+32*2+ 0], m4, 1 + mova [tmp3q-32*1+ 0], xm6 + vextracti128 [tmp3q+32*3+ 0], m6, 1 + mova [tmp3q-32*4+16], xm8 + vextracti128 [tmp3q+32*0+16], m8, 1 + mova [tmp3q-32*3+16], xm10 + vextracti128 [tmp3q+32*1+16], m10, 1 + mova [tmp3q-32*2+16], xm12 + vextracti128 [tmp3q+32*2+16], m12, 1 + mova [tmp3q-32*1+16], xm14 + vextracti128 [tmp3q+32*3+16], m14, 1 + cmp eobd, 150 + jg .full + vinserti128 m0, m1, xm9, 1 + vperm2i128 m4, m1, m9, 0x31 + vinserti128 m2, m5, xm13, 1 + vperm2i128 m6, m5, m13, 0x31 + vinserti128 m1, m3, xm11, 1 + vperm2i128 m5, m3, m11, 0x31 + vinserti128 m3, m7, xm15, 1 + vperm2i128 m7, m7, m15, 0x31 + call .main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct16 +.dconly: + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 16 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.full: + mova [tmp1q-32*4], m1 + mova [tmp1q-32*3], m3 + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m7 + mova [tmp1q+32*0], m9 + mova [tmp1q+32*1], m11 + mova [tmp1q+32*2], m13 + mova [tmp1q+32*3], m15 + LOAD_16ROWS cq+32, 64, 1 + call m(idct_16x16_internal_8bpc).main + lea r2, [tmp3q+32*8] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [r2-32*4+ 0], xm0 + vextracti128 [r2+32*0+ 0], m0, 1 + mova [r2-32*3+ 0], xm2 + vextracti128 [r2+32*1+ 0], m2, 1 + mova [r2-32*2+ 0], xm4 + vextracti128 [r2+32*2+ 0], m4, 1 + mova [r2-32*1+ 0], xm6 + vextracti128 [r2+32*3+ 0], m6, 1 + mova [r2-32*4+16], xm8 + vextracti128 [r2+32*0+16], m8, 1 + mova [r2-32*3+16], xm10 + vextracti128 [r2+32*1+16], m10, 1 + mova [r2-32*2+16], xm12 + vextracti128 [r2+32*2+16], m12, 1 + mova [r2-32*1+16], xm14 + vextracti128 [r2+32*3+16], m14, 1 + vinserti128 m8, m1, xm9, 1 + vperm2i128 m12, m1, m9, 0x31 + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp1q+32*0], 1 + vinserti128 m1, [tmp1q+32*1], 1 + vinserti128 m10, m5, xm13, 1 + vperm2i128 m14, m5, m13, 0x31 + mova xm4, [tmp1q-32*4+16] + mova xm5, [tmp1q-32*3+16] + vinserti128 m4, [tmp1q+32*0+16], 1 + vinserti128 m5, [tmp1q+32*1+16], 1 + vinserti128 m9, m3, xm11, 1 + vperm2i128 m13, m3, m11, 0x31 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp1q+32*2], 1 + vinserti128 m3, [tmp1q+32*3], 1 + vinserti128 m11, m7, xm15, 1 + vperm2i128 m15, m7, m15, 0x31 + mova xm6, [tmp1q-32*2+16] + mova xm7, [tmp1q-32*1+16] + vinserti128 m6, [tmp1q+32*2+16], 1 + vinserti128 m7, [tmp1q+32*3+16], 1 + call .main_oddhalf + LOAD_8ROWS_H r2-32*4, 32 +.idct16: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +cglobal_label .main_oddhalf_fast ; lower half is zero + mova [rsp+gprsize+32*1], m7 + pxor m7, m7 + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m7 + vpbroadcastd m11, [o(pw_3703x8)] + vpbroadcastd m7, [o(pw_1751x8)] + vpbroadcastd m12, [o(pw_m1380x8)] + vpbroadcastd m8, [o(pw_3857x8)] + vpbroadcastd m13, [o(pw_3973x8)] + vpbroadcastd m15, [o(pw_995x8)] + pmulhrsw m11, m4 ; t29a + pmulhrsw m4, m7 ; t18a + pmulhrsw m12, m3 ; t19a + pmulhrsw m3, m8 ; t28a + pmulhrsw m13, m2 ; t27a + pmulhrsw m2, m15 ; t20a + vpbroadcastd m10, [o(pw_m2106x8)] + vpbroadcastd m7, [o(pw_3513x8)] + vpbroadcastd m9, [o(pw_3290x8)] + vpbroadcastd m8, [o(pw_2440x8)] + vpbroadcastd m14, [o(pw_m601x8)] + vpbroadcastd m15, [o(pw_4052x8)] + pmulhrsw m10, m5 ; t21a + pmulhrsw m5, m7 ; t26a + pmulhrsw m9, m6 ; t25a + pmulhrsw m6, m8 ; t22a + pmulhrsw m14, m1 ; t23a + pmulhrsw m1, m15 ; t24a + vpbroadcastd m15, [o(pd_2048)] + jmp .main2 +ALIGN function_align +cglobal_label .main_oddhalf + mova [rsp+gprsize+32*0], m15 + mova [rsp+gprsize+32*1], m7 + mova [rsp+gprsize+32*2], m8 + vpbroadcastd m15, [o(pd_2048)] + ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a + ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a +.main2: + psubsw m7, m12, m4 ; t18 + paddsw m12, m4 ; t19 + psubsw m4, m2, m10 ; t21 + paddsw m2, m10 ; t20 + psubsw m10, m14, m6 ; t22 + paddsw m14, m6 ; t23 + psubsw m6, m1, m9 ; t25 + paddsw m1, m9 ; t24 + psubsw m9, m13, m5 ; t26 + paddsw m13, m5 ; t27 + psubsw m5, m3, m11 ; t29 + paddsw m3, m11 ; t28 + ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a + ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a + ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a + psubsw m8, m14, m2 ; t20a + paddsw m14, m2 ; t23a + psubsw m2, m1, m13 ; t27a + paddsw m1, m13 ; t24a + psubsw m13, m6, m9 ; t21 + paddsw m6, m9 ; t22 + psubsw m9, m10, m4 ; t26 + paddsw m10, m4 ; t25 + ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27 + ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a + mova m4, [rsp+gprsize+32*0] ; in31 + mova [rsp+gprsize+32*0], m6 ; t22 + mova m6, [rsp+gprsize+32*1] ; in15 + mova [rsp+gprsize+32*1], m14 ; t23a + mova m14, [rsp+gprsize+32*2] ; in17 + mova [rsp+gprsize+32*2], m1 ; t24a + ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a + ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a + psubsw m1, m0, m14 ; t17 + paddsw m0, m14 ; t16 + psubsw m14, m4, m6 ; t30 + paddsw m4, m6 ; t31 + ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a + psubsw m6, m0, m12 ; t19a + paddsw m0, m12 ; t16a + psubsw m12, m4, m3 ; t28a + paddsw m4, m3 ; t31a + psubsw m3, m14, m5 ; t18 + paddsw m14, m5 ; t17 + psubsw m5, m1, m7 ; t29 + paddsw m1, m7 ; t30 + ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a + ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28 + psubsw m7, m1, m10 ; t25a + paddsw m1, m10 ; t30a + psubsw m10, m5, m9 ; t21 + paddsw m5, m9 ; t18 + psubsw m9, m12, m2 ; t20a + paddsw m12, m2 ; t19a + psubsw m2, m3, m13 ; t26 + paddsw m3, m13 ; t29 + psubsw m13, m6, m8 ; t27a + paddsw m6, m8 ; t28a + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m12 + mova [tmp2q+32*0], m6 + mova [tmp2q+32*1], m3 + mova [tmp2q+32*2], m1 + mova m5, [rsp+gprsize+32*0] ; t22 + mova m6, [rsp+gprsize+32*1] ; t23 + mova m3, [rsp+gprsize+32*2] ; t24a + psubsw m1, m14, m5 ; t22a + paddsw m14, m5 ; t17a + psubsw m5, m0, m6 ; t23 + paddsw m0, m6 ; t16 + psubsw m6, m4, m3 ; t24 + paddsw m4, m3 ; t31 + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m3, [o(pw_2896_2896)] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m14 + mova [tmp2q+32*3], m4 + ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27 + ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a + ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25 + ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a + mova [tmp1q+32*0], m13 + mova [tmp1q+32*1], m2 + mova [tmp1q+32*2], m7 + mova [tmp1q+32*3], m6 + mova [tmp2q-32*4], m5 + mova [tmp2q-32*3], m1 + mova [tmp2q-32*2], m10 + mova [tmp2q-32*1], m9 + ret +ALIGN function_align +.transpose_2x8x8_round: + punpckhwd m6, m12, m13 + punpcklwd m12, m13 + punpckhwd m13, m8, m9 + punpcklwd m8, m9 + punpckhwd m9, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m10, m11 + punpcklwd m10, m11 + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 + punpckhdq m11, m8, m10 + punpckldq m8, m10 + punpckldq m10, m12, m14 + punpckhdq m12, m14 + punpckhdq m14, m13, m15 + punpckldq m13, m15 + punpckldq m15, m6, m9 + punpckhdq m6, m9 + punpckhqdq m9, m8, m10 + punpcklqdq m8, m10 + punpcklqdq m10, m11, m12 + punpckhqdq m11, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m14, m6 + punpcklqdq m14, m6 + pmulhrsw m6, m7, [rsp+gprsize+32*0] + REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 + pmulhrsw m7, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m15 + vpbroadcastd m15, [o(pw_2048)] + IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 + IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m7, [rsp+gprsize+32*0] + mova m1, [rsp+gprsize+32*2] + IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 + ret + +; Perform the final sumsub step and YMM lane shuffling +%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2] + mova m%3, [tmp2q+32*( 3-%1)] + psubsw m%4, m%1, m%3 + paddsw m%1, m%3 + mova m%3, [tmp1q+32*(11-%2)] + mova [tmp1q+32*(11-%2)+16], xm%4 + vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 + paddsw m%4, m%2, m%3 + psubsw m%2, m%3 + mova [tmp1q+32*(11-%2)], xm%2 + vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 + vperm2i128 m%2, m%1, m%4, 0x31 + vinserti128 m%1, xm%4, 1 +%endmacro + +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 16 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly +.normal: + PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [cq+32* 1] + pmulhrsw m1, m15, [cq+32* 3] + pmulhrsw m2, m15, [cq+32* 5] + pmulhrsw m3, m15, [cq+32* 7] + pmulhrsw m4, m15, [cq+32* 9] + pmulhrsw m5, m15, [cq+32*11] + pmulhrsw m6, m15, [cq+32*13] + pmulhrsw m7, m15, [cq+32*15] + pmulhrsw m8, m15, [cq+32*17] + pmulhrsw m9, m15, [cq+32*19] + pmulhrsw m10, m15, [cq+32*21] + pmulhrsw m11, m15, [cq+32*23] + pmulhrsw m12, m15, [cq+32*25] + pmulhrsw m13, m15, [cq+32*27] + pmulhrsw m14, m15, [cq+32*29] + pmulhrsw m15, [cq+32*31] + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + LOAD_16ROWS cq+32*0, 32*2, 1, 0 + pxor m15, m15 + mov r3d, 8 +.zero_loop: + mova [cq+32*0], m15 + mova [cq+32*1], m15 + mova [cq+32*2], m15 + mova [cq+32*3], m15 + add cq, 32*4 + dec r3d + jg .zero_loop + call m(idct_16x16_internal_8bpc).main + call .pass1_end + lea r2, [strideq*3] + mov r3, dstq +.pass2: + vpbroadcastd m7, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main + mova [rsp+32*2], m15 + vpbroadcastd m15, [o(pw_2048)] + REPX {pmulhrsw x, m15}, m2, m3, m0 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m4, m5, m6, m7 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m8, m9, m10, m11 + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m11, m12, m13, m14 + pmulhrsw m15, [rsp+32*2] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + test r3, r3 + jnz .right_half + RET +.right_half: + LOAD_8ROWS tmp1q-32*4, 32 + LOAD_8ROWS_H tmp2q-32*4, 32 + lea dstq, [r3+16] + xor r3d, r3d + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + jmp .pass2 +ALIGN function_align +.pass1_end: + mova [rsp+gprsize+32*0], m9 + IDCT32_PASS1_END 0, 8, 1, 9 + IDCT32_PASS1_END 2, 10, 1, 9 + IDCT32_PASS1_END 3, 11, 1, 9 + IDCT32_PASS1_END 4, 12, 1, 9 + IDCT32_PASS1_END 5, 13, 1, 9 + IDCT32_PASS1_END 6, 14, 1, 9 + IDCT32_PASS1_END 7, 15, 1, 9 + mova m1, [rsp+gprsize+32*1] + mova m9, [rsp+gprsize+32*0] + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*1], m7 + IDCT32_PASS1_END 1, 9, 6, 7 + ret + +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob +%undef cmp + lea rax, [o_base] + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m12, [o(pw_8192)] + cmp eobd, 43 ; if (eob > 43) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg al ; iteration_count++ + add eobd, -279 ; if (eob > 278) + adc r4b, al ; iteration_count++ + lea r3, [strideq*3] + mov rax, cq + paddw m11, m12, m12 ; pw_16384 +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jge .loop + sub cq, 32 + pxor m0, m0 + mov r0d, 8 + cmp cq, rax + ja .zero_loop +.zero_loop_half: + mova [rax+64*0], m0 + mova [rax+64*1], m0 + add rax, 64*4 + mova [rax-64*2], m0 + mova [rax-64*1], m0 + sub r0d, 2 + jg .zero_loop_half + RET +.zero_loop: + mova [rax+32*0], m0 + mova [rax+32*1], m0 + mova [rax+32*2], m0 + mova [rax+32*3], m0 + add rax, 32*4 + dec r0d + jg .zero_loop + RET + +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob +%undef cmp + lea rax, [o_base] + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m11, [o(pw_2048)] + cmp eobd, 35 ; if (eob > 35) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg r3b ; iteration_count += 2 + lea r4d, [r4+r3*2] + lea r3, [strideq*3] + mov r5, dstq + mov rax, cq +.loop: + mova xm0, [cq+32* 0] + mova xm1, [cq+32* 1] + vinserti128 m0, [cq+32* 8], 1 + vinserti128 m1, [cq+32* 9], 1 + mova xm2, [cq+32* 2] + mova xm3, [cq+32* 3] + vinserti128 m2, [cq+32*10], 1 + vinserti128 m3, [cq+32*11], 1 + mova xm4, [cq+32* 4] + mova xm5, [cq+32* 5] + vinserti128 m4, [cq+32*12], 1 + vinserti128 m5, [cq+32*13], 1 + mova xm6, [cq+32* 6] + mova xm7, [cq+32* 7] + vinserti128 m6, [cq+32*14], 1 + vinserti128 m7, [cq+32*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jl .ret + test r4b, 1 + jz .loop + add cq, 32*15 + lea dstq, [r5+16] + jmp .loop +.ret: + sub cd, eax + pxor m0, m0 + add cd, 384 +.zero_loop: + mova [rax+32*0], m0 + mova [rax+32*1], m0 + mova [rax+32*2], m0 + mova [rax+32*3], m0 + add rax, 32*4 + sub cd, 128 + jge .zero_loop + RET + +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly +.normal: + PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + %undef cmp + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + sub eobd, 136 + mov tmp4d, eobd +.pass1_loop: + LOAD_8ROWS cq+64*1, 64*2 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test tmp4d, tmp4d + jl .fast + LOAD_8ROWS_H cq+64*17, 64*2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2 + pxor m0, m0 + REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + mova [rsp], m15 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + LOAD_8ROWS cq+64*0, 64*2 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + lea tmp3q, [tmp1q+32*32] + mova m15, [rsp] + mova [tmp3q-32*4], m0 + mova [tmp3q-32*3], m2 + mova [tmp3q-32*2], m4 + mova [tmp3q-32*1], m6 + mova [tmp3q+32*0], m8 + mova [tmp3q+32*1], m10 + mova [tmp3q+32*2], m12 + mova [tmp3q+32*3], m14 + add tmp3q, 32*8 + mova [tmp3q-32*4], m1 + mova [tmp3q-32*3], m3 + mova [tmp3q-32*2], m5 + mova [tmp3q-32*1], m7 + mova [tmp3q+32*0], m9 + mova [tmp3q+32*1], m11 + mova [tmp3q+32*2], m13 + mova [tmp3q+32*3], m15 + vpbroadcastd m9, [o(pw_8192)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*16 + add tmp2q, 32*16 + add eobd, 0x80000000 + jnc .pass1_loop + add tmp1q, 32*24 + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + test tmp4d, tmp4d + jge .pass2_loop + add tmp1q, 32*16 + add tmp2q, 32*16 + add tmp3q, 32*16 +.pass2_loop: + LOAD_8ROWS tmp2q-32*4, 32 + test tmp4d, tmp4d + jl .fast2 + LOAD_8ROWS_H tmp3q-32*4, 32 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + sub tmp3q, 32*8 + LOAD_8ROWS_H tmp3q-32*4, 32 + sub tmp3q, 32*16 + jmp .pass2_loop_end +.fast2: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + sub tmp3q, 32*24 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 +.pass2_loop_end: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end + lea tmp3q, [tmp1q-32*32] + cmp tmp2q, tmp3q + jb .ret + sub tmp2q, 32*32 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + jmp .pass2_loop +.ret: + RET + +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob + %undef cmp + vpbroadcastd m9, [pw_8192] + sub eobd, 136 ; if (eob < 136) + shr eobd, 30 ; topleft 16x16 only + lea eobd, [eobq*2-8] + lea r4, [strideq*3] + mov r5, dstq + lea rax, [cq+32] +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + add cq, 16 + inc eobd + jz .ret + test eobd, 3 + jnz .loop + add cq, 64*15 + lea dstq, [r5+16] + jmp .loop +.ret: + pxor m0, m0 + mov r0d, 16 + cmp cq, rax + jne .zero_loop +.zero_loop_topleft: + mova [rax-32*1], m0 + mova [rax+32*1], m0 + mova [rax+32*3], m0 + mova [rax+32*5], m0 + add rax, 64*4 + sub r0d, 4 + jg .zero_loop_topleft + RET +.zero_loop: + mova [rax-32*1], m0 + mova [rax+32*0], m0 + mova [rax+32*1], m0 + mova [rax+32*2], m0 + add rax, 32*4 + dec r0d + jg .zero_loop + RET + +%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n + mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n +%else + mova m%5, [tmp1q-32*(45-%1)] + mova m%4, [tmp2q-32*(20+%1)] +%endif + psubsw m%6, m%5, m%4 ; idct32 out31-n + paddsw m%5, m%4 ; idct32 out 0+n + psubsw m%4, m%6, m%3 ; out32+n + paddsw m%6, m%3 ; out31-n + psubsw m%3, m%5, m%2 ; out63-n + paddsw m%5, m%2 ; out 0+n +%if %0 == 6 ; pass 1 +%if %1 & 1 + mova [tmp2q-32*(19-%1)], m%4 + mova [tmp1q-32*(14+%1)], m%6 + mova [tmp1q+32*(18-%1)], m%3 + mova [tmp2q-32*(51-%1)], m%5 +%else + mova [tmp1q-32*(13-%1)], m%4 + mova [tmp2q-32*(20+%1)], m%6 + mova [tmp2q+32*(12-%1)], m%3 + mova [tmp1q-32*(45-%1)], m%5 +%endif +%else ; pass 2 + REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + pmovzxbw m%2, [%%d0+%9 ] + paddw m%2, m%4 + pmovzxbw m%4, [%%d1+%8 ] + paddw m%4, m%6 + pmovzxbw m%6, [%%d1+%10] + paddw m%3, m%6 + pmovzxbw m%6, [%%d0+%7 ] + paddw m%5, m%6 + packuswb m%2, m%4 + packuswb m%3, m%5 + vpermq m%2, m%2, q3120 + vpermq m%3, m%3, q3120 + mova [%%d0+%9 ], xm%2 + vextracti128 [%%d1+%8 ], m%2, 1 + mova [%%d1+%10], xm%3 + vextracti128 [%%d0+%7 ], m%3, 1 +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 32 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.normal: + PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + %undef cmp + lea tmp1q, [rsp+32*23] + lea tmp2q, [tmp1q+32*24] + sub eobd, 151 + mov r7d, eobd +.pass1_loop: + LOAD_16ROWS cq, 64 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m2 + mova [tmp1q-32*2], m4 + mova [tmp1q-32*1], m6 + mova [tmp1q+32*0], m8 + mova [tmp1q+32*1], m10 + mova [tmp1q+32*2], m12 + mova [tmp1q+32*3], m14 + mova [tmp2q-32*4], m1 + mova [tmp2q-32*3], m3 + mova [tmp2q-32*2], m5 + mova [tmp2q-32*1], m7 + mova [tmp2q+32*0], m9 + mova [tmp2q+32*1], m11 + mova [tmp2q+32*2], m13 + mova [tmp2q+32*3], m15 + add cq, 32 + add tmp1q, 32*8 + add tmp2q, 32*8 + add eobd, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*23] + mova xm0, [r2-32*4+ 0] + mova xm1, [r2-32*2+ 0] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m1, [r2+32*2+ 0], 1 + mova xm2, [r2-32*4+16] + mova xm3, [r2-32*2+16] + vinserti128 m2, [r2+32*0+16], 1 + vinserti128 m3, [r2+32*2+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r7d, r7d + jl .fast + lea r3, [r2+32*8] + mova xm4, [r3-32*4+ 0] + mova xm5, [r3-32*2+ 0] + vinserti128 m4, [r3+32*0+ 0], 1 + vinserti128 m5, [r3+32*2+ 0], 1 + mova xm6, [r3-32*4+16] + mova xm7, [r3-32*2+16] + vinserti128 m6, [r3+32*0+16], 1 + vinserti128 m7, [r3+32*2+16], 1 +.fast: + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova xm0, [r2-32*3+ 0] + mova xm1, [r2-32*1+ 0] + vinserti128 m0, [r2+32*1+ 0], 1 + vinserti128 m1, [r2+32*3+ 0], 1 + mova xm2, [r2-32*3+16] + mova xm3, [r2-32*1+16] + vinserti128 m2, [r2+32*1+16], 1 + vinserti128 m3, [r2+32*3+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r7d, r7d + jl .fast2 + mova xm4, [r3-32*3+ 0] + mova xm5, [r3-32*1+ 0] + vinserti128 m4, [r3+32*1+ 0], 1 + vinserti128 m5, [r3+32*3+ 0], 1 + mova xm6, [r3-32*3+16] + mova xm7, [r3-32*1+16] + vinserti128 m6, [r3+32*1+16], 1 + vinserti128 m7, [r3+32*3+16], 1 +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + add r2, 32*24 + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova xm0, [r2-32*4+ 0] + mova xm3, [r2-32*1+16] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m3, [r2+32*3+16], 1 + mova xm4, [r2-32*4+16] + mova xm7, [r2-32*1+ 0] + vinserti128 m4, [r2+32*0+16], 1 + vinserti128 m7, [r2+32*3+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast3 + add r3, 32*24 + mova xm1, [r3-32*1+16] + mova xm2, [r3-32*4+ 0] + vinserti128 m1, [r3+32*3+16], 1 + vinserti128 m2, [r3+32*0+ 0], 1 + mova xm5, [r3-32*1+ 0] + mova xm6, [r3-32*4+16] + vinserti128 m5, [r3+32*3+ 0], 1 + vinserti128 m6, [r3+32*0+16], 1 +.fast3: + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova xm0, [r2-32*2+ 0] + mova xm3, [r2-32*3+16] + vinserti128 m0, [r2+32*2+ 0], 1 + vinserti128 m3, [r2+32*1+16], 1 + mova xm4, [r2-32*2+16] + mova xm7, [r2-32*3+ 0] + vinserti128 m4, [r2+32*2+16], 1 + vinserti128 m7, [r2+32*1+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast4 + mova xm1, [r3-32*3+16] + mova xm2, [r3-32*2+ 0] + vinserti128 m1, [r3+32*1+16], 1 + vinserti128 m2, [r3+32*2+ 0], 1 + mova xm5, [r3-32*3+ 0] + mova xm6, [r3-32*2+16] + vinserti128 m5, [r3+32*1+ 0], 1 + vinserti128 m6, [r3+32*2+16], 1 +.fast4: + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 + RET +ALIGN function_align +%define o_base idct64_mul - 8 +cglobal_label .main_part1 + ; idct64 steps 1-5: + ; in1/31/17/15/ 9/23/25/ 7 -> + ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a + ; in5/27/21/11/13/19/29/ 3 -> + ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a + vpbroadcastd m11, [o(idct64_mul+4* 0)] + vpbroadcastd m13, [o(idct64_mul+4* 1)] + vpbroadcastd m10, [o(idct64_mul+4* 4)] + vpbroadcastd m12, [o(idct64_mul+4* 5)] + pmulhrsw m11, m0 ; t63a + pmulhrsw m0, m13 ; t32a + pmulhrsw m10, m1 ; t62a + pmulhrsw m1, m12 ; t33a + vpbroadcastd m9, [o(idct64_mul+4* 8)] + vpbroadcastd m13, [o(idct64_mul+4* 9)] + vpbroadcastd m8, [o(idct64_mul+4*12)] + vpbroadcastd m12, [o(idct64_mul+4*13)] + pmulhrsw m9, m2 ; t61a + pmulhrsw m2, m13 ; t34a + pmulhrsw m8, m3 ; t60a + pmulhrsw m3, m12 ; t35a + psubsw m12, m0, m1 ; t33 + paddsw m0, m1 ; t32 + psubsw m1, m3, m2 ; t34 + paddsw m3, m2 ; t35 + psubsw m2, m8, m9 ; t61 + paddsw m8, m9 ; t60 + psubsw m9, m11, m10 ; t62 + paddsw m11, m10 ; t63 + ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a + vpbroadcastd m14, [o(pw_401_4076)] + ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a + psubsw m10, m0, m3 ; t35a + paddsw m0, m3 ; t32a + psubsw m3, m11, m8 ; t60a + paddsw m11, m8 ; t63a + psubsw m8, m9, m2 ; t34 + paddsw m9, m2 ; t33 + psubsw m2, m12, m1 ; t61 + paddsw m12, m1 ; t62 + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m9 + mova [tmp2q+32*2], m12 + mova [tmp2q+32*3], m11 + vpbroadcastd m13, [o(pw_m4017_799)] + vpbroadcastd m14, [o(pw_799_4017)] + ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a + ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp2q+32*0], m10 + mova [tmp2q+32*1], m8 + vpbroadcastd m3, [o(idct64_mul+4*16)] + vpbroadcastd m11, [o(idct64_mul+4*17)] + vpbroadcastd m2, [o(idct64_mul+4*20)] + vpbroadcastd m10, [o(idct64_mul+4*21)] + vpbroadcastd m1, [o(idct64_mul+4*24)] + vpbroadcastd m9, [o(idct64_mul+4*25)] + vpbroadcastd m0, [o(idct64_mul+4*28)] + vpbroadcastd m8, [o(idct64_mul+4*29)] + pmulhrsw m3, m4 ; t59a + pmulhrsw m4, m11 ; t36a + pmulhrsw m2, m5 ; t58a + pmulhrsw m5, m10 ; t37a + pmulhrsw m1, m6 ; t57a + pmulhrsw m6, m9 ; t38a + pmulhrsw m0, m7 ; t56a + pmulhrsw m7, m8 ; t39a + psubsw m8, m4, m5 ; t37 + paddsw m4, m5 ; t36 + psubsw m5, m7, m6 ; t38 + paddsw m7, m6 ; t39 + psubsw m6, m0, m1 ; t57 + paddsw m0, m1 ; t56 + psubsw m1, m3, m2 ; t58 + paddsw m3, m2 ; t59 + ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a + vpbroadcastd m10, [o(pw_3166_2598)] + ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a + psubsw m2, m7, m4 ; t36a + paddsw m7, m4 ; t39a + psubsw m4, m0, m3 ; t59a + paddsw m0, m3 ; t56a + psubsw m3, m6, m1 ; t37 + paddsw m6, m1 ; t38 + psubsw m1, m5, m8 ; t58 + paddsw m5, m8 ; t57 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + mova [tmp2q-32*4], m0 + mova [tmp2q-32*3], m5 + vpbroadcastd m6, [o(pw_m799_m4017)] + vpbroadcastd m7, [o(pw_m4017_799)] + ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59 + ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m1 + mova [tmp2q-32*2], m3 + mova [tmp2q-32*1], m2 + ret +%define o_base pw_5 + 128 +.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub + sub rax, o_idct64_offset + 8 + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m13, [o(pw_2896_2896)] + vpbroadcastd m14, [o(pw_m2896_2896)] +.main_part2_pass1_loop: + call .main_part2_internal + IDCT64_PART2_END 0, 7, 0, 6, 9, 10 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7 + cmp tmp1q, tmp2q + jne .main_part2_pass1_loop + ret +cglobal_label .main_part2_internal + mova m0, [tmp1q-32*12] ; t32a + mova m6, [tmp2q-32*13] ; t39a + mova m1, [tmp1q-32* 4] ; t40a + mova m5, [tmp2q+32* 3] ; t55a + add tmp1q, 32 + sub tmp2q, 32 + mova m2, [tmp1q+32* 3] ; t48a + mova m4, [tmp2q-32* 4] ; t47a + mova m3, [tmp1q+32*11] ; t56a + mova m7, [tmp2q+32*12] ; t63a + psubsw m8, m0, m6 ; t39 + paddsw m0, m6 ; t32 + psubsw m6, m4, m1 ; t40 + paddsw m4, m1 ; t47 + psubsw m1, m2, m5 ; t55 + paddsw m2, m5 ; t48 + psubsw m5, m7, m3 ; t56 + paddsw m7, m3 ; t63 + ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a + vpbroadcastd m9, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a + psubsw m3, m0, m4 ; t47a + paddsw m0, m4 ; t32a + psubsw m4, m7, m2 ; t48a + paddsw m7, m2 ; t63a + psubsw m2, m5, m1 ; t40 + paddsw m5, m1 ; t39 + psubsw m1, m8, m6 ; t55 + paddsw m8, m6 ; t56 + ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48 + ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a + ret +.main_part2_pass2: + sub rax, o_idct64_offset + 8 + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m13, [o(pw_2896_2896)] + lea r9, [strideq*5] ; stride*5 + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + lea r8, [r3+strideq*2] ; stride*8 + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [o(pw_m2896_2896)] + call .main_part2_internal + vpbroadcastd m14, [o(pw_2048)] + IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp tmp1q, tmp2q + jne .main_part2_pass2_loop + ret + +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 16 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [o(pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m1, m1 +.dconly_loop: + mova m2, [dstq+32*0] + mova m3, [dstq+32*1] + punpckhbw m4, m2, m1 + punpcklbw m2, m1 + punpckhbw m5, m3, m1 + punpcklbw m3, m1 + paddw m4, m0 + paddw m2, m0 + paddw m5, m0 + paddw m3, m0 + packuswb m2, m4 + packuswb m3, m5 + mova [dstq+32*0], m2 + mova [dstq+32*1], m3 + add dstq, strideq + dec r2d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + LOAD_8ROWS cq+32*0, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+32*2, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+32* 1] + mova m1, [cq+32*31] + mova m2, [cq+32*17] + mova m3, [cq+32*15] + mova m4, [cq+32* 9] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32* 7] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+32* 5] + mova m1, [cq+32*27] + mova m2, [cq+32*21] + mova m3, [cq+32*11] + mova m4, [cq+32*13] + mova m5, [cq+32*19] + mova m6, [cq+32*29] + mova m7, [cq+32* 3] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 + sub tmp1q, 32*36 + lea r2, [strideq*3] + mov tmp2d, 4 +.pass2_loop: + lea r3, [tmp1q-32*8] + mova xm0, [r3 -32*4] + mova xm1, [r3 -32*3] + vinserti128 m0, [tmp1q-32*4], 1 + vinserti128 m1, [tmp1q-32*3], 1 + mova xm2, [r3 -32*2] + mova xm3, [r3 -32*1] + vinserti128 m2, [tmp1q-32*2], 1 + vinserti128 m3, [tmp1q-32*1], 1 + mova xm4, [r3 +32*0] + mova xm5, [r3 +32*1] + vinserti128 m4, [tmp1q+32*0], 1 + vinserti128 m5, [tmp1q+32*1], 1 + mova xm6, [r3 +32*2] + mova xm7, [r3 +32*3] + vinserti128 m6, [tmp1q+32*2], 1 + vinserti128 m7, [tmp1q+32*3], 1 + mova xm8, [r3 -32*4+16] + mova xm9, [r3 -32*3+16] + vinserti128 m8, [tmp1q-32*4+16], 1 + vinserti128 m9, [tmp1q-32*3+16], 1 + mova xm10, [r3 -32*2+16] + mova xm11, [r3 -32*1+16] + vinserti128 m10, [tmp1q-32*2+16], 1 + vinserti128 m11, [tmp1q-32*1+16], 1 + mova xm12, [r3 +32*0+16] + mova xm13, [r3 +32*1+16] + vinserti128 m12, [tmp1q+32*0+16], 1 + vinserti128 m13, [tmp1q+32*1+16], 1 + mova xm14, [r3 +32*2+16] + mova xm15, [r3 +32*3+16] + vinserti128 m14, [tmp1q+32*2+16], 1 + vinserti128 m15, [tmp1q+32*3+16], 1 + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main + mova [rsp+32*0], m15 + vpbroadcastd m15, [o(pw_2048)] + REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 + lea r3, [r3+strideq*4] + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + pmulhrsw m15, [rsp+32*0] + lea r3, [r3+strideq*4] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + add tmp1q, 32*16 + add r0, 16 + dec tmp2d + jg .pass2_loop + RET + +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 64 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly +.normal: + PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*7] + lea r10d, [eobq-136] + sar r10d, 31 +.pass1_loop: + lea tmp2q, [tmp1q+32*16] + LOAD_8ROWS cq+64*1, 64*2, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test r10b, r10b + jnz .fast + LOAD_8ROWS_H cq+64*17, 64*2, 2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2, 1 + mova [rsp], m15 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + LOAD_8ROWS cq+64*0, 64*2, 1 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end + vpbroadcastd m7, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + lea r3, [tmp1q+32*48] + mova m15, [rsp] + mova [r3-32*4], m0 + mova [r3-32*3], m2 + mova [r3-32*2], m4 + mova [r3-32*1], m6 + mova [r3+32*0], m8 + mova [r3+32*1], m10 + mova [r3+32*2], m12 + mova [r3+32*3], m14 + add r3, 32*24 + mova [r3-32*4], m1 + mova [r3-32*3], m3 + mova [r3-32*2], m5 + mova [r3-32*1], m7 + mova [r3+32*0], m9 + mova [r3+32*1], m11 + mova [r3+32*2], m13 + mova [r3+32*3], m15 + vpbroadcastd m9, [o(pw_16384)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*8 + add r10d, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*55] + lea r7, [r2+32*24] +.pass2_loop: + lea r3, [r2+32*8] + lea r8, [r7+32*8] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r10b, r10b + jnz .fast2 + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast2: + mova [rsp], m8 + lea tmp1q, [rsp+32*39] + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10b, r10b + jnz .fast3 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast3: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r7-32*4] + mova m3, [r7+32*3] + mova m4, [r7+32*0] + mova m7, [r7-32*1] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast4 + mova m1, [r8+32*3] + mova m2, [r8-32*4] + mova m5, [r8-32*1] + mova m6, [r8+32*0] +.fast4: + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r7-32*2] + mova m3, [r7+32*1] + mova m4, [r7+32*2] + mova m7, [r7-32*3] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast5 + mova m1, [r8+32*1] + mova m2, [r8-32*2] + mova m5, [r8-32*3] + mova m6, [r8+32*2] +.fast5: + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 + add r10d, 0x80000000 + jc .ret + lea r2, [rsp+32*7] + lea r7, [r2+32*16] + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + jmp .pass2_loop +.ret: + RET + +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + mov r2d, 32 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly +.normal: + PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + lea tmp1q, [rsp+32*7] + lea tmp4d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + vpbroadcastd m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [cq+64* 1] + pmulhrsw m1, m7, [cq+64*31] + pmulhrsw m2, m7, [cq+64*17] + pmulhrsw m3, m7, [cq+64*15] + pmulhrsw m4, m7, [cq+64* 9] + pmulhrsw m5, m7, [cq+64*23] + pmulhrsw m6, m7, [cq+64*25] + pmulhrsw m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + pmulhrsw m0, m7, [cq+64* 5] + pmulhrsw m1, m7, [cq+64*27] + pmulhrsw m2, m7, [cq+64*21] + pmulhrsw m3, m7, [cq+64*11] + pmulhrsw m4, m7, [cq+64*13] + pmulhrsw m5, m7, [cq+64*19] + pmulhrsw m6, m7, [cq+64*29] + pmulhrsw m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave + add cq, 32 + add tmp4d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*15] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + mov tmp4b, 4 +.pass2_loop: + lea tmp2q, [tmp1q+32*64] + LOAD_8ROWS tmp1q-32*4, 32 + test tmp4d, 0x40000000 + jnz .fast + LOAD_8ROWS_H tmp2q-32*4, 32 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + lea tmp3q, [tmp2q-32*8] + LOAD_8ROWS_H tmp3q-32*4, 32 + mova [rsp], m15 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + lea tmp3q, [tmp1q-32*8] + LOAD_8ROWS tmp3q-32*4, 32 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end + add tmp1q, 32*16 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + dec tmp4b + jg .pass2_loop + RET +ALIGN function_align +.transpose_round_interleave: + mov tmp3d, 4 +.loop: + lea tmp2q, [tmp1q+32*8] + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp2q-32*4], 1 + vinserti128 m1, [tmp2q-32*3], 1 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp2q-32*2], 1 + vinserti128 m3, [tmp2q-32*1], 1 + mova xm4, [tmp1q+32*0] + mova xm5, [tmp1q+32*1] + vinserti128 m4, [tmp2q+32*0], 1 + vinserti128 m5, [tmp2q+32*1], 1 + mova xm6, [tmp1q+32*2] + mova xm7, [tmp1q+32*3] + vinserti128 m6, [tmp2q+32*2], 1 + vinserti128 m7, [tmp2q+32*3], 1 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova xm8, [tmp1q-32*4+16] + mova xm9, [tmp1q-32*3+16] + vinserti128 m8, [tmp2q-32*4+16], 1 + vinserti128 m9, [tmp2q-32*3+16], 1 + mova [tmp1q-32*4], m0 + mova [tmp2q-32*4], m1 + mova [tmp1q-32*3], m2 + mova [tmp2q-32*3], m3 + mova xm2, [tmp1q-32*2+16] + mova xm3, [tmp1q-32*1+16] + vinserti128 m2, [tmp2q-32*2+16], 1 + vinserti128 m3, [tmp2q-32*1+16], 1 + mova [tmp1q-32*2], m4 + mova [tmp2q-32*2], m5 + mova [tmp1q-32*1], m6 + mova [tmp2q-32*1], m7 + mova xm4, [tmp1q+32*0+16] + mova xm5, [tmp1q+32*1+16] + vinserti128 m4, [tmp2q+32*0+16], 1 + vinserti128 m5, [tmp2q+32*1+16], 1 + mova xm6, [tmp1q+32*2+16] + mova xm7, [tmp1q+32*3+16] + vinserti128 m6, [tmp2q+32*2+16], 1 + vinserti128 m7, [tmp2q+32*3+16], 1 + pmulhrsw m0, m8, m10 + pmulhrsw m1, m9, m10 + REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add tmp1q, 32*16 + dec tmp3d + jg .loop + ret + +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob + lea rax, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + mov r2d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly +.normal: + PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*71] + lea r10d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + mova m4, [cq+64* 9] + mova m5, [cq+64*23] + mova m6, [cq+64*25] + mova m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + mova m4, [cq+64*13] + mova m5, [cq+64*19] + mova m6, [cq+64*29] + mova m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave + add cq, 32 + add r10d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*7] + mov r10b, 4 +.pass2_loop: + lea r2, [tmp1q+32*64] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + mova [rsp], m4 + test r10d, 0x40000000 + jnz .fast + lea r3, [r2+32*64] + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast: + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10d, 0x40000000 + jnz .fast2 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add r2, 32*8 + add r3, 32*8 + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r2-32*4] ; 1 + mova m3, [r2+32*3] ; 15 + mova m4, [r2+32*0] ; 9 + mova m7, [r2-32*1] ; 7 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast3 + mova m1, [r3+32*3] ; 31 + mova m2, [r3-32*4] ; 17 + mova m5, [r3-32*1] ; 23 + mova m6, [r3+32*0] ; 25 +.fast3: + add rax, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add rax, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r2-32*2] ; 5 + mova m3, [r2+32*1] ; 11 + mova m4, [r2+32*2] ; 13 + mova m7, [r2-32*3] ; 3 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast4 + mova m1, [r3+32*1] ; 27 + mova m2, [r3-32*2] ; 21 + mova m5, [r3-32*3] ; 19 + mova m6, [r3+32*2] ; 29 +.fast4: + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 + sub tmp1q, 32*28 + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + dec r10b + jg .pass2_loop + RET + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/itx_init_tmpl.c dav1d-0.9.1/src/x86/itx_init_tmpl.c --- dav1d-0.7.1/src/x86/itx_init_tmpl.c 2020-06-21 11:48:55.024126500 +0000 +++ dav1d-0.9.1/src/x86/itx_init_tmpl.c 2021-07-28 21:38:28.905852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -29,79 +29,65 @@ #include "src/itx.h" #define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) #define decl_itx12_fns(w, h, opt) \ decl_itx2_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) #define decl_itx16_fns(w, h, opt) \ decl_itx12_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) #define decl_itx17_fns(w, h, opt) \ decl_itx16_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) -decl_itx17_fns( 4, 4, avx2); -decl_itx16_fns( 4, 8, avx2); -decl_itx16_fns( 4, 16, avx2); -decl_itx16_fns( 8, 4, avx2); -decl_itx16_fns( 8, 8, avx2); -decl_itx16_fns( 8, 16, avx2); -decl_itx2_fns ( 8, 32, avx2); -decl_itx16_fns(16, 4, avx2); -decl_itx16_fns(16, 8, avx2); -decl_itx12_fns(16, 16, avx2); -decl_itx2_fns (16, 32, avx2); -decl_itx2_fns (32, 8, avx2); -decl_itx2_fns (32, 16, avx2); -decl_itx2_fns (32, 32, avx2); - -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_avx2); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_avx2); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2); - -decl_itx17_fns( 4, 4, ssse3); -decl_itx16_fns( 4, 8, ssse3); -decl_itx16_fns( 8, 4, ssse3); -decl_itx16_fns( 8, 8, ssse3); -decl_itx16_fns( 4, 16, ssse3); -decl_itx16_fns(16, 4, ssse3); -decl_itx16_fns( 8, 16, ssse3); -decl_itx16_fns(16, 8, ssse3); -decl_itx12_fns(16, 16, ssse3); -decl_itx2_fns ( 8, 32, ssse3); -decl_itx2_fns (32, 8, ssse3); -decl_itx2_fns (16, 32, ssse3); -decl_itx2_fns (32, 16, ssse3); -decl_itx2_fns (32, 32, ssse3); - -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3); - -COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { +#define decl_itx_fns(ext) \ +decl_itx17_fns( 4, 4, ext); \ +decl_itx16_fns( 4, 8, ext); \ +decl_itx16_fns( 4, 16, ext); \ +decl_itx16_fns( 8, 4, ext); \ +decl_itx16_fns( 8, 8, ext); \ +decl_itx16_fns( 8, 16, ext); \ +decl_itx2_fns ( 8, 32, ext); \ +decl_itx16_fns(16, 4, ext); \ +decl_itx16_fns(16, 8, ext); \ +decl_itx12_fns(16, 16, ext); \ +decl_itx2_fns (16, 32, ext); \ +decl_itx2_fns (32, 8, ext); \ +decl_itx2_fns (32, 16, ext); \ +decl_itx2_fns (32, 32, ext); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext)) + +decl_itx_fns(avx2); +decl_itx_fns(sse4); +decl_itx_fns(ssse3); +decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_sse2); + +COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c, + const int bpc) +{ #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - dav1d_inv_txfm_add_##type##_##w##x##h##_##ext + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) #define assign_itx1_fn(pfx, w, h, ext) \ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) @@ -134,9 +120,14 @@ assign_itx16_fn(pfx, w, h, ext); \ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) - const unsigned flags = dav1d_get_cpu_flags(); + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + +#if BITDEPTH == 16 + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2); +#endif + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; #if BITDEPTH == 8 @@ -161,9 +152,28 @@ assign_itx1_fn ( , 64, 64, ssse3); #endif + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + +#if BITDEPTH == 16 + if (bpc <= 10) { + assign_itx16_fn(, 4, 4, sse4); + assign_itx16_fn(R, 4, 8, sse4); + assign_itx16_fn(R, 4, 16, sse4); + assign_itx16_fn(R, 8, 4, sse4); + assign_itx16_fn(, 8, 8, sse4); + assign_itx16_fn(R, 8, 16, sse4); + } +#endif + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if BITDEPTH == 8 && ARCH_X86_64 +#if ARCH_X86_64 + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2); +#endif + + if (bpc > 10) return; + +#if ARCH_X86_64 assign_itx17_fn( , 4, 4, avx2); assign_itx16_fn(R, 4, 8, avx2); assign_itx16_fn(R, 4, 16, avx2); diff -Nru dav1d-0.7.1/src/x86/itx_sse.asm dav1d-0.9.1/src/x86/itx_sse.asm --- dav1d-0.7.1/src/x86/itx_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/itx_sse.asm 2021-07-28 21:38:28.905852000 +0000 @@ -0,0 +1,6560 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + + +SECTION_RODATA 16 + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 + +%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 +pw_%1_m%2: times 4 dw %1, -%2 +%if %3 != 2 +pw_%2_%1: times 4 dw %2, %1 +%endif +%if %3 +pw_m%1_m%2: times 4 dw -%1, -%2 +%endif +%endmacro + +;adst4 +pw_1321_3803: times 4 dw 1321, 3803 +pw_2482_m1321: times 4 dw 2482, -1321 +pw_3344_2482: times 4 dw 3344, 2482 +pw_3344_m3803: times 4 dw 3344, -3803 +pw_3344_m3344: times 4 dw 3344, -3344 +pw_0_3344 times 4 dw 0, 3344 +pw_m6688_m3803: times 4 dw -6688, -3803 + +COEF_PAIR 2896, 2896 +COEF_PAIR 1567, 3784 +COEF_PAIR 799, 4017 +COEF_PAIR 3406, 2276 +COEF_PAIR 401, 4076 +COEF_PAIR 1931, 3612 +COEF_PAIR 3166, 2598 +COEF_PAIR 3920, 1189 +COEF_PAIR 3784, 1567, 1 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 3513, 2106 +COEF_PAIR 3857, 1380 +COEF_PAIR 4017, 799, 1 +COEF_PAIR 201, 4091 +COEF_PAIR 2440, 3290 +COEF_PAIR 3035, 2751 +COEF_PAIR 4052, 601 +COEF_PAIR 2276, 3406, 1 +COEF_PAIR 4076, 401, 2 +COEF_PAIR 2598, 3166, 2 +COEF_PAIR 3612, 1931, 2 +COEF_PAIR 1189, 3920, 2 + +pd_2048: times 4 dd 2048 +pw_2048: times 8 dw 2048 +pw_m2048: times 8 dw -2048 +pw_4096: times 8 dw 4096 +pw_16384: times 8 dw 16384 +pw_m16384: times 8 dw -16384 +pw_1697x16: times 8 dw 1697*16 +pw_1697x8: times 8 dw 1697*8 +pw_2896x8: times 8 dw 2896*8 +pw_3344x8: times 8 dw 3344*8 +pw_8192: times 8 dw 8192 +pw_m8192: times 8 dw -8192 +pw_5: times 8 dw 5 +pw_201x8: times 8 dw 201*8 +pw_4091x8: times 8 dw 4091*8 +pw_m2751x8: times 8 dw -2751*8 +pw_3035x8: times 8 dw 3035*8 +pw_1751x8: times 8 dw 1751*8 +pw_3703x8: times 8 dw 3703*8 +pw_m1380x8: times 8 dw -1380*8 +pw_3857x8: times 8 dw 3857*8 +pw_995x8: times 8 dw 995*8 +pw_3973x8: times 8 dw 3973*8 +pw_m2106x8: times 8 dw -2106*8 +pw_3513x8: times 8 dw 3513*8 +pw_2440x8: times 8 dw 2440*8 +pw_3290x8: times 8 dw 3290*8 +pw_m601x8: times 8 dw -601*8 +pw_4052x8: times 8 dw 4052*8 + +pw_4095x8: times 8 dw 4095*8 +pw_101x8: times 8 dw 101*8 +pw_2967x8: times 8 dw 2967*8 +pw_m2824x8: times 8 dw -2824*8 +pw_3745x8: times 8 dw 3745*8 +pw_1660x8: times 8 dw 1660*8 +pw_3822x8: times 8 dw 3822*8 +pw_m1474x8: times 8 dw -1474*8 +pw_3996x8: times 8 dw 3996*8 +pw_897x8: times 8 dw 897*8 +pw_3461x8: times 8 dw 3461*8 +pw_m2191x8: times 8 dw -2191*8 +pw_3349x8: times 8 dw 3349*8 +pw_2359x8: times 8 dw 2359*8 +pw_4036x8: times 8 dw 4036*8 +pw_m700x8: times 8 dw -700*8 +pw_4065x8: times 8 dw 4065*8 +pw_501x8: times 8 dw 501*8 +pw_3229x8: times 8 dw 3229*8 +pw_m2520x8: times 8 dw -2520*8 +pw_3564x8: times 8 dw 3564*8 +pw_2019x8: times 8 dw 2019*8 +pw_3948x8: times 8 dw 3948*8 +pw_m1092x8: times 8 dw -1092*8 +pw_3889x8: times 8 dw 3889*8 +pw_1285x8: times 8 dw 1285*8 +pw_3659x8: times 8 dw 3659*8 +pw_m1842x8: times 8 dw -1842*8 +pw_3102x8: times 8 dw 3102*8 +pw_2675x8: times 8 dw 2675*8 +pw_4085x8: times 8 dw 4085*8 +pw_m301x8: times 8 dw -301*8 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%if ARCH_X86_64 +%define o(x) x +%else +%define o(x) r5-$$+x ; PIC +%endif + +%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rotate 5 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + + movd m%3, [%%row_adr1] ;dst0 + movd m%5, [%%row_adr2] ;dst1 + punpckldq m%3, m%5 ;high: dst1 :low: dst0 + movd m%4, [%%row_adr3] ;dst2 + movd m%5, [%%row_adr4] ;dst3 + punpckldq m%4, m%5 ;high: dst3 :low: dst2 + + pxor m%5, m%5 + punpcklbw m%3, m%5 ;extend byte to word + punpcklbw m%4, m%5 ;extend byte to word + + paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 + paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 + + packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 + + movd [%%row_adr1], m%3 ;store dst0 + out0 + pshuflw m%4, m%3, q1032 + movd [%%row_adr2], m%4 ;store dst1 + out1 + punpckhqdq m%3, m%3 + movd [%%row_adr3], m%3 ;store dst2 + out2 + psrlq m%3, 32 + movd [%%row_adr4], m%3 ;store dst3 + out3 +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + mova m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + + WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 + ret +%endmacro + +; flags: 1 = swap, 2: coef_regs, 4: no_pack +%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags +%if %6 & 2 + pmaddwd m%2, m%4, m%1 + pmaddwd m%1, m%5 +%elif %6 & 1 + pmaddwd m%2, m%1, [o(pw_%5_%4)] + pmaddwd m%1, [o(pw_%4_m%5)] +%else + pmaddwd m%2, m%1, [o(pw_%4_m%5)] + pmaddwd m%1, [o(pw_%5_%4)] +%endif + paddd m%2, m%3 + paddd m%1, m%3 + psrad m%2, 12 + psrad m%1, 12 +%if %6 & 4 == 0 + packssdw m%1, m%2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 + mova m3, [o(pd_2048)] + punpckhwd m2, m0, m1 ;unpacked in1 in3 + punpcklwd m0, m1 ;unpacked in0 in2 + ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 + ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 + psubsw m1, m0, m2 ;high: out2 ;low: out3 + paddsw m0, m2 ;high: out1 ;low: out0 +%endmacro + +%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) +%if ARCH_X86_32 + LEA r5, $$ +%endif +%if has_epilogue +%ifidn %1_%2, dct_dct + test eobd, eobd + jz %%end +%endif + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] + call %%p1 + RET +%%end: +%else + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4, 6 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd ;0 + pmulhrsw m0, m1 + mova m1, m0 + TAIL_CALL m(iadst_4x4_internal_8bpc).end2 +%endif +%endmacro + +INIT_XMM ssse3 +; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] ;high: in1 ;low: in0 + mova m1, [coeffq+16*1] ;high: in3 ;low in2 + + IDCT4_1D_PACKED + + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 ;high: in1 ;low: in0 + pshufb m1, m3, m2 ;high: in3 ;low :in2 + jmp tx2q + +.pass2: + IDCT4_1D_PACKED + + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); + + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + call .main + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 ;high: in3 ;low :in2 + punpcklwd m0, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + call .main + +.end: + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + +.end2: + ITX4_END 0, 1, 2, 3 + +ALIGN function_align +cglobal_label .main + punpcklwd m2, m0, m1 ;unpacked in0 in2 + punpckhwd m0, m1 ;unpacked in1 in3 + mova m3, m0 + pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 + pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 + paddd m1, m0 ;t2 + pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 + paddd m4, m0 ;t0 + t3 + pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m0, [o(pd_2048)] + paddd m1, m0 ;t2 + 2048 + paddd m2, m0 + paddd m0, m4 ;t0 + t3 + 2048 + paddd m5, m2 ;t1 + t3 + 2048 + paddd m2, m4 + paddd m2, m3 ;t0 + t1 - t3 + 2048 + REPX {psrad x, 12}, m1, m0, m5, m2 + packssdw m0, m5 ;high: out1 ;low: out0 + packssdw m1, m2 ;high: out3 ;low: out3 + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + call m(iadst_4x4_internal_8bpc).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 ;high: in3 ;low :in2 + punpckhwd m1, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + call m(iadst_4x4_internal_8bpc).main + +.end: + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + mova m3, [o(pw_1697x8)] + pmulhrsw m2, m0, m3 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 ;high: in3 ;low :in2 + punpcklwd m0, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + mova m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8bpc).end + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ;low: in1 high: in3 + punpcklqdq m0, m1 ;low: in0 high: in2 + psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 + paddw m0, m3 ;low: in0 + in1 high: in2 + in3 + punpckhqdq m2, m2 ;t2 t2 + punpcklqdq m0, m0 ;t0 t0 + psubw m1, m0, m2 + psraw m1, 1 ;t4 t4 + psubw m1, m3 ;low: t1/out2 high: t3/out1 + psubw m0, m1 ;high: out0 + paddw m2, m1 ;low: out3 +%endmacro + +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + + IWHT4_1D_PACKED + + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + + IWHT4_1D_PACKED + + shufpd m0, m2, 0x01 + ITX4_END 0, 3, 2, 1, 0 + + +%macro IDCT8_1D_PACKED 0 + mova m6, [o(pd_2048)] + punpckhwd m4, m0, m3 ;unpacked in1 in7 + punpcklwd m0, m2 ;unpacked in0 in4 + punpckhwd m2, m1 ;unpacked in5 in3 + punpcklwd m1, m3 ;unpacked in2 in6 + ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a + ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a + ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 + psubsw m3, m4, m2 ;low: t6a high: t5a + paddsw m4, m2 ;low: t7 high: t4 + pshufb m3, [o(deint_shuf1)] + ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 + ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 + psubsw m2, m0, m1 ;low: tmp3 high: tmp2 + paddsw m0, m1 ;low: tmp0 high: tmp1 + punpcklqdq m1, m4, m3 ;low: t7 high: t6 + punpckhqdq m4, m3 ;low: t4 high: t5 + psubsw m3, m0, m1 ;low: out7 high: out6 + paddsw m0, m1 ;low: out0 high: out1 + paddsw m1, m2, m4 ;low: out3 high: out2 + psubsw m2, m4 ;low: out4 high: out5 +%endmacro + +;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 + punpckhwd m%4, m%1, m%2 + punpcklwd m%1, m%2 +%if %7 < 8 + pmaddwd m%2, m%7, m%1 + pmaddwd m%3, m%7, m%4 +%else + mova m%2, [o(pw_%7_%6)] +%if %8 + pmaddwd m%3, m%1, m%2 + pmaddwd m%2, m%4 +%else + pmaddwd m%3, m%4, m%2 + pmaddwd m%2, m%1 +%endif +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 12 + psrad m%2, 12 +%if %8 + packssdw m%3, m%2 +%else + packssdw m%2, m%3 ;dst2 +%endif +%if %7 < 8 + pmaddwd m%4, m%6 + pmaddwd m%1, m%6 +%elif %8 + mova m%2, [o(pw_%6_m%7)] + pmaddwd m%4, m%2 + pmaddwd m%1, m%2 +%else + mova m%3, [o(pw_%6_m%7)] + pmaddwd m%4, m%3 + pmaddwd m%1, m%3 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 12 + psrad m%1, 12 + packssdw m%1, m%4 ;dst1 +%endmacro + +%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 + psubsw m%3, m%1, m%2 ;out2 + paddsw m%2, m%1 ;out1 + paddsw m%1, m%5, m%4 ;out0 + psubsw m%4, m%5 ;out3 +%endmacro + +%macro WRITE_4X8 4 ;row[1-4] + WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 + lea dstq, [dstq+strideq*4] + WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 +%endmacro + +%macro INV_4X8 0 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ;low: in2 high: in3 + punpckldq m0, m2 ;low: in0 high: in1 + punpckldq m2, m3, m4 ;low: in4 high: in5 + punpckhdq m3, m4 ;low: in6 high: in7 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd + pmulhrsw m0, m1 + pmulhrsw m0, m1 + pmulhrsw m0, [o(pw_2048)] + mova m1, m0 + mova m2, m0 + mova m3, m0 + TAIL_CALL m(iadst_4x8_internal_8bpc).end3 +%endif +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity + +cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(idct_8x4_internal_8bpc).main + jmp m(iadst_4x8_internal_8bpc).pass1_end + +.pass2: + call .main + shufps m1, m1, q1032 + shufps m3, m3, q1032 + mova m4, [o(pw_2048)] + jmp m(iadst_4x8_internal_8bpc).end2 + +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(iadst_8x4_internal_8bpc).main + +.pass1_end: + INV_4X8 + jmp tx2q + +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call .main + mova m4, [o(pw_2048)] + pxor m5, m5 + psubw m5, m4 + +.end: + punpcklqdq m4, m5 + +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + pxor m5, m5 + mova [coeffq+16*0], m5 + mova [coeffq+16*1], m5 + mova [coeffq+16*2], m5 + mova [coeffq+16*3], m5 + +.end3: + WRITE_4X8 0, 1, 2, 3 + RET + +ALIGN function_align +cglobal_label .main + mova m6, [o(pd_2048)] + punpckhwd m4, m3, m0 ;unpacked in7 in0 + punpckhwd m5, m2, m1 ;unpacked in5 in2 + punpcklwd m1, m2 ;unpacked in3 in4 + punpcklwd m0, m3 ;unpacked in1 in6 + ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a + ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a + ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a + ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a + + psubsw m3, m4, m1 ;low: t4 high: t5 + paddsw m4, m1 ;low: t0 high: t1 + psubsw m2, m5, m0 ;low: t6 high: t7 + paddsw m5, m0 ;low: t2 high: t3 + + shufps m1, m3, m2, q1032 + punpckhwd m2, m1 + punpcklwd m3, m1 + ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a + ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a + + psubsw m1, m4, m5 ;low: t2 high: t3 + paddsw m4, m5 ;low: out0 high: -out7 + psubsw m5, m3, m2 ;low: t7 high: t6 + paddsw m3, m2 ;low: out6 high: -out1 + shufps m0, m4, m3, q3210 ;low: out0 high: -out1 + shufps m3, m4, q3210 ;low: out6 high: -out7 + + mova m2, [o(pw_2896_m2896)] + mova m7, [o(pw_2896_2896)] + shufps m4, m1, m5, q1032 ;low: t3 high: t7 + shufps m1, m5, q3210 ;low: t2 high: t6 + punpcklwd m5, m1, m4 + punpckhwd m1, m4 + pmaddwd m4, m2, m1 ;-out5 + pmaddwd m2, m5 ; out4 + pmaddwd m1, m7 ; out2 + pmaddwd m5, m7 ;-out3 + REPX {paddd x, m6}, m4, m2, m1, m5 + REPX {psrad x, 12}, m4, m2, m1, m5 + packssdw m1, m5 ;low: out2 high: -out3 + packssdw m2, m4 ;low: out4 high: -out5 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(iadst_8x4_internal_8bpc).main + + punpcklwd m4, m3, m2 + punpckhwd m3, m2 + punpcklwd m5, m1, m0 + punpckhwd m1, m0 + punpckldq m2, m3, m1 ;low: in4 high: in5 + punpckhdq m3, m1 ;low: in6 high: in7 + punpckldq m0, m4, m5 ;low: in0 high: in1 + punpckhdq m1, m4, m5 ;low: in2 high: in3 + jmp tx2q + +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal_8bpc).main + + mova m4, m0 + mova m5, m1 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + pshufd m2, m5, q1032 + pshufd m3, m4, q1032 + mova m5, [o(pw_2048)] + pxor m4, m4 + psubw m4, m5 + jmp m(iadst_4x8_internal_8bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_4x8_internal_8bpc).pass1_end + +.pass2: + mova m4, [o(pw_4096)] + jmp m(iadst_4x8_internal_8bpc).end2 + + +%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] + movq m%3, [dstq ] + movq m%4, [dstq+strideq] + pxor m%5, m%5 + punpcklbw m%3, m%5 ;extend byte to word + punpcklbw m%4, m%5 ;extend byte to word +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + movq [dstq ], m%3 + punpckhqdq m%3, m%3 + movq [dstq+strideq], m%3 +%endmacro + +%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] + WRITE_8X2 %1, %2, %5, %6, %7 + lea dstq, [dstq+strideq*2] + WRITE_8X2 %3, %4, %5, %6, %7 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + mova m2, [o(pw_2048)] + pmulhrsw m0, m1 + pmulhrsw m0, m2 + mova m1, m0 + mova m2, m0 + mova m3, m0 + TAIL_CALL m(iadst_8x4_internal_8bpc).end2 +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + call m(idct_4x8_internal_8bpc).main + + mova m4, [o(deint_shuf1)] + mova m5, [o(deint_shuf2)] + pshufb m0, m4 + pshufb m1, m5 + pshufb m2, m4 + pshufb m3, m5 + punpckhdq m4, m0, m1 + punpckldq m0, m1 + punpckhdq m5, m2, m3 + punpckldq m2, m3 + punpckhqdq m1, m0, m2 ;in1 + punpcklqdq m0, m2 ;in0 + punpckhqdq m3, m4, m5 ;in3 + punpcklqdq m2 ,m4, m5 ;in2 + jmp tx2q + +.pass2: + call .main + jmp m(iadst_8x4_internal_8bpc).end + +ALIGN function_align +cglobal_label .main + mova m6, [o(pd_2048)] + IDCT4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal_8bpc).main + + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + pxor m5, m5 + psubsw m3, m5, m1 + psubsw m5, m4 + punpckhdq m4, m5, m3 + punpckldq m5, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhwd m1, m0, m5 ;in1 + punpcklwd m0, m5 ;in0 + punpcklwd m2, m3, m4 ;in2 + punpckhwd m3, m4 ;in3 + jmp tx2q + +.pass2: + call .main + +.end: + mova m4, [o(pw_2048)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + +.end2: + pxor m6, m6 + mova [coeffq+16*0], m6 + mova [coeffq+16*1], m6 + mova [coeffq+16*2], m6 + mova [coeffq+16*3], m6 +.end3: + WRITE_8X4 0, 1, 2, 3, 4, 5, 6 + RET + +ALIGN function_align +cglobal_label .main + punpckhwd m6, m0, m2 ;unpacked in0 in2 + punpcklwd m0, m2 ;unpacked in0 in2 + punpckhwd m7, m1, m3 ;unpacked in1 in3 + punpcklwd m1, m3 ;unpacked in1 in3 + + mova m2, [o(pw_3344_m3344)] + mova m4, [o(pw_0_3344)] + pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 + pmaddwd m5, m4, m7 ;3344 * in3 + pmaddwd m2, m0 + pmaddwd m4, m1 + paddd m3, m5 + paddd m2, m4 + mova m4, [o(pd_2048)] + paddd m3, m4 ;t2 + 2048 + paddd m2, m4 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + + pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m3, m4 ;t0 + t3 + + pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m4, [o(pd_2048)] + paddd m0, m4 + paddd m4, m3 ;t0 + t3 + 2048 + paddd m5, m0 ;t1 + t3 + 2048 + paddd m3, m0 + paddd m3, m1 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m3, 12 ;out3 + packssdw m0, m4, m5 ;low: out0 high: out1 + + pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m1, m4 ;t0 + t3 + pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + + mova m4, [o(pd_2048)] + paddd m6, m4 + paddd m4, m1 ;t0 + t3 + 2048 + paddd m5, m6 ;t1 + t3 + 2048 + paddd m1, m6 + paddd m1, m7 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m1, 12 ;out3 + packssdw m3, m1 ;out3 + packssdw m4, m5 ;low: out0 high: out1 + + punpckhqdq m1, m0, m4 ;out1 + punpcklqdq m0, m4 ;out0 + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal_8bpc).main + + punpckhwd m5, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + + pxor m0, m0 + psubsw m4, m0, m2 + psubsw m0, m5 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m4, m3, m1 + punpckldq m3, m1 + punpckhwd m1, m0, m3 ;in1 + punpcklwd m0, m3 ;in0 + punpckhwd m3, m2, m4 ;in3 + punpcklwd m2, m4 ;in2 + jmp tx2q + +.pass2: + call m(iadst_8x4_internal_8bpc).main + mova m4, m0 + mova m5, m1 + mova m0, m3 + mova m1, m2 + mova m2, m5 + mova m3, m4 + jmp m(iadst_8x4_internal_8bpc).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + paddsw m0, m0 + paddsw m1, m1 + paddsw m2, m2 + paddsw m3, m3 + + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m5, m4, m1 + punpckldq m4, m1 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhwd m1, m0, m4 ;in1 + punpcklwd m0, m4 ;in0 + punpcklwd m2, m3, m5 ;in2 + punpckhwd m3, m5 ;in3 + jmp tx2q + +.pass2: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_8x4_internal_8bpc).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8, 8, 16*4 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mova m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m2 + psrlw m2, 3 + pmulhrsw m0, m1 + pmulhrsw m0, m2 +.end: + mov r3d, 2 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] +.loop: + WRITE_8X4 0, 0, 0, 0, 1, 2, 3 + lea dstq, [dstq+strideq*2] + dec r3d + jg .loop + jmp tx2q +.end3: + RET +%endif +%endmacro + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [%1+%2*0] + pmulhrsw m1, m7, [%1+%2*1] + pmulhrsw m2, m7, [%1+%2*2] + pmulhrsw m3, m7, [%1+%2*3] + pmulhrsw m4, m7, [%1+%2*4] + pmulhrsw m5, m7, [%1+%2*5] + pmulhrsw m6, m7, [%1+%2*6] + pmulhrsw m7, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a + ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a + psubsw m%2, m%4, m%5 ;t6a + paddsw m%4, m%5 ;t7 + psubsw m%5, m%1, m%3 ;t5a + paddsw m%1, m%3 ;t4 + ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call .main + +.pass1_end: + mova m7, [o(pw_16384)] + +.pass1_end1: + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + +.pass1_end2: + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + +cglobal_label .pass1_end3 + punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 + punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 + punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 + punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 + punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 + punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 + punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 + punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 + punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 + punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 + mova [rsp+gprsize+16*2], m6 + mova m6, [rsp+gprsize+16*1] + punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 + punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 + punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 + punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 + punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 + punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 + + punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 + punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 + mova [rsp+gprsize+16*0], m2 + punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 + punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 + punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 + punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 + mova m7, [rsp+gprsize+16*2] + punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 + punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 + mova m7, [rsp+gprsize+16*0] + jmp tx2q + +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.pass2_main: + call .main + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + +.end2: + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + mova [rsp+gprsize+16*2], m5 + mova [rsp+gprsize+16*0], m7 + +.end3: + WRITE_8X4 0, 1, 2, 3, 5, 6, 7 + lea dstq, [dstq+strideq*2] + WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 + jmp tx2q + +.end4: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*0], m7 + mova [rsp+gprsize*2+16*1], m3 + mova [rsp+gprsize*2+16*2], m1 + mova m7, [o(pd_2048)] + IDCT4_1D 0, 2, 4, 6, 1, 3, 7 + mova m3, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m2 + mova m2, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*1], m4 + mova m4, [rsp+gprsize*2+16*0] + mova [rsp+gprsize*2+16*0], m6 + IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 + mova m6, [rsp+gprsize*2+16*0] + psubsw m7, m0, m4 ;out7 + paddsw m0, m4 ;out0 + mova [rsp+gprsize*2+16*0], m7 + mova m1, [rsp+gprsize*2+16*2] + psubsw m4, m6, m3 ;out4 + paddsw m3, m6 ;out3 + mova m7, [rsp+gprsize*2+16*1] + psubsw m6, m1, m5 ;out6 + paddsw m1, m5 ;out1 + psubsw m5, m7, m2 ;out5 + paddsw m2, m7 ;out2 + ret + + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call .main + call .main_pass1_end + +.pass1_end: + mova m7, [o(pw_16384)] + +.pass1_end1: + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + pxor m6, m6 + psubw m6, m7 + mova m7, m6 + jmp m(idct_8x8_internal_8bpc).pass1_end2 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.pass2_main: + call .main + call .main_pass2_end + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + pxor m6, m6 + psubw m6, m7 + mova m7, m6 + jmp m(idct_8x8_internal_8bpc).end2 + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*0], m7 + mova [rsp+gprsize*2+16*1], m3 + mova [rsp+gprsize*2+16*2], m4 + mova m7, [o(pd_2048)] + ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a + ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a + paddsw m3, m2, m6 ;t2 + psubsw m2, m6 ;t6 + paddsw m4, m5, m1 ;t3 + psubsw m5, m1 ;t7 + ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a + + mova m6, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m5 + mova m1, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*1], m2 + mova m5, [rsp+gprsize*2+16*0] + mova [rsp+gprsize*2+16*0], m3 + ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a + ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a + psubsw m2, m0, m6 ;t4 + paddsw m0, m6 ;t0 + paddsw m3, m5, m1 ;t1 + psubsw m5, m1 ;t5 + ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a + + mova m7, [rsp+gprsize*2+16*0] + paddsw m1, m3, m4 ;-out7 + psubsw m3, m4 ;t3 + mova [rsp+gprsize*2+16*0], m1 + psubsw m4, m0, m7 ;t2 + paddsw m0, m7 ;out0 + mova m6, [rsp+gprsize*2+16*2] + mova m7, [rsp+gprsize*2+16*1] + paddsw m1, m5, m6 ;-out1 + psubsw m5, m6 ;t6 + paddsw m6, m2, m7 ;out6 + psubsw m2, m7 ;t7 + ret +ALIGN function_align +.main_pass1_end: + mova [rsp+gprsize*2+16*1], m1 + mova [rsp+gprsize*2+16*2], m6 + punpckhwd m1, m4, m3 + punpcklwd m4, m3 + punpckhwd m7, m5, m2 + punpcklwd m5, m2 + mova m2, [o(pw_2896_2896)] + mova m6, [o(pd_2048)] + pmaddwd m3, m2, m7 + pmaddwd m2, m5 + paddd m3, m6 + paddd m2, m6 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + mova m3, [o(pw_2896_m2896)] + pmaddwd m7, m3 + pmaddwd m5, m3 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m5, m7 ;-out5 + mova m3, [o(pw_2896_2896)] + pmaddwd m7, m3, m1 + pmaddwd m3, m4 + paddd m7, m6 + paddd m3, m6 + psrad m7, 12 + psrad m3, 12 + packssdw m3, m7 ;-out3 + mova m7, [o(pw_2896_m2896)] + pmaddwd m1, m7 + pmaddwd m4, m7 + paddd m1, m6 + paddd m4, m6 + psrad m1, 12 + psrad m4, 12 + packssdw m4, m1 ;-out5 + mova m1, [rsp+gprsize*2+16*1] + mova m6, [rsp+gprsize*2+16*2] + ret +ALIGN function_align +cglobal_label .main_pass2_end + paddsw m7, m4, m3 ;t2 + t3 + psubsw m4, m3 ;t2 - t3 + paddsw m3, m5, m2 ;t6 + t7 + psubsw m5, m2 ;t6 - t7 + mova m2, [o(pw_2896x8)] + pmulhrsw m4, m2 ;out4 + pmulhrsw m5, m2 ;-out5 + pmulhrsw m7, m2 ;-out3 + pmulhrsw m2, m3 ;out2 + mova m3, m7 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass1_end + +.pass1_end: + mova m7, [o(pw_m16384)] + +.pass1_end1: + pmulhrsw m1, m7 + mova [rsp+gprsize+16*1], m1 + mova m1, m6 + mova m6, m2 + pmulhrsw m2, m5, m7 + mova m5, m6 + mova m6, m4 + pmulhrsw m4, m3, m7 + mova m3, m6 + mova m6, m0 + mova m0, m7 + pxor m7, m7 + psubw m7, m0 + pmulhrsw m0, [rsp+gprsize+16*0] + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.pass2_main: + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass2_end + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*2], m2 + mova m2, m0 + pxor m0, m0 + psubw m0, m7 + mova m7, m2 + pmulhrsw m1, m0 + pmulhrsw m2, m5, m0 + mova [rsp+gprsize+16*1], m1 + mova m5, m4 + mova m1, m6 + pmulhrsw m4, m3, m0 + pmulhrsw m0, [rsp+gprsize+16*0] + mova m3, m5 + mova [rsp+gprsize+16*0], m7 + jmp m(idct_8x8_internal_8bpc).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.end: + pmulhrsw m7, [o(pw_4096)] + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_4096)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + mova [rsp+gprsize+16*2], m5 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).end3 + + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd + pmulhrsw m0, [o(pw_16384)] + pmulhrsw m0, m1 + pmulhrsw m0, [o(pw_2048)] +.end: + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + RET +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity + +cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_4x8_internal_8bpc).pass1)] + +.pass1: + mova m0, [coeffq+16*1] + mova m1, [coeffq+16*3] + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*7] + push tx2q + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] + jmp r3 + +.pass1_2: + mova [coeffq+16*1], m0 + mova [coeffq+16*3], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*2] + mova m2, [coeffq+16*4] + mova m3, [coeffq+16*6] + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] + jmp r3 + +.pass1_end: + pop tx2q + + mova m4, [coeffq+16*1] + mova m5, [coeffq+16*3] + mova m6, [coeffq+16*5] + mova m7, [o(pw_16384)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*7], m7 + jmp tx2q + +.pass2: + call m(idct_16x4_internal_8bpc).main + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*4], m4 + +.end1: + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + mov r3, coeffq + WRITE_4X8 0, 1, 3, 2 + + mova m0, [r3+16*4] + mova m1, [r3+16*5] + mova m2, [r3+16*6] + mova m3, m7 + lea dstq, [dstq+strideq*4] + WRITE_4X8 0, 1, 3, 2 + +.end2: + pxor m7, m7 + REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 + +.pass2: + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end + + punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 + punpckhqdq m4, m5 ;low: out8 high: out10 + punpcklqdq m5, m7, m2 ;low: out4 high: out6 + punpckhqdq m2, m7 ;low: -out9 high: -out11 + mova [coeffq+16*4], m2 + mova [coeffq+16*5], m6 + mova m2, [coeffq+16*6] + mova m6, [coeffq+16*7] + punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 + punpcklqdq m0, m6 ;low: out0 high: out2 + punpckhqdq m6, m3, m2 ;low: out12 high: out14 + punpcklqdq m2, m3 ;low: -out1 high: -out3 + + mova m7, [o(pw_2048)] + +.end1: + REPX {pmulhrsw x, m7}, m0, m5, m4, m6 + pxor m3, m3 + psubw m3, m7 + mova m7, [coeffq+16*4] + REPX {pmulhrsw x, m3}, m2, m7, m1 + pmulhrsw m3, [coeffq+16*5] + mova [coeffq+16*7], m5 + + punpckhqdq m5, m4, m7 ;low: out10 high: out11 + punpcklqdq m4, m7 ;low: out8 high: out9 + punpckhqdq m7, m6, m1 ;low: out14 high: out15 + punpcklqdq m6, m1 ;low: out12 high: out13 + punpckhqdq m1, m0, m2 ;low: out2 high: out3 + punpcklqdq m0, m2 ;low: out0 high: out1 + mova [coeffq+16*4], m4 + mova m4, [coeffq+16*7] + punpcklqdq m2, m4, m3 ;low: out4 high: out5 + punpckhqdq m4, m3 ;low: out6 high: out7 + mova m3, m4 + +.end2: + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + mov r3, coeffq + WRITE_4X8 0, 1, 2, 3 + + mova m0, [r3+16*4] + mova m1, [r3+16*5] + mova m2, [r3+16*6] + mova m3, m7 + lea dstq, [dstq+strideq*4] + WRITE_4X8 0, 1, 2, 3 + +.end3: + pxor m7, m7 + REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 + +.pass2: + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end + + punpckhqdq m6, m5, m4 ;low: out5 high: out7 + punpcklqdq m4, m5 ;low: -out8 high: -out10 + punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 + punpcklqdq m2, m7 ;low: out9 high: out11 + mova [coeffq+16*4], m2 + mova [coeffq+16*5], m6 + mova m2, [coeffq+16*6] + mova m6, [coeffq+16*7] + punpcklqdq m1, m6, m0 ;low: out13 high: out15 + punpckhqdq m0, m6 ;low: -out0 high: -out2 + punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 + punpckhqdq m2, m3 ;low: out1 high: out3 + + mova m7, [o(pw_m2048)] + jmp m(iadst_4x16_internal_8bpc).end1 + + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*1] + mova m6, [o(pw_1697x8)] + mova m1, [coeffq+16*3] + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*7] + pcmpeqw m7, m7 + mov r3, tx2q + lea tx2q, [o(.pass1_2)] +.pass1: + pmulhrsw m4, m6, m0 + pmulhrsw m5, m6, m1 + pavgw m4, m0 + pcmpeqw m0, m7 + pavgw m5, m1 + pcmpeqw m1, m7 + pandn m0, m4 + pmulhrsw m4, m6, m2 + pandn m1, m5 + pmulhrsw m5, m6, m3 + pavgw m4, m2 + pcmpeqw m2, m7 + pavgw m5, m3 + pcmpeqw m3, m7 + pandn m2, m4 + pandn m3, m5 + jmp m(iadst_4x8_internal_8bpc).pass1_end +.pass1_2: + mova [coeffq+16*1], m0 + mova [coeffq+16*3], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*2] + mova m2, [coeffq+16*4] + mova m3, [coeffq+16*6] + lea tx2q, [o(.pass1_end)] + jmp .pass1 +.pass1_end: + mova m4, [coeffq+16*1] + mova m5, [coeffq+16*3] + mova m6, [coeffq+16*5] + jmp r3 +.pass2: + mova m7, [o(pw_1697x16)] + mova [coeffq+16*6], m6 + REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 + mova m6, [coeffq+16*7] + IDTX16 6, 7, 7 + mova [coeffq+16*7], m6 + mova m6, [coeffq+16*6] + pmulhrsw m7, m6, [o(pw_1697x16)] + paddsw m6, m6 + paddsw m6, m7 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*4], m4 + jmp m(iadst_4x16_internal_8bpc).end2 + + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4, 8 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + mov r2d, 2 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] +.dconly: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m5, m5 +.dconly_loop: + mova m1, [dstq] + mova m3, [dstq+strideq] + punpckhbw m2, m1, m5 + punpcklbw m1, m5 + punpckhbw m4, m3, m5 + punpcklbw m3, m5 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + mova [dstq], m1 + mova [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + dec r2d + jg .dconly_loop + jmp tx2q +.end: + RET +%endif +%endmacro + +%macro LOAD_7ROWS 2 ;src, stride + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] +%endmacro + +%macro SAVE_7ROWS 2 ;src, stride + mova [%1+%2*0], m0 + mova [%1+%2*1], m1 + mova [%1+%2*2], m2 + mova [%1+%2*3], m3 + mova [%1+%2*4], m4 + mova [%1+%2*5], m5 + mova [%1+%2*6], m6 +%endmacro + +%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] + punpckhwd m%5, m%4, m%1 ;packed in13 in3 + punpcklwd m%1, m%4 ;packed in1 in15 + punpcklwd m%4, m%3, m%2 ;packed in9 in7 + punpckhwd m%2, m%3 ;packed in5 in11 + mova m%7, [o(pd_2048)] + ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a + ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a + ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a + ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a + psubsw m%6, m%1, m%4 ;low: t9 high: t14 + paddsw m%1, m%4 ;low: t8 high: t15 + psubsw m%4, m%5, m%2 ;low: t10 high: t13 + paddsw m%5, m%2 ;low: t11 high: t12 + mova m%2, [o(deint_shuf2)] + pshufb m%6, m%2 + pshufb m%4, m%2 + ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a + ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a + psubsw m%3, m%1, m%5 ;low: t11a high: t12a + paddsw m%1, m%5 ;low: t8a high: t15a + psubsw m%5, m%6, m%4 ;low: t10 high: t13 + paddsw m%6, m%4 ;low: t9 high: t14 + pshufb m%3, m%2 + pshufb m%5, m%2 + ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 + ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a + packssdw m%2, m%4 ;low: t11 high: t10a + packssdw m%3, m%5 ;low: t12 high: t13a + punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 + punpcklqdq m%1, m%6 ;low: t8a high: t9 +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call .main + +.pass1_end: + punpckhwd m7, m0, m2 ;packed out1, out5 + punpcklwd m0, m2 ;packed out0, out4 + punpcklwd m2, m1, m3 ;packed out3, out7 + punpckhwd m1, m3 ;packed out2, out6 + mova [coeffq+16*6], m7 + mova m7, [coeffq+16*7] + punpckhwd m3, m4, m6 ;packed out9, out13 + punpcklwd m4, m6 ;packed out8, out12 + punpcklwd m6, m5, m7 ;packed out11, out15 + punpckhwd m5, m7 ;packed out10, out14 + +.pass1_end2: + mova m7, [o(pw_16384)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*6] + mova [coeffq+16*6], m7 + +.pass1_end3: + punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high + punpcklwd m3, m6 ;packed 9, 10, 13, 15 low + punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high + punpcklwd m4, m5 ;packed 8, 10, 12, 14 low + punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) + punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) + punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) + punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) + mova [coeffq+16*7], m3 + mova m3, [coeffq+16*6] + punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high + punpcklwd m3, m2 ;packed 1, 3, 5, 7 low + punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high + punpcklwd m0, m1 ;packed 0, 2, 4, 6 low + punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) + punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) + punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) + punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) + jmp tx2q + +.pass2: + lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] + +.pass2_end: + mova [coeffq+16*4], m4 + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + lea r3, [dstq+8] + call tx2q + + add coeffq, 16*4 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + mov dstq, r3 + jmp tx2q + +ALIGN function_align +cglobal_label .main + punpckhqdq m7, m0, m1 ;low:in1 high:in3 + punpcklqdq m0, m1 + punpcklqdq m1, m2, m3 + punpckhqdq m3, m2 ;low:in7 high:in5 + mova [coeffq+16*4], m7 + mova [coeffq+16*5], m3 + mova m7, [coeffq+16*7] + punpcklqdq m2, m4, m5 + punpckhqdq m4, m5 ;low:in9 high:in11 + punpcklqdq m3, m6, m7 + punpckhqdq m7, m6 ;low:in15 high:in13 + mova [coeffq+16*6], m4 + IDCT8_1D_PACKED + mova m6, [coeffq+16*4] + mova m4, [coeffq+16*5] + mova m5, [coeffq+16*6] + mova [coeffq+16*4], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*6], m3 + + IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 + + mova m1, [coeffq+16*4] + psubsw m3, m0, m7 ;low:out15 high:out14 + paddsw m0, m7 ;low:out0 high:out1 + psubsw m7, m1, m5 ;low:out12 high:out13 + paddsw m1, m5 ;low:out3 high:out2 + mova [coeffq+16*7], m3 + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*6] + psubsw m5, m2, m4 ;low:out11 high:out10 + paddsw m2, m4 ;low:out4 high:out5 + psubsw m4, m3, m6 ;low:out8 high:out9 + paddsw m3, m6 ;low:out7 high:out6 + mova m6, m7 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call .main + call .main_pass1_end + + punpckhwd m6, m7, m0 ;packed -out11, -out15 + punpcklwd m0, m7 ;packed out0, out4 + punpcklwd m7, m3, m4 ;packed -out3, -out7 + punpckhwd m4, m3 ;packed out8, out12 + mova m1, [coeffq+16*6] + punpcklwd m3, m1, m5 ;packed -out1, -out5 + punpckhwd m5, m1 ;packed out10, out14 + mova m1, [coeffq+16*7] + mova [coeffq+16*6], m3 + mova [coeffq+16*7], m7 + punpckhwd m3, m2, m1 ;packed -out9, -out13 + punpcklwd m1, m2 ;packed out2, out6 + + mova m7, [o(pw_16384)] + +.pass1_end: + REPX {pmulhrsw x, m7}, m0, m1, m4, m5 + pxor m2, m2 + psubw m2, m7 + mova m7, [coeffq+16*6] + REPX {pmulhrsw x, m2}, m7, m3, m6 + pmulhrsw m2, [coeffq+16*7] + mova [coeffq+16*6], m7 + jmp m(idct_16x4_internal_8bpc).pass1_end3 + +.pass2: + lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end + +ALIGN function_align +cglobal_label .main + mova [coeffq+16*6], m0 + pshufd m0, m1, q1032 + pshufd m2, m2, q1032 + punpckhwd m1, m6, m0 ;packed in13, in2 + punpcklwd m0, m6 ;packed in3, in12 + punpckhwd m7, m5, m2 ;packed in11, in4 + punpcklwd m2, m5 ;packed in5, in10 + mova m6, [o(pd_2048)] + ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 + ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 + ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 + ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 + psubsw m5, m1, m2 ;low:t10a high:t11a + paddsw m1, m2 ;low:t2a high:t3a + psubsw m2, m7, m0 ;low:t12a high:t13a + paddsw m7, m0 ;low:t4a high:t5a + punpcklqdq m0, m5 + punpckhwd m0, m5 ;packed t10a, t11a + punpcklqdq m5, m2 + punpckhwd m2, m5 ;packed t13a, t12a + ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 + ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 + mova [coeffq+16*4], m1 + mova [coeffq+16*5], m7 + mova m1, [coeffq+16*6] + mova m7, [coeffq+16*7] + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + punpckhwd m5, m7, m1 ;packed in15, in0 + punpcklwd m1, m7 ;packed in1, in14 + punpckhwd m7, m4, m3 ;packed in9, in6 + punpcklwd m3, m4 ;packed in7, in8 + ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 + ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 + ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 + ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 + psubsw m4, m5, m3 ;low:t8a high:t9a + paddsw m5, m3 ;low:t0a high:t1a + psubsw m3, m7, m1 ;low:t14a high:t15a + paddsw m7, m1 ;low:t6a high:t7a + punpcklqdq m1, m4 + punpckhwd m1, m4 ;packed t8a, t9a + punpcklqdq m4, m3 + punpckhwd m3, m4 ;packed t15a, t14a + ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 + ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 + paddsw m4, m1, m2 ;low:t12a high:t13a + psubsw m1, m2 ;low:t8a high:t9a + psubsw m2, m0, m3 ;low:t14a high:t15a + paddsw m0, m3 ;low:t10a high:t11a + punpcklqdq m3, m1 + punpckhwd m3, m1 ;packed t12a, t13a + punpcklqdq m1, m2 + punpckhwd m2, m1 ;packed t15a, t14a + ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 + ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 + psubsw m1, m3, m2 ;low:t14a high:t15a + paddsw m3, m2 ;low:out2 high:-out13 + psubsw m2, m4, m0 ;low:t10 high:t11 + paddsw m0, m4 ;low:-out1 high:out14 + mova [coeffq+16*6], m0 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*4] + mova m3, [coeffq+16*5] + psubsw m4, m5, m3 ;low:t4 high:t5 + paddsw m5, m3 ;low:t0 high:t1 + psubsw m3, m0, m7 ;low:t6 high:t7 + paddsw m0, m7 ;low:t2 high:t3 + punpcklqdq m7, m4 + punpckhwd m7, m4 ;packed t4, t5 + punpcklqdq m4, m3 + punpckhwd m3, m4 ;packed t7, t6 + ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a + ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a + psubsw m4, m5, m0 ;low:t2a high:t3a + paddsw m0, m5 ;low:out0 high:-out15 + psubsw m5, m7, m3 ;low:t6 high:t7 + paddsw m3, m7 ;low:-out3 high:out12 + ret +ALIGN function_align +.main_pass1_end: + mova m7, [o(deint_shuf1)] + mova [coeffq+16*4], m0 + mova [coeffq+16*5], m3 + mova m0, [o(pw_2896_m2896)] + mova m3, [o(pw_2896_2896)] + pshufb m1, m7 ;t14a t15a + pshufb m2, m7 ;t10 t11 + pshufb m4, m7 ;t2a t3a + pshufb m5, m7 ;t6 t7 + pmaddwd m7, m0, m2 + pmaddwd m2, m3 + paddd m7, m6 + paddd m2, m6 + psrad m7, 12 + psrad m2, 12 + packssdw m2, m7 ;low:out6 high:-out9 + pmaddwd m7, m0, m4 + pmaddwd m4, m3 + paddd m7, m6 + paddd m4, m6 + psrad m7, 12 + psrad m4, 12 + packssdw m4, m7 ;low:-out7 high:out8 + pmaddwd m7, m3, m5 + pmaddwd m5, m0 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m7, m5 ;low:out4 high:-out11 + pmaddwd m5, m3, m1 + pmaddwd m1, m0 + paddd m5, m6 + paddd m1, m6 + psrad m5, 12 + psrad m1, 12 + packssdw m5, m1 ;low:-out5 high:out10 + mova m0, [coeffq+16*4] + mova m3, [coeffq+16*5] + ret +ALIGN function_align +cglobal_label .main_pass2_end + mova m7, [o(pw_2896x8)] + punpckhqdq m6, m2, m1 ;low:t11 high:t15a + punpcklqdq m2, m1 ;low:t10 high:t14a + psubsw m1, m2, m6 + paddsw m2, m6 + punpckhqdq m6, m4, m5 ;low:t3a high:t7 + punpcklqdq m4, m5 ;low:t2a high:t6 + psubsw m5, m4, m6 + paddsw m4, m6 + pmulhrsw m1, m7 ;low:-out9 high:out10 + pmulhrsw m2, m7 ;low:out6 high:-out5 + pmulhrsw m5, m7 ;low:out8 high:-out11 + pmulhrsw m4, m7 ;low:-out7 high:out4 + punpckhqdq m7, m4, m5 ;low:out4 high:-out11 + punpcklqdq m4, m5 ;low:-out7 high:out8 + punpckhqdq m5, m2, m1 ;low:-out5 high:out10 + punpcklqdq m2, m1 ;low:out6 high:-out9 + ret + + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass1_end + + punpcklwd m6, m7, m0 ;packed out11, out15 + punpckhwd m0, m7 ;packed -out0, -out4 + punpckhwd m7, m3, m4 ;packed out3, out7 + punpcklwd m4, m3 ;packed -out8, -out12 + mova m1, [coeffq+16*6] + punpckhwd m3, m1, m5 ;packed out1, out5 + punpcklwd m5, m1 ;packed -out10, -out14 + mova m1, [coeffq+16*7] + mova [coeffq+16*6], m3 + mova [coeffq+16*7], m7 + punpcklwd m3, m2, m1 ;packed out9, out13 + punpckhwd m1, m2 ;packed -out2, -out6 + + mova m7, [o(pw_m16384)] + jmp m(iadst_16x4_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end + + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m1, [coeffq+16*6] + mova m0, [coeffq+16*5] + mova m2, [coeffq+16*7] + mova m6, [o(pw_1697x16)] + mova m7, [o(pw_16384)] + pmulhrsw m4, m6, m1 + pmulhrsw m3, m6, m0 + pmulhrsw m5, m6, m2 + pmulhrsw m4, m7 + pmulhrsw m3, m7 + pmulhrsw m5, m7 + paddsw m1, m4 + paddsw m0, m3 + paddsw m5, m2 + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + mova m4, [coeffq+16*4] + mova [coeffq+16*6], m1 + mova [coeffq+16*5], m0 + mova [coeffq+16*7], m5 + pmulhrsw m0, m6, m2 + pmulhrsw m1, m6, m3 + pmulhrsw m5, m6, m4 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + pmulhrsw m5, m7 + paddsw m2, m0 + paddsw m3, m1 + paddsw m4, m5 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + pmulhrsw m5, m6, m0 + pmulhrsw m6, m1 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + paddsw m0, m5 + paddsw m1, m6 + mova m6, [coeffq+16*6] + mova m5, [coeffq+16*5] + punpckhwd m7, m0, m2 ;packed out1, out5 + punpcklwd m0, m2 ;packed out0, out4 + punpckhwd m2, m1, m3 ;packed out3, out7 + punpcklwd m1, m3 ;packed out2, out6 + mova [coeffq+16*6], m7 + mova m7, [coeffq+16*7] + punpckhwd m3, m4, m6 ;packed out9, out13 + punpcklwd m4, m6 ;packed out8, out12 + punpckhwd m6, m5, m7 ;packed out11, out15 + punpcklwd m5, m7 ;packed out10, out14 + jmp m(idct_16x4_internal_8bpc).pass1_end3 + +.pass2: + lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end + + +%macro SAVE_8ROWS 2 ;src, stride + mova [%1+%2*0], m0 + mova [%1+%2*1], m1 + mova [%1+%2*2], m2 + mova [%1+%2*3], m3 + mova [%1+%2*4], m4 + mova [%1+%2*5], m5 + mova [%1+%2*6], m6 + mova [%1+%2*7], m7 +%endmacro + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16, 8, 16*16 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mova m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + pmulhrsw m0, m2 + psrlw m2, 3 ; pw_2048 + pmulhrsw m0, m1 + pmulhrsw m0, m2 + mov r3d, 4 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop +.end: + RET +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity + +cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_8x8_internal_8bpc).pass1)] + +.pass1: + LOAD_8ROWS coeffq+16*1, 32, 1 + mov [rsp+gprsize+16*11], tx2q + lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] + jmp r3 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32, 1 + mov tx2q, [rsp+gprsize+16*11] + jmp r3 + +.pass2: + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] + +.pass2_pre: + mova [coeffq+16*2 ], m1 + mova [coeffq+16*6 ], m3 + mova [coeffq+16*10], m5 + mova [coeffq+16*14], m7 + mova m1, m2 + mova m2, m4 + mova m3, m6 + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*5 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*13] + +.pass2_main: + call m(idct_8x8_internal_8bpc).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [coeffq+16*2 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*10] + mova m3, [coeffq+16*14] + mova m4, [coeffq+16*3 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*15] + call m(idct_16x8_internal_8bpc).main + + mov r3, dstq + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 + +.pass2: + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] + +.pass2_pre: + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + +.pass2_main: + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*3 ] + mova m6, [coeffq+16*13] + mova m7, [coeffq+16*15] + mova [rsp+gprsize+16*3], m4 + mova [rsp+gprsize+16*4], m5 + mova [rsp+gprsize+16*9], m6 + mova [rsp+gprsize+32*5], m7 + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*11] + + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + + mov r3, dstq + lea dstq, [dstq+strideq*8] + jmp m(iadst_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iadst_8x8_internal_8bpc).end + + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 + +.pass2: + lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] + lea r3, [dstq+strideq*8] + +.pass2_pre: + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + +.pass2_main: + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*3 ] + mova m6, [coeffq+16*13] + mova m7, [coeffq+16*15] + mova [rsp+gprsize+16*3], m4 + mova [rsp+gprsize+16*4], m5 + mova [rsp+gprsize+16*9], m6 + mova [rsp+gprsize+32*5], m7 + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*11] + + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + jmp m(iflipadst_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iflipadst_8x8_internal_8bpc).end + + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 32, 1 + mov r3, tx2q + lea tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)] + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32, 1 + mov tx2q, r3 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass2: + lea tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)] + +.end: + mova [rsp+gprsize+16*0], m7 + mova [rsp+gprsize+16*1], m6 + mova m7, [o(pw_1697x16)] + REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 + mova m6, [rsp+gprsize+16*1] + mova [rsp+gprsize+16*2], m5 + IDTX16 6, 5, 7 + mova m5, [rsp+gprsize+16*0] + IDTX16 5, 7, 7 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+16*2] + mova [rsp+gprsize+16*0], m5 + mova [rsp+gprsize+16*1], m6 + mova [rsp+gprsize+16*2], m7 + jmp m(idct_8x8_internal_8bpc).end3 + +.end1: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp .end + + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8, 8, 16*16 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r2d, 4 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.end: + RET +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity + +cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*0, 32, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*1, 32, 1 + call .main + mov r3, tx2q + lea tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(m(idct_16x8_internal_8bpc).end)] + lea r3, [dstq+8] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).pass2_main + + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+16*2], m6 + mova [rsp+gprsize*2+32*5], m5 + + mova m6, [o(pd_2048)] + ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a + ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a + psubsw m2, m0, m4 ;t9 + paddsw m0, m4 ;t8 + psubsw m4, m7, m3 ;t14 + paddsw m7, m3 ;t15 + ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a + mova m3, [rsp+gprsize*2+16*1] + mova m5, [rsp+gprsize*2+32*5] + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+32*5], m4 + mova m2, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m7 + ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a + ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a + psubsw m4, m2, m3 ;t10 + paddsw m2, m3 ;t11 + psubsw m3, m1, m5 ;t13 + paddsw m1, m5 ;t12 + ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a + mova m7, [rsp+gprsize*2+32*5] + psubsw m6, m0, m2 ;t11a + paddsw m0, m2 ;t8a + paddsw m2, m7, m3 ;t9 + psubsw m7, m3 ;t10 + mova m5, [rsp+gprsize*2+16*0] + psubsw m3, m5, m0 ;out8 + paddsw m0, m5 ;out7 + mova [rsp+gprsize*2+32*5], m0 + mova m5, [rsp+gprsize*2+16*9] + psubsw m0, m5, m2 ;out9 + paddsw m2, m5 ;out6 + mova [rsp+gprsize*2+16*0], m0 + mova [rsp+gprsize*2+16*9], m2 + mova m0, [rsp+gprsize*2+16*1] + mova m2, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*1], m3 + psubsw m5, m0, m4 ;t13 + paddsw m0, m4 ;t14 + mova m3, [o(pd_2048)] + psubsw m4, m2, m1 ;t12a + paddsw m1, m2 ;t15a + mova [rsp+gprsize*2+16*2], m1 + ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a + ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 + mova m3, [rsp+gprsize*2+16*8] + psubsw m2, m3, m5 ;out10 + paddsw m3, m5 ;out5 + mova m5, [rsp+gprsize*2+16*7] + mova [rsp+gprsize*2+16*8], m3 + psubsw m3, m5, m4 ;out11 + paddsw m5, m4 ;out4 + mova m4, [rsp+gprsize*2+16*6] + mova [rsp+gprsize*2+16*7], m5 + paddsw m5, m4, m6 ;out3 + psubsw m4, m6 ;out12 + mova m6, [rsp+gprsize*2+16*5] + mova [rsp+gprsize*2+16*6], m5 + psubsw m5, m6, m7 ;out13 + paddsw m6, m7 ;out2 + mova m7, [rsp+gprsize*2+16*4] + mova [rsp+gprsize*2+16*5], m6 + psubsw m6, m7, m0 ;out14 + paddsw m7, m0 ;out1 + mova m1, [rsp+gprsize*2+16*2] + mova m0, [rsp+gprsize*2+16*3] + mova [rsp+gprsize*2+16*4], m7 + psubsw m7, m0, m1 ;out15 + paddsw m0, m1 ;out0 + mova [rsp+gprsize*2+16*3], m0 + mova m1, [rsp+gprsize*2+16*0] + mova m0, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*0], m7 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [coeffq+16*0 ] + pmulhrsw m1, m7, [coeffq+16*1 ] + pmulhrsw m2, m7, [coeffq+16*14] + pmulhrsw m3, m7, [coeffq+16*15] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + pmulhrsw m0, m7, [coeffq+16*6 ] + pmulhrsw m1, m7, [coeffq+16*7 ] + pmulhrsw m2, m7, [coeffq+16*8 ] + pmulhrsw m3, m7, [coeffq+16*9 ] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + pmulhrsw m0, m7, [coeffq+16*2 ] + pmulhrsw m1, m7, [coeffq+16*3 ] + pmulhrsw m2, m7, [coeffq+16*4 ] + pmulhrsw m3, m7, [coeffq+16*5 ] + pmulhrsw m4, m7, [coeffq+16*10] + pmulhrsw m5, m7, [coeffq+16*11] + pmulhrsw m6, m7, [coeffq+16*12] + pmulhrsw m7, [coeffq+16*13] + + call .main + call .main_pass1_end + mov r3, tx2q + lea tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)] + jmp m(iadst_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(iadst_8x8_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(m(iadst_16x8_internal_8bpc).end)] + lea r3, [dstq+8] + jmp m(iadst_8x8_internal_8bpc).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iadst_8x8_internal_8bpc).pass2_main + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*0], m1 + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+16*2], m6 + + mova m6, [o(pd_2048)] + ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 + ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 + psubsw m1, m0, m4 ;t10a + paddsw m0, m4 ;t2a + psubsw m4, m7, m3 ;t11a + paddsw m3, m7 ;t3a + ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 + mova m2, [rsp+gprsize*2+16*0] ;in3 + mova m7, [rsp+gprsize*2+16*1] ;in4 + mova [rsp+gprsize*2+16*0], m1 ;t11 + mova [rsp+gprsize*2+16*1], m4 ;t10 + mova m1, [rsp+gprsize*2+16*2] ;in12 + mova [rsp+gprsize*2+16*2], m0 ;t2a + ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 + ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 + psubsw m0, m7, m1 ;t12a + paddsw m1, m7 ;t4a + psubsw m4, m5, m2 ;t13a + paddsw m5, m2 ;t5a + ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 + mova m2, [rsp+gprsize*2+16*8] ;in1 + mova m7, [rsp+gprsize*2+16*9] ;in14 + mova [rsp+gprsize*2+16*8], m4 ;t12 + mova [rsp+gprsize*2+16*9], m0 ;t13 + mova m4, [rsp+gprsize*2+16*4] ;in9 + mova m0, [rsp+gprsize*2+16*5] ;in6 + mova [rsp+gprsize*2+16*4], m1 ;t4a + mova [rsp+gprsize*2+16*5], m5 ;t5a + ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 + ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 + psubsw m1, m0, m7 ;t14a + paddsw m0, m7 ;t6a + psubsw m5, m4, m2 ;t15a + paddsw m4, m2 ;t7a + ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 + mova m2, [rsp+gprsize*2+16*2] ;t2a + mova [rsp+gprsize*2+16*2], m5 ;t14 + psubsw m7, m2, m0 ;t6 + paddsw m2, m0 ;t2 + psubsw m0, m3, m4 ;t7 + paddsw m3, m4 ;t3 + ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a + mova m4, [rsp+gprsize*2+16*7] ;in0 + mova m5, [rsp+gprsize*2+32*5] ;in15 + mova [rsp+gprsize*2+16*7], m3 ;t3 + mova [rsp+gprsize*2+32*5], m1 ;t15 + mova m1, [rsp+gprsize*2+16*6] ;in7 + mova m3, [rsp+gprsize*2+16*3] ;in8 + mova [rsp+gprsize*2+16*6], m7 ;t7a + mova [rsp+gprsize*2+16*3], m0 ;t6a + ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 + ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 + psubsw m0, m4, m3 ;t8a + paddsw m4, m3 ;t0a + psubsw m3, m5, m1 ;t9a + paddsw m5, m1 ;t1a + ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 + mova m1, [rsp+gprsize*2+16*4] ;t4a + mova m7, [rsp+gprsize*2+16*5] ;t5a + mova [rsp+gprsize*2+16*4], m3 ;t8 + mova [rsp+gprsize*2+16*5], m0 ;t9 + psubsw m0, m4, m1 ;t4 + paddsw m4, m1 ;t0 + psubsw m3, m5, m7 ;t5 + paddsw m5, m7 ;t1 + ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a + mova m7, [rsp+gprsize*2+16*3] ;t6a + psubsw m1, m4, m2 ;t2a + paddsw m4, m2 ;out0 + mova [rsp+gprsize*2+16*3], m4 ;out0 + mova m4, [rsp+gprsize*2+16*6] ;t7a + psubsw m2, m3, m7 ;t6 + paddsw m3, m7 ;-out3 + mova [rsp+gprsize*2+16*6], m3 ;-out3 + psubsw m3, m0, m4 ;t7 + paddsw m0, m4 ;out12 + mova [rsp+gprsize*2+16*12], m3 + mova m3, [rsp+gprsize*2+16*7] ;t3 + mova [rsp+gprsize*2+16* 7], m2 ;out4 + psubsw m2, m5, m3 ;t3a + paddsw m5, m3 ;-out15 + mova [rsp+gprsize*2+16*11], m2 + mova m2, [rsp+gprsize*2+32*5] ;t15 + mova [rsp+gprsize*2+16*10], m1 ;-out7 + mova m1, [rsp+gprsize*2+16*0] ;t11 + mova [rsp+gprsize*2+16*0 ], m5 ;-out15 + mova m3, [rsp+gprsize*2+16*1] ;t10 + mova [rsp+gprsize*2+16*1 ], m4 ;-out11 + mova m4, [rsp+gprsize*2+16*2] ;t14 + mova [rsp+gprsize*2+16*2 ], m0 ;out12 + psubsw m0, m3, m4 ;t14a + paddsw m3, m4 ;t10a + psubsw m5, m1, m2 ;t15a + paddsw m1, m2 ;t11a + ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 + mova m2, [rsp+gprsize*2+16*4] ;t8 + mova m4, [rsp+gprsize*2+16*5] ;t9 + mova [rsp+gprsize*2+16*4], m3 ;t10a + mova [rsp+gprsize*2+16*5], m1 ;t11a + mova m3, [rsp+gprsize*2+16*8] ;t12 + mova m1, [rsp+gprsize*2+16*9] ;t13 + mova [rsp+gprsize*2+16*8], m5 ;t14 + mova [rsp+gprsize*2+16*9], m0 ;t15 + psubsw m5, m2, m3 ;t12a + paddsw m2, m3 ;t8a + psubsw m0, m4, m1 ;t13a + paddsw m4, m1 ;t9a + ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 + mova m6, [rsp+gprsize*2+16*4] ;t10a + mova m1, [rsp+gprsize*2+16*5] ;t11a + psubsw m3, m2, m6 ;t10 + paddsw m2, m6 ;-out1 + paddsw m6, m4, m1 ;out14 + psubsw m4, m1 ;t11 + mova [rsp+gprsize*2+16*14], m4 + mova [rsp+gprsize*2+16* 4], m2 ;-out1 + mova m4, [rsp+gprsize*2+16*8] ;t14 + mova m2, [rsp+gprsize*2+16*9] ;t15 + mova [rsp+gprsize*2+16* 9], m3 ;out6 + psubsw m3, m0, m4 ;t14a + paddsw m0, m4 ;out2 + psubsw m4, m5, m2 ;t15a + paddsw m5, m2 ;-out13 + mova [rsp+gprsize*2+16* 5], m0 ;out2 + ret +ALIGN function_align +.main_pass1_end: + mova m0, [rsp+gprsize*2+16*14] + mova [rsp+gprsize*2+16*14], m5 + mova [rsp+gprsize*2+16*15], m6 + mova m5, [o(pw_2896_2896)] + mova m6, [o(pw_2896_m2896)] + mova m7, [o(pd_2048)] + punpcklwd m2, m3, m4 + punpckhwd m3, m4 + pmaddwd m4, m5, m2 + pmaddwd m2, m6 + pmaddwd m1, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m4, m2, m1, m3 + REPX {psrad x, 12}, m4, m1, m2, m3 + packssdw m4, m1 ;-out5 + packssdw m2, m3 ;out10 + mova [rsp+gprsize*2+16* 8], m4 + mova m3, [rsp+gprsize*2+16* 9] + punpcklwd m1, m3, m0 + punpckhwd m3, m0 + pmaddwd m0, m5, m1 + pmaddwd m1, m6 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m0, m1, m4, m3 + REPX {psrad x, 12}, m0, m4, m1, m3 + packssdw m0, m4 ;out6 + packssdw m1, m3 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 + mova m0, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + punpcklwd m3, m0, m4 + punpckhwd m0, m4 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + pmaddwd m5, m0 + pmaddwd m0, m6 + REPX {paddd x, m7}, m4, m3, m5, m0 + REPX {psrad x, 12}, m4, m5, m3, m0 + packssdw m4, m5 ;out4 + packssdw m3, m0 ;-out11 + mova [rsp+gprsize*2+16* 7], m4 + mova m4, [rsp+gprsize*2+16*10] + mova m5, [rsp+gprsize*2+16*11] + punpcklwd m0, m4, m5 + punpckhwd m4, m5 + pmaddwd m5, m0, [o(pw_2896_2896)] + pmaddwd m0, m6 + pmaddwd m6, m4 + pmaddwd m4, [o(pw_2896_2896)] + REPX {paddd x, m7}, m5, m0, m6, m4 + REPX {psrad x, 12}, m0, m6, m5, m4 + packssdw m0, m6 ;out8 + packssdw m5, m4 ;-out7 + mova [rsp+gprsize*2+16*10], m5 + mova m4, [rsp+gprsize*2+16* 2] ;out12 + mova m5, [rsp+gprsize*2+16*14] ;-out13 + mova m6, [rsp+gprsize*2+16*15] ;out14 + ret +ALIGN function_align +cglobal_label .main_pass2_end + mova m7, [o(pw_2896x8)] + mova m1, [rsp+gprsize*2+16* 9] + mova m2, [rsp+gprsize*2+16*14] + paddsw m0, m1, m2 + psubsw m1, m2 + pmulhrsw m0, m7 ;out6 + pmulhrsw m1, m7 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 + psubsw m2, m3, m4 + paddsw m3, m4 + pmulhrsw m2, m7 ;out10 + pmulhrsw m3, m7 ;-out5 + mova [rsp+gprsize*2+16* 8], m3 + mova m3, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + paddsw m0, m3, m4 + psubsw m3, m4 + pmulhrsw m0, m7 ;out4 + pmulhrsw m3, m7 ;-out11 + mova [rsp+gprsize*2+16* 7], m0 + mova m0, [rsp+gprsize*2+16*10] + paddsw m4, m0, [rsp+gprsize*2+16*11] + psubsw m0, [rsp+gprsize*2+16*11] + pmulhrsw m4, m7 ;-out7 + pmulhrsw m0, m7 ;out8 + mova [rsp+gprsize*2+16*10], m4 + mova m4, [rsp+gprsize*2+16*2 ] ;out12 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [coeffq+16*0 ] + pmulhrsw m1, m7, [coeffq+16*1 ] + pmulhrsw m2, m7, [coeffq+16*14] + pmulhrsw m3, m7, [coeffq+16*15] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + pmulhrsw m0, m7, [coeffq+16*6 ] + pmulhrsw m1, m7, [coeffq+16*7 ] + pmulhrsw m2, m7, [coeffq+16*8 ] + pmulhrsw m3, m7, [coeffq+16*9 ] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + pmulhrsw m0, m7, [coeffq+16*2 ] + pmulhrsw m1, m7, [coeffq+16*3 ] + pmulhrsw m2, m7, [coeffq+16*4 ] + pmulhrsw m3, m7, [coeffq+16*5 ] + pmulhrsw m4, m7, [coeffq+16*10] + pmulhrsw m5, m7, [coeffq+16*11] + pmulhrsw m6, m7, [coeffq+16*12] + pmulhrsw m7, [coeffq+16*13] + + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov r3, tx2q + lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)] + lea r3, [dstq+8] + jmp m(iflipadst_8x8_internal_8bpc).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iflipadst_8x8_internal_8bpc).pass2_main + + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16*16 + mova m4, [coeffq-16*7] + mova m5, [coeffq-16*5] + mova m6, [coeffq-16*3] + mova m7, [coeffq-16*1] + mov r3, tx2q + lea tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)] + +.pass1: + mova m0, [o(pw_2896x8)] + mova m2, [o(pw_1697x16)] + mova m3, [o(pw_16384)] + sub coeffq, 8*16 + REPX {pmulhrsw x, m0}, m4, m5, m6, m7 + pmulhrsw m1, m2, m4 + pmulhrsw m1, m3 + paddsw m1, m4 ; 1 + pmulhrsw m4, m2, m5 + pmulhrsw m4, m3 + paddsw m4, m5 ; 3 + pmulhrsw m5, m2, m6 + pmulhrsw m5, m3 + paddsw m5, m6 ; 5 + pmulhrsw m6, m2, m7 + pmulhrsw m6, m3 + paddsw m7, m6 ; 7 + pmulhrsw m6, m0, [coeffq+16*6] + mova [rsp+gprsize+16*0], m4 + pmulhrsw m4, m2, m6 + pmulhrsw m4, m3 + paddsw m6, m4 ; 6 + pmulhrsw m4, m0, [coeffq+16*4] + mova [rsp+gprsize+16*1], m6 + pmulhrsw m6, m2, m4 + pmulhrsw m6, m3 + paddsw m4, m6 ; 4 + pmulhrsw m6, m0, [coeffq+16*2] + pmulhrsw m0, [coeffq+16*0] + pmulhrsw m2, m6 + pmulhrsw m2, m3 + paddsw m2, m6 ; 2 + pmulhrsw m6, m0, [o(pw_1697x16)] + pmulhrsw m6, m3 + mova m3, [rsp+gprsize+16*0] + paddsw m0, m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass1_end: + mova [coeffq+16*1], m4 + mova [coeffq+16*3], m5 + mova [coeffq+16*5], m6 + mova [coeffq+16*7], m7 + mova m4, [coeffq-16*7] + mova m5, [coeffq-16*5] + mova m6, [coeffq-16*3] + mova m7, [coeffq-16*1] + mova [coeffq-16*7], m0 + mova [coeffq-16*5], m1 + mova [coeffq-16*3], m2 + mova [coeffq-16*1], m3 + mov tx2q, r3 + jmp .pass1 + +.pass2: + lea tx2q, [o(m(iidentity_16x8_internal_8bpc).end)] + lea r3, [dstq+8] + jmp m(iidentity_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iidentity_8x8_internal_8bpc).end + + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16, 8, 16*16 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r2d, 8 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.end: + RET +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity + +cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 64 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*3, 64 + call m(idct_16x8_internal_8bpc).main + mov r3, tx2q + lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*17, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass2: + lea tx2q, [o(m(idct_16x16_internal_8bpc).end)] + jmp m(idct_8x16_internal_8bpc).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x16_internal_8bpc).end1)] + mov dstq, r3 + lea r3, [dstq+8] + jmp m(idct_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + mov dstq, r3 + + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*4 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*12] + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*5 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*13] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] + jmp m(idct_8x16_internal_8bpc).pass2_main + + +%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 + mova m0, [coeffq+16*1 ] + mova m1, [coeffq+16*3 ] + mova m2, [coeffq+16*29] + mova m3, [coeffq+16*31] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + mova m0, [coeffq+16*13] + mova m1, [coeffq+16*15] + mova m2, [coeffq+16*17] + mova m3, [coeffq+16*19] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + mova m0, [coeffq+16*5 ] + mova m1, [coeffq+16*7 ] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*11] + mova m4, [coeffq+16*21] + mova m5, [coeffq+16*23] + mova m6, [coeffq+16*25] + mova m7, [coeffq+16*27] +%endmacro + +%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*2 ] + mova m2, [coeffq+16*28] + mova m3, [coeffq+16*30] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + mova m0, [coeffq+16*12] + mova m1, [coeffq+16*14] + mova m2, [coeffq+16*16] + mova m3, [coeffq+16*18] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m4, [coeffq+16*20] + mova m5, [coeffq+16*22] + mova m6, [coeffq+16*24] + mova m7, [coeffq+16*26] +%endmacro + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + ITX_16X16_ADST_LOAD_ODD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mov r3, tx2q + lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*17, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*1, 32 + ITX_16X16_ADST_LOAD_EVEN_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + lea tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass2: + lea tx2q, [o(m(iadst_16x16_internal_8bpc).end)] + jmp m(iadst_8x16_internal_8bpc).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iadst_16x16_internal_8bpc).end1)] + mov dstq, r3 + lea r3, [dstq+8] + jmp m(iadst_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + mov dstq, r3 + + mova m4, [coeffq+16*0 ] + mova m5, [coeffq+16*2 ] + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m6, [coeffq+16*12] + mova m7, [coeffq+16*14] + mova [rsp+gprsize+16*7], m4 + mova [rsp+gprsize+16*8], m5 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] + jmp m(iadst_8x16_internal_8bpc).pass2_main + + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + ITX_16X16_ADST_LOAD_ODD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mov r3, tx2q + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*17, 32 + ITX_16X16_ADST_LOAD_EVEN_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS coeffq+16* 0, 32 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass2: + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)] + lea r3, [dstq+8] + jmp m(iflipadst_8x16_internal_8bpc).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp m(iflipadst_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + + mova m4, [coeffq+16*0 ] + mova m5, [coeffq+16*2 ] + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m6, [coeffq+16*12] + mova m7, [coeffq+16*14] + mova [rsp+gprsize+16*7], m4 + mova [rsp+gprsize+16*8], m5 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + + lea tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)] + mov dstq, r3 + jmp m(iflipadst_8x16_internal_8bpc).pass2_main + +.end2: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp m(iflipadst_8x8_internal_8bpc).end + + +%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 +%endmacro + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16*17 + mov r3, tx2q + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)] + +.pass1: + mova m6, [o(pw_1697x16)] + mova m7, [coeffq+32*6] + mova m0, [coeffq+32*0] + mova m1, [coeffq+32*1] + mova m2, [coeffq+32*2] + mova m3, [coeffq+32*3] + mova m4, [coeffq+32*4] + REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 + mova m5, [coeffq+32*5] + mova [rsp+gprsize+16*1], m7 + IDTX16B 5, 7, 6 + mova m7, [coeffq+32*7] + IDTX16B 7, 6, 6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass1_end: + SAVE_8ROWS coeffq, 32 + sub coeffq, 16 + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)] + jmp .pass1 + +.pass1_end1: + SAVE_8ROWS coeffq, 32 + sub coeffq, 15*16 + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)] + jmp .pass1 + +.pass1_end2: + SAVE_8ROWS coeffq, 32 + sub coeffq, 16 + mov tx2q, r3 + jmp .pass1 + +.pass2: + lea r3, [dstq+8] + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)] + +.end: + mova [rsp+gprsize+16*0], m7 + mova [rsp+gprsize+16*1], m4 + mova m7, [o(pw_1697x16)] + REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 + mova m4, [o(pw_2048)] + pmulhrsw m5, m4 + pmulhrsw m6, m4 + mova [rsp+gprsize+16*2], m5 + mova m5, [rsp+gprsize+16*1] + mova [rsp+gprsize+16*1], m6 + IDTX16 5, 6, 7 + mova m6, [rsp+gprsize+16*0] + IDTX16 6, 7, 7 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 + pmulhrsw m4, m5 + mova [rsp+gprsize+16*0], m6 + jmp m(idct_8x8_internal_8bpc).end3 + +.end1: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)] + lea dstq, [dstq+strideq*2] + jmp .end + +.end2: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + LOAD_8ROWS coeffq, 32 + lea tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)] + mov dstq, r3 + jmp .end + +.end3: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp .end + + +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_8x32_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + pmulhrsw m0, m2 + psrlw m2, 2 ;pw_2048 + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + mov r3d, 8 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop + +.end: + RET + + + +cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + cmp eobd, 106 + jle .fast + + LOAD_8ROWS coeffq+16*3, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1: + mova [rsp+gprsize+16*9 ], m0 ;in24 + mova [rsp+gprsize+16*10], m4 ;in28 + mova [rsp+gprsize+16*17], m2 ;in26 + mova [rsp+gprsize+16*18], m6 ;in30 + mova [rsp+gprsize+16*31], m1 ;in25 + mova [rsp+gprsize+16*30], m3 ;in27 + mova [rsp+gprsize+16*27], m5 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_1: + mova [rsp+gprsize+16*7 ], m0 ;in16 + mova [rsp+gprsize+16*8 ], m4 ;in20 + mova [rsp+gprsize+16*15], m2 ;in18 + mova [rsp+gprsize+16*16], m6 ;in22 + mova [rsp+gprsize+16*33], m1 ;in17 + mova [rsp+gprsize+16*28], m3 ;in19 + mova [rsp+gprsize+16*29], m5 ;in21 + mova [rsp+gprsize+16*32], m7 ;in23 + +.fast: + LOAD_8ROWS coeffq+16*1, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + mova [rsp+gprsize+16*5 ], m0 ;in8 + mova [rsp+gprsize+16*6 ], m4 ;in12 + mova [rsp+gprsize+16*13], m2 ;in10 + mova [rsp+gprsize+16*14], m6 ;in14 + mova [rsp+gprsize+16*21], m1 ;in9 + mova [rsp+gprsize+16*24], m3 ;in11 + mova [rsp+gprsize+16*25], m5 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + mova [rsp+gprsize+16*11], m2 ;in2 + mova [rsp+gprsize+16*12], m6 ;in6 + mova [rsp+gprsize+16*19], m1 ;in1 + mova [rsp+gprsize+16*26], m3 ;in3 + mova [rsp+gprsize+16*23], m5 ;in5 + mova [rsp+gprsize+16*22], m7 ;in7 + mova m1, m4 ;in4 + mova m2, [rsp+gprsize+16*5 ] ;in8 + mova m3, [rsp+gprsize+16*6 ] ;in12 + + cmp eobd, 106 + jg .full + + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + mova m0, [rsp+gprsize+16*11] + mova m1, [rsp+gprsize+16*12] + mova m2, [rsp+gprsize+16*13] + mova m3, [rsp+gprsize+16*14] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call .main_fast + jmp .pass2 + +.full: + mova m4, [rsp+gprsize+16*7 ] ;in16 + mova m5, [rsp+gprsize+16*8 ] ;in20 + mova m6, [rsp+gprsize+16*9 ] ;in24 + mova m7, [rsp+gprsize+16*10] ;in28 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + LOAD_8ROWS rsp+gprsize+16*11, 16 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + call .main + +.pass2: + lea r3, [o(m(idct_8x32_internal_8bpc).end6)] + +.end: + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_8x32_internal_8bpc).end2)] + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + + jmp tx2q + +.end2: + lea tx2q, [o(m(idct_8x32_internal_8bpc).end3)] + jmp m(idct_8x8_internal_8bpc).end + +.end3: + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + lea tx2q, [o(m(idct_8x32_internal_8bpc).end4)] + jmp m(idct_8x8_internal_8bpc).end + +.end4: + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + lea tx2q, [o(m(idct_8x32_internal_8bpc).end5)] + jmp m(idct_8x8_internal_8bpc).end + +.end5: + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + mov tx2q, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end6: + ret + +ALIGN function_align +.main_veryfast: + mova m0, [rsp+gprsize*2+16*19] ;in1 + pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 + pmulhrsw m0, [o(pw_201x8)] ;t16,t17 + mova m7, [o(pd_2048)] + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*34], m3 ;t31 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*20], m3 ;t17a + mova [rsp+gprsize*2+16*33], m0 ;t30a + mova m1, [rsp+gprsize*2+16*22] ;in7 + pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 + pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 + mova [rsp+gprsize*2+16*22], m1 ;t19 + mova [rsp+gprsize*2+16*31], m2 ;t28 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m2 ;t18a + mova [rsp+gprsize*2+16*32], m1 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 + pmulhrsw m0, [o(pw_995x8)] ;t20, t21 + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*30], m3 ;t27 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*24], m3 ;t21a + mova [rsp+gprsize*2+16*29], m0 ;t26a + mova m2, [rsp+gprsize*2+16*26] ;in3 + pxor m0, m0 + mova m3, m0 + pmulhrsw m1, m2, [o(pw_4052x8)] + pmulhrsw m2, [o(pw_m601x8)] + jmp .main2 + +ALIGN function_align +.main_fast: ;bottom half is zero + mova m0, [rsp+gprsize*2+16*19] ;in1 + mova m1, [rsp+gprsize*2+16*20] ;in15 + pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a + pmulhrsw m0, [o(pw_201x8)] ;t16a + pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a + pmulhrsw m1, [o(pw_m2751x8)] ;t17a + mova m7, [o(pd_2048)] + psubsw m4, m0, m1 ;t17 + paddsw m0, m1 ;t16 + psubsw m5, m3, m2 ;t30 + paddsw m3, m2 ;t31 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*20], m5 ;t17a + mova [rsp+gprsize*2+16*33], m4 ;t30a + mova [rsp+gprsize*2+16*34], m3 ;t31 + mova m0, [rsp+gprsize*2+16*21] ;in9 + mova m1, [rsp+gprsize*2+16*22] ;in7 + pmulhrsw m3, m0, [o(pw_3703x8)] + pmulhrsw m0, [o(pw_1751x8)] + pmulhrsw m2, m1, [o(pw_3857x8)] + pmulhrsw m1, [o(pw_m1380x8)] + psubsw m4, m1, m0 ;t18 + paddsw m0, m1 ;t19 + psubsw m5, m2, m3 ;t29 + paddsw m3, m2 ;t28 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*22], m0 ;t19 + mova [rsp+gprsize*2+16*31], m3 ;t28 + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + mova m1, [rsp+gprsize*2+16*24] ;in11 + pmulhrsw m3, m0, [o(pw_3973x8)] + pmulhrsw m0, [o(pw_995x8)] + pmulhrsw m2, m1, [o(pw_3513x8)] + pmulhrsw m1, [o(pw_m2106x8)] + psubsw m4, m0, m1 ;t21 + paddsw m0, m1 ;t20 + psubsw m5, m3, m2 ;t26 + paddsw m3, m2 ;t27 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m4 ;t26a + mova [rsp+gprsize*2+16*30], m3 ;t27 + mova m0, [rsp+gprsize*2+16*25] ;in13 + mova m2, [rsp+gprsize*2+16*26] ;in3 + pmulhrsw m3, m0, [o(pw_3290x8)] + pmulhrsw m0, [o(pw_2440x8)] + pmulhrsw m1, m2, [o(pw_4052x8)] + pmulhrsw m2, [o(pw_m601x8)] + jmp .main2 + +ALIGN function_align +.main: + mova m7, [o(pd_2048)] + mova m0, [rsp+gprsize*2+16*19] ;in1 + mova m1, [rsp+gprsize*2+16*20] ;in15 + mova m2, [rsp+gprsize*2+16*33] ;in17 + mova m3, [rsp+gprsize*2+16*34] ;in31 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a + psubsw m4, m0, m2 ;t17 + paddsw m0, m2 ;t16 + psubsw m5, m3, m1 ;t30 + paddsw m3, m1 ;t31 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*20], m5 ;t17a + mova [rsp+gprsize*2+16*33], m4 ;t30a + mova [rsp+gprsize*2+16*34], m3 ;t31 + mova m0, [rsp+gprsize*2+16*21] ;in9 + mova m1, [rsp+gprsize*2+16*22] ;in7 + mova m2, [rsp+gprsize*2+16*31] ;in25 + mova m3, [rsp+gprsize*2+16*32] ;in23 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a + psubsw m4, m2, m0 ;t18 + paddsw m0, m2 ;t19 + psubsw m5, m1, m3 ;t29 + paddsw m3, m1 ;t28 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*22], m0 ;t19 + mova [rsp+gprsize*2+16*31], m3 ;t28 + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + mova m1, [rsp+gprsize*2+16*24] ;in11 + mova m2, [rsp+gprsize*2+16*29] ;in21 + mova m3, [rsp+gprsize*2+16*30] ;in27 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a + psubsw m4, m0, m2 ;t21 + paddsw m0, m2 ;t20 + psubsw m5, m3, m1 ;t26 + paddsw m3, m1 ;t27 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m4 ;t26a + mova [rsp+gprsize*2+16*30], m3 ;t27 + mova m0, [rsp+gprsize*2+16*25] ;in13 + mova m1, [rsp+gprsize*2+16*26] ;in3 + mova m2, [rsp+gprsize*2+16*27] ;in29 + mova m3, [rsp+gprsize*2+16*28] ;in19 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a + +.main2: + psubsw m4, m2, m0 ;t22 + paddsw m0, m2 ;t23 + psubsw m5, m1, m3 ;t25 + paddsw m3, m1 ;t24 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a + mova m2, [rsp+gprsize*2+16*24] ;t21a + psubsw m1, m5, m2 ;t21 + paddsw m5, m2 ;t22 + mova [rsp+gprsize*2+16*25], m5 ;t22 + mova m2, [rsp+gprsize*2+16*29] ;t26a + psubsw m5, m4, m2 ;t26 + paddsw m4, m2 ;t25 + mova [rsp+gprsize*2+16*28], m4 ;t25 + ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m1 ;t26a + + mova m1, [rsp+gprsize*2+16*23] ;t20 + mova m5, [rsp+gprsize*2+16*30] ;t27 + psubsw m2, m0, m1 ;t20a + paddsw m0, m1 ;t23a + psubsw m6, m3, m5 ;t27a + paddsw m3, m5 ;t24a + ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 + mova [rsp+gprsize*2+16*26], m0 ;t23a + mova [rsp+gprsize*2+16*27], m3 ;t24a + mova [rsp+gprsize*2+16*30], m2 ;t27 + + mova m0, [rsp+gprsize*2+16*20] ;t17a + mova m1, [rsp+gprsize*2+16*21] ;t18a + mova m2, [rsp+gprsize*2+16*32] ;t29a + mova m3, [rsp+gprsize*2+16*33] ;t30a + psubsw m4, m0, m1 ;t18 + paddsw m0, m1 ;t17 + psubsw m5, m3, m2 ;t29 + paddsw m3, m2 ;t30 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a + mova [rsp+gprsize*2+16*20], m0 ;t17 + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova [rsp+gprsize*2+16*33], m3 ;t30 + mova m0, [rsp+gprsize*2+16*19] ;t16 + mova m1, [rsp+gprsize*2+16*22] ;t19 + mova m2, [rsp+gprsize*2+16*31] ;t28 + mova m3, [rsp+gprsize*2+16*34] ;t31 + psubsw m4, m0, m1 ;t19a + paddsw m0, m1 ;t16a + psubsw m5, m3, m2 ;t28a + paddsw m3, m2 ;t31a + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 + mova m2, [rsp+gprsize*2+16*15] ;tmp12 + psubsw m1, m5, m6 ;t20a + paddsw m5, m6 ;t19a + psubsw m6, m2, m5 ;out19 + paddsw m2, m5 ;out12 + mova m5, [rsp+gprsize*2+16*30] ;t27 + mova [rsp+gprsize*2+16*22], m6 ;out19 + mova [rsp+gprsize*2+16*15], m2 ;out12 + psubsw m6, m4, m5 ;t27a + paddsw m4, m5 ;t28a + ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 + mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 + psubsw m5, m2, m4 ;out28 + paddsw m2, m4 ;out3 + mova m4, [rsp+gprsize*2+16*14] ;tmp11 + mova [rsp+gprsize*2+16*31], m5 ;out28 + mova [rsp+gprsize*2+16*6 ], m2 ;out3 + psubsw m5, m4, m6 ;out20 + paddsw m4, m6 ;out11 + mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 + mova [rsp+gprsize*2+16*23], m5 ;out20 + mova [rsp+gprsize*2+16*14], m4 ;out11 + psubsw m5, m2, m1 ;out27 + paddsw m2, m1 ;out4 + mova m1, [rsp+gprsize*2+16*26] ;t23a + mova m4, [rsp+gprsize*2+16*27] ;t24a + mova [rsp+gprsize*2+16*30], m5 ;out27 + mova [rsp+gprsize*2+16*7 ], m2 ;out4 + psubsw m5, m0, m1 ;t23 + paddsw m0, m1 ;t16 + psubsw m2, m3, m4 ;t24 + paddsw m3, m4 ;t31 + ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a + mova m6, [rsp+gprsize*2+16*18] ;tmp15 + psubsw m4, m6, m0 ;out16 + paddsw m6, m0 ;out15 + mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 + mova m1, [rsp+gprsize*2+16*11] ;tmp8 + mova [rsp+gprsize*2+16*18], m6 ;out15 + mova [rsp+gprsize*2+16*19], m4 ;out16 + psubsw m6, m0, m3 ;out31 + paddsw m0, m3 ;out0 + psubsw m4, m1, m2 ;out23 + paddsw m1, m2 ;out8 + mova m3, [rsp+gprsize*2+16*10] ;tmp7 + mova [rsp+gprsize*2+16*34], m6 ;out31 + mova [rsp+gprsize*2+16*11], m1 ;out8 + mova [rsp+gprsize*2+16*26], m4 ;out23 + paddsw m6, m3, m5 ;out7 + psubsw m3, m5 ;out24 + mova m1, [rsp+gprsize*2+16*20] ;t17 + mova m5, [rsp+gprsize*2+16*25] ;t22 + mova m2, [rsp+gprsize*2+16*17] ;tmp14 + mova [rsp+gprsize*2+16*27], m3 ;out24 + psubsw m4, m1, m5 ;t22a + paddsw m1, m5 ;t17a + psubsw m3, m2, m1 ;out17 + paddsw m2, m1 ;out14 + mova m5, [rsp+gprsize*2+16*28] ;t25 + mova m1, [rsp+gprsize*2+16*33] ;t30 + mova [rsp+gprsize*2+16*17], m2 ;out14 + mova [rsp+gprsize*2+16*20], m3 ;out17 + psubsw m2, m1, m5 ;t25a + paddsw m1, m5 ;t30a + ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 + mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 + psubsw m3, m5, m1 ;out30 + paddsw m5, m1 ;out1 + mova m1, [rsp+gprsize*2+16*12] ;tmp9 + mova [rsp+gprsize*2+16*33], m3 ;out30 + mova [rsp+gprsize*2+16*4 ], m5 ;out1 + psubsw m3, m1, m2 ;out22 + paddsw m1, m2 ;out9 + mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 + mova [rsp+gprsize*2+16*25], m3 ;out22 + mova [rsp+gprsize*2+16*12], m1 ;out9 + psubsw m3, m5, m4 ;out25 + paddsw m5, m4 ;out6 + mova m4, [rsp+gprsize*2+16*21] ;t18a + mova m1, [rsp+gprsize*2+16*24] ;t21a + mova m2, [rsp+gprsize*2+16*16] ;tmp13 + mova [rsp+gprsize*2+16*28], m3 ;out25 + mova [rsp+gprsize*2+16*9 ], m5 ;out6 + paddsw m3, m4, m1 ;t18 + psubsw m4, m1 ;t21 + psubsw m5, m2, m3 ;out18 + paddsw m2, m3 ;out13 + mova m1, [rsp+gprsize*2+16*29] ;t26a + mova m3, [rsp+gprsize*2+16*32] ;t29a + mova [rsp+gprsize*2+16*21], m5 ;out18 + mova [rsp+gprsize*2+16*16], m2 ;out13 + psubsw m5, m3, m1 ;t26 + paddsw m3, m1 ;t29 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a + mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 + psubsw m1, m2, m3 ;out29 + paddsw m2, m3 ;out2 + mova m3, [rsp+gprsize*2+16*13] ;tmp10 + mova [rsp+gprsize*2+16*32], m1 ;out29 + psubsw m7, m3, m5 ;out21 + paddsw m3, m5 ;out10 + mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 + mova [rsp+gprsize*2+16*24], m7 ;out21 + mova [rsp+gprsize*2+16*13], m3 ;out10 + psubsw m1, m5, m4 ;out26 + paddsw m5, m4 ;out5 + mova m7, m6 ;out7 + mova m3, [rsp+gprsize*2+16*6 ] ;out3 + mova m4, [rsp+gprsize*2+16*7 ] ;out4 + mova [rsp+gprsize*2+16*29], m1 ;out26 + mova m6, [rsp+gprsize*2+16*9 ] ;out6 + mova m1, [rsp+gprsize*2+16*4 ] ;out1 + ret + + +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_32x8_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 8 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + +.body: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m5, m5 + +.loop: + mova m1, [dstq+16*0] + mova m3, [dstq+16*1] + punpckhbw m2, m1, m5 + punpcklbw m1, m5 + punpckhbw m4, m3, m5 + punpcklbw m3, m5 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m3 + add dstq, strideq + dec r3d + jg .loop + jmp tx2q + +.end: + RET + + +cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+16*1, 32 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + cmp eobd, 106 + jg .full + call m(idct_8x32_internal_8bpc).main_fast + jmp .pass2 + +.full: + LOAD_8ROWS coeffq+16*17, 32 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + call m(idct_8x32_internal_8bpc).main + +.pass2: + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end)] + jmp m(idct_8x32_internal_8bpc).end1 + +.end: + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x8_internal_8bpc).end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end1: + lea r3, [dstq+8] + lea tx2q, [o(m(idct_32x8_internal_8bpc).end2)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end2: + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x8_internal_8bpc).end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end3: + mov dstq, r3 + add r3, 8 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end4)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end4: + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x8_internal_8bpc).end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end5: + mov dstq, r3 + add r3, 8 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end6)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end6: + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x8_internal_8bpc).end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end7: + mov dstq, r3 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end8: + ret + + +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov tx2d, 2 + cmp eobd, 107 + cmovns tx2d, r5d + mov r3d, tx2d +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] +.loop: + LOAD_8ROWS coeffq+16*0, 64 + paddsw m6, [o(pw_5)] + mova [rsp+16*1], m6 + mova m6, [o(pw_5)] + REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + call m(idct_8x8_internal_8bpc).pass1_end3 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [rsp+16*2], m5 + mova [rsp+16*1], m6 + mova [rsp+16*0], m7 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + pxor m7, m7 + REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + add coeffq, 16 + dec r3d + jg .loop + RET + +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov tx2d, 2 + cmp eobd, 107 + cmovns tx2d, r5d + mov r3d, tx2d +%if ARCH_X86_32 + LEA r5, $$ +%endif + +.loop: + LOAD_8ROWS coeffq+16*0, 16 + pmulhrsw m6, [o(pw_4096)] + mova [rsp+16*1], m6 + mova m6, [o(pw_4096)] + REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + call m(idct_8x8_internal_8bpc).pass1_end3 + + mov [rsp+16*3], dstq + mova [rsp+16*2], m5 + mova [rsp+16*1], m6 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + call m(idct_8x8_internal_8bpc).end3 + + add coeffq, 16*8 + mov dstq, [rsp+16*3] + lea dstq, [dstq+8] + dec r3d + jg .loop + jnc .loop + RET + + +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_16x32_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r2d, 16 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly + +.end: + RET + +cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + LOAD_8ROWS coeffq+16*1, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*5, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + mova [coeffq+16*1 ], m0 ;in8 + mova [coeffq+16*5 ], m4 ;in12 + mova [rsp+gprsize+16*13], m2 ;in10 + mova [rsp+gprsize+16*14], m6 ;in14 + mova [rsp+gprsize+16*21], m1 ;in9 + mova [rsp+gprsize+16*24], m3 ;in11 + mova [rsp+gprsize+16*25], m5 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + LOAD_8ROWS coeffq+16*0, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*4, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + mova [rsp+gprsize+16*11], m2 ;in2 + mova [rsp+gprsize+16*12], m6 ;in6 + mova [rsp+gprsize+16*19], m1 ;in1 + mova [rsp+gprsize+16*26], m3 ;in3 + mova [rsp+gprsize+16*23], m5 ;in5 + mova [rsp+gprsize+16*22], m7 ;in7 + + cmp eobd, 150 + jg .full + + mova m1, m4 ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*5 ] ;in12 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [rsp+gprsize+16*11] ;in2 + mova m1, [rsp+gprsize+16*12] ;in6 + mova m2, [rsp+gprsize+16*13] ;in10 + mova m3, [rsp+gprsize+16*14] ;in14 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + jmp .pass2 + +.full: + mova [coeffq+16*0 ], m0 ;in0 + mova [coeffq+16*4 ], m4 ;in4 + + LOAD_8ROWS coeffq+16*2, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*6, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end5: + mova [coeffq+16*2 ], m0 ;in16 + mova [coeffq+16*6 ], m4 ;in20 + mova [rsp+gprsize+16*15], m2 ;in18 + mova [rsp+gprsize+16*16], m6 ;in22 + mova [rsp+gprsize+16*33], m1 ;in17 + mova [rsp+gprsize+16*28], m3 ;in19 + mova [rsp+gprsize+16*29], m5 ;in21 + mova [rsp+gprsize+16*32], m7 ;in23 + + LOAD_8ROWS coeffq+16*3, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*7, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end6: + SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end7: + mova [rsp+gprsize+16*17], m2 ;in26 + mova [rsp+gprsize+16*18], m6 ;in30 + mova [rsp+gprsize+16*31], m1 ;in25 + mova [rsp+gprsize+16*30], m3 ;in27 + mova [rsp+gprsize+16*27], m5 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + mova m6, m0 ;in24 + mova m7, m4 ;in28 + mova m0, [coeffq+16*0 ] ;in0 + mova m1, [coeffq+16*4 ] ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*5 ] ;in12 + mova m4, [coeffq+16*2 ] ;in16 + mova m5, [coeffq+16*6 ] ;in20 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + LOAD_8ROWS rsp+gprsize+16*11, 16 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main + +.pass2: + mov [rsp+gprsize*1+16*35], eobd + lea r3, [dstq+8] + mov [rsp+gprsize*2+16*35], r3 + lea r3, [o(m(idct_16x32_internal_8bpc).end)] + jmp m(idct_8x32_internal_8bpc).end + +.end: + mov dstq, [rsp+gprsize*2+16*35] + mov eobd, [rsp+gprsize*1+16*35] + add coeffq, 16*32 + + mova m0, [coeffq+16*4 ] ;in1 + mova m1, [coeffq+16*12] ;in3 + mova m2, [coeffq+16*20] ;in5 + mova m3, [coeffq+16*28] ;in7 + mova m4, [coeffq+16*5 ] ;in9 + mova m5, [coeffq+16*13] ;in11 + mova m6, [coeffq+16*21] ;in13 + mova m7, [coeffq+16*29] ;in15 + + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mova m0, [coeffq+16*0 ] ;in0 + mova m1, [coeffq+16*16] ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*17] ;in12 + + cmp eobd, 150 + jg .full1 + + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] ;in2 + mova m1, [coeffq+16*24] ;in6 + mova m2, [coeffq+16*9 ] ;in10 + mova m3, [coeffq+16*25] ;in14 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + jmp .end1 + +.full1: + mova m4, [coeffq+16*2 ] ;in16 + mova m5, [coeffq+16*18] ;in20 + mova m6, [coeffq+16*3 ] ;in24 + mova m7, [coeffq+16*19] ;in26 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] ;in2 + mova m1, [coeffq+16*24] ;in6 + mova m2, [coeffq+16*9 ] ;in10 + mova m3, [coeffq+16*25] ;in14 + mova m4, [coeffq+16*10] ;in18 + mova m5, [coeffq+16*26] ;in22 + mova m6, [coeffq+16*11] ;in26 + mova m7, [coeffq+16*27] ;in30 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*6 ] ;in17 + mova m1, [coeffq+16*14] ;in19 + mova m2, [coeffq+16*22] ;in21 + mova m3, [coeffq+16*30] ;in23 + mova m4, [coeffq+16*7 ] ;in25 + mova m5, [coeffq+16*15] ;in27 + mova m6, [coeffq+16*23] ;in29 + mova m7, [coeffq+16*31] ;in31 + + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + +.end1: + jmp m(idct_8x32_internal_8bpc).pass2 + + + +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x16_internal_8bpc) + call m(idct_8x16_internal_8bpc).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*11, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*19, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*27, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r3d, 16 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body + + +cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + add coeffq, 16 + lea r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)] +.pass1: + LOAD_8ROWS coeffq+16*0, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*4, 128, 1 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+16*2, 64, 1 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + LOAD_8ROWS coeffq+16*34, 64, 1 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + call m(idct_8x32_internal_8bpc).main + +.pass1_end: + mova [rsp+gprsize+16*0 ], m7 + mov tx2q, r3 + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+16*32, 32 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+16*48, 32 + + sub coeffq, 16 + lea r3, [o(m(idct_32x16_internal_8bpc).end)] + jmp .pass1 + +.end: + ret + + +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, eobd + cmp eobd, 43 ;if (eob > 43) + sbb r3d, r3d ; iteration_count++ + cmp r4d, 150 ;if (eob > 150) + sbb r3d, 0 ; iteration_count++ + cmp r4d, 278 ;if (eob > 278) + sbb r3d, -4 ; iteration_count++ + +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea r4, [dstq+8] + mov [rsp+16*3], r4 + mov [rsp+gprsize+16*3], r3d + mov [rsp+gprsize*2+16*3], coeffq + +.loop: + LOAD_8ROWS coeffq, 64, 1 + mova [rsp+16*1], m6 + pxor m6, m6 + REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 + mova [rsp+16*0], m2 + mova [rsp+16*1], m3 + mova [rsp+16*2], m4 + mova m3, [o(pw_1697x16)] + mova m4, [o(pw_16384)] + REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 + mova m2, [o(pw_8192)] + REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 + mova m2, [rsp+16*0] + mova [rsp+16*0], m7 + IDTX16 2, 7, 3, 4 + mova m7, [rsp+16*2] + mova [rsp+16*2], m5 + IDTX16 7, 5, 3, 4 + mova m5, [rsp+16*1] + mova [rsp+16*1], m6 + pmulhrsw m3, m5 + pmulhrsw m3, m4 + psrlw m4, 1 ; pw_8192 + paddsw m3, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + pmulhrsw m4, m7 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + add coeffq, 16 + dec r3d + jg .loop + mov coeffq, [rsp+gprsize*2+16*3] + add coeffq, 64*8 + mov r3d, [rsp+gprsize+16*3] + xor dstq, dstq + mov [rsp+gprsize+16*3], dstq + mov dstq, [rsp+16*3] + test r3d, r3d + jnz .loop + RET + + +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 12 ;0100b + mov r5d, 136 ;1000 1000b + cmp eobd, 44 ;if (eob > 43) + cmovns r4d, r5d ; iteration_count+2 + cmp eobd, 151 ;if (eob > 150) + mov r3d, 34952 ;1000 1000 1000 1000b + cmovs r3d, r4d ; iteration_count += 4 + +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea r4, [dstq+8] + mov [rsp+16*3], r4 + +.loop: + LOAD_8ROWS coeffq, 32, 1 + REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + mova [rsp+16*1], m6 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 + mova [rsp+16*1], m5 + mova [rsp+16*2], m6 + mova m6, [o(pw_1697x16)] + REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 + pmulhrsw m7, [o(pw_2048)] + mova m5, [rsp+16*1] + mova [rsp+16*0], m7 + IDTX16 5, 7, 6 + mova m7, [rsp+16*2] + IDTX16 7, 6, 6 + mova m6, [o(pw_2048)] + REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + mova [rsp+16*2], m5 + mova [rsp+16*1], m7 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + pxor m7, m7 + REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + +.loop_end: + add coeffq, 16 + shr r3d, 2 + jz .ret + test r3d, 2 + jnz .loop + mov r4d, r3d + and r4d, 1 + lea coeffq, [coeffq+r4*8+32*7] + mov dstq, [rsp+16*3] + lea r4, [dstq+8] + mov [rsp+16*3], r4 + jmp .loop + +.ret: + RET + + +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x32_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 32 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body + + +cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*35], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*35], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*1, 64*2 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov tx2d, [rsp+gprsize*1+16*35] + test tx2d, tx2d + jl .fast + +.full: + LOAD_8ROWS coeffq+64*0, 64*4 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*2, 64*4 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*17, 64*2 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp .pass1_end + +.fast: + mova m0, [coeffq+256*0] + mova m1, [coeffq+256*1] + mova m2, [coeffq+256*2] + mova m3, [coeffq+256*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [coeffq+128*1] + mova m1, [coeffq+128*3] + mova m2, [coeffq+128*5] + mova m3, [coeffq+128*7] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + +.pass1_end: + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end4: + SAVE_8ROWS coeffq+64*24, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + + +.pass2: + mov coeffq, [rsp+gprsize*2+16*35] + mov r3d, 4 + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] + +.pass2_loop: + mov [rsp+gprsize*3+16*35], r3d + lea r3, [dstq+8] + mov [rsp+gprsize*2+16*35], r3 + + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*12] + mova m2, [coeffq+16*20] + mova m3, [coeffq+16*28] + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*13] + mova m6, [coeffq+16*21] + mova m7, [coeffq+16*29] + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov eobd, [rsp+gprsize*1+16*35] + test eobd, eobd + jl .fast1 + +.full1: + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*16] + mova m2, [coeffq+16*1 ] + mova m3, [coeffq+16*17] + mova m4, [coeffq+16*2 ] + mova m5, [coeffq+16*18] + mova m6, [coeffq+16*3 ] + mova m7, [coeffq+16*19] + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova m4, [coeffq+16*10] + mova m5, [coeffq+16*26] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*27] + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*6 ] + mova m1, [coeffq+16*14] + mova m2, [coeffq+16*22] + mova m3, [coeffq+16*30] + mova m4, [coeffq+16*7 ] + mova m5, [coeffq+16*15] + mova m6, [coeffq+16*23] + mova m7, [coeffq+16*31] + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp tx2q + +.fast1: + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*16] + mova m2, [coeffq+16*1 ] + mova m3, [coeffq+16*17] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + jmp tx2q + +.pass2_end: + lea r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end + +.pass2_end1: + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] + add coeffq, 16*32 + mov dstq, [rsp+gprsize*2+16*35] + mov r3d, [rsp+gprsize*3+16*35] + dec r3d + jg .pass2_loop + + ret + + +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + cmp eobd, 136 + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + lea r4, [dstq+8] + mov [rsp+gprsize*0+16*3], r4 + mov [rsp+gprsize*1+16*3], r3d + mov [rsp+gprsize*2+16*3], r3d + mov [rsp+gprsize*3+16*3], coeffq + +.loop: + LOAD_8ROWS coeffq, 64 + mova [rsp+16*1], m6 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 + pmulhrsw m7, [o(pw_8192)] + mova [rsp+16*0], m7 + mova m7, [o(pw_8192)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + mova [rsp+16*1], m6 + mova [rsp+16*2], m5 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + + pxor m7, m7 + REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + + add coeffq, 16 + dec r3d + jg .loop + + mov r4d, [rsp+gprsize*2+16*3] + dec r4d + jle .ret + + mov dstq, [rsp+gprsize*0+16*3] + mov coeffq, [rsp+gprsize*3+16*3] + mov [rsp+gprsize*2+16*3], r4 + lea r3, [dstq+8] + add coeffq, 64*8 + mov [rsp+gprsize*0+16*3], r3 + mov r3d, [rsp+gprsize*1+16*3] + mov [rsp+gprsize*3+16*3], coeffq + jmp .loop + +.ret: + RET + + +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_16x64_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r2d, 32 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly + +.end: + RET + + +cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + sub eobd, 151 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*0, 64*2 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*1, 64*2 + call m(idct_16x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + + mov coeffq, [rsp+gprsize*2+16*67] + mov r3d, 2 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + +.pass2_loop: + mov [rsp+gprsize*3+16*67], r3d + mov eobd, [rsp+gprsize*1+16*67] + + mova m0, [coeffq+16*4 ] ;in1 + mova m1, [coeffq+16*12] ;in3 + mova m2, [coeffq+16*20] ;in5 + mova m3, [coeffq+16*28] ;in7 + mova m4, [coeffq+16*5 ] ;in9 + mova m5, [coeffq+16*13] ;in11 + mova m6, [coeffq+16*21] ;in13 + mova m7, [coeffq+16*29] ;in15 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + pxor m4, m4 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + + test eobd, eobd + jl .fast + +.full: + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + mova m0, [coeffq+16*16] + mova m1, [coeffq+16*17] + mova m2, [coeffq+16*18] + mova m3, [coeffq+16*19] + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova m4, [coeffq+16*10] + mova m5, [coeffq+16*26] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*27] + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*6 ] ;in17 + mova m1, [coeffq+16*14] ;in19 + mova m2, [coeffq+16*22] ;in21 + mova m3, [coeffq+16*30] ;in23 + mova m4, [coeffq+16*7 ] ;in25 + mova m5, [coeffq+16*15] ;in27 + mova m6, [coeffq+16*23] ;in29 + mova m7, [coeffq+16*31] ;in31 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call .main + jmp .end + +.fast: + REPX {mova x, m4}, m2, m3, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + mova m0, [coeffq+16*16] + mova m1, [coeffq+16*17] + + REPX {mova x, m4}, m2, m3, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + + call m(idct_8x32_internal_8bpc).main_veryfast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + call .main_fast + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov r3, r4 + jmp m(idct_8x32_internal_8bpc).end2 + +.end1: + LOAD_8ROWS rsp+gprsize+16*35, 16 + lea dstq, [dstq+strideq*2] + add rsp, 16*32 + lea r3, [o(m(idct_16x64_internal_8bpc).end2)] + jmp m(idct_8x32_internal_8bpc).end + +.end2: + add coeffq, 16*32 + sub rsp, 16*32 + + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, [rsp+gprsize*3+16*67] + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + + dec r3d + jg .pass2_loop + ret + + +ALIGN function_align +.main_fast: + mova m0, [rsp+gprsize*2+16*35] ;in1 + pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 + pmulhrsw m0, [o(pw_101x8)] ;t32,t33 + mova m7, [o(pd_2048)] + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*66], m3 ;t63 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a + mova [rsp+gprsize*2+16*36], m3 ;t33a + mova [rsp+gprsize*2+16*65], m0 ;t62a + + mova m1, [rsp+gprsize*2+16*37] ;in15 + pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 + pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 + mova [rsp+gprsize*2+16*38], m1 ;t35 + mova [rsp+gprsize*2+16*63], m2 ;t60 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a + mova [rsp+gprsize*2+16*37], m2 ;t34a + mova [rsp+gprsize*2+16*64], m1 ;t61a + + mova m0, [rsp+gprsize*2+16*39] ;in9 + pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 + pmulhrsw m0, [o(pw_897x8)] ;t36,t37 + mova [rsp+gprsize*2+16*39], m0 ;t36 + mova [rsp+gprsize*2+16*62], m3 ;t59 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a + mova [rsp+gprsize*2+16*40], m3 ;t37a + mova [rsp+gprsize*2+16*61], m0 ;t58a + + mova m1, [rsp+gprsize*2+16*41] ;in7 + pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 + pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 + mova [rsp+gprsize*2+16*42], m1 ;t39 + mova [rsp+gprsize*2+16*59], m2 ;t56 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a + mova [rsp+gprsize*2+16*41], m2 ;t38a + mova [rsp+gprsize*2+16*60], m1 ;t57a + + mova m0, [rsp+gprsize*2+16*43] ;in5 + pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 + pmulhrsw m0, [o(pw_501x8)] ;t40,t41 + mova [rsp+gprsize*2+16*43], m0 ;t40 + mova [rsp+gprsize*2+16*58], m3 ;t55 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a + mova [rsp+gprsize*2+16*44], m3 ;t41a + mova [rsp+gprsize*2+16*57], m0 ;t54a + + mova m1, [rsp+gprsize*2+16*45] ;in11 + pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 + pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 + mova [rsp+gprsize*2+16*46], m1 ;t43 + mova [rsp+gprsize*2+16*55], m2 ;t52 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a + mova [rsp+gprsize*2+16*45], m2 ;t42a + mova [rsp+gprsize*2+16*56], m1 ;t53a + + mova m0, [rsp+gprsize*2+16*47] ;in13 + pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 + pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 + mova m6, m0 + mova [rsp+gprsize*2+16*54], m3 ;t51 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a + mova [rsp+gprsize*2+16*48], m3 ;t45a + mova [rsp+gprsize*2+16*53], m0 ;t50a + + mova m0, [rsp+gprsize*2+16*49] ;in3 + pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 + pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 + mova m4, m3 + mova m5, m0 + + jmp .main2 + +ALIGN function_align +.main: + mova m0, [rsp+gprsize*2+16*35] ;in1 + mova m1, [rsp+gprsize*2+16*65] ;in31 + pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a + pmulhrsw m0, [o(pw_101x8)] ;t32a + pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a + pmulhrsw m1, [o(pw_m2824x8)] ;t33a + mova m7, [o(pd_2048)] + psubsw m4, m0, m1 ;t33 + paddsw m0, m1 ;t32 + psubsw m5, m3, m2 ;t62 + paddsw m3, m2 ;t63 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*36], m5 ;t33a + mova [rsp+gprsize*2+16*65], m4 ;t62a + mova [rsp+gprsize*2+16*66], m3 ;t63 + + mova m0, [rsp+gprsize*2+16*63] ;in17 + mova m1, [rsp+gprsize*2+16*37] ;in15 + pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a + pmulhrsw m0, [o(pw_1660x8)] ;t34a + pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a + pmulhrsw m1, [o(pw_m1474x8)] ;t35a + psubsw m4, m1, m0 ;t34 + paddsw m0, m1 ;t35 + psubsw m5, m2, m3 ;t61 + paddsw m3, m2 ;t60 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a + mova [rsp+gprsize*2+16*37], m5 ;t34a + mova [rsp+gprsize*2+16*38], m0 ;t35 + mova [rsp+gprsize*2+16*63], m3 ;t60 + mova [rsp+gprsize*2+16*64], m4 ;t61a + + mova m0, [rsp+gprsize*2+16*39] ;in9 + mova m1, [rsp+gprsize*2+16*61] ;in23 + pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a + pmulhrsw m0, [o(pw_897x8)] ;t36a + pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a + pmulhrsw m1, [o(pw_m2191x8)] ;t37a + psubsw m4, m0, m1 ;t37 + paddsw m0, m1 ;t36 + psubsw m5, m3, m2 ;t58 + paddsw m3, m2 ;t59 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a + mova [rsp+gprsize*2+16*39], m0 ;t36 + mova [rsp+gprsize*2+16*40], m5 ;t37a + mova [rsp+gprsize*2+16*61], m4 ;t58a + mova [rsp+gprsize*2+16*62], m3 ;t59 + + mova m0, [rsp+gprsize*2+16*59] ;in25 + mova m1, [rsp+gprsize*2+16*41] ;in7 + pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a + pmulhrsw m0, [o(pw_2359x8)] ;t38a + pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a + pmulhrsw m1, [o(pw_m700x8)] ;t39a + psubsw m4, m1, m0 ;t38 + paddsw m0, m1 ;t39 + psubsw m5, m2, m3 ;t57 + paddsw m3, m2 ;t56 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a + mova [rsp+gprsize*2+16*41], m5 ;t38a + mova [rsp+gprsize*2+16*42], m0 ;t39 + mova [rsp+gprsize*2+16*59], m3 ;t56 + mova [rsp+gprsize*2+16*60], m4 ;t57a + + mova m0, [rsp+gprsize*2+16*43] ;in5 + mova m1, [rsp+gprsize*2+16*57] ;in27 + pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a + pmulhrsw m0, [o(pw_501x8)] ;t40a + pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a + pmulhrsw m1, [o(pw_m2520x8)] ;t41a + psubsw m4, m0, m1 ;t41 + paddsw m0, m1 ;t40 + psubsw m5, m3, m2 ;t54 + paddsw m3, m2 ;t55 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a + mova [rsp+gprsize*2+16*43], m0 ;t40 + mova [rsp+gprsize*2+16*44], m5 ;t41a + mova [rsp+gprsize*2+16*57], m4 ;t54a + mova [rsp+gprsize*2+16*58], m3 ;t55 + + mova m0, [rsp+gprsize*2+16*55] ;in21 + mova m1, [rsp+gprsize*2+16*45] ;in11 + pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a + pmulhrsw m0, [o(pw_2019x8)] ;t42a + pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a + pmulhrsw m1, [o(pw_m1092x8)] ;t43a + psubsw m4, m1, m0 ;t42 + paddsw m0, m1 ;t43 + psubsw m5, m2, m3 ;t53 + paddsw m3, m2 ;t52 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*46], m0 ;t43 + mova [rsp+gprsize*2+16*55], m3 ;t52 + mova [rsp+gprsize*2+16*56], m4 ;t53a + + mova m0, [rsp+gprsize*2+16*47] ;in13 + mova m1, [rsp+gprsize*2+16*53] ;in19 + pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a + pmulhrsw m0, [o(pw_1285x8)] ;t44a + pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a + pmulhrsw m1, [o(pw_m1842x8)] ;t45a + psubsw m4, m0, m1 ;t45 + paddsw m0, m1 ;t44 + psubsw m5, m3, m2 ;t50 + paddsw m3, m2 ;t51 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a + mova m6, m0 + mova [rsp+gprsize*2+16*48], m5 ;t45a + mova [rsp+gprsize*2+16*53], m4 ;t50a + mova [rsp+gprsize*2+16*54], m3 ;t51 + + mova m0, [rsp+gprsize*2+16*51] ;in29 + mova m1, [rsp+gprsize*2+16*49] ;in3 + pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a + pmulhrsw m0, [o(pw_2675x8)] ;t46a + pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a + pmulhrsw m1, [o(pw_m301x8)] ;t47a + psubsw m5, m1, m0 ;t46 + paddsw m0, m1 ;t47 + psubsw m4, m2, m3 ;t49 + paddsw m3, m2 ;t48 + +ALIGN function_align +.main2: + ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a + mova m1, [rsp+gprsize*2+16*54] ;t51 + psubsw m2, m0, m6 ;t44a + paddsw m0, m6 ;t47a + psubsw m6, m3, m1 ;t51a + paddsw m3, m1 ;t48a + mova [rsp+gprsize*2+16*50], m0 ;t47a + mova [rsp+gprsize*2+16*51], m3 ;t48a + ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 + mova [rsp+gprsize*2+16*47], m6 ;t44 + mova [rsp+gprsize*2+16*54], m2 ;t51 + + mova m0, [rsp+gprsize*2+16*48] ;t45a + mova m3, [rsp+gprsize*2+16*53] ;t50a + psubsw m2, m4, m0 ;t45 + paddsw m4, m0 ;t46 + psubsw m6, m5, m3 ;t50 + paddsw m5, m3 ;t49 + ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a + mova [rsp+gprsize*2+16*48], m6 ;t45a + mova [rsp+gprsize*2+16*49], m4 ;t46 + mova [rsp+gprsize*2+16*52], m5 ;t49 + mova [rsp+gprsize*2+16*53], m2 ;t50a + + mova m0, [rsp+gprsize*2+16*43] ;t40 + mova m2, [rsp+gprsize*2+16*46] ;t43 + mova m3, [rsp+gprsize*2+16*55] ;t52 + mova m1, [rsp+gprsize*2+16*58] ;t55 + psubsw m4, m0, m2 ;t43a + paddsw m0, m2 ;t40a + psubsw m5, m1, m3 ;t52a + paddsw m1, m3 ;t55a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 + mova [rsp+gprsize*2+16*43], m0 ;t40a + mova [rsp+gprsize*2+16*46], m5 ;t43 + mova [rsp+gprsize*2+16*55], m4 ;t52 + mova [rsp+gprsize*2+16*58], m1 ;t55a + + mova m0, [rsp+gprsize*2+16*44] ;t41a + mova m2, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*57] ;t54a + psubsw m4, m0, m2 ;t42 + paddsw m0, m2 ;t41 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t54 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a + mova [rsp+gprsize*2+16*44], m0 ;t41 + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*56], m4 ;t53a + mova [rsp+gprsize*2+16*57], m1 ;t54 + + mova m0, [rsp+gprsize*2+16*41] ;t38a + mova m2, [rsp+gprsize*2+16*40] ;t37a + mova m3, [rsp+gprsize*2+16*61] ;t58a + mova m1, [rsp+gprsize*2+16*60] ;t57a + psubsw m4, m0, m2 ;t37 + paddsw m0, m2 ;t38 + psubsw m5, m1, m3 ;t58 + paddsw m1, m3 ;t57 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a + mova [rsp+gprsize*2+16*41], m0 ;t38 + mova [rsp+gprsize*2+16*40], m5 ;t37a + mova [rsp+gprsize*2+16*61], m4 ;t58a + mova [rsp+gprsize*2+16*60], m1 ;t57 + + mova m0, [rsp+gprsize*2+16*42] ;t39 + mova m2, [rsp+gprsize*2+16*39] ;t36 + mova m3, [rsp+gprsize*2+16*62] ;t59 + mova m1, [rsp+gprsize*2+16*59] ;t56 + psubsw m4, m0, m2 ;t36a + paddsw m0, m2 ;t39a + psubsw m5, m1, m3 ;t59a + paddsw m1, m3 ;t56a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 + mova [rsp+gprsize*2+16*42], m0 ;t39a + mova [rsp+gprsize*2+16*39], m5 ;t36 + mova [rsp+gprsize*2+16*62], m4 ;t59 + mova [rsp+gprsize*2+16*59], m1 ;t56a + + mova m0, [rsp+gprsize*2+16*35] ;t32 + mova m2, [rsp+gprsize*2+16*38] ;t35 + mova m3, [rsp+gprsize*2+16*63] ;t60 + mova m1, [rsp+gprsize*2+16*66] ;t63 + psubsw m4, m0, m2 ;t35a + paddsw m0, m2 ;t32a + psubsw m5, m1, m3 ;t60a + paddsw m1, m3 ;t63a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 + mova [rsp+gprsize*2+16*35], m0 ;t32a + mova [rsp+gprsize*2+16*38], m5 ;t35 + mova [rsp+gprsize*2+16*63], m4 ;t60 + mova [rsp+gprsize*2+16*66], m1 ;t63a + + mova m0, [rsp+gprsize*2+16*36] ;t33a + mova m2, [rsp+gprsize*2+16*37] ;t34a + mova m3, [rsp+gprsize*2+16*64] ;t61a + mova m1, [rsp+gprsize*2+16*65] ;t62a + psubsw m4, m0, m2 ;t34 + paddsw m0, m2 ;t33 + psubsw m5, m1, m3 ;t61 + paddsw m1, m3 ;t62 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a + + mova m2, [rsp+gprsize*2+16*41] ;t38 + mova m3, [rsp+gprsize*2+16*60] ;t57 + psubsw m6, m0, m2 ;t38a + paddsw m0, m2 ;t33a + psubsw m2, m1, m3 ;t57a + paddsw m1, m3 ;t62a + mova [rsp+gprsize*2+16*36], m0 ;t33a + mova [rsp+gprsize*2+16*65], m1 ;t62a + ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 + mova [rsp+gprsize*2+16*41], m2 ;t38 + mova [rsp+gprsize*2+16*60], m6 ;t57 + + mova m2, [rsp+gprsize*2+16*40] ;t37 + mova m3, [rsp+gprsize*2+16*61] ;t58 + psubsw m0, m5, m2 ;t37 + paddsw m5, m2 ;t34 + psubsw m1, m4, m3 ;t58 + paddsw m4, m3 ;t61 + ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a + mova [rsp+gprsize*2+16*37], m5 ;t34 + mova [rsp+gprsize*2+16*64], m4 ;t61 + mova [rsp+gprsize*2+16*40], m1 ;t37a + mova [rsp+gprsize*2+16*61], m0 ;t58a + + mova m0, [rsp+gprsize*2+16*38] ;t35 + mova m2, [rsp+gprsize*2+16*39] ;t36 + mova m3, [rsp+gprsize*2+16*62] ;t59 + mova m1, [rsp+gprsize*2+16*63] ;t60 + psubsw m4, m0, m2 ;t36a + paddsw m0, m2 ;t35a + psubsw m5, m1, m3 ;t59a + paddsw m1, m3 ;t60a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 + mova [rsp+gprsize*2+16*38], m0 ;t35a + mova [rsp+gprsize*2+16*39], m5 ;t36 + mova [rsp+gprsize*2+16*62], m4 ;t59 + mova [rsp+gprsize*2+16*63], m1 ;t60a + + mova m0, [rsp+gprsize*2+16*35] ;t32a + mova m2, [rsp+gprsize*2+16*42] ;t39a + mova m3, [rsp+gprsize*2+16*59] ;t56a + mova m1, [rsp+gprsize*2+16*66] ;t63a + psubsw m4, m0, m2 ;t39 + paddsw m0, m2 ;t32 + psubsw m5, m1, m3 ;t56 + paddsw m1, m3 ;t63 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*42], m5 ;t39a + mova [rsp+gprsize*2+16*59], m4 ;t56a + mova [rsp+gprsize*2+16*66], m1 ;t63 + + mova m0, [rsp+gprsize*2+16*50] ;t47a + mova m2, [rsp+gprsize*2+16*43] ;t40a + mova m3, [rsp+gprsize*2+16*58] ;t55a + mova m1, [rsp+gprsize*2+16*51] ;t48a + psubsw m4, m0, m2 ;t40 + paddsw m0, m2 ;t47 + psubsw m5, m1, m3 ;t55 + paddsw m1, m3 ;t48 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a + mova [rsp+gprsize*2+16*50], m0 ;t47 + mova [rsp+gprsize*2+16*43], m5 ;t40a + mova [rsp+gprsize*2+16*58], m4 ;t55a + mova [rsp+gprsize*2+16*51], m1 ;t48 + + mova m0, [rsp+gprsize*2+16*49] ;t46 + mova m2, [rsp+gprsize*2+16*44] ;t41 + mova m3, [rsp+gprsize*2+16*57] ;t54 + mova m1, [rsp+gprsize*2+16*52] ;t49 + psubsw m4, m0, m2 ;t41a + paddsw m0, m2 ;t46a + psubsw m5, m1, m3 ;t54a + paddsw m1, m3 ;t49a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 + mova [rsp+gprsize*2+16*49], m0 ;t46a + mova [rsp+gprsize*2+16*44], m5 ;t41 + mova [rsp+gprsize*2+16*57], m4 ;t54 + mova [rsp+gprsize*2+16*52], m1 ;t49a + + mova m0, [rsp+gprsize*2+16*48] ;t45a + mova m2, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*53] ;t50a + psubsw m4, m0, m2 ;t42 + paddsw m0, m2 ;t45 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t50 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a + mova [rsp+gprsize*2+16*48], m0 ;t45 + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*56], m4 ;t53a + mova [rsp+gprsize*2+16*53], m1 ;t50 + + mova m0, [rsp+gprsize*2+16*47] ;t44 + mova m2, [rsp+gprsize*2+16*46] ;t43 + mova m3, [rsp+gprsize*2+16*55] ;t52 + mova m1, [rsp+gprsize*2+16*54] ;t51 + psubsw m4, m0, m2 ;t43a + paddsw m0, m2 ;t44a + psubsw m5, m1, m3 ;t52a + paddsw m1, m3 ;t51a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 + + mova m2, [rsp+gprsize*2+16*38] ;t35a + mova m3, [rsp+gprsize*2+16*31] ;tmp[28] + psubsw m6, m2, m0 ;t44 + paddsw m2, m0 ;t35 + psubsw m0, m3, m2 ;out35 + paddsw m2, m3 ;out28 + mova m3, [rsp+gprsize*2+16*63] ;t60a + mova [rsp+gprsize*2+16*38], m0 ;out35 + mova [rsp+gprsize*2+16*31], m2 ;out28 + psubsw m0, m3, m1 ;t51 + paddsw m3, m1 ;t60 + ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a + mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] + psubsw m1, m2, m3 ;out60 + paddsw m2, m3 ;out3 + mova m3, [rsp+gprsize*2+16*22] ;tmp[19] + mova [rsp+gprsize*2+16*63], m1 ;out60 + mova [rsp+gprsize*2+16*6 ], m2 ;out3 + psubsw m1, m3, m0 ;out44 + paddsw m3, m0 ;out19 + mova m2, [rsp+gprsize*2+16*15] ;tmp[12] + + mova m0, [rsp+gprsize*2+16*39] ;t36 + mova [rsp+gprsize*2+16*47], m1 ;out44 + mova [rsp+gprsize*2+16*22], m3 ;out19 + mova m1, [rsp+gprsize*2+16*62] ;t59 + psubsw m3, m2, m6 ;out51 + paddsw m2, m6 ;out12 + mova [rsp+gprsize*2+16*54], m3 ;out51 + mova [rsp+gprsize*2+16*15], m2 ;out12 + psubsw m2, m0, m5 ;t43a + paddsw m0, m5 ;t36a + mova m5, [rsp+gprsize*2+16*30] ;tmp[27] + psubsw m3, m1, m4 ;t52a + paddsw m1, m4 ;t59a + ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 + mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] + psubsw m6, m5, m0 ;out36 + paddsw m5, m0 ;out27 + psubsw m0, m4, m1 ;out59 + paddsw m4, m1 ;out4 + mova [rsp+gprsize*2+16*39], m6 ;out36 + mova [rsp+gprsize*2+16*30], m5 ;out27 + mova [rsp+gprsize*2+16*62], m0 ;out59 + mova [rsp+gprsize*2+16*7 ], m4 ;out4 + mova m0, [rsp+gprsize*2+16*23] ;tmp[20] + mova m5, [rsp+gprsize*2+16*14] ;tmp[11] + psubsw m4, m0, m3 ;out43 + paddsw m0, m3 ;out20 + psubsw m6, m5, m2 ;out52 + paddsw m5, m2 ;out11 + mova [rsp+gprsize*2+16*46], m4 ;out43 + mova [rsp+gprsize*2+16*23], m0 ;out20 + mova [rsp+gprsize*2+16*55], m6 ;out52 + mova [rsp+gprsize*2+16*14], m5 ;out11 + + mova m0, [rsp+gprsize*2+16*40] ;t37a + mova m5, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*61] ;t58a + mova m2, [rsp+gprsize*2+16*29] ;tmp[26] + psubsw m4, m0, m5 ;t42 + paddsw m0, m5 ;t37 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t58 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 + mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] + psubsw m6, m2, m0 ;out37 + paddsw m2, m0 ;out26 + psubsw m0, m3, m1 ;out58 + paddsw m3, m1 ;out5 + mova [rsp+gprsize*2+16*40], m6 ;out37 + mova [rsp+gprsize*2+16*29], m2 ;out26 + mova [rsp+gprsize*2+16*61], m0 ;out58 + mova [rsp+gprsize*2+16*8 ], m3 ;out5 + mova m0, [rsp+gprsize*2+16*24] ;tmp[21] + mova m1, [rsp+gprsize*2+16*13] ;tmp[10] + psubsw m2, m0, m5 ;out42 + paddsw m0, m5 ;out21 + psubsw m3, m1, m4 ;out53 + paddsw m1, m4 ;out10 + mova [rsp+gprsize*2+16*45], m2 ;out42 + mova [rsp+gprsize*2+16*24], m0 ;out21 + mova [rsp+gprsize*2+16*56], m3 ;out53 + mova [rsp+gprsize*2+16*13], m1 ;out10 + + mova m0, [rsp+gprsize*2+16*41] ;t38 + mova m5, [rsp+gprsize*2+16*44] ;t41 + mova m3, [rsp+gprsize*2+16*57] ;t54 + mova m1, [rsp+gprsize*2+16*60] ;t57 + mova m2, [rsp+gprsize*2+16*28] ;tmp[25] + psubsw m4, m0, m5 ;t41a + paddsw m0, m5 ;t38a + psubsw m5, m1, m3 ;t54a + paddsw m1, m3 ;t57a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a + mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] + psubsw m6, m2, m0 ;out38 + paddsw m2, m0 ;out25 + psubsw m0, m3, m1 ;out57 + paddsw m3, m1 ;out6 + mova [rsp+gprsize*2+16*41], m6 ;out38 + mova [rsp+gprsize*2+16*28], m2 ;out25 + mova [rsp+gprsize*2+16*60], m0 ;out57 + mova [rsp+gprsize*2+16*9 ], m3 ;out6 + mova m0, [rsp+gprsize*2+16*25] ;tmp[22] + mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] + psubsw m2, m0, m5 ;out41 + paddsw m0, m5 ;out22 + psubsw m3, m1, m4 ;out54 + paddsw m1, m4 ;out9 + mova [rsp+gprsize*2+16*44], m2 ;out41 + mova [rsp+gprsize*2+16*25], m0 ;out22 + mova [rsp+gprsize*2+16*57], m3 ;out54 + mova [rsp+gprsize*2+16*12], m1 ;out9 + + mova m0, [rsp+gprsize*2+16*42] ;t39a + mova m5, [rsp+gprsize*2+16*43] ;t40a + mova m3, [rsp+gprsize*2+16*58] ;t55a + mova m1, [rsp+gprsize*2+16*59] ;t56a + mova m2, [rsp+gprsize*2+16*27] ;tmp[24] + psubsw m4, m0, m5 ;t40 + paddsw m0, m5 ;t39 + psubsw m5, m1, m3 ;t55 + paddsw m1, m3 ;t56 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a + mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] + psubsw m6, m2, m0 ;out39 + paddsw m2, m0 ;out24 + psubsw m0, m3, m1 ;out56 + paddsw m3, m1 ;out7 + mova [rsp+gprsize*2+16*42], m6 ;out39 + mova [rsp+gprsize*2+16*27], m2 ;out24 + mova [rsp+gprsize*2+16*59], m0 ;out56 + mova [rsp+gprsize*2+16*10], m3 ;out7 + mova m0, [rsp+gprsize*2+16*26] ;tmp[23] + mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] + psubsw m2, m0, m5 ;out40 + paddsw m0, m5 ;out23 + psubsw m3, m1, m4 ;out55 + paddsw m1, m4 ;out8 + mova [rsp+gprsize*2+16*43], m2 ;out40 + mova [rsp+gprsize*2+16*26], m0 ;out23 + mova [rsp+gprsize*2+16*58], m3 ;out55 + mova [rsp+gprsize*2+16*11], m1 ;out8 + + mova m0, [rsp+gprsize*2+16*37] ;t34 + mova m5, [rsp+gprsize*2+16*48] ;t45 + mova m3, [rsp+gprsize*2+16*53] ;t50 + mova m1, [rsp+gprsize*2+16*64] ;t61 + mova m2, [rsp+gprsize*2+16*32] ;tmp[29] + psubsw m4, m0, m5 ;t45a + paddsw m0, m5 ;t34a + psubsw m5, m1, m3 ;t50a + paddsw m1, m3 ;t61a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 + mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] + psubsw m6, m2, m0 ;out34 + paddsw m2, m0 ;out29 + psubsw m0, m3, m1 ;out61 + paddsw m3, m1 ;out2 + mova [rsp+gprsize*2+16*37], m6 ;out34 + mova [rsp+gprsize*2+16*32], m2 ;out29 + mova [rsp+gprsize*2+16*64], m0 ;out61 + mova [rsp+gprsize*2+16*5 ], m3 ;out2 + mova m0, [rsp+gprsize*2+16*21] ;tmp[18] + mova m1, [rsp+gprsize*2+16*16] ;tmp[13] + psubsw m2, m0, m5 ;out45 + paddsw m0, m5 ;out18 + psubsw m3, m1, m4 ;out50 + paddsw m1, m4 ;out13 + mova [rsp+gprsize*2+16*48], m2 ;out45 + mova [rsp+gprsize*2+16*21], m0 ;out18 + mova [rsp+gprsize*2+16*53], m3 ;out50 + mova [rsp+gprsize*2+16*16], m1 ;out13 + + mova m0, [rsp+gprsize*2+16*36] ;t33a + mova m5, [rsp+gprsize*2+16*49] ;t46a + mova m3, [rsp+gprsize*2+16*52] ;t49a + mova m1, [rsp+gprsize*2+16*65] ;t62a + mova m2, [rsp+gprsize*2+16*33] ;tmp[30] + psubsw m4, m0, m5 ;t46 + paddsw m0, m5 ;t33 + psubsw m5, m1, m3 ;t49 + paddsw m1, m3 ;t62 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 + mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] + psubsw m6, m2, m0 ;out33 + paddsw m2, m0 ;out30 + psubsw m0, m3, m1 ;out62 + paddsw m3, m1 ;out1 + mova [rsp+gprsize*2+16*36], m6 ;out33 + mova [rsp+gprsize*2+16*33], m2 ;out30 + mova [rsp+gprsize*2+16*65], m0 ;out62 + mova [rsp+gprsize*2+16*4 ], m3 ;out1 + mova m0, [rsp+gprsize*2+16*20] ;tmp[17] + mova m1, [rsp+gprsize*2+16*17] ;tmp[14] + psubsw m2, m0, m5 ;out46 + paddsw m0, m5 ;out17 + psubsw m3, m1, m4 ;out49 + paddsw m1, m4 ;out14 + mova [rsp+gprsize*2+16*49], m2 ;out46 + mova [rsp+gprsize*2+16*20], m0 ;out17 + mova [rsp+gprsize*2+16*52], m3 ;out49 + mova [rsp+gprsize*2+16*17], m1 ;out14 + + mova m0, [rsp+gprsize*2+16*35] ;t32 + mova m5, [rsp+gprsize*2+16*50] ;t47 + mova m3, [rsp+gprsize*2+16*51] ;t48 + mova m1, [rsp+gprsize*2+16*66] ;t63 + mova m2, [rsp+gprsize*2+16*34] ;tmp[31] + psubsw m4, m0, m5 ;t47a + paddsw m0, m5 ;t32a + psubsw m5, m1, m3 ;t48a + paddsw m1, m3 ;t63a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 + mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] + psubsw m6, m2, m0 ;out32 + paddsw m2, m0 ;out31 + psubsw m0, m3, m1 ;out63 + paddsw m3, m1 ;out0 + mova [rsp+gprsize*2+16*35], m6 ;out32 + mova [rsp+gprsize*2+16*34], m2 ;out31 + mova [rsp+gprsize*2+16*66], m0 ;out63 + mova [rsp+gprsize*2+16*3 ], m3 ;out0 + mova m0, [rsp+gprsize*2+16*19] ;tmp[16] + mova m1, [rsp+gprsize*2+16*18] ;tmp[15] + psubsw m2, m0, m5 ;out47 + paddsw m0, m5 ;out16 + psubsw m3, m1, m4 ;out48 + paddsw m1, m4 ;out15 + mova [rsp+gprsize*2+16*50], m2 ;out47 + mova [rsp+gprsize*2+16*19], m0 ;out16 + mova [rsp+gprsize*2+16*51], m3 ;out48 + mova [rsp+gprsize*2+16*18], m1 ;out15 + ret + + +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x16_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 16 + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)] + +.body: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m7, m7 + +.loop: + mova m1, [dstq+16*0] + mova m3, [dstq+16*1] + mova m5, [dstq+16*2] + mova m6, [dstq+16*3] + punpckhbw m2, m1, m7 + punpcklbw m1, m7 + punpckhbw m4, m3, m7 + punpcklbw m3, m7 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + punpckhbw m2, m5, m7 + punpcklbw m5, m7 + punpckhbw m4, m6, m7 + punpcklbw m6, m7 + paddw m2, m0 + paddw m5, m0 + paddw m4, m0 + paddw m6, m0 + packuswb m5, m2 + packuswb m6, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m3 + mova [dstq+16*2], m5 + mova [dstq+16*3], m6 + add dstq, strideq + dec r3d + jg .loop + jmp tx2q + +.end: + RET + + +%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 + +%if %3 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [%1+%2*0] + pmulhrsw m1, m3, [%1+%2*1] + pmulhrsw m2, m3, [%1+%2*2] + pmulhrsw m3, [%1+%2*3] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] +%endif +%endmacro + +%macro LOAD_4ROWS_H 2 ;src, stride + mova m4, [%1+%2*0] + mova m5, [%1+%2*1] + mova m6, [%1+%2*2] + mova m7, [%1+%2*3] +%endmacro + +cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r3d, 2 + mov [rsp+gprsize*2+16*67], dstq + lea dstq, [rsp+gprsize+16*68] + +.pass1_loop: + LOAD_4ROWS coeffq+32*0, 32*8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+32*4, 32*8 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+32*2, 32*4 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+32*1, 32*2 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+32*17, 32*2 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal_8bpc).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+32*0, 32 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+32*8, 32 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+32*16, 32 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+32*24, 32 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end4: + SAVE_8ROWS dstq+32*0, 32 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end5: + SAVE_8ROWS dstq+32*8, 32 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end6: + SAVE_8ROWS dstq+32*16, 32 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end7: + SAVE_8ROWS dstq+32*24, 32 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov dstq, [rsp+gprsize*2+16*67] + sub coeffq, 32 + mov r3d, 4 + +.pass2_loop: + mov [rsp+gprsize*1+16*67], r3d + + LOAD_4ROWS coeffq+16*0, 32*2 + LOAD_4ROWS_H coeffq+16*1, 32*2 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+16*2, 32*2 + LOAD_4ROWS_H coeffq+16*3, 32*2 + call m(idct_16x8_internal_8bpc).main + + mov r3, dstq + lea tx2q, [o(m(idct_64x16_internal_8bpc).end)] + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 16*16 + mov r3d, [rsp+gprsize*1+16*67] + mov dstq, [rsp+gprsize*2+16*67] + add dstq, 8 + mov [rsp+gprsize*2+16*67], dstq + dec r3d + jg .pass2_loop + + mov r3d, 4 + lea coeffq, [rsp+gprsize+16*68] +.pass2_loop2: + mov [rsp+gprsize*1+16*67], r3d + + LOAD_4ROWS coeffq+16*0, 32*2 + LOAD_4ROWS_H coeffq+16*1, 32*2 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+16*2, 32*2 + LOAD_4ROWS_H coeffq+16*3, 32*2 + call m(idct_16x8_internal_8bpc).main + + mov r3, dstq + lea tx2q, [o(m(idct_64x16_internal_8bpc).end2)] + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal_8bpc).end + +.end2: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x16_internal_8bpc).end3)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end3: + + add coeffq, 16*16 + mov r3d, [rsp+gprsize*1+16*67] + mov dstq, [rsp+gprsize*2+16*67] + add dstq, 8 + mov [rsp+gprsize*2+16*67], dstq + dec r3d + jg .pass2_loop2 + ret + + +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x64_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r3d, 64 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body + +.end: + RET + + +cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*1, 64*2, 1 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov tx2d, [rsp+gprsize*1+16*67] + test tx2d, tx2d + jl .fast + +.full: + LOAD_8ROWS coeffq+64*0, 64*4, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*2, 64*4, 1 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*17, 64*2, 1 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp .pass1_end + +.fast: + LOAD_4ROWS coeffq, 256, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+128*1, 256, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + +.pass1_end: + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+64*24, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov coeffq, [rsp+gprsize*2+16*67] + mov r3d, 4 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop + + +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x32_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + pmulhrsw m0, m1 + mov [coeffq], eobd + mov r3d, 32 + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body + +.end: + RET + +cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + mov [rsp+gprsize*3+16*67], dstq + lea dstq, [rsp+gprsize+16*69] + mov [rsp+gprsize*4+16*67], dstq + +.pass1_loop: + LOAD_4ROWS coeffq+64*0, 64*8, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+64*4, 64*8, 1 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*2, 64*4, 1 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+64*1, 64*2, 1 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+64*17, 64*2, 1 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal_8bpc).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+64*24, 64 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS dstq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end5: + SAVE_8ROWS dstq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end6: + SAVE_8ROWS dstq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end7: + SAVE_8ROWS dstq+64*24, 64 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov coeffq, [rsp+gprsize*4+16*67] + mov dstq, [rsp+gprsize*3+16*67] + mov eobd, [rsp+gprsize*1+16*67] + lea dstq, [dstq+32] + mov [rsp+gprsize*1+16*35], eobd + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] + mov r3d, 4 + jmp m(idct_32x32_internal_8bpc).pass2_loop + +.pass2_end: + mova [rsp+gprsize+16*0], m7 + lea r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end2 + +.pass2_end1: + lea tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)] + add coeffq, 16*32 + mov dstq, [rsp+gprsize*2+16*35] + mov r3d, [rsp+gprsize*3+16*35] + dec r3d + jg m(idct_32x32_internal_8bpc).pass2_loop + +.pass2_end2: + mov dstq, [rsp+gprsize*3+16*67] + mov coeffq, [rsp+gprsize*2+16*67] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] + mov r3d, 4 + jmp m(idct_32x32_internal_8bpc).pass2_loop + + +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x64_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 64 + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body + +cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + %undef cmp + + mov r5d, 4 + mov r4d, 2 + sub eobd, 136 + cmovns r4d, r5d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*1+16*67], eobd + mov r3d, r4d + mov [rsp+gprsize*4+16*67], coeffq + mov [rsp+gprsize*3+16*67], dstq + lea dstq, [rsp+gprsize+16*69] + mov [rsp+gprsize*2+16*67], dstq + +.pass1_loop: + LOAD_4ROWS coeffq+64*0, 64*8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+64*4, 64*8 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*2, 64*4 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+64*1, 64*2 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+64*17, 64*2 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal_8bpc).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+64*24, 64 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end4: + SAVE_8ROWS dstq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end5: + SAVE_8ROWS dstq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end6: + SAVE_8ROWS dstq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end7: + SAVE_8ROWS dstq+64*24, 64 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov dstq, [rsp+gprsize*3+16*67] + mov coeffq, [rsp+gprsize*2+16*67] + lea dstq, [dstq+32] + mov r3d, 4 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] + jmp m(idct_16x64_internal_8bpc).pass2_loop + +.pass2_end: + LOAD_8ROWS rsp+gprsize+16*35, 16 + lea dstq, [dstq+strideq*2] + add rsp, 16*32 + mova [rsp+gprsize+16*0], m7 + lea r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end2 + +.pass2_end1: + add coeffq, 16*32 + sub rsp, 16*32 + + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, [rsp+gprsize*3+16*67] + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_64x64_internal_8bpc).pass2_end)] + + dec r3d + jg m(idct_16x64_internal_8bpc).pass2_loop + +.pass2_end2: + mov coeffq, [rsp+gprsize*4+16*67] + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, 4 + sub dstq, 72 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop diff -Nru dav1d-0.7.1/src/x86/itx_ssse3.asm dav1d-0.9.1/src/x86/itx_ssse3.asm --- dav1d-0.7.1/src/x86/itx_ssse3.asm 2020-06-21 11:48:55.028126500 +0000 +++ dav1d-0.9.1/src/x86/itx_ssse3.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,6558 +0,0 @@ -; Copyright © 2018, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - - -SECTION_RODATA 16 - -deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 - -deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 - -%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 -pw_%1_m%2: times 4 dw %1, -%2 -%if %3 != 2 -pw_%2_%1: times 4 dw %2, %1 -%endif -%if %3 -pw_m%1_m%2: times 4 dw -%1, -%2 -%endif -%endmacro - -;adst4 -pw_1321_3803: times 4 dw 1321, 3803 -pw_2482_m1321: times 4 dw 2482, -1321 -pw_3344_2482: times 4 dw 3344, 2482 -pw_3344_m3803: times 4 dw 3344, -3803 -pw_3344_m3344: times 4 dw 3344, -3344 -pw_0_3344 times 4 dw 0, 3344 -pw_m6688_m3803: times 4 dw -6688, -3803 - -COEF_PAIR 2896, 2896 -COEF_PAIR 1567, 3784 -COEF_PAIR 799, 4017 -COEF_PAIR 3406, 2276 -COEF_PAIR 401, 4076 -COEF_PAIR 1931, 3612 -COEF_PAIR 3166, 2598 -COEF_PAIR 3920, 1189 -COEF_PAIR 3784, 1567, 1 -COEF_PAIR 995, 3973 -COEF_PAIR 1751, 3703 -COEF_PAIR 3513, 2106 -COEF_PAIR 3857, 1380 -COEF_PAIR 4017, 799, 1 -COEF_PAIR 201, 4091 -COEF_PAIR 2440, 3290 -COEF_PAIR 3035, 2751 -COEF_PAIR 4052, 601 -COEF_PAIR 2276, 3406, 1 -COEF_PAIR 4076, 401, 2 -COEF_PAIR 2598, 3166, 2 -COEF_PAIR 3612, 1931, 2 -COEF_PAIR 1189, 3920, 2 - -pd_2048: times 4 dd 2048 -pw_2048: times 8 dw 2048 -pw_m2048: times 8 dw -2048 -pw_4096: times 8 dw 4096 -pw_16384: times 8 dw 16384 -pw_m16384: times 8 dw -16384 -pw_1697x16: times 8 dw 1697*16 -pw_1697x8: times 8 dw 1697*8 -pw_2896x8: times 8 dw 2896*8 -pw_3344x8: times 8 dw 3344*8 -pw_8192: times 8 dw 8192 -pw_m8192: times 8 dw -8192 -pw_5: times 8 dw 5 -pw_201x8: times 8 dw 201*8 -pw_4091x8: times 8 dw 4091*8 -pw_m2751x8: times 8 dw -2751*8 -pw_3035x8: times 8 dw 3035*8 -pw_1751x8: times 8 dw 1751*8 -pw_3703x8: times 8 dw 3703*8 -pw_m1380x8: times 8 dw -1380*8 -pw_3857x8: times 8 dw 3857*8 -pw_995x8: times 8 dw 995*8 -pw_3973x8: times 8 dw 3973*8 -pw_m2106x8: times 8 dw -2106*8 -pw_3513x8: times 8 dw 3513*8 -pw_2440x8: times 8 dw 2440*8 -pw_3290x8: times 8 dw 3290*8 -pw_m601x8: times 8 dw -601*8 -pw_4052x8: times 8 dw 4052*8 - -pw_4095x8: times 8 dw 4095*8 -pw_101x8: times 8 dw 101*8 -pw_2967x8: times 8 dw 2967*8 -pw_m2824x8: times 8 dw -2824*8 -pw_3745x8: times 8 dw 3745*8 -pw_1660x8: times 8 dw 1660*8 -pw_3822x8: times 8 dw 3822*8 -pw_m1474x8: times 8 dw -1474*8 -pw_3996x8: times 8 dw 3996*8 -pw_897x8: times 8 dw 897*8 -pw_3461x8: times 8 dw 3461*8 -pw_m2191x8: times 8 dw -2191*8 -pw_3349x8: times 8 dw 3349*8 -pw_2359x8: times 8 dw 2359*8 -pw_4036x8: times 8 dw 4036*8 -pw_m700x8: times 8 dw -700*8 -pw_4065x8: times 8 dw 4065*8 -pw_501x8: times 8 dw 501*8 -pw_3229x8: times 8 dw 3229*8 -pw_m2520x8: times 8 dw -2520*8 -pw_3564x8: times 8 dw 3564*8 -pw_2019x8: times 8 dw 2019*8 -pw_3948x8: times 8 dw 3948*8 -pw_m1092x8: times 8 dw -1092*8 -pw_3889x8: times 8 dw 3889*8 -pw_1285x8: times 8 dw 1285*8 -pw_3659x8: times 8 dw 3659*8 -pw_m1842x8: times 8 dw -1842*8 -pw_3102x8: times 8 dw 3102*8 -pw_2675x8: times 8 dw 2675*8 -pw_4085x8: times 8 dw 4085*8 -pw_m301x8: times 8 dw -301*8 - -SECTION .text - -%macro REPX 2-* - %xdefine %%f(x) %1 -%rep %0 - 1 - %rotate 1 - %%f(%1) -%endrep -%endmacro - -%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) - -%if ARCH_X86_64 -%define o(x) x -%else -%define o(x) r5-$$+x ; PIC -%endif - -%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] - lea r2, [dstq+strideq*2] -%assign %%i 1 -%rotate 5 -%rep 4 - %if %1 & 2 - CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) - %else - CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) - %endif - %assign %%i %%i + 1 - %rotate 1 -%endrep - - movd m%3, [%%row_adr1] ;dst0 - movd m%5, [%%row_adr2] ;dst1 - punpckldq m%3, m%5 ;high: dst1 :low: dst0 - movd m%4, [%%row_adr3] ;dst2 - movd m%5, [%%row_adr4] ;dst3 - punpckldq m%4, m%5 ;high: dst3 :low: dst2 - - pxor m%5, m%5 - punpcklbw m%3, m%5 ;extend byte to word - punpcklbw m%4, m%5 ;extend byte to word - - paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 - paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 - - packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 - - movd [%%row_adr1], m%3 ;store dst0 + out0 - pshuflw m%4, m%3, q1032 - movd [%%row_adr2], m%4 ;store dst1 + out1 - punpckhqdq m%3, m%3 - movd [%%row_adr3], m%3 ;store dst2 + out2 - psrlq m%3, 32 - movd [%%row_adr4], m%3 ;store dst3 + out3 -%endmacro - -%macro ITX4_END 4-5 2048 ; row[1-4], rnd -%if %5 - mova m2, [o(pw_%5)] - pmulhrsw m0, m2 - pmulhrsw m1, m2 -%endif - - WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 - ret -%endmacro - -; flags: 1 = swap, 2: coef_regs, 4: no_pack -%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags -%if %6 & 2 - pmaddwd m%2, m%4, m%1 - pmaddwd m%1, m%5 -%elif %6 & 1 - pmaddwd m%2, m%1, [o(pw_%5_%4)] - pmaddwd m%1, [o(pw_%4_m%5)] -%else - pmaddwd m%2, m%1, [o(pw_%4_m%5)] - pmaddwd m%1, [o(pw_%5_%4)] -%endif - paddd m%2, m%3 - paddd m%1, m%3 - psrad m%2, 12 - psrad m%1, 12 -%if %6 & 4 == 0 - packssdw m%1, m%2 -%endif -%endmacro - -%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 - mova m3, [o(pd_2048)] - punpckhwd m2, m0, m1 ;unpacked in1 in3 - punpcklwd m0, m1 ;unpacked in0 in2 - ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 - ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 - psubsw m1, m0, m2 ;high: out2 ;low: out3 - paddsw m0, m2 ;high: out1 ;low: out0 -%endmacro - -%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack -cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2 - %define %%p1 m(i%1_%3_internal) -%if ARCH_X86_32 - LEA r5, $$ -%endif -%if has_epilogue -%ifidn %1_%2, dct_dct - test eobd, eobd - jz %%end -%endif - lea tx2q, [o(m(i%2_%3_internal).pass2)] - call %%p1 - RET -%%end: -%else - lea tx2q, [o(m(i%2_%3_internal).pass2)] -%ifidn %1_%2, dct_dct - test eobd, eobd - jnz %%p1 -%else - times ((%%end - %%p1) >> 31) & 1 jmp %%p1 -ALIGN function_align -%%end: -%endif -%endif -%endmacro - -%macro INV_TXFM_4X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 4x4, 6 -%ifidn %1_%2, dct_dct - pshuflw m0, [coeffq], q0000 - punpcklqdq m0, m0 - mova m1, [o(pw_2896x8)] - pmulhrsw m0, m1 - mov [coeffq], eobd ;0 - pmulhrsw m0, m1 - mova m1, m0 - TAIL_CALL m(iadst_4x4_internal).end2 -%endif -%endmacro - -INIT_XMM ssse3 - -INV_TXFM_4X4_FN dct, dct -INV_TXFM_4X4_FN dct, adst -INV_TXFM_4X4_FN dct, flipadst -INV_TXFM_4X4_FN dct, identity - -cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m0, [coeffq+16*0] ;high: in1 ;low: in0 - mova m1, [coeffq+16*1] ;high: in3 ;low in2 - - IDCT4_1D_PACKED - - mova m2, [o(deint_shuf)] - shufps m3, m0, m1, q1331 - shufps m0, m1, q0220 - pshufb m0, m2 ;high: in1 ;low: in0 - pshufb m1, m3, m2 ;high: in3 ;low :in2 - jmp tx2q - -.pass2: - IDCT4_1D_PACKED - - pxor m2, m2 - mova [coeffq+16*0], m2 - mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); - - ITX4_END 0, 1, 3, 2 - -INV_TXFM_4X4_FN adst, dct -INV_TXFM_4X4_FN adst, adst -INV_TXFM_4X4_FN adst, flipadst -INV_TXFM_4X4_FN adst, identity - -cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - call .main - punpckhwd m2, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m0, m2 ;high: in3 ;low :in2 - punpcklwd m0, m2 ;high: in1 ;low: in0 - jmp tx2q - -.pass2: - call .main - -.end: - pxor m2, m2 - mova [coeffq+16*0], m2 - mova [coeffq+16*1], m2 - -.end2: - ITX4_END 0, 1, 2, 3 - -ALIGN function_align -.main: - punpcklwd m2, m0, m1 ;unpacked in0 in2 - punpckhwd m0, m1 ;unpacked in1 in3 - mova m3, m0 - pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 - pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 - paddd m1, m0 ;t2 - pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 - pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 - pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 - pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 - paddd m4, m0 ;t0 + t3 - pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 - mova m0, [o(pd_2048)] - paddd m1, m0 ;t2 + 2048 - paddd m2, m0 - paddd m0, m4 ;t0 + t3 + 2048 - paddd m5, m2 ;t1 + t3 + 2048 - paddd m2, m4 - paddd m2, m3 ;t0 + t1 - t3 + 2048 - REPX {psrad x, 12}, m1, m0, m5, m2 - packssdw m0, m5 ;high: out1 ;low: out0 - packssdw m1, m2 ;high: out3 ;low: out3 - ret - -INV_TXFM_4X4_FN flipadst, dct -INV_TXFM_4X4_FN flipadst, adst -INV_TXFM_4X4_FN flipadst, flipadst -INV_TXFM_4X4_FN flipadst, identity - -cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - call m(iadst_4x4_internal).main - punpcklwd m2, m1, m0 - punpckhwd m1, m0 - punpcklwd m0, m1, m2 ;high: in3 ;low :in2 - punpckhwd m1, m2 ;high: in1 ;low: in0 - jmp tx2q - -.pass2: - call m(iadst_4x4_internal).main - -.end: - pxor m2, m2 - mova [coeffq+16*0], m2 - mova [coeffq+16*1], m2 - -.end2: - ITX4_END 3, 2, 1, 0 - -INV_TXFM_4X4_FN identity, dct -INV_TXFM_4X4_FN identity, adst -INV_TXFM_4X4_FN identity, flipadst -INV_TXFM_4X4_FN identity, identity - -cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - mova m3, [o(pw_1697x8)] - pmulhrsw m2, m0, m3 - pmulhrsw m3, m1 - paddsw m0, m2 - paddsw m1, m3 - punpckhwd m2, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m0, m2 ;high: in3 ;low :in2 - punpcklwd m0, m2 ;high: in1 ;low: in0 - jmp tx2q - -.pass2: - mova m3, [o(pw_1697x8)] - pmulhrsw m2, m3, m0 - pmulhrsw m3, m1 - paddsw m0, m2 - paddsw m1, m3 - jmp m(iadst_4x4_internal).end - -%macro IWHT4_1D_PACKED 0 - punpckhqdq m3, m0, m1 ;low: in1 high: in3 - punpcklqdq m0, m1 ;low: in0 high: in2 - psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 - paddw m0, m3 ;low: in0 + in1 high: in2 + in3 - punpckhqdq m2, m2 ;t2 t2 - punpcklqdq m0, m0 ;t0 t0 - psubw m1, m0, m2 - psraw m1, 1 ;t4 t4 - psubw m1, m3 ;low: t1/out2 high: t3/out1 - psubw m0, m1 ;high: out0 - paddw m2, m1 ;low: out3 -%endmacro - -cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - pxor m2, m2 - mova [coeffq+16*0], m2 - mova [coeffq+16*1], m2 - psraw m0, 2 - psraw m1, 2 - - IWHT4_1D_PACKED - - punpckhwd m0, m1 - punpcklwd m3, m1, m2 - punpckhdq m1, m0, m3 - punpckldq m0, m3 - - IWHT4_1D_PACKED - - shufpd m0, m2, 0x01 - ITX4_END 0, 3, 2, 1, 0 - - -%macro IDCT8_1D_PACKED 0 - mova m6, [o(pd_2048)] - punpckhwd m4, m0, m3 ;unpacked in1 in7 - punpcklwd m0, m2 ;unpacked in0 in4 - punpckhwd m2, m1 ;unpacked in5 in3 - punpcklwd m1, m3 ;unpacked in2 in6 - ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a - ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a - ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 - psubsw m3, m4, m2 ;low: t6a high: t5a - paddsw m4, m2 ;low: t7 high: t4 - pshufb m3, [o(deint_shuf1)] - ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 - ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 - psubsw m2, m0, m1 ;low: tmp3 high: tmp2 - paddsw m0, m1 ;low: tmp0 high: tmp1 - punpcklqdq m1, m4, m3 ;low: t7 high: t6 - punpckhqdq m4, m3 ;low: t4 high: t5 - psubsw m3, m0, m1 ;low: out7 high: out6 - paddsw m0, m1 ;low: out0 high: out1 - paddsw m1, m2, m4 ;low: out3 high: out2 - psubsw m2, m4 ;low: out4 high: out5 -%endmacro - -;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 -;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 -%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 - punpckhwd m%4, m%1, m%2 - punpcklwd m%1, m%2 -%if %7 < 8 - pmaddwd m%2, m%7, m%1 - pmaddwd m%3, m%7, m%4 -%else - mova m%2, [o(pw_%7_%6)] -%if %8 - pmaddwd m%3, m%1, m%2 - pmaddwd m%2, m%4 -%else - pmaddwd m%3, m%4, m%2 - pmaddwd m%2, m%1 -%endif -%endif - paddd m%3, m%5 - paddd m%2, m%5 - psrad m%3, 12 - psrad m%2, 12 -%if %8 - packssdw m%3, m%2 -%else - packssdw m%2, m%3 ;dst2 -%endif -%if %7 < 8 - pmaddwd m%4, m%6 - pmaddwd m%1, m%6 -%elif %8 - mova m%2, [o(pw_%6_m%7)] - pmaddwd m%4, m%2 - pmaddwd m%1, m%2 -%else - mova m%3, [o(pw_%6_m%7)] - pmaddwd m%4, m%3 - pmaddwd m%1, m%3 -%endif - paddd m%4, m%5 - paddd m%1, m%5 - psrad m%4, 12 - psrad m%1, 12 - packssdw m%1, m%4 ;dst1 -%endmacro - -%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 - ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 - ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 - psubsw m%3, m%1, m%2 ;out2 - paddsw m%2, m%1 ;out1 - paddsw m%1, m%5, m%4 ;out0 - psubsw m%4, m%5 ;out3 -%endmacro - -%macro WRITE_4X8 4 ;row[1-4] - WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 - lea dstq, [dstq+strideq*4] - WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 -%endmacro - -%macro INV_4X8 0 - punpckhwd m4, m2, m3 - punpcklwd m2, m3 - punpckhwd m3, m0, m1 - punpcklwd m0, m1 - punpckhdq m1, m0, m2 ;low: in2 high: in3 - punpckldq m0, m2 ;low: in0 high: in1 - punpckldq m2, m3, m4 ;low: in4 high: in5 - punpckhdq m3, m4 ;low: in6 high: in7 -%endmacro - -%macro INV_TXFM_4X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 4x8, 8 -%ifidn %1_%2, dct_dct - pshuflw m0, [coeffq], q0000 - punpcklqdq m0, m0 - mova m1, [o(pw_2896x8)] - pmulhrsw m0, m1 - mov [coeffq], eobd - pmulhrsw m0, m1 - pmulhrsw m0, m1 - pmulhrsw m0, [o(pw_2048)] - mova m1, m0 - mova m2, m0 - mova m3, m0 - TAIL_CALL m(iadst_4x8_internal).end3 -%endif -%endmacro - -INV_TXFM_4X8_FN dct, dct -INV_TXFM_4X8_FN dct, adst -INV_TXFM_4X8_FN dct, flipadst -INV_TXFM_4X8_FN dct, identity - -cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m3, [o(pw_2896x8)] - pmulhrsw m0, m3, [coeffq+16*0] - pmulhrsw m1, m3, [coeffq+16*1] - pmulhrsw m2, m3, [coeffq+16*2] - pmulhrsw m3, [coeffq+16*3] - -.pass1: - call m(idct_8x4_internal).main - jmp m(iadst_4x8_internal).pass1_end - -.pass2: - call .main - shufps m1, m1, q1032 - shufps m3, m3, q1032 - mova m4, [o(pw_2048)] - jmp m(iadst_4x8_internal).end2 - -ALIGN function_align -.main: - IDCT8_1D_PACKED - ret - - -INV_TXFM_4X8_FN adst, dct -INV_TXFM_4X8_FN adst, adst -INV_TXFM_4X8_FN adst, flipadst -INV_TXFM_4X8_FN adst, identity - -cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m3, [o(pw_2896x8)] - pmulhrsw m0, m3, [coeffq+16*0] - pmulhrsw m1, m3, [coeffq+16*1] - pmulhrsw m2, m3, [coeffq+16*2] - pmulhrsw m3, [coeffq+16*3] - -.pass1: - call m(iadst_8x4_internal).main - -.pass1_end: - INV_4X8 - jmp tx2q - -.pass2: - shufps m0, m0, q1032 - shufps m1, m1, q1032 - call .main - mova m4, [o(pw_2048)] - pxor m5, m5 - psubw m5, m4 - -.end: - punpcklqdq m4, m5 - -.end2: - pmulhrsw m0, m4 - pmulhrsw m1, m4 - pmulhrsw m2, m4 - pmulhrsw m3, m4 - pxor m5, m5 - mova [coeffq+16*0], m5 - mova [coeffq+16*1], m5 - mova [coeffq+16*2], m5 - mova [coeffq+16*3], m5 - -.end3: - WRITE_4X8 0, 1, 2, 3 - RET - -ALIGN function_align -.main: - mova m6, [o(pd_2048)] - punpckhwd m4, m3, m0 ;unpacked in7 in0 - punpckhwd m5, m2, m1 ;unpacked in5 in2 - punpcklwd m1, m2 ;unpacked in3 in4 - punpcklwd m0, m3 ;unpacked in1 in6 - ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a - ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a - ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a - ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a - - psubsw m3, m4, m1 ;low: t4 high: t5 - paddsw m4, m1 ;low: t0 high: t1 - psubsw m2, m5, m0 ;low: t6 high: t7 - paddsw m5, m0 ;low: t2 high: t3 - - shufps m1, m3, m2, q1032 - punpckhwd m2, m1 - punpcklwd m3, m1 - ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a - ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a - - psubsw m1, m4, m5 ;low: t2 high: t3 - paddsw m4, m5 ;low: out0 high: -out7 - psubsw m5, m3, m2 ;low: t7 high: t6 - paddsw m3, m2 ;low: out6 high: -out1 - shufps m0, m4, m3, q3210 ;low: out0 high: -out1 - shufps m3, m4, q3210 ;low: out6 high: -out7 - - mova m2, [o(pw_2896_m2896)] - mova m7, [o(pw_2896_2896)] - shufps m4, m1, m5, q1032 ;low: t3 high: t7 - shufps m1, m5, q3210 ;low: t2 high: t6 - punpcklwd m5, m1, m4 - punpckhwd m1, m4 - pmaddwd m4, m2, m1 ;-out5 - pmaddwd m2, m5 ; out4 - pmaddwd m1, m7 ; out2 - pmaddwd m5, m7 ;-out3 - REPX {paddd x, m6}, m4, m2, m1, m5 - REPX {psrad x, 12}, m4, m2, m1, m5 - packssdw m1, m5 ;low: out2 high: -out3 - packssdw m2, m4 ;low: out4 high: -out5 - ret - -INV_TXFM_4X8_FN flipadst, dct -INV_TXFM_4X8_FN flipadst, adst -INV_TXFM_4X8_FN flipadst, flipadst -INV_TXFM_4X8_FN flipadst, identity - -cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m3, [o(pw_2896x8)] - pmulhrsw m0, m3, [coeffq+16*0] - pmulhrsw m1, m3, [coeffq+16*1] - pmulhrsw m2, m3, [coeffq+16*2] - pmulhrsw m3, [coeffq+16*3] - -.pass1: - call m(iadst_8x4_internal).main - - punpcklwd m4, m3, m2 - punpckhwd m3, m2 - punpcklwd m5, m1, m0 - punpckhwd m1, m0 - punpckldq m2, m3, m1 ;low: in4 high: in5 - punpckhdq m3, m1 ;low: in6 high: in7 - punpckldq m0, m4, m5 ;low: in0 high: in1 - punpckhdq m1, m4, m5 ;low: in2 high: in3 - jmp tx2q - -.pass2: - shufps m0, m0, q1032 - shufps m1, m1, q1032 - call m(iadst_4x8_internal).main - - mova m4, m0 - mova m5, m1 - pshufd m0, m3, q1032 - pshufd m1, m2, q1032 - pshufd m2, m5, q1032 - pshufd m3, m4, q1032 - mova m5, [o(pw_2048)] - pxor m4, m4 - psubw m4, m5 - jmp m(iadst_4x8_internal).end - -INV_TXFM_4X8_FN identity, dct -INV_TXFM_4X8_FN identity, adst -INV_TXFM_4X8_FN identity, flipadst -INV_TXFM_4X8_FN identity, identity - -cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m3, [o(pw_2896x8)] - pmulhrsw m0, m3, [coeffq+16*0] - pmulhrsw m1, m3, [coeffq+16*1] - pmulhrsw m2, m3, [coeffq+16*2] - pmulhrsw m3, [coeffq+16*3] - -.pass1: - mova m7, [o(pw_1697x8)] - pmulhrsw m4, m7, m0 - pmulhrsw m5, m7, m1 - pmulhrsw m6, m7, m2 - pmulhrsw m7, m3 - paddsw m0, m4 - paddsw m1, m5 - paddsw m2, m6 - paddsw m3, m7 - jmp m(iadst_4x8_internal).pass1_end - -.pass2: - mova m4, [o(pw_4096)] - jmp m(iadst_4x8_internal).end2 - - -%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] - movq m%3, [dstq ] - movq m%4, [dstq+strideq] - pxor m%5, m%5 - punpcklbw m%3, m%5 ;extend byte to word - punpcklbw m%4, m%5 ;extend byte to word -%ifnum %1 - paddw m%3, m%1 -%else - paddw m%3, %1 -%endif -%ifnum %2 - paddw m%4, m%2 -%else - paddw m%4, %2 -%endif - packuswb m%3, m%4 - movq [dstq ], m%3 - punpckhqdq m%3, m%3 - movq [dstq+strideq], m%3 -%endmacro - -%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] - WRITE_8X2 %1, %2, %5, %6, %7 - lea dstq, [dstq+strideq*2] - WRITE_8X2 %3, %4, %5, %6, %7 -%endmacro - -%macro INV_TXFM_8X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 8x4, 8 -%ifidn %1_%2, dct_dct - pshuflw m0, [coeffq], q0000 - punpcklqdq m0, m0 - mova m1, [o(pw_2896x8)] - pmulhrsw m0, m1 - pmulhrsw m0, m1 - mova m2, [o(pw_2048)] - pmulhrsw m0, m1 - pmulhrsw m0, m2 - mova m1, m0 - mova m2, m0 - mova m3, m0 - TAIL_CALL m(iadst_8x4_internal).end2 -%endif -%endmacro - -INV_TXFM_8X4_FN dct, dct -INV_TXFM_8X4_FN dct, adst -INV_TXFM_8X4_FN dct, flipadst -INV_TXFM_8X4_FN dct, identity - -cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m3, [o(pw_2896x8)] - pmulhrsw m0, m3, [coeffq+16*0] - pmulhrsw m1, m3, [coeffq+16*1] - pmulhrsw m2, m3, [coeffq+16*2] - pmulhrsw m3, [coeffq+16*3] - - call m(idct_4x8_internal).main - - mova m4, [o(deint_shuf1)] - mova m5, [o(deint_shuf2)] - pshufb m0, m4 - pshufb m1, m5 - pshufb m2, m4 - pshufb m3, m5 - punpckhdq m4, m0, m1 - punpckldq m0, m1 - punpckhdq m5, m2, m3 - punpckldq m2, m3 - punpckhqdq m1, m0, m2 ;in1 - punpcklqdq m0, m2 ;in0 - punpckhqdq m3, m4, m5 ;in3 - punpcklqdq m2 ,m4, m5 ;in2 - jmp tx2q - -.pass2: - call .main - jmp m(iadst_8x4_internal).end - -ALIGN function_align -.main: - mova m6, [o(pd_2048)] - IDCT4_1D 0, 1, 2, 3, 4, 5, 6 - ret - -INV_TXFM_8X4_FN adst, dct -INV_TXFM_8X4_FN adst, adst -INV_TXFM_8X4_FN adst, flipadst -INV_TXFM_8X4_FN adst, identity - -cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m3, [o(pw_2896x8)] - pmulhrsw m0, m3, [coeffq+16*0] - pmulhrsw m1, m3, [coeffq+16*1] - pmulhrsw m2, m3, [coeffq+16*2] - pmulhrsw m3, [coeffq+16*3] - - shufps m0, m0, q1032 - shufps m1, m1, q1032 - call m(iadst_4x8_internal).main - - punpckhwd m4, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m2, m3 - punpcklwd m2, m3 - pxor m5, m5 - psubsw m3, m5, m1 - psubsw m5, m4 - punpckhdq m4, m5, m3 - punpckldq m5, m3 - punpckhdq m3, m0, m2 - punpckldq m0, m2 - punpckhwd m1, m0, m5 ;in1 - punpcklwd m0, m5 ;in0 - punpcklwd m2, m3, m4 ;in2 - punpckhwd m3, m4 ;in3 - jmp tx2q - -.pass2: - call .main - -.end: - mova m4, [o(pw_2048)] - pmulhrsw m0, m4 - pmulhrsw m1, m4 - pmulhrsw m2, m4 - pmulhrsw m3, m4 - -.end2: - pxor m6, m6 - mova [coeffq+16*0], m6 - mova [coeffq+16*1], m6 - mova [coeffq+16*2], m6 - mova [coeffq+16*3], m6 -.end3: - WRITE_8X4 0, 1, 2, 3, 4, 5, 6 - RET - -ALIGN function_align -.main: - punpckhwd m6, m0, m2 ;unpacked in0 in2 - punpcklwd m0, m2 ;unpacked in0 in2 - punpckhwd m7, m1, m3 ;unpacked in1 in3 - punpcklwd m1, m3 ;unpacked in1 in3 - - mova m2, [o(pw_3344_m3344)] - mova m4, [o(pw_0_3344)] - pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 - pmaddwd m5, m4, m7 ;3344 * in3 - pmaddwd m2, m0 - pmaddwd m4, m1 - paddd m3, m5 - paddd m2, m4 - mova m4, [o(pd_2048)] - paddd m3, m4 ;t2 + 2048 - paddd m2, m4 - psrad m3, 12 - psrad m2, 12 - packssdw m2, m3 ;out2 - - pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 - pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 - pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 - pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 - paddd m3, m4 ;t0 + t3 - - pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 - mova m4, [o(pd_2048)] - paddd m0, m4 - paddd m4, m3 ;t0 + t3 + 2048 - paddd m5, m0 ;t1 + t3 + 2048 - paddd m3, m0 - paddd m3, m1 ;t0 + t1 - t3 + 2048 - - psrad m4, 12 ;out0 - psrad m5, 12 ;out1 - psrad m3, 12 ;out3 - packssdw m0, m4, m5 ;low: out0 high: out1 - - pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 - pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 - pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 - pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 - paddd m1, m4 ;t0 + t3 - pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 - - mova m4, [o(pd_2048)] - paddd m6, m4 - paddd m4, m1 ;t0 + t3 + 2048 - paddd m5, m6 ;t1 + t3 + 2048 - paddd m1, m6 - paddd m1, m7 ;t0 + t1 - t3 + 2048 - - psrad m4, 12 ;out0 - psrad m5, 12 ;out1 - psrad m1, 12 ;out3 - packssdw m3, m1 ;out3 - packssdw m4, m5 ;low: out0 high: out1 - - punpckhqdq m1, m0, m4 ;out1 - punpcklqdq m0, m4 ;out0 - ret - -INV_TXFM_8X4_FN flipadst, dct -INV_TXFM_8X4_FN flipadst, adst -INV_TXFM_8X4_FN flipadst, flipadst -INV_TXFM_8X4_FN flipadst, identity - -cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m3, [o(pw_2896x8)] - pmulhrsw m0, m3, [coeffq+16*0] - pmulhrsw m1, m3, [coeffq+16*1] - pmulhrsw m2, m3, [coeffq+16*2] - pmulhrsw m3, [coeffq+16*3] - - shufps m0, m0, q1032 - shufps m1, m1, q1032 - call m(iadst_4x8_internal).main - - punpckhwd m5, m3, m2 - punpcklwd m3, m2 - punpckhwd m2, m1, m0 - punpcklwd m1, m0 - - pxor m0, m0 - psubsw m4, m0, m2 - psubsw m0, m5 - punpckhdq m2, m0, m4 - punpckldq m0, m4 - punpckhdq m4, m3, m1 - punpckldq m3, m1 - punpckhwd m1, m0, m3 ;in1 - punpcklwd m0, m3 ;in0 - punpckhwd m3, m2, m4 ;in3 - punpcklwd m2, m4 ;in2 - jmp tx2q - -.pass2: - call m(iadst_8x4_internal).main - mova m4, m0 - mova m5, m1 - mova m0, m3 - mova m1, m2 - mova m2, m5 - mova m3, m4 - jmp m(iadst_8x4_internal).end - -INV_TXFM_8X4_FN identity, dct -INV_TXFM_8X4_FN identity, adst -INV_TXFM_8X4_FN identity, flipadst -INV_TXFM_8X4_FN identity, identity - -cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m3, [o(pw_2896x8)] - pmulhrsw m0, m3, [coeffq+16*0] - pmulhrsw m1, m3, [coeffq+16*1] - pmulhrsw m2, m3, [coeffq+16*2] - pmulhrsw m3, [coeffq+16*3] - paddsw m0, m0 - paddsw m1, m1 - paddsw m2, m2 - paddsw m3, m3 - - punpckhwd m4, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m2, m3 - punpcklwd m2, m3 - punpckhdq m5, m4, m1 - punpckldq m4, m1 - punpckhdq m3, m0, m2 - punpckldq m0, m2 - punpckhwd m1, m0, m4 ;in1 - punpcklwd m0, m4 ;in0 - punpcklwd m2, m3, m5 ;in2 - punpckhwd m3, m5 ;in3 - jmp tx2q - -.pass2: - mova m7, [o(pw_1697x8)] - pmulhrsw m4, m7, m0 - pmulhrsw m5, m7, m1 - pmulhrsw m6, m7, m2 - pmulhrsw m7, m3 - paddsw m0, m4 - paddsw m1, m5 - paddsw m2, m6 - paddsw m3, m7 - jmp m(iadst_8x4_internal).end - -%macro INV_TXFM_8X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 8x8, 8, 16*4 -%ifidn %1_%2, dct_dct - pshuflw m0, [coeffq], q0000 - punpcklwd m0, m0 - mova m1, [o(pw_2896x8)] - pmulhrsw m0, m1 - mova m2, [o(pw_16384)] - mov [coeffq], eobd - pmulhrsw m0, m2 - psrlw m2, 3 - pmulhrsw m0, m1 - pmulhrsw m0, m2 -.end: - mov r3d, 2 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)] -.loop: - WRITE_8X4 0, 0, 0, 0, 1, 2, 3 - lea dstq, [dstq+strideq*2] - dec r3d - jg .loop - jmp tx2q -.end3: - RET -%endif -%endmacro - -%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 -%if %3 - mova m7, [o(pw_2896x8)] - pmulhrsw m0, m7, [%1+%2*0] - pmulhrsw m1, m7, [%1+%2*1] - pmulhrsw m2, m7, [%1+%2*2] - pmulhrsw m3, m7, [%1+%2*3] - pmulhrsw m4, m7, [%1+%2*4] - pmulhrsw m5, m7, [%1+%2*5] - pmulhrsw m6, m7, [%1+%2*6] - pmulhrsw m7, [%1+%2*7] -%else - mova m0, [%1+%2*0] - mova m1, [%1+%2*1] - mova m2, [%1+%2*2] - mova m3, [%1+%2*3] - mova m4, [%1+%2*4] - mova m5, [%1+%2*5] - mova m6, [%1+%2*6] - mova m7, [%1+%2*7] -%endif -%endmacro - -%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 - ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a - ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a - psubsw m%2, m%4, m%5 ;t6a - paddsw m%4, m%5 ;t7 - psubsw m%5, m%1, m%3 ;t5a - paddsw m%1, m%3 ;t4 - ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 -%endmacro - -INV_TXFM_8X8_FN dct, dct -INV_TXFM_8X8_FN dct, adst -INV_TXFM_8X8_FN dct, flipadst -INV_TXFM_8X8_FN dct, identity - -cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_8ROWS coeffq, 16 - -.pass1: - call .main - -.pass1_end: - mova m7, [o(pw_16384)] - -.pass1_end1: - REPX {pmulhrsw x, m7}, m0, m2, m4, m6 - mova [rsp+gprsize+16*1], m6 - -.pass1_end2: - REPX {pmulhrsw x, m7}, m1, m3, m5 - pmulhrsw m7, [rsp+gprsize+16*0] - -.pass1_end3: - punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 - punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 - punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 - punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 - punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 - punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 - punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 - punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 - punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 - punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 - mova [rsp+gprsize+16*2], m6 - mova m6, [rsp+gprsize+16*1] - punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 - punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 - punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 - punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 - punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 - punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 - - punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 - punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 - mova [rsp+gprsize+16*0], m2 - punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 - punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 - punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 - punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 - mova m7, [rsp+gprsize+16*2] - punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 - punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 - mova m7, [rsp+gprsize+16*0] - jmp tx2q - -.pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] - -.pass2_main: - call .main - -.end: - mova m7, [o(pw_2048)] - REPX {pmulhrsw x, m7}, m0, m2, m4, m6 - mova [rsp+gprsize+16*1], m6 - -.end2: - REPX {pmulhrsw x, m7}, m1, m3, m5 - pmulhrsw m7, [rsp+gprsize+16*0] - mova [rsp+gprsize+16*2], m5 - mova [rsp+gprsize+16*0], m7 - -.end3: - WRITE_8X4 0, 1, 2, 3, 5, 6, 7 - lea dstq, [dstq+strideq*2] - WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 - jmp tx2q - -.end4: - pxor m7, m7 - REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 - ret - -ALIGN function_align -.main: - mova [rsp+gprsize*2+16*0], m7 - mova [rsp+gprsize*2+16*1], m3 - mova [rsp+gprsize*2+16*2], m1 - mova m7, [o(pd_2048)] - IDCT4_1D 0, 2, 4, 6, 1, 3, 7 - mova m3, [rsp+gprsize*2+16*2] - mova [rsp+gprsize*2+16*2], m2 - mova m2, [rsp+gprsize*2+16*1] - mova [rsp+gprsize*2+16*1], m4 - mova m4, [rsp+gprsize*2+16*0] - mova [rsp+gprsize*2+16*0], m6 - IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 - mova m6, [rsp+gprsize*2+16*0] - psubsw m7, m0, m4 ;out7 - paddsw m0, m4 ;out0 - mova [rsp+gprsize*2+16*0], m7 - mova m1, [rsp+gprsize*2+16*2] - psubsw m4, m6, m3 ;out4 - paddsw m3, m6 ;out3 - mova m7, [rsp+gprsize*2+16*1] - psubsw m6, m1, m5 ;out6 - paddsw m1, m5 ;out1 - psubsw m5, m7, m2 ;out5 - paddsw m2, m7 ;out2 - ret - - -INV_TXFM_8X8_FN adst, dct -INV_TXFM_8X8_FN adst, adst -INV_TXFM_8X8_FN adst, flipadst -INV_TXFM_8X8_FN adst, identity - -cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_8ROWS coeffq, 16 - -.pass1: - call .main - call .main_pass1_end - -.pass1_end: - mova m7, [o(pw_16384)] - -.pass1_end1: - REPX {pmulhrsw x, m7}, m0, m2, m4, m6 - mova [rsp+gprsize+16*1], m6 - pxor m6, m6 - psubw m6, m7 - mova m7, m6 - jmp m(idct_8x8_internal).pass1_end2 - -ALIGN function_align -.pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] - -.pass2_main: - call .main - call .main_pass2_end - -.end: - mova m7, [o(pw_2048)] - REPX {pmulhrsw x, m7}, m0, m2, m4, m6 - mova [rsp+gprsize+16*1], m6 - pxor m6, m6 - psubw m6, m7 - mova m7, m6 - jmp m(idct_8x8_internal).end2 - -ALIGN function_align -.main: - mova [rsp+gprsize*2+16*0], m7 - mova [rsp+gprsize*2+16*1], m3 - mova [rsp+gprsize*2+16*2], m4 - mova m7, [o(pd_2048)] - ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a - ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a - paddsw m3, m2, m6 ;t2 - psubsw m2, m6 ;t6 - paddsw m4, m5, m1 ;t3 - psubsw m5, m1 ;t7 - ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a - - mova m6, [rsp+gprsize*2+16*2] - mova [rsp+gprsize*2+16*2], m5 - mova m1, [rsp+gprsize*2+16*1] - mova [rsp+gprsize*2+16*1], m2 - mova m5, [rsp+gprsize*2+16*0] - mova [rsp+gprsize*2+16*0], m3 - ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a - ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a - psubsw m2, m0, m6 ;t4 - paddsw m0, m6 ;t0 - paddsw m3, m5, m1 ;t1 - psubsw m5, m1 ;t5 - ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a - - mova m7, [rsp+gprsize*2+16*0] - paddsw m1, m3, m4 ;-out7 - psubsw m3, m4 ;t3 - mova [rsp+gprsize*2+16*0], m1 - psubsw m4, m0, m7 ;t2 - paddsw m0, m7 ;out0 - mova m6, [rsp+gprsize*2+16*2] - mova m7, [rsp+gprsize*2+16*1] - paddsw m1, m5, m6 ;-out1 - psubsw m5, m6 ;t6 - paddsw m6, m2, m7 ;out6 - psubsw m2, m7 ;t7 - ret -ALIGN function_align -.main_pass1_end: - mova [rsp+gprsize*2+16*1], m1 - mova [rsp+gprsize*2+16*2], m6 - punpckhwd m1, m4, m3 - punpcklwd m4, m3 - punpckhwd m7, m5, m2 - punpcklwd m5, m2 - mova m2, [o(pw_2896_2896)] - mova m6, [o(pd_2048)] - pmaddwd m3, m2, m7 - pmaddwd m2, m5 - paddd m3, m6 - paddd m2, m6 - psrad m3, 12 - psrad m2, 12 - packssdw m2, m3 ;out2 - mova m3, [o(pw_2896_m2896)] - pmaddwd m7, m3 - pmaddwd m5, m3 - paddd m7, m6 - paddd m5, m6 - psrad m7, 12 - psrad m5, 12 - packssdw m5, m7 ;-out5 - mova m3, [o(pw_2896_2896)] - pmaddwd m7, m3, m1 - pmaddwd m3, m4 - paddd m7, m6 - paddd m3, m6 - psrad m7, 12 - psrad m3, 12 - packssdw m3, m7 ;-out3 - mova m7, [o(pw_2896_m2896)] - pmaddwd m1, m7 - pmaddwd m4, m7 - paddd m1, m6 - paddd m4, m6 - psrad m1, 12 - psrad m4, 12 - packssdw m4, m1 ;-out5 - mova m1, [rsp+gprsize*2+16*1] - mova m6, [rsp+gprsize*2+16*2] - ret -ALIGN function_align -.main_pass2_end: - paddsw m7, m4, m3 ;t2 + t3 - psubsw m4, m3 ;t2 - t3 - paddsw m3, m5, m2 ;t6 + t7 - psubsw m5, m2 ;t6 - t7 - mova m2, [o(pw_2896x8)] - pmulhrsw m4, m2 ;out4 - pmulhrsw m5, m2 ;-out5 - pmulhrsw m7, m2 ;-out3 - pmulhrsw m2, m3 ;out2 - mova m3, m7 - ret - -INV_TXFM_8X8_FN flipadst, dct -INV_TXFM_8X8_FN flipadst, adst -INV_TXFM_8X8_FN flipadst, flipadst -INV_TXFM_8X8_FN flipadst, identity - -cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_8ROWS coeffq, 16 - -.pass1: - call m(iadst_8x8_internal).main - call m(iadst_8x8_internal).main_pass1_end - -.pass1_end: - mova m7, [o(pw_m16384)] - -.pass1_end1: - pmulhrsw m1, m7 - mova [rsp+gprsize+16*1], m1 - mova m1, m6 - mova m6, m2 - pmulhrsw m2, m5, m7 - mova m5, m6 - mova m6, m4 - pmulhrsw m4, m3, m7 - mova m3, m6 - mova m6, m0 - mova m0, m7 - pxor m7, m7 - psubw m7, m0 - pmulhrsw m0, [rsp+gprsize+16*0] - REPX {pmulhrsw x, m7}, m1, m3, m5 - pmulhrsw m7, m6 - jmp m(idct_8x8_internal).pass1_end3 - -ALIGN function_align -.pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] - -.pass2_main: - call m(iadst_8x8_internal).main - call m(iadst_8x8_internal).main_pass2_end - -.end: - mova m7, [o(pw_2048)] - REPX {pmulhrsw x, m7}, m0, m2, m4, m6 - mova [rsp+gprsize+16*2], m2 - mova m2, m0 - pxor m0, m0 - psubw m0, m7 - mova m7, m2 - pmulhrsw m1, m0 - pmulhrsw m2, m5, m0 - mova [rsp+gprsize+16*1], m1 - mova m5, m4 - mova m1, m6 - pmulhrsw m4, m3, m0 - pmulhrsw m0, [rsp+gprsize+16*0] - mova m3, m5 - mova [rsp+gprsize+16*0], m7 - jmp m(idct_8x8_internal).end3 - -INV_TXFM_8X8_FN identity, dct -INV_TXFM_8X8_FN identity, adst -INV_TXFM_8X8_FN identity, flipadst -INV_TXFM_8X8_FN identity, identity - -cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_8ROWS coeffq, 16 - mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).pass1_end3 - -ALIGN function_align -.pass2: - lea tx2q, [o(m(idct_8x8_internal).end4)] - -.end: - pmulhrsw m7, [o(pw_4096)] - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_4096)] - REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 - mova [rsp+gprsize+16*2], m5 - mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).end3 - - -%macro INV_TXFM_4X16_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 4x16, 8 -%ifidn %1_%2, dct_dct - pshuflw m0, [coeffq], q0000 - punpcklwd m0, m0 - mova m1, [o(pw_2896x8)] - pmulhrsw m0, m1 - mov [coeffq], eobd - pmulhrsw m0, [o(pw_16384)] - pmulhrsw m0, m1 - pmulhrsw m0, [o(pw_2048)] -.end: - WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 - lea dstq, [dstq+strideq*4] - WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 - lea dstq, [dstq+strideq*4] - WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 - lea dstq, [dstq+strideq*4] - WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 - RET -%endif -%endmacro - -INV_TXFM_4X16_FN dct, dct -INV_TXFM_4X16_FN dct, adst -INV_TXFM_4X16_FN dct, flipadst -INV_TXFM_4X16_FN dct, identity - -cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(idct_4x8_internal).pass1)] - -.pass1: - mova m0, [coeffq+16*1] - mova m1, [coeffq+16*3] - mova m2, [coeffq+16*5] - mova m3, [coeffq+16*7] - push tx2q - lea tx2q, [o(m(idct_4x16_internal).pass1_2)] - jmp r3 - -.pass1_2: - mova [coeffq+16*1], m0 - mova [coeffq+16*3], m1 - mova [coeffq+16*5], m2 - mova [coeffq+16*7], m3 - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*2] - mova m2, [coeffq+16*4] - mova m3, [coeffq+16*6] - lea tx2q, [o(m(idct_4x16_internal).pass1_end)] - jmp r3 - -.pass1_end: - pop tx2q - - mova m4, [coeffq+16*1] - mova m5, [coeffq+16*3] - mova m6, [coeffq+16*5] - mova m7, [o(pw_16384)] - REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 - - pmulhrsw m7, [coeffq+16*7] - mova [coeffq+16*7], m7 - jmp tx2q - -.pass2: - call m(idct_16x4_internal).main - -.end: - mova m7, [o(pw_2048)] - REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 - pmulhrsw m7, [coeffq+16*7] - mova [coeffq+16*4], m4 - -.end1: - mova [coeffq+16*5], m5 - mova [coeffq+16*6], m6 - mov r3, coeffq - WRITE_4X8 0, 1, 3, 2 - - mova m0, [r3+16*4] - mova m1, [r3+16*5] - mova m2, [r3+16*6] - mova m3, m7 - lea dstq, [dstq+strideq*4] - WRITE_4X8 0, 1, 3, 2 - -.end2: - pxor m7, m7 - REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 - ret - -INV_TXFM_4X16_FN adst, dct -INV_TXFM_4X16_FN adst, adst -INV_TXFM_4X16_FN adst, flipadst -INV_TXFM_4X16_FN adst, identity - -cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iadst_4x8_internal).pass1)] - jmp m(idct_4x16_internal).pass1 - -.pass2: - call m(iadst_16x4_internal).main - call m(iadst_16x4_internal).main_pass2_end - - punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 - punpckhqdq m4, m5 ;low: out8 high: out10 - punpcklqdq m5, m7, m2 ;low: out4 high: out6 - punpckhqdq m2, m7 ;low: -out9 high: -out11 - mova [coeffq+16*4], m2 - mova [coeffq+16*5], m6 - mova m2, [coeffq+16*6] - mova m6, [coeffq+16*7] - punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 - punpcklqdq m0, m6 ;low: out0 high: out2 - punpckhqdq m6, m3, m2 ;low: out12 high: out14 - punpcklqdq m2, m3 ;low: -out1 high: -out3 - - mova m7, [o(pw_2048)] - -.end1: - REPX {pmulhrsw x, m7}, m0, m5, m4, m6 - pxor m3, m3 - psubw m3, m7 - mova m7, [coeffq+16*4] - REPX {pmulhrsw x, m3}, m2, m7, m1 - pmulhrsw m3, [coeffq+16*5] - mova [coeffq+16*7], m5 - - punpckhqdq m5, m4, m7 ;low: out10 high: out11 - punpcklqdq m4, m7 ;low: out8 high: out9 - punpckhqdq m7, m6, m1 ;low: out14 high: out15 - punpcklqdq m6, m1 ;low: out12 high: out13 - punpckhqdq m1, m0, m2 ;low: out2 high: out3 - punpcklqdq m0, m2 ;low: out0 high: out1 - mova [coeffq+16*4], m4 - mova m4, [coeffq+16*7] - punpcklqdq m2, m4, m3 ;low: out4 high: out5 - punpckhqdq m4, m3 ;low: out6 high: out7 - mova m3, m4 - -.end2: - mova [coeffq+16*5], m5 - mova [coeffq+16*6], m6 - mov r3, coeffq - WRITE_4X8 0, 1, 2, 3 - - mova m0, [r3+16*4] - mova m1, [r3+16*5] - mova m2, [r3+16*6] - mova m3, m7 - lea dstq, [dstq+strideq*4] - WRITE_4X8 0, 1, 2, 3 - -.end3: - pxor m7, m7 - REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 - ret - - -INV_TXFM_4X16_FN flipadst, dct -INV_TXFM_4X16_FN flipadst, adst -INV_TXFM_4X16_FN flipadst, flipadst -INV_TXFM_4X16_FN flipadst, identity - -cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iflipadst_4x8_internal).pass1)] - jmp m(idct_4x16_internal).pass1 - -.pass2: - call m(iadst_16x4_internal).main - call m(iadst_16x4_internal).main_pass2_end - - punpckhqdq m6, m5, m4 ;low: out5 high: out7 - punpcklqdq m4, m5 ;low: -out8 high: -out10 - punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 - punpcklqdq m2, m7 ;low: out9 high: out11 - mova [coeffq+16*4], m2 - mova [coeffq+16*5], m6 - mova m2, [coeffq+16*6] - mova m6, [coeffq+16*7] - punpcklqdq m1, m6, m0 ;low: out13 high: out15 - punpckhqdq m0, m6 ;low: -out0 high: -out2 - punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 - punpckhqdq m2, m3 ;low: out1 high: out3 - - mova m7, [o(pw_m2048)] - jmp m(iadst_4x16_internal).end1 - - -INV_TXFM_4X16_FN identity, dct -INV_TXFM_4X16_FN identity, adst -INV_TXFM_4X16_FN identity, flipadst -INV_TXFM_4X16_FN identity, identity - -%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] - pmulhrsw m%2, m%3, m%1 -%if %0 == 4 ; if downshifting by 1 - pmulhrsw m%2, m%4 -%else - paddsw m%1, m%1 -%endif - paddsw m%1, m%2 -%endmacro - -cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m0, [coeffq+16*1] - mova m6, [o(pw_1697x8)] - mova m1, [coeffq+16*3] - mova m2, [coeffq+16*5] - mova m3, [coeffq+16*7] - pcmpeqw m7, m7 - mov r3, tx2q - lea tx2q, [o(.pass1_2)] -.pass1: - pmulhrsw m4, m6, m0 - pmulhrsw m5, m6, m1 - pavgw m4, m0 - pcmpeqw m0, m7 - pavgw m5, m1 - pcmpeqw m1, m7 - pandn m0, m4 - pmulhrsw m4, m6, m2 - pandn m1, m5 - pmulhrsw m5, m6, m3 - pavgw m4, m2 - pcmpeqw m2, m7 - pavgw m5, m3 - pcmpeqw m3, m7 - pandn m2, m4 - pandn m3, m5 - jmp m(iadst_4x8_internal).pass1_end -.pass1_2: - mova [coeffq+16*1], m0 - mova [coeffq+16*3], m1 - mova [coeffq+16*5], m2 - mova [coeffq+16*7], m3 - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*2] - mova m2, [coeffq+16*4] - mova m3, [coeffq+16*6] - lea tx2q, [o(.pass1_end)] - jmp .pass1 -.pass1_end: - mova m4, [coeffq+16*1] - mova m5, [coeffq+16*3] - mova m6, [coeffq+16*5] - jmp r3 -.pass2: - mova m7, [o(pw_1697x16)] - mova [coeffq+16*6], m6 - REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 - mova m6, [coeffq+16*7] - IDTX16 6, 7, 7 - mova [coeffq+16*7], m6 - mova m6, [coeffq+16*6] - pmulhrsw m7, m6, [o(pw_1697x16)] - paddsw m6, m6 - paddsw m6, m7 - mova m7, [o(pw_2048)] - REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 - pmulhrsw m7, [coeffq+16*7] - mova [coeffq+16*4], m4 - jmp m(iadst_4x16_internal).end2 - - -%macro INV_TXFM_16X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 16x4, 8 -%ifidn %1_%2, dct_dct - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_16384)] - mov [coeffq], eobd - mov r2d, 2 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)] -.dconly: - pmulhrsw m0, m2 - movd m2, [o(pw_2048)] ;intentionally rip-relative - pmulhrsw m0, m1 - pmulhrsw m0, m2 - pshuflw m0, m0, q0000 - punpcklwd m0, m0 - pxor m5, m5 -.dconly_loop: - mova m1, [dstq] - mova m3, [dstq+strideq] - punpckhbw m2, m1, m5 - punpcklbw m1, m5 - punpckhbw m4, m3, m5 - punpcklbw m3, m5 - paddw m2, m0 - paddw m1, m0 - paddw m4, m0 - paddw m3, m0 - packuswb m1, m2 - packuswb m3, m4 - mova [dstq], m1 - mova [dstq+strideq], m3 - lea dstq, [dstq+strideq*2] - dec r2d - jg .dconly_loop - jmp tx2q -.end: - RET -%endif -%endmacro - -%macro LOAD_7ROWS 2 ;src, stride - mova m0, [%1+%2*0] - mova m1, [%1+%2*1] - mova m2, [%1+%2*2] - mova m3, [%1+%2*3] - mova m4, [%1+%2*4] - mova m5, [%1+%2*5] - mova m6, [%1+%2*6] -%endmacro - -%macro SAVE_7ROWS 2 ;src, stride - mova [%1+%2*0], m0 - mova [%1+%2*1], m1 - mova [%1+%2*2], m2 - mova [%1+%2*3], m3 - mova [%1+%2*4], m4 - mova [%1+%2*5], m5 - mova [%1+%2*6], m6 -%endmacro - -%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] - punpckhwd m%5, m%4, m%1 ;packed in13 in3 - punpcklwd m%1, m%4 ;packed in1 in15 - punpcklwd m%4, m%3, m%2 ;packed in9 in7 - punpckhwd m%2, m%3 ;packed in5 in11 - mova m%7, [o(pd_2048)] - ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a - ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a - ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a - ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a - psubsw m%6, m%1, m%4 ;low: t9 high: t14 - paddsw m%1, m%4 ;low: t8 high: t15 - psubsw m%4, m%5, m%2 ;low: t10 high: t13 - paddsw m%5, m%2 ;low: t11 high: t12 - mova m%2, [o(deint_shuf2)] - pshufb m%6, m%2 - pshufb m%4, m%2 - ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a - ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a - psubsw m%3, m%1, m%5 ;low: t11a high: t12a - paddsw m%1, m%5 ;low: t8a high: t15a - psubsw m%5, m%6, m%4 ;low: t10 high: t13 - paddsw m%6, m%4 ;low: t9 high: t14 - pshufb m%3, m%2 - pshufb m%5, m%2 - ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 - ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a - packssdw m%2, m%4 ;low: t11 high: t10a - packssdw m%3, m%5 ;low: t12 high: t13a - punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 - punpcklqdq m%1, m%6 ;low: t8a high: t9 -%endmacro - -INV_TXFM_16X4_FN dct, dct -INV_TXFM_16X4_FN dct, adst -INV_TXFM_16X4_FN dct, flipadst -INV_TXFM_16X4_FN dct, identity - -cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_7ROWS coeffq, 16 - call .main - -.pass1_end: - punpckhwd m7, m0, m2 ;packed out1, out5 - punpcklwd m0, m2 ;packed out0, out4 - punpcklwd m2, m1, m3 ;packed out3, out7 - punpckhwd m1, m3 ;packed out2, out6 - mova [coeffq+16*6], m7 - mova m7, [coeffq+16*7] - punpckhwd m3, m4, m6 ;packed out9, out13 - punpcklwd m4, m6 ;packed out8, out12 - punpcklwd m6, m5, m7 ;packed out11, out15 - punpckhwd m5, m7 ;packed out10, out14 - -.pass1_end2: - mova m7, [o(pw_16384)] - REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 - pmulhrsw m7, [coeffq+16*6] - mova [coeffq+16*6], m7 - -.pass1_end3: - punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high - punpcklwd m3, m6 ;packed 9, 10, 13, 15 low - punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high - punpcklwd m4, m5 ;packed 8, 10, 12, 14 low - punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) - punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) - punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) - punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) - mova [coeffq+16*7], m3 - mova m3, [coeffq+16*6] - punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high - punpcklwd m3, m2 ;packed 1, 3, 5, 7 low - punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high - punpcklwd m0, m1 ;packed 0, 2, 4, 6 low - punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) - punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) - punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) - punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) - jmp tx2q - -.pass2: - lea tx2q, [o(m(idct_8x4_internal).pass2)] - -.pass2_end: - mova [coeffq+16*4], m4 - mova [coeffq+16*5], m5 - mova [coeffq+16*6], m6 - lea r3, [dstq+8] - call tx2q - - add coeffq, 16*4 - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - mova m2, [coeffq+16*2] - mova m3, [coeffq+16*3] - mov dstq, r3 - jmp tx2q - -ALIGN function_align -.main: - punpckhqdq m7, m0, m1 ;low:in1 high:in3 - punpcklqdq m0, m1 - punpcklqdq m1, m2, m3 - punpckhqdq m3, m2 ;low:in7 high:in5 - mova [coeffq+16*4], m7 - mova [coeffq+16*5], m3 - mova m7, [coeffq+16*7] - punpcklqdq m2, m4, m5 - punpckhqdq m4, m5 ;low:in9 high:in11 - punpcklqdq m3, m6, m7 - punpckhqdq m7, m6 ;low:in15 high:in13 - mova [coeffq+16*6], m4 - IDCT8_1D_PACKED - mova m6, [coeffq+16*4] - mova m4, [coeffq+16*5] - mova m5, [coeffq+16*6] - mova [coeffq+16*4], m1 - mova [coeffq+16*5], m2 - mova [coeffq+16*6], m3 - - IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 - - mova m1, [coeffq+16*4] - psubsw m3, m0, m7 ;low:out15 high:out14 - paddsw m0, m7 ;low:out0 high:out1 - psubsw m7, m1, m5 ;low:out12 high:out13 - paddsw m1, m5 ;low:out3 high:out2 - mova [coeffq+16*7], m3 - mova m2, [coeffq+16*5] - mova m3, [coeffq+16*6] - psubsw m5, m2, m4 ;low:out11 high:out10 - paddsw m2, m4 ;low:out4 high:out5 - psubsw m4, m3, m6 ;low:out8 high:out9 - paddsw m3, m6 ;low:out7 high:out6 - mova m6, m7 - ret - -INV_TXFM_16X4_FN adst, dct -INV_TXFM_16X4_FN adst, adst -INV_TXFM_16X4_FN adst, flipadst -INV_TXFM_16X4_FN adst, identity - -cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_7ROWS coeffq, 16 - call .main - call .main_pass1_end - - punpckhwd m6, m7, m0 ;packed -out11, -out15 - punpcklwd m0, m7 ;packed out0, out4 - punpcklwd m7, m3, m4 ;packed -out3, -out7 - punpckhwd m4, m3 ;packed out8, out12 - mova m1, [coeffq+16*6] - punpcklwd m3, m1, m5 ;packed -out1, -out5 - punpckhwd m5, m1 ;packed out10, out14 - mova m1, [coeffq+16*7] - mova [coeffq+16*6], m3 - mova [coeffq+16*7], m7 - punpckhwd m3, m2, m1 ;packed -out9, -out13 - punpcklwd m1, m2 ;packed out2, out6 - - mova m7, [o(pw_16384)] - -.pass1_end: - REPX {pmulhrsw x, m7}, m0, m1, m4, m5 - pxor m2, m2 - psubw m2, m7 - mova m7, [coeffq+16*6] - REPX {pmulhrsw x, m2}, m7, m3, m6 - pmulhrsw m2, [coeffq+16*7] - mova [coeffq+16*6], m7 - jmp m(idct_16x4_internal).pass1_end3 - -.pass2: - lea tx2q, [o(m(iadst_8x4_internal).pass2)] - jmp m(idct_16x4_internal).pass2_end - -ALIGN function_align -.main: - mova [coeffq+16*6], m0 - pshufd m0, m1, q1032 - pshufd m2, m2, q1032 - punpckhwd m1, m6, m0 ;packed in13, in2 - punpcklwd m0, m6 ;packed in3, in12 - punpckhwd m7, m5, m2 ;packed in11, in4 - punpcklwd m2, m5 ;packed in5, in10 - mova m6, [o(pd_2048)] - ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 - ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 - ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 - ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 - psubsw m5, m1, m2 ;low:t10a high:t11a - paddsw m1, m2 ;low:t2a high:t3a - psubsw m2, m7, m0 ;low:t12a high:t13a - paddsw m7, m0 ;low:t4a high:t5a - punpcklqdq m0, m5 - punpckhwd m0, m5 ;packed t10a, t11a - punpcklqdq m5, m2 - punpckhwd m2, m5 ;packed t13a, t12a - ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 - ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 - mova [coeffq+16*4], m1 - mova [coeffq+16*5], m7 - mova m1, [coeffq+16*6] - mova m7, [coeffq+16*7] - pshufd m1, m1, q1032 - pshufd m3, m3, q1032 - punpckhwd m5, m7, m1 ;packed in15, in0 - punpcklwd m1, m7 ;packed in1, in14 - punpckhwd m7, m4, m3 ;packed in9, in6 - punpcklwd m3, m4 ;packed in7, in8 - ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 - ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 - ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 - ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 - psubsw m4, m5, m3 ;low:t8a high:t9a - paddsw m5, m3 ;low:t0a high:t1a - psubsw m3, m7, m1 ;low:t14a high:t15a - paddsw m7, m1 ;low:t6a high:t7a - punpcklqdq m1, m4 - punpckhwd m1, m4 ;packed t8a, t9a - punpcklqdq m4, m3 - punpckhwd m3, m4 ;packed t15a, t14a - ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 - ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 - paddsw m4, m1, m2 ;low:t12a high:t13a - psubsw m1, m2 ;low:t8a high:t9a - psubsw m2, m0, m3 ;low:t14a high:t15a - paddsw m0, m3 ;low:t10a high:t11a - punpcklqdq m3, m1 - punpckhwd m3, m1 ;packed t12a, t13a - punpcklqdq m1, m2 - punpckhwd m2, m1 ;packed t15a, t14a - ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 - ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 - psubsw m1, m3, m2 ;low:t14a high:t15a - paddsw m3, m2 ;low:out2 high:-out13 - psubsw m2, m4, m0 ;low:t10 high:t11 - paddsw m0, m4 ;low:-out1 high:out14 - mova [coeffq+16*6], m0 - mova [coeffq+16*7], m3 - mova m0, [coeffq+16*4] - mova m3, [coeffq+16*5] - psubsw m4, m5, m3 ;low:t4 high:t5 - paddsw m5, m3 ;low:t0 high:t1 - psubsw m3, m0, m7 ;low:t6 high:t7 - paddsw m0, m7 ;low:t2 high:t3 - punpcklqdq m7, m4 - punpckhwd m7, m4 ;packed t4, t5 - punpcklqdq m4, m3 - punpckhwd m3, m4 ;packed t7, t6 - ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a - ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a - psubsw m4, m5, m0 ;low:t2a high:t3a - paddsw m0, m5 ;low:out0 high:-out15 - psubsw m5, m7, m3 ;low:t6 high:t7 - paddsw m3, m7 ;low:-out3 high:out12 - ret -ALIGN function_align -.main_pass1_end: - mova m7, [o(deint_shuf1)] - mova [coeffq+16*4], m0 - mova [coeffq+16*5], m3 - mova m0, [o(pw_2896_m2896)] - mova m3, [o(pw_2896_2896)] - pshufb m1, m7 ;t14a t15a - pshufb m2, m7 ;t10 t11 - pshufb m4, m7 ;t2a t3a - pshufb m5, m7 ;t6 t7 - pmaddwd m7, m0, m2 - pmaddwd m2, m3 - paddd m7, m6 - paddd m2, m6 - psrad m7, 12 - psrad m2, 12 - packssdw m2, m7 ;low:out6 high:-out9 - pmaddwd m7, m0, m4 - pmaddwd m4, m3 - paddd m7, m6 - paddd m4, m6 - psrad m7, 12 - psrad m4, 12 - packssdw m4, m7 ;low:-out7 high:out8 - pmaddwd m7, m3, m5 - pmaddwd m5, m0 - paddd m7, m6 - paddd m5, m6 - psrad m7, 12 - psrad m5, 12 - packssdw m7, m5 ;low:out4 high:-out11 - pmaddwd m5, m3, m1 - pmaddwd m1, m0 - paddd m5, m6 - paddd m1, m6 - psrad m5, 12 - psrad m1, 12 - packssdw m5, m1 ;low:-out5 high:out10 - mova m0, [coeffq+16*4] - mova m3, [coeffq+16*5] - ret -ALIGN function_align -.main_pass2_end: - mova m7, [o(pw_2896x8)] - punpckhqdq m6, m2, m1 ;low:t11 high:t15a - punpcklqdq m2, m1 ;low:t10 high:t14a - psubsw m1, m2, m6 - paddsw m2, m6 - punpckhqdq m6, m4, m5 ;low:t3a high:t7 - punpcklqdq m4, m5 ;low:t2a high:t6 - psubsw m5, m4, m6 - paddsw m4, m6 - pmulhrsw m1, m7 ;low:-out9 high:out10 - pmulhrsw m2, m7 ;low:out6 high:-out5 - pmulhrsw m5, m7 ;low:out8 high:-out11 - pmulhrsw m4, m7 ;low:-out7 high:out4 - punpckhqdq m7, m4, m5 ;low:out4 high:-out11 - punpcklqdq m4, m5 ;low:-out7 high:out8 - punpckhqdq m5, m2, m1 ;low:-out5 high:out10 - punpcklqdq m2, m1 ;low:out6 high:-out9 - ret - - -INV_TXFM_16X4_FN flipadst, dct -INV_TXFM_16X4_FN flipadst, adst -INV_TXFM_16X4_FN flipadst, flipadst -INV_TXFM_16X4_FN flipadst, identity - -cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_7ROWS coeffq, 16 - call m(iadst_16x4_internal).main - call m(iadst_16x4_internal).main_pass1_end - - punpcklwd m6, m7, m0 ;packed out11, out15 - punpckhwd m0, m7 ;packed -out0, -out4 - punpckhwd m7, m3, m4 ;packed out3, out7 - punpcklwd m4, m3 ;packed -out8, -out12 - mova m1, [coeffq+16*6] - punpckhwd m3, m1, m5 ;packed out1, out5 - punpcklwd m5, m1 ;packed -out10, -out14 - mova m1, [coeffq+16*7] - mova [coeffq+16*6], m3 - mova [coeffq+16*7], m7 - punpcklwd m3, m2, m1 ;packed out9, out13 - punpckhwd m1, m2 ;packed -out2, -out6 - - mova m7, [o(pw_m16384)] - jmp m(iadst_16x4_internal).pass1_end - -.pass2: - lea tx2q, [o(m(iflipadst_8x4_internal).pass2)] - jmp m(idct_16x4_internal).pass2_end - - -INV_TXFM_16X4_FN identity, dct -INV_TXFM_16X4_FN identity, adst -INV_TXFM_16X4_FN identity, flipadst -INV_TXFM_16X4_FN identity, identity - -cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m1, [coeffq+16*6] - mova m0, [coeffq+16*5] - mova m2, [coeffq+16*7] - mova m6, [o(pw_1697x16)] - mova m7, [o(pw_16384)] - pmulhrsw m4, m6, m1 - pmulhrsw m3, m6, m0 - pmulhrsw m5, m6, m2 - pmulhrsw m4, m7 - pmulhrsw m3, m7 - pmulhrsw m5, m7 - paddsw m1, m4 - paddsw m0, m3 - paddsw m5, m2 - mova m2, [coeffq+16*2] - mova m3, [coeffq+16*3] - mova m4, [coeffq+16*4] - mova [coeffq+16*6], m1 - mova [coeffq+16*5], m0 - mova [coeffq+16*7], m5 - pmulhrsw m0, m6, m2 - pmulhrsw m1, m6, m3 - pmulhrsw m5, m6, m4 - pmulhrsw m0, m7 - pmulhrsw m1, m7 - pmulhrsw m5, m7 - paddsw m2, m0 - paddsw m3, m1 - paddsw m4, m5 - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - pmulhrsw m5, m6, m0 - pmulhrsw m6, m1 - pmulhrsw m5, m7 - pmulhrsw m6, m7 - paddsw m0, m5 - paddsw m1, m6 - mova m6, [coeffq+16*6] - mova m5, [coeffq+16*5] - punpckhwd m7, m0, m2 ;packed out1, out5 - punpcklwd m0, m2 ;packed out0, out4 - punpckhwd m2, m1, m3 ;packed out3, out7 - punpcklwd m1, m3 ;packed out2, out6 - mova [coeffq+16*6], m7 - mova m7, [coeffq+16*7] - punpckhwd m3, m4, m6 ;packed out9, out13 - punpcklwd m4, m6 ;packed out8, out12 - punpckhwd m6, m5, m7 ;packed out11, out15 - punpcklwd m5, m7 ;packed out10, out14 - jmp m(idct_16x4_internal).pass1_end3 - -.pass2: - lea tx2q, [o(m(iidentity_8x4_internal).pass2)] - jmp m(idct_16x4_internal).pass2_end - - -%macro SAVE_8ROWS 2 ;src, stride - mova [%1+%2*0], m0 - mova [%1+%2*1], m1 - mova [%1+%2*2], m2 - mova [%1+%2*3], m3 - mova [%1+%2*4], m4 - mova [%1+%2*5], m5 - mova [%1+%2*6], m6 - mova [%1+%2*7], m7 -%endmacro - -%macro INV_TXFM_8X16_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 8x16, 8, 16*16 -%ifidn %1_%2, dct_dct - pshuflw m0, [coeffq], q0000 - punpcklwd m0, m0 - mova m1, [o(pw_2896x8)] - pmulhrsw m0, m1 - mova m2, [o(pw_16384)] - mov [coeffq], eobd - pmulhrsw m0, m1 - pmulhrsw m0, m2 - psrlw m2, 3 ; pw_2048 - pmulhrsw m0, m1 - pmulhrsw m0, m2 - mov r3d, 4 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)] - jmp m(inv_txfm_add_dct_dct_8x8).loop -.end: - RET -%endif -%endmacro - -INV_TXFM_8X16_FN dct, dct -INV_TXFM_8X16_FN dct, adst -INV_TXFM_8X16_FN dct, flipadst -INV_TXFM_8X16_FN dct, identity - -cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(idct_8x8_internal).pass1)] - -.pass1: - LOAD_8ROWS coeffq+16*1, 32, 1 - mov [rsp+gprsize+16*11], tx2q - lea tx2q, [o(m(idct_8x16_internal).pass1_end)] - jmp r3 - -.pass1_end: - SAVE_8ROWS coeffq+16*1, 32 - LOAD_8ROWS coeffq+16*0, 32, 1 - mov tx2q, [rsp+gprsize+16*11] - jmp r3 - -.pass2: - lea tx2q, [o(m(idct_8x16_internal).end)] - -.pass2_pre: - mova [coeffq+16*2 ], m1 - mova [coeffq+16*6 ], m3 - mova [coeffq+16*10], m5 - mova [coeffq+16*14], m7 - mova m1, m2 - mova m2, m4 - mova m3, m6 - mova m4, [coeffq+16*1 ] - mova m5, [coeffq+16*5 ] - mova m6, [coeffq+16*9 ] - mova m7, [coeffq+16*13] - -.pass2_main: - call m(idct_8x8_internal).main - - SAVE_7ROWS rsp+gprsize+16*3, 16 - mova m0, [coeffq+16*2 ] - mova m1, [coeffq+16*6 ] - mova m2, [coeffq+16*10] - mova m3, [coeffq+16*14] - mova m4, [coeffq+16*3 ] - mova m5, [coeffq+16*7 ] - mova m6, [coeffq+16*11] - mova m7, [coeffq+16*15] - call m(idct_16x8_internal).main - - mov r3, dstq - lea dstq, [dstq+strideq*8] - jmp m(idct_8x8_internal).end - -.end: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] - mov dstq, r3 - jmp m(idct_8x8_internal).end - -.end1: - pxor m7, m7 - REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - ret - -INV_TXFM_8X16_FN adst, dct -INV_TXFM_8X16_FN adst, adst -INV_TXFM_8X16_FN adst, flipadst -INV_TXFM_8X16_FN adst, identity - -cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iadst_8x8_internal).pass1)] - jmp m(idct_8x16_internal).pass1 - -.pass2: - lea tx2q, [o(m(iadst_8x16_internal).end)] - -.pass2_pre: - mova [rsp+gprsize+16*7], m0 - mova [rsp+gprsize+16*8], m1 - mova [rsp+gprsize+16*5], m6 - mova [rsp+gprsize+16*6], m7 - mova m0, m2 - mova m1, m3 - mova m2, m4 - mova m3, m5 - -.pass2_main: - mova m4, [coeffq+16*1 ] - mova m5, [coeffq+16*3 ] - mova m6, [coeffq+16*13] - mova m7, [coeffq+16*15] - mova [rsp+gprsize+16*3], m4 - mova [rsp+gprsize+16*4], m5 - mova [rsp+gprsize+16*9], m6 - mova [rsp+gprsize+32*5], m7 - mova m4, [coeffq+16*5 ] - mova m5, [coeffq+16*7 ] - mova m6, [coeffq+16*9 ] - mova m7, [coeffq+16*11] - - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end - - mov r3, dstq - lea dstq, [dstq+strideq*8] - jmp m(iadst_8x8_internal).end - -.end: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] - mov dstq, r3 - jmp m(iadst_8x8_internal).end - - -INV_TXFM_8X16_FN flipadst, dct -INV_TXFM_8X16_FN flipadst, adst -INV_TXFM_8X16_FN flipadst, flipadst -INV_TXFM_8X16_FN flipadst, identity - -cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - lea r3, [o(m(iflipadst_8x8_internal).pass1)] - jmp m(idct_8x16_internal).pass1 - -.pass2: - lea tx2q, [o(m(iflipadst_8x16_internal).end)] - lea r3, [dstq+strideq*8] - -.pass2_pre: - mova [rsp+gprsize+16*7], m0 - mova [rsp+gprsize+16*8], m1 - mova [rsp+gprsize+16*5], m6 - mova [rsp+gprsize+16*6], m7 - mova m0, m2 - mova m1, m3 - mova m2, m4 - mova m3, m5 - -.pass2_main: - mova m4, [coeffq+16*1 ] - mova m5, [coeffq+16*3 ] - mova m6, [coeffq+16*13] - mova m7, [coeffq+16*15] - mova [rsp+gprsize+16*3], m4 - mova [rsp+gprsize+16*4], m5 - mova [rsp+gprsize+16*9], m6 - mova [rsp+gprsize+32*5], m7 - mova m4, [coeffq+16*5 ] - mova m5, [coeffq+16*7 ] - mova m6, [coeffq+16*9 ] - mova m7, [coeffq+16*11] - - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass2_end - jmp m(iflipadst_8x8_internal).end - -.end: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] - mov dstq, r3 - jmp m(iflipadst_8x8_internal).end - - -INV_TXFM_8X16_FN identity, dct -INV_TXFM_8X16_FN identity, adst -INV_TXFM_8X16_FN identity, flipadst -INV_TXFM_8X16_FN identity, identity - -cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_8ROWS coeffq+16*1, 32, 1 - mov r3, tx2q - lea tx2q, [o(m(iidentity_8x16_internal).pass1_end)] - mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).pass1_end3 - -.pass1_end: - SAVE_8ROWS coeffq+16*1, 32 - LOAD_8ROWS coeffq+16*0, 32, 1 - mov tx2q, r3 - mova [rsp+gprsize+16*1], m6 - jmp m(idct_8x8_internal).pass1_end3 - -.pass2: - lea tx2q, [o(m(iidentity_8x16_internal).end1)] - -.end: - mova [rsp+gprsize+16*0], m7 - mova [rsp+gprsize+16*1], m6 - mova m7, [o(pw_1697x16)] - REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 - mova m6, [rsp+gprsize+16*1] - mova [rsp+gprsize+16*2], m5 - IDTX16 6, 5, 7 - mova m5, [rsp+gprsize+16*0] - IDTX16 5, 7, 7 - mova m7, [o(pw_2048)] - REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 - pmulhrsw m7, [rsp+gprsize+16*2] - mova [rsp+gprsize+16*0], m5 - mova [rsp+gprsize+16*1], m6 - mova [rsp+gprsize+16*2], m7 - jmp m(idct_8x8_internal).end3 - -.end1: - LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] - lea dstq, [dstq+strideq*2] - jmp .end - - -%macro INV_TXFM_16X8_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 16x8, 8, 16*16 -%ifidn %1_%2, dct_dct - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_16384)] - mov [coeffq], eobd - pmulhrsw m0, m1 - mov r2d, 4 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly -.end: - RET -%endif -%endmacro - -INV_TXFM_16X8_FN dct, dct -INV_TXFM_16X8_FN dct, adst -INV_TXFM_16X8_FN dct, flipadst -INV_TXFM_16X8_FN dct, identity - -cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_8ROWS coeffq+16*0, 32, 1 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - LOAD_8ROWS coeffq+16*1, 32, 1 - call .main - mov r3, tx2q - lea tx2q, [o(m(idct_16x8_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end: - SAVE_8ROWS coeffq+16*1, 32 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - mov tx2q, r3 - jmp m(idct_8x8_internal).pass1_end - -.pass2: - lea tx2q, [o(m(idct_16x8_internal).end)] - lea r3, [dstq+8] - jmp m(idct_8x8_internal).pass2_main - -.end: - LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] - mov dstq, r3 - jmp m(idct_8x8_internal).pass2_main - - -ALIGN function_align -.main: - mova [rsp+gprsize*2+16*1], m2 - mova [rsp+gprsize*2+16*2], m6 - mova [rsp+gprsize*2+32*5], m5 - - mova m6, [o(pd_2048)] - ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a - ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a - psubsw m2, m0, m4 ;t9 - paddsw m0, m4 ;t8 - psubsw m4, m7, m3 ;t14 - paddsw m7, m3 ;t15 - ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a - mova m3, [rsp+gprsize*2+16*1] - mova m5, [rsp+gprsize*2+32*5] - mova [rsp+gprsize*2+16*1], m2 - mova [rsp+gprsize*2+32*5], m4 - mova m2, [rsp+gprsize*2+16*2] - mova [rsp+gprsize*2+16*2], m7 - ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a - ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a - psubsw m4, m2, m3 ;t10 - paddsw m2, m3 ;t11 - psubsw m3, m1, m5 ;t13 - paddsw m1, m5 ;t12 - ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a - mova m7, [rsp+gprsize*2+32*5] - psubsw m6, m0, m2 ;t11a - paddsw m0, m2 ;t8a - paddsw m2, m7, m3 ;t9 - psubsw m7, m3 ;t10 - mova m5, [rsp+gprsize*2+16*0] - psubsw m3, m5, m0 ;out8 - paddsw m0, m5 ;out7 - mova [rsp+gprsize*2+32*5], m0 - mova m5, [rsp+gprsize*2+16*9] - psubsw m0, m5, m2 ;out9 - paddsw m2, m5 ;out6 - mova [rsp+gprsize*2+16*0], m0 - mova [rsp+gprsize*2+16*9], m2 - mova m0, [rsp+gprsize*2+16*1] - mova m2, [rsp+gprsize*2+16*2] - mova [rsp+gprsize*2+16*1], m3 - psubsw m5, m0, m4 ;t13 - paddsw m0, m4 ;t14 - mova m3, [o(pd_2048)] - psubsw m4, m2, m1 ;t12a - paddsw m1, m2 ;t15a - mova [rsp+gprsize*2+16*2], m1 - ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a - ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 - mova m3, [rsp+gprsize*2+16*8] - psubsw m2, m3, m5 ;out10 - paddsw m3, m5 ;out5 - mova m5, [rsp+gprsize*2+16*7] - mova [rsp+gprsize*2+16*8], m3 - psubsw m3, m5, m4 ;out11 - paddsw m5, m4 ;out4 - mova m4, [rsp+gprsize*2+16*6] - mova [rsp+gprsize*2+16*7], m5 - paddsw m5, m4, m6 ;out3 - psubsw m4, m6 ;out12 - mova m6, [rsp+gprsize*2+16*5] - mova [rsp+gprsize*2+16*6], m5 - psubsw m5, m6, m7 ;out13 - paddsw m6, m7 ;out2 - mova m7, [rsp+gprsize*2+16*4] - mova [rsp+gprsize*2+16*5], m6 - psubsw m6, m7, m0 ;out14 - paddsw m7, m0 ;out1 - mova m1, [rsp+gprsize*2+16*2] - mova m0, [rsp+gprsize*2+16*3] - mova [rsp+gprsize*2+16*4], m7 - psubsw m7, m0, m1 ;out15 - paddsw m0, m1 ;out0 - mova [rsp+gprsize*2+16*3], m0 - mova m1, [rsp+gprsize*2+16*0] - mova m0, [rsp+gprsize*2+16*1] - mova [rsp+gprsize*2+16*0], m7 - ret - -INV_TXFM_16X8_FN adst, dct -INV_TXFM_16X8_FN adst, adst -INV_TXFM_16X8_FN adst, flipadst -INV_TXFM_16X8_FN adst, identity - -cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m7, [o(pw_2896x8)] - pmulhrsw m0, m7, [coeffq+16*0 ] - pmulhrsw m1, m7, [coeffq+16*1 ] - pmulhrsw m2, m7, [coeffq+16*14] - pmulhrsw m3, m7, [coeffq+16*15] - mova [rsp+gprsize+16*7], m0 - mova [rsp+gprsize+16*8], m1 - mova [rsp+gprsize+16*9], m2 - mova [rsp+gprsize+32*5], m3 - pmulhrsw m0, m7, [coeffq+16*6 ] - pmulhrsw m1, m7, [coeffq+16*7 ] - pmulhrsw m2, m7, [coeffq+16*8 ] - pmulhrsw m3, m7, [coeffq+16*9 ] - mova [rsp+gprsize+16*3], m2 - mova [rsp+gprsize+16*4], m3 - mova [rsp+gprsize+16*5], m0 - mova [rsp+gprsize+16*6], m1 - pmulhrsw m0, m7, [coeffq+16*2 ] - pmulhrsw m1, m7, [coeffq+16*3 ] - pmulhrsw m2, m7, [coeffq+16*4 ] - pmulhrsw m3, m7, [coeffq+16*5 ] - pmulhrsw m4, m7, [coeffq+16*10] - pmulhrsw m5, m7, [coeffq+16*11] - pmulhrsw m6, m7, [coeffq+16*12] - pmulhrsw m7, [coeffq+16*13] - - call .main - call .main_pass1_end - mov r3, tx2q - lea tx2q, [o(m(iadst_16x8_internal).pass1_end)] - jmp m(iadst_8x8_internal).pass1_end - -.pass1_end: - SAVE_8ROWS coeffq+16*1, 32 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - mov tx2q, r3 - jmp m(iadst_8x8_internal).pass1_end - -.pass2: - lea tx2q, [o(m(iadst_16x8_internal).end)] - lea r3, [dstq+8] - jmp m(iadst_8x8_internal).pass2_main - -.end: - LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] - mov dstq, r3 - jmp m(iadst_8x8_internal).pass2_main - -ALIGN function_align -.main: - mova [rsp+gprsize*2+16*0], m1 - mova [rsp+gprsize*2+16*1], m2 - mova [rsp+gprsize*2+16*2], m6 - - mova m6, [o(pd_2048)] - ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 - ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 - psubsw m1, m0, m4 ;t10a - paddsw m0, m4 ;t2a - psubsw m4, m7, m3 ;t11a - paddsw m3, m7 ;t3a - ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 - mova m2, [rsp+gprsize*2+16*0] ;in3 - mova m7, [rsp+gprsize*2+16*1] ;in4 - mova [rsp+gprsize*2+16*0], m1 ;t11 - mova [rsp+gprsize*2+16*1], m4 ;t10 - mova m1, [rsp+gprsize*2+16*2] ;in12 - mova [rsp+gprsize*2+16*2], m0 ;t2a - ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 - ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 - psubsw m0, m7, m1 ;t12a - paddsw m1, m7 ;t4a - psubsw m4, m5, m2 ;t13a - paddsw m5, m2 ;t5a - ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 - mova m2, [rsp+gprsize*2+16*8] ;in1 - mova m7, [rsp+gprsize*2+16*9] ;in14 - mova [rsp+gprsize*2+16*8], m4 ;t12 - mova [rsp+gprsize*2+16*9], m0 ;t13 - mova m4, [rsp+gprsize*2+16*4] ;in9 - mova m0, [rsp+gprsize*2+16*5] ;in6 - mova [rsp+gprsize*2+16*4], m1 ;t4a - mova [rsp+gprsize*2+16*5], m5 ;t5a - ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 - ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 - psubsw m1, m0, m7 ;t14a - paddsw m0, m7 ;t6a - psubsw m5, m4, m2 ;t15a - paddsw m4, m2 ;t7a - ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 - mova m2, [rsp+gprsize*2+16*2] ;t2a - mova [rsp+gprsize*2+16*2], m5 ;t14 - psubsw m7, m2, m0 ;t6 - paddsw m2, m0 ;t2 - psubsw m0, m3, m4 ;t7 - paddsw m3, m4 ;t3 - ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a - mova m4, [rsp+gprsize*2+16*7] ;in0 - mova m5, [rsp+gprsize*2+32*5] ;in15 - mova [rsp+gprsize*2+16*7], m3 ;t3 - mova [rsp+gprsize*2+32*5], m1 ;t15 - mova m1, [rsp+gprsize*2+16*6] ;in7 - mova m3, [rsp+gprsize*2+16*3] ;in8 - mova [rsp+gprsize*2+16*6], m7 ;t7a - mova [rsp+gprsize*2+16*3], m0 ;t6a - ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 - ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 - psubsw m0, m4, m3 ;t8a - paddsw m4, m3 ;t0a - psubsw m3, m5, m1 ;t9a - paddsw m5, m1 ;t1a - ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 - mova m1, [rsp+gprsize*2+16*4] ;t4a - mova m7, [rsp+gprsize*2+16*5] ;t5a - mova [rsp+gprsize*2+16*4], m3 ;t8 - mova [rsp+gprsize*2+16*5], m0 ;t9 - psubsw m0, m4, m1 ;t4 - paddsw m4, m1 ;t0 - psubsw m3, m5, m7 ;t5 - paddsw m5, m7 ;t1 - ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a - mova m7, [rsp+gprsize*2+16*3] ;t6a - psubsw m1, m4, m2 ;t2a - paddsw m4, m2 ;out0 - mova [rsp+gprsize*2+16*3], m4 ;out0 - mova m4, [rsp+gprsize*2+16*6] ;t7a - psubsw m2, m3, m7 ;t6 - paddsw m3, m7 ;-out3 - mova [rsp+gprsize*2+16*6], m3 ;-out3 - psubsw m3, m0, m4 ;t7 - paddsw m0, m4 ;out12 - mova [rsp+gprsize*2+16*12], m3 - mova m3, [rsp+gprsize*2+16*7] ;t3 - mova [rsp+gprsize*2+16* 7], m2 ;out4 - psubsw m2, m5, m3 ;t3a - paddsw m5, m3 ;-out15 - mova [rsp+gprsize*2+16*11], m2 - mova m2, [rsp+gprsize*2+32*5] ;t15 - mova [rsp+gprsize*2+16*10], m1 ;-out7 - mova m1, [rsp+gprsize*2+16*0] ;t11 - mova [rsp+gprsize*2+16*0 ], m5 ;-out15 - mova m3, [rsp+gprsize*2+16*1] ;t10 - mova [rsp+gprsize*2+16*1 ], m4 ;-out11 - mova m4, [rsp+gprsize*2+16*2] ;t14 - mova [rsp+gprsize*2+16*2 ], m0 ;out12 - psubsw m0, m3, m4 ;t14a - paddsw m3, m4 ;t10a - psubsw m5, m1, m2 ;t15a - paddsw m1, m2 ;t11a - ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 - mova m2, [rsp+gprsize*2+16*4] ;t8 - mova m4, [rsp+gprsize*2+16*5] ;t9 - mova [rsp+gprsize*2+16*4], m3 ;t10a - mova [rsp+gprsize*2+16*5], m1 ;t11a - mova m3, [rsp+gprsize*2+16*8] ;t12 - mova m1, [rsp+gprsize*2+16*9] ;t13 - mova [rsp+gprsize*2+16*8], m5 ;t14 - mova [rsp+gprsize*2+16*9], m0 ;t15 - psubsw m5, m2, m3 ;t12a - paddsw m2, m3 ;t8a - psubsw m0, m4, m1 ;t13a - paddsw m4, m1 ;t9a - ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 - mova m6, [rsp+gprsize*2+16*4] ;t10a - mova m1, [rsp+gprsize*2+16*5] ;t11a - psubsw m3, m2, m6 ;t10 - paddsw m2, m6 ;-out1 - paddsw m6, m4, m1 ;out14 - psubsw m4, m1 ;t11 - mova [rsp+gprsize*2+16*14], m4 - mova [rsp+gprsize*2+16* 4], m2 ;-out1 - mova m4, [rsp+gprsize*2+16*8] ;t14 - mova m2, [rsp+gprsize*2+16*9] ;t15 - mova [rsp+gprsize*2+16* 9], m3 ;out6 - psubsw m3, m0, m4 ;t14a - paddsw m0, m4 ;out2 - psubsw m4, m5, m2 ;t15a - paddsw m5, m2 ;-out13 - mova [rsp+gprsize*2+16* 5], m0 ;out2 - ret -ALIGN function_align -.main_pass1_end: - mova m0, [rsp+gprsize*2+16*14] - mova [rsp+gprsize*2+16*14], m5 - mova [rsp+gprsize*2+16*15], m6 - mova m5, [o(pw_2896_2896)] - mova m6, [o(pw_2896_m2896)] - mova m7, [o(pd_2048)] - punpcklwd m2, m3, m4 - punpckhwd m3, m4 - pmaddwd m4, m5, m2 - pmaddwd m2, m6 - pmaddwd m1, m5, m3 - pmaddwd m3, m6 - REPX {paddd x, m7}, m4, m2, m1, m3 - REPX {psrad x, 12}, m4, m1, m2, m3 - packssdw m4, m1 ;-out5 - packssdw m2, m3 ;out10 - mova [rsp+gprsize*2+16* 8], m4 - mova m3, [rsp+gprsize*2+16* 9] - punpcklwd m1, m3, m0 - punpckhwd m3, m0 - pmaddwd m0, m5, m1 - pmaddwd m1, m6 - pmaddwd m4, m5, m3 - pmaddwd m3, m6 - REPX {paddd x, m7}, m0, m1, m4, m3 - REPX {psrad x, 12}, m0, m4, m1, m3 - packssdw m0, m4 ;out6 - packssdw m1, m3 ;-out9 - mova [rsp+gprsize*2+16* 9], m0 - mova m0, [rsp+gprsize*2+16* 7] - mova m4, [rsp+gprsize*2+16*12] - punpcklwd m3, m0, m4 - punpckhwd m0, m4 - pmaddwd m4, m5, m3 - pmaddwd m3, m6 - pmaddwd m5, m0 - pmaddwd m0, m6 - REPX {paddd x, m7}, m4, m3, m5, m0 - REPX {psrad x, 12}, m4, m5, m3, m0 - packssdw m4, m5 ;out4 - packssdw m3, m0 ;-out11 - mova [rsp+gprsize*2+16* 7], m4 - mova m4, [rsp+gprsize*2+16*10] - mova m5, [rsp+gprsize*2+16*11] - punpcklwd m0, m4, m5 - punpckhwd m4, m5 - pmaddwd m5, m0, [o(pw_2896_2896)] - pmaddwd m0, m6 - pmaddwd m6, m4 - pmaddwd m4, [o(pw_2896_2896)] - REPX {paddd x, m7}, m5, m0, m6, m4 - REPX {psrad x, 12}, m0, m6, m5, m4 - packssdw m0, m6 ;out8 - packssdw m5, m4 ;-out7 - mova [rsp+gprsize*2+16*10], m5 - mova m4, [rsp+gprsize*2+16* 2] ;out12 - mova m5, [rsp+gprsize*2+16*14] ;-out13 - mova m6, [rsp+gprsize*2+16*15] ;out14 - ret -ALIGN function_align -.main_pass2_end: - mova m7, [o(pw_2896x8)] - mova m1, [rsp+gprsize*2+16* 9] - mova m2, [rsp+gprsize*2+16*14] - paddsw m0, m1, m2 - psubsw m1, m2 - pmulhrsw m0, m7 ;out6 - pmulhrsw m1, m7 ;-out9 - mova [rsp+gprsize*2+16* 9], m0 - psubsw m2, m3, m4 - paddsw m3, m4 - pmulhrsw m2, m7 ;out10 - pmulhrsw m3, m7 ;-out5 - mova [rsp+gprsize*2+16* 8], m3 - mova m3, [rsp+gprsize*2+16* 7] - mova m4, [rsp+gprsize*2+16*12] - paddsw m0, m3, m4 - psubsw m3, m4 - pmulhrsw m0, m7 ;out4 - pmulhrsw m3, m7 ;-out11 - mova [rsp+gprsize*2+16* 7], m0 - mova m0, [rsp+gprsize*2+16*10] - paddsw m4, m0, [rsp+gprsize*2+16*11] - psubsw m0, [rsp+gprsize*2+16*11] - pmulhrsw m4, m7 ;-out7 - pmulhrsw m0, m7 ;out8 - mova [rsp+gprsize*2+16*10], m4 - mova m4, [rsp+gprsize*2+16*2 ] ;out12 - ret - -INV_TXFM_16X8_FN flipadst, dct -INV_TXFM_16X8_FN flipadst, adst -INV_TXFM_16X8_FN flipadst, flipadst -INV_TXFM_16X8_FN flipadst, identity - -cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mova m7, [o(pw_2896x8)] - pmulhrsw m0, m7, [coeffq+16*0 ] - pmulhrsw m1, m7, [coeffq+16*1 ] - pmulhrsw m2, m7, [coeffq+16*14] - pmulhrsw m3, m7, [coeffq+16*15] - mova [rsp+gprsize+16*7], m0 - mova [rsp+gprsize+16*8], m1 - mova [rsp+gprsize+16*9], m2 - mova [rsp+gprsize+32*5], m3 - pmulhrsw m0, m7, [coeffq+16*6 ] - pmulhrsw m1, m7, [coeffq+16*7 ] - pmulhrsw m2, m7, [coeffq+16*8 ] - pmulhrsw m3, m7, [coeffq+16*9 ] - mova [rsp+gprsize+16*3], m2 - mova [rsp+gprsize+16*4], m3 - mova [rsp+gprsize+16*5], m0 - mova [rsp+gprsize+16*6], m1 - pmulhrsw m0, m7, [coeffq+16*2 ] - pmulhrsw m1, m7, [coeffq+16*3 ] - pmulhrsw m2, m7, [coeffq+16*4 ] - pmulhrsw m3, m7, [coeffq+16*5 ] - pmulhrsw m4, m7, [coeffq+16*10] - pmulhrsw m5, m7, [coeffq+16*11] - pmulhrsw m6, m7, [coeffq+16*12] - pmulhrsw m7, [coeffq+16*13] - - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end - - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS coeffq+16*0, 32 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x8_internal).pass1_end)] - jmp m(iflipadst_8x8_internal).pass1_end - -.pass1_end: - SAVE_8ROWS coeffq+16*1, 32 - LOAD_8ROWS coeffq+16*0, 32 - mova [rsp+gprsize+16*0], m7 - mov tx2q, r3 - jmp m(iflipadst_8x8_internal).pass1_end - -.pass2: - lea tx2q, [o(m(iflipadst_16x8_internal).end)] - lea r3, [dstq+8] - jmp m(iflipadst_8x8_internal).pass2_main - -.end: - LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] - mov dstq, r3 - jmp m(iflipadst_8x8_internal).pass2_main - - -INV_TXFM_16X8_FN identity, dct -INV_TXFM_16X8_FN identity, adst -INV_TXFM_16X8_FN identity, flipadst -INV_TXFM_16X8_FN identity, identity - -cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - add coeffq, 16*16 - mova m4, [coeffq-16*7] - mova m5, [coeffq-16*5] - mova m6, [coeffq-16*3] - mova m7, [coeffq-16*1] - mov r3, tx2q - lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)] - -.pass1: - mova m0, [o(pw_2896x8)] - mova m2, [o(pw_1697x16)] - mova m3, [o(pw_16384)] - sub coeffq, 8*16 - REPX {pmulhrsw x, m0}, m4, m5, m6, m7 - pmulhrsw m1, m2, m4 - pmulhrsw m1, m3 - paddsw m1, m4 ; 1 - pmulhrsw m4, m2, m5 - pmulhrsw m4, m3 - paddsw m4, m5 ; 3 - pmulhrsw m5, m2, m6 - pmulhrsw m5, m3 - paddsw m5, m6 ; 5 - pmulhrsw m6, m2, m7 - pmulhrsw m6, m3 - paddsw m7, m6 ; 7 - pmulhrsw m6, m0, [coeffq+16*6] - mova [rsp+gprsize+16*0], m4 - pmulhrsw m4, m2, m6 - pmulhrsw m4, m3 - paddsw m6, m4 ; 6 - pmulhrsw m4, m0, [coeffq+16*4] - mova [rsp+gprsize+16*1], m6 - pmulhrsw m6, m2, m4 - pmulhrsw m6, m3 - paddsw m4, m6 ; 4 - pmulhrsw m6, m0, [coeffq+16*2] - pmulhrsw m0, [coeffq+16*0] - pmulhrsw m2, m6 - pmulhrsw m2, m3 - paddsw m2, m6 ; 2 - pmulhrsw m6, m0, [o(pw_1697x16)] - pmulhrsw m6, m3 - mova m3, [rsp+gprsize+16*0] - paddsw m0, m6 - jmp m(idct_8x8_internal).pass1_end3 - -.pass1_end: - mova [coeffq+16*1], m4 - mova [coeffq+16*3], m5 - mova [coeffq+16*5], m6 - mova [coeffq+16*7], m7 - mova m4, [coeffq-16*7] - mova m5, [coeffq-16*5] - mova m6, [coeffq-16*3] - mova m7, [coeffq-16*1] - mova [coeffq-16*7], m0 - mova [coeffq-16*5], m1 - mova [coeffq-16*3], m2 - mova [coeffq-16*1], m3 - mov tx2q, r3 - jmp .pass1 - -.pass2: - lea tx2q, [o(m(iidentity_16x8_internal).end)] - lea r3, [dstq+8] - jmp m(iidentity_8x8_internal).end - -.end: - LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] - mov dstq, r3 - jmp m(iidentity_8x8_internal).end - - -%macro INV_TXFM_16X16_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 16x16, 8, 16*16 -%ifidn %1_%2, dct_dct - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_8192)] - mov [coeffq], eobd - mov r2d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly -.end: - RET -%endif -%endmacro - -INV_TXFM_16X16_FN dct, dct -INV_TXFM_16X16_FN dct, adst -INV_TXFM_16X16_FN dct, flipadst -INV_TXFM_16X16_FN dct, identity - -cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - LOAD_8ROWS coeffq+16*1, 64 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_8ROWS coeffq+16*3, 64 - call m(idct_16x8_internal).main - mov r3, tx2q - lea tx2q, [o(m(idct_16x16_internal).pass1_end)] - mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end: - SAVE_8ROWS coeffq+16*17, 32 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal).pass1_end1)] - mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end1: - SAVE_8ROWS coeffq+16*1, 32 - LOAD_8ROWS coeffq+16*0, 64 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_8ROWS coeffq+16*2, 64 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x16_internal).pass1_end2)] - mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end2: - SAVE_8ROWS coeffq+16*16, 32 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - mov tx2q, r3 - mova m7, [o(pw_8192)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass2: - lea tx2q, [o(m(idct_16x16_internal).end)] - jmp m(idct_8x16_internal).pass2_pre - -.end: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x16_internal).end1)] - mov dstq, r3 - lea r3, [dstq+8] - jmp m(idct_8x8_internal).end - -.end1: - pxor m7, m7 - REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - - add coeffq, 32*8 - mov dstq, r3 - - mova m0, [coeffq+16*0 ] - mova m1, [coeffq+16*4 ] - mova m2, [coeffq+16*8 ] - mova m3, [coeffq+16*12] - mova m4, [coeffq+16*1 ] - mova m5, [coeffq+16*5 ] - mova m6, [coeffq+16*9 ] - mova m7, [coeffq+16*13] - lea tx2q, [o(m(idct_8x16_internal).end)] - jmp m(idct_8x16_internal).pass2_main - - -%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 - mova m0, [coeffq+16*1 ] - mova m1, [coeffq+16*3 ] - mova m2, [coeffq+16*29] - mova m3, [coeffq+16*31] - mova [rsp+gprsize+16*7], m0 - mova [rsp+gprsize+16*8], m1 - mova [rsp+gprsize+16*9], m2 - mova [rsp+gprsize+32*5], m3 - mova m0, [coeffq+16*13] - mova m1, [coeffq+16*15] - mova m2, [coeffq+16*17] - mova m3, [coeffq+16*19] - mova [rsp+gprsize+16*3], m2 - mova [rsp+gprsize+16*4], m3 - mova [rsp+gprsize+16*5], m0 - mova [rsp+gprsize+16*6], m1 - mova m0, [coeffq+16*5 ] - mova m1, [coeffq+16*7 ] - mova m2, [coeffq+16*9 ] - mova m3, [coeffq+16*11] - mova m4, [coeffq+16*21] - mova m5, [coeffq+16*23] - mova m6, [coeffq+16*25] - mova m7, [coeffq+16*27] -%endmacro - -%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 - mova m0, [coeffq+16*0 ] - mova m1, [coeffq+16*2 ] - mova m2, [coeffq+16*28] - mova m3, [coeffq+16*30] - mova [rsp+gprsize+16*7], m0 - mova [rsp+gprsize+16*8], m1 - mova [rsp+gprsize+16*9], m2 - mova [rsp+gprsize+32*5], m3 - mova m0, [coeffq+16*12] - mova m1, [coeffq+16*14] - mova m2, [coeffq+16*16] - mova m3, [coeffq+16*18] - mova [rsp+gprsize+16*3], m2 - mova [rsp+gprsize+16*4], m3 - mova [rsp+gprsize+16*5], m0 - mova [rsp+gprsize+16*6], m1 - mova m0, [coeffq+16*4 ] - mova m1, [coeffq+16*6 ] - mova m2, [coeffq+16*8 ] - mova m3, [coeffq+16*10] - mova m4, [coeffq+16*20] - mova m5, [coeffq+16*22] - mova m6, [coeffq+16*24] - mova m7, [coeffq+16*26] -%endmacro - -INV_TXFM_16X16_FN adst, dct -INV_TXFM_16X16_FN adst, adst -INV_TXFM_16X16_FN adst, flipadst - -cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - ITX_16X16_ADST_LOAD_ODD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end - - mov r3, tx2q - lea tx2q, [o(m(iadst_16x16_internal).pass1_end)] - mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 - -.pass1_end: - SAVE_8ROWS coeffq+16*17, 32 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal).pass1_end1)] - mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 - -.pass1_end1: - SAVE_8ROWS coeffq+16*1, 32 - ITX_16X16_ADST_LOAD_EVEN_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end - - lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)] - mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 - -.pass1_end2: - SAVE_8ROWS coeffq+16*16, 32 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - mov tx2q, r3 - mova m7, [o(pw_8192)] - jmp m(iadst_8x8_internal).pass1_end1 - -.pass2: - lea tx2q, [o(m(iadst_16x16_internal).end)] - jmp m(iadst_8x16_internal).pass2_pre - -.end: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iadst_16x16_internal).end1)] - mov dstq, r3 - lea r3, [dstq+8] - jmp m(iadst_8x8_internal).end - -.end1: - pxor m7, m7 - REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - - add coeffq, 32*8 - mov dstq, r3 - - mova m4, [coeffq+16*0 ] - mova m5, [coeffq+16*2 ] - mova m0, [coeffq+16*4 ] - mova m1, [coeffq+16*6 ] - mova m2, [coeffq+16*8 ] - mova m3, [coeffq+16*10] - mova m6, [coeffq+16*12] - mova m7, [coeffq+16*14] - mova [rsp+gprsize+16*7], m4 - mova [rsp+gprsize+16*8], m5 - mova [rsp+gprsize+16*5], m6 - mova [rsp+gprsize+16*6], m7 - lea tx2q, [o(m(iadst_8x16_internal).end)] - jmp m(iadst_8x16_internal).pass2_main - - -INV_TXFM_16X16_FN flipadst, dct -INV_TXFM_16X16_FN flipadst, adst -INV_TXFM_16X16_FN flipadst, flipadst - -cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - ITX_16X16_ADST_LOAD_ODD_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end - - mov r3, tx2q - lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)] - mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 - -.pass1_end: - SAVE_8ROWS coeffq+16*1, 32 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)] - mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 - -.pass1_end1: - SAVE_8ROWS coeffq+16*17, 32 - ITX_16X16_ADST_LOAD_EVEN_COEFS - call m(iadst_16x8_internal).main - call m(iadst_16x8_internal).main_pass1_end - - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS coeffq+16*0, 32 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)] - mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 - -.pass1_end2: - SAVE_8ROWS coeffq+16*16, 32 - LOAD_8ROWS coeffq+16* 0, 32 - mova [rsp+gprsize+16*0], m7 - mov tx2q, r3 - mova m7, [o(pw_m8192)] - jmp m(iflipadst_8x8_internal).pass1_end1 - -.pass2: - lea tx2q, [o(m(iflipadst_16x16_internal).end)] - lea r3, [dstq+8] - jmp m(iflipadst_8x16_internal).pass2_pre - -.end: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(iflipadst_16x16_internal).end1)] - lea dstq, [dstq+strideq*2] - jmp m(iflipadst_8x8_internal).end - -.end1: - pxor m7, m7 - REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - - add coeffq, 32*8 - - mova m4, [coeffq+16*0 ] - mova m5, [coeffq+16*2 ] - mova m0, [coeffq+16*4 ] - mova m1, [coeffq+16*6 ] - mova m2, [coeffq+16*8 ] - mova m3, [coeffq+16*10] - mova m6, [coeffq+16*12] - mova m7, [coeffq+16*14] - mova [rsp+gprsize+16*7], m4 - mova [rsp+gprsize+16*8], m5 - mova [rsp+gprsize+16*5], m6 - mova [rsp+gprsize+16*6], m7 - - lea tx2q, [o(m(iflipadst_16x16_internal).end2)] - mov dstq, r3 - jmp m(iflipadst_8x16_internal).pass2_main - -.end2: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_8x16_internal).end1)] - lea dstq, [dstq+strideq*2] - jmp m(iflipadst_8x8_internal).end - - -%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 - pmulhrsw m%2, m%3, m%1 - psraw m%2, 1 - pavgw m%1, m%2 -%endmacro - -INV_TXFM_16X16_FN identity, dct -INV_TXFM_16X16_FN identity, identity - -cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - add coeffq, 16*17 - mov r3, tx2q - lea tx2q, [o(m(iidentity_16x16_internal).pass1_end)] - -.pass1: - mova m6, [o(pw_1697x16)] - mova m7, [coeffq+32*6] - mova m0, [coeffq+32*0] - mova m1, [coeffq+32*1] - mova m2, [coeffq+32*2] - mova m3, [coeffq+32*3] - mova m4, [coeffq+32*4] - REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 - mova m5, [coeffq+32*5] - mova [rsp+gprsize+16*1], m7 - IDTX16B 5, 7, 6 - mova m7, [coeffq+32*7] - IDTX16B 7, 6, 6 - jmp m(idct_8x8_internal).pass1_end3 - -.pass1_end: - SAVE_8ROWS coeffq, 32 - sub coeffq, 16 - lea tx2q, [o(m(iidentity_16x16_internal).pass1_end1)] - jmp .pass1 - -.pass1_end1: - SAVE_8ROWS coeffq, 32 - sub coeffq, 15*16 - lea tx2q, [o(m(iidentity_16x16_internal).pass1_end2)] - jmp .pass1 - -.pass1_end2: - SAVE_8ROWS coeffq, 32 - sub coeffq, 16 - mov tx2q, r3 - jmp .pass1 - -.pass2: - lea r3, [dstq+8] - lea tx2q, [o(m(iidentity_16x16_internal).end1)] - -.end: - mova [rsp+gprsize+16*0], m7 - mova [rsp+gprsize+16*1], m4 - mova m7, [o(pw_1697x16)] - REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 - mova m4, [o(pw_2048)] - pmulhrsw m5, m4 - pmulhrsw m6, m4 - mova [rsp+gprsize+16*2], m5 - mova m5, [rsp+gprsize+16*1] - mova [rsp+gprsize+16*1], m6 - IDTX16 5, 6, 7 - mova m6, [rsp+gprsize+16*0] - IDTX16 6, 7, 7 - REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 - pmulhrsw m4, m5 - mova [rsp+gprsize+16*0], m6 - jmp m(idct_8x8_internal).end3 - -.end1: - LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(iidentity_16x16_internal).end2)] - lea dstq, [dstq+strideq*2] - jmp .end - -.end2: - pxor m7, m7 - REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - - add coeffq, 32*8 - LOAD_8ROWS coeffq, 32 - lea tx2q, [o(m(iidentity_16x16_internal).end3)] - mov dstq, r3 - jmp .end - -.end3: - LOAD_8ROWS coeffq+16*1, 32 - lea tx2q, [o(m(idct_8x16_internal).end1)] - lea dstq, [dstq+strideq*2] - jmp .end - - -cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - call m(idct_8x32_internal) - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_8192)] - mov [coeffq], eobd - pmulhrsw m0, m2 - psrlw m2, 2 ;pw_2048 - pmulhrsw m0, m1 - pmulhrsw m0, m2 - pshuflw m0, m0, q0000 - punpcklwd m0, m0 - mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_8x32).end)] - jmp m(inv_txfm_add_dct_dct_8x8).loop - -.end: - RET - - - -cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - cmp eobd, 106 - jle .fast - - LOAD_8ROWS coeffq+16*3, 64 - call m(idct_8x8_internal).main - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1: - mova [rsp+gprsize+16*9 ], m0 ;in24 - mova [rsp+gprsize+16*10], m4 ;in28 - mova [rsp+gprsize+16*17], m2 ;in26 - mova [rsp+gprsize+16*18], m6 ;in30 - mova [rsp+gprsize+16*31], m1 ;in25 - mova [rsp+gprsize+16*30], m3 ;in27 - mova [rsp+gprsize+16*27], m5 ;in29 - mova [rsp+gprsize+16*34], m7 ;in31 - LOAD_8ROWS coeffq+16*2, 64 - call m(idct_8x8_internal).main - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1_1)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_1: - mova [rsp+gprsize+16*7 ], m0 ;in16 - mova [rsp+gprsize+16*8 ], m4 ;in20 - mova [rsp+gprsize+16*15], m2 ;in18 - mova [rsp+gprsize+16*16], m6 ;in22 - mova [rsp+gprsize+16*33], m1 ;in17 - mova [rsp+gprsize+16*28], m3 ;in19 - mova [rsp+gprsize+16*29], m5 ;in21 - mova [rsp+gprsize+16*32], m7 ;in23 - -.fast: - LOAD_8ROWS coeffq+16*1, 64 - call m(idct_8x8_internal).main - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end: - mova [rsp+gprsize+16*5 ], m0 ;in8 - mova [rsp+gprsize+16*6 ], m4 ;in12 - mova [rsp+gprsize+16*13], m2 ;in10 - mova [rsp+gprsize+16*14], m6 ;in14 - mova [rsp+gprsize+16*21], m1 ;in9 - mova [rsp+gprsize+16*24], m3 ;in11 - mova [rsp+gprsize+16*25], m5 ;in13 - mova [rsp+gprsize+16*20], m7 ;in15 - LOAD_8ROWS coeffq+16*0, 64 - call m(idct_8x8_internal).main - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_8x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end1: - mova [rsp+gprsize+16*11], m2 ;in2 - mova [rsp+gprsize+16*12], m6 ;in6 - mova [rsp+gprsize+16*19], m1 ;in1 - mova [rsp+gprsize+16*26], m3 ;in3 - mova [rsp+gprsize+16*23], m5 ;in5 - mova [rsp+gprsize+16*22], m7 ;in7 - mova m1, m4 ;in4 - mova m2, [rsp+gprsize+16*5 ] ;in8 - mova m3, [rsp+gprsize+16*6 ] ;in12 - - cmp eobd, 106 - jg .full - - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3 , 16 - mova m0, [rsp+gprsize+16*11] - mova m1, [rsp+gprsize+16*12] - mova m2, [rsp+gprsize+16*13] - mova m3, [rsp+gprsize+16*14] - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - call .main_fast - jmp .pass2 - -.full: - mova m4, [rsp+gprsize+16*7 ] ;in16 - mova m5, [rsp+gprsize+16*8 ] ;in20 - mova m6, [rsp+gprsize+16*9 ] ;in24 - mova m7, [rsp+gprsize+16*10] ;in28 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3 , 16 - LOAD_8ROWS rsp+gprsize+16*11, 16 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - call .main - -.pass2: - lea r3, [o(m(idct_8x32_internal).end6)] - -.end: - mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_8x32_internal).end2)] - -.end1: - pxor m7, m7 - REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ - 8, 9, 10, 11, 12, 13, 14, 15, \ - 16, 17, 18, 19, 20, 21, 22, 23, \ - 24, 25, 26, 27, 28, 29, 30, 31 - - jmp tx2q - -.end2: - lea tx2q, [o(m(idct_8x32_internal).end3)] - jmp m(idct_8x8_internal).end - -.end3: - LOAD_8ROWS rsp+gprsize+16*11, 16 - mova [rsp+gprsize+16*0 ], m7 - lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal).end4)] - jmp m(idct_8x8_internal).end - -.end4: - LOAD_8ROWS rsp+gprsize+16*19, 16 - mova [rsp+gprsize+16*0 ], m7 - lea dstq, [dstq+strideq*2] - lea tx2q, [o(m(idct_8x32_internal).end5)] - jmp m(idct_8x8_internal).end - -.end5: - LOAD_8ROWS rsp+gprsize+16*27, 16 - mova [rsp+gprsize+16*0 ], m7 - lea dstq, [dstq+strideq*2] - mov tx2q, r3 - jmp m(idct_8x8_internal).end - -.end6: - ret - -ALIGN function_align -.main_veryfast: - mova m0, [rsp+gprsize*2+16*19] ;in1 - pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 - pmulhrsw m0, [o(pw_201x8)] ;t16,t17 - mova m7, [o(pd_2048)] - mova [rsp+gprsize*2+16*19], m0 ;t16 - mova [rsp+gprsize*2+16*34], m3 ;t31 - ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a - mova [rsp+gprsize*2+16*20], m3 ;t17a - mova [rsp+gprsize*2+16*33], m0 ;t30a - mova m1, [rsp+gprsize*2+16*22] ;in7 - pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 - pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 - mova [rsp+gprsize*2+16*22], m1 ;t19 - mova [rsp+gprsize*2+16*31], m2 ;t28 - ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a - mova [rsp+gprsize*2+16*21], m2 ;t18a - mova [rsp+gprsize*2+16*32], m1 ;t29a - mova m0, [rsp+gprsize*2+16*23] ;in5 - pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 - pmulhrsw m0, [o(pw_995x8)] ;t20, t21 - mova [rsp+gprsize*2+16*23], m0 ;t20 - mova [rsp+gprsize*2+16*30], m3 ;t27 - ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a - mova [rsp+gprsize*2+16*24], m3 ;t21a - mova [rsp+gprsize*2+16*29], m0 ;t26a - mova m2, [rsp+gprsize*2+16*26] ;in3 - pxor m0, m0 - mova m3, m0 - pmulhrsw m1, m2, [o(pw_4052x8)] - pmulhrsw m2, [o(pw_m601x8)] - jmp .main2 - -ALIGN function_align -.main_fast: ;bottom half is zero - mova m0, [rsp+gprsize*2+16*19] ;in1 - mova m1, [rsp+gprsize*2+16*20] ;in15 - pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a - pmulhrsw m0, [o(pw_201x8)] ;t16a - pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a - pmulhrsw m1, [o(pw_m2751x8)] ;t17a - mova m7, [o(pd_2048)] - psubsw m4, m0, m1 ;t17 - paddsw m0, m1 ;t16 - psubsw m5, m3, m2 ;t30 - paddsw m3, m2 ;t31 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a - mova [rsp+gprsize*2+16*19], m0 ;t16 - mova [rsp+gprsize*2+16*20], m5 ;t17a - mova [rsp+gprsize*2+16*33], m4 ;t30a - mova [rsp+gprsize*2+16*34], m3 ;t31 - mova m0, [rsp+gprsize*2+16*21] ;in9 - mova m1, [rsp+gprsize*2+16*22] ;in7 - pmulhrsw m3, m0, [o(pw_3703x8)] - pmulhrsw m0, [o(pw_1751x8)] - pmulhrsw m2, m1, [o(pw_3857x8)] - pmulhrsw m1, [o(pw_m1380x8)] - psubsw m4, m1, m0 ;t18 - paddsw m0, m1 ;t19 - psubsw m5, m2, m3 ;t29 - paddsw m3, m2 ;t28 - ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a - mova [rsp+gprsize*2+16*21], m5 ;t18a - mova [rsp+gprsize*2+16*22], m0 ;t19 - mova [rsp+gprsize*2+16*31], m3 ;t28 - mova [rsp+gprsize*2+16*32], m4 ;t29a - mova m0, [rsp+gprsize*2+16*23] ;in5 - mova m1, [rsp+gprsize*2+16*24] ;in11 - pmulhrsw m3, m0, [o(pw_3973x8)] - pmulhrsw m0, [o(pw_995x8)] - pmulhrsw m2, m1, [o(pw_3513x8)] - pmulhrsw m1, [o(pw_m2106x8)] - psubsw m4, m0, m1 ;t21 - paddsw m0, m1 ;t20 - psubsw m5, m3, m2 ;t26 - paddsw m3, m2 ;t27 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a - mova [rsp+gprsize*2+16*23], m0 ;t20 - mova [rsp+gprsize*2+16*24], m5 ;t21a - mova [rsp+gprsize*2+16*29], m4 ;t26a - mova [rsp+gprsize*2+16*30], m3 ;t27 - mova m0, [rsp+gprsize*2+16*25] ;in13 - mova m2, [rsp+gprsize*2+16*26] ;in3 - pmulhrsw m3, m0, [o(pw_3290x8)] - pmulhrsw m0, [o(pw_2440x8)] - pmulhrsw m1, m2, [o(pw_4052x8)] - pmulhrsw m2, [o(pw_m601x8)] - jmp .main2 - -ALIGN function_align -.main: - mova m7, [o(pd_2048)] - mova m0, [rsp+gprsize*2+16*19] ;in1 - mova m1, [rsp+gprsize*2+16*20] ;in15 - mova m2, [rsp+gprsize*2+16*33] ;in17 - mova m3, [rsp+gprsize*2+16*34] ;in31 - ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a - ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a - psubsw m4, m0, m2 ;t17 - paddsw m0, m2 ;t16 - psubsw m5, m3, m1 ;t30 - paddsw m3, m1 ;t31 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a - mova [rsp+gprsize*2+16*19], m0 ;t16 - mova [rsp+gprsize*2+16*20], m5 ;t17a - mova [rsp+gprsize*2+16*33], m4 ;t30a - mova [rsp+gprsize*2+16*34], m3 ;t31 - mova m0, [rsp+gprsize*2+16*21] ;in9 - mova m1, [rsp+gprsize*2+16*22] ;in7 - mova m2, [rsp+gprsize*2+16*31] ;in25 - mova m3, [rsp+gprsize*2+16*32] ;in23 - ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a - ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a - psubsw m4, m2, m0 ;t18 - paddsw m0, m2 ;t19 - psubsw m5, m1, m3 ;t29 - paddsw m3, m1 ;t28 - ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a - mova [rsp+gprsize*2+16*21], m5 ;t18a - mova [rsp+gprsize*2+16*22], m0 ;t19 - mova [rsp+gprsize*2+16*31], m3 ;t28 - mova [rsp+gprsize*2+16*32], m4 ;t29a - mova m0, [rsp+gprsize*2+16*23] ;in5 - mova m1, [rsp+gprsize*2+16*24] ;in11 - mova m2, [rsp+gprsize*2+16*29] ;in21 - mova m3, [rsp+gprsize*2+16*30] ;in27 - ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a - ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a - psubsw m4, m0, m2 ;t21 - paddsw m0, m2 ;t20 - psubsw m5, m3, m1 ;t26 - paddsw m3, m1 ;t27 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a - mova [rsp+gprsize*2+16*23], m0 ;t20 - mova [rsp+gprsize*2+16*24], m5 ;t21a - mova [rsp+gprsize*2+16*29], m4 ;t26a - mova [rsp+gprsize*2+16*30], m3 ;t27 - mova m0, [rsp+gprsize*2+16*25] ;in13 - mova m1, [rsp+gprsize*2+16*26] ;in3 - mova m2, [rsp+gprsize*2+16*27] ;in29 - mova m3, [rsp+gprsize*2+16*28] ;in19 - ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a - ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a - -.main2: - psubsw m4, m2, m0 ;t22 - paddsw m0, m2 ;t23 - psubsw m5, m1, m3 ;t25 - paddsw m3, m1 ;t24 - ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a - mova m2, [rsp+gprsize*2+16*24] ;t21a - psubsw m1, m5, m2 ;t21 - paddsw m5, m2 ;t22 - mova [rsp+gprsize*2+16*25], m5 ;t22 - mova m2, [rsp+gprsize*2+16*29] ;t26a - psubsw m5, m4, m2 ;t26 - paddsw m4, m2 ;t25 - mova [rsp+gprsize*2+16*28], m4 ;t25 - ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a - mova [rsp+gprsize*2+16*24], m5 ;t21a - mova [rsp+gprsize*2+16*29], m1 ;t26a - - mova m1, [rsp+gprsize*2+16*23] ;t20 - mova m5, [rsp+gprsize*2+16*30] ;t27 - psubsw m2, m0, m1 ;t20a - paddsw m0, m1 ;t23a - psubsw m6, m3, m5 ;t27a - paddsw m3, m5 ;t24a - ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 - mova [rsp+gprsize*2+16*26], m0 ;t23a - mova [rsp+gprsize*2+16*27], m3 ;t24a - mova [rsp+gprsize*2+16*30], m2 ;t27 - - mova m0, [rsp+gprsize*2+16*20] ;t17a - mova m1, [rsp+gprsize*2+16*21] ;t18a - mova m2, [rsp+gprsize*2+16*32] ;t29a - mova m3, [rsp+gprsize*2+16*33] ;t30a - psubsw m4, m0, m1 ;t18 - paddsw m0, m1 ;t17 - psubsw m5, m3, m2 ;t29 - paddsw m3, m2 ;t30 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a - mova [rsp+gprsize*2+16*20], m0 ;t17 - mova [rsp+gprsize*2+16*21], m5 ;t18a - mova [rsp+gprsize*2+16*32], m4 ;t29a - mova [rsp+gprsize*2+16*33], m3 ;t30 - mova m0, [rsp+gprsize*2+16*19] ;t16 - mova m1, [rsp+gprsize*2+16*22] ;t19 - mova m2, [rsp+gprsize*2+16*31] ;t28 - mova m3, [rsp+gprsize*2+16*34] ;t31 - psubsw m4, m0, m1 ;t19a - paddsw m0, m1 ;t16a - psubsw m5, m3, m2 ;t28a - paddsw m3, m2 ;t31a - ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 - mova m2, [rsp+gprsize*2+16*15] ;tmp12 - psubsw m1, m5, m6 ;t20a - paddsw m5, m6 ;t19a - psubsw m6, m2, m5 ;out19 - paddsw m2, m5 ;out12 - mova m5, [rsp+gprsize*2+16*30] ;t27 - mova [rsp+gprsize*2+16*22], m6 ;out19 - mova [rsp+gprsize*2+16*15], m2 ;out12 - psubsw m6, m4, m5 ;t27a - paddsw m4, m5 ;t28a - ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 - mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 - psubsw m5, m2, m4 ;out28 - paddsw m2, m4 ;out3 - mova m4, [rsp+gprsize*2+16*14] ;tmp11 - mova [rsp+gprsize*2+16*31], m5 ;out28 - mova [rsp+gprsize*2+16*6 ], m2 ;out3 - psubsw m5, m4, m6 ;out20 - paddsw m4, m6 ;out11 - mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 - mova [rsp+gprsize*2+16*23], m5 ;out20 - mova [rsp+gprsize*2+16*14], m4 ;out11 - psubsw m5, m2, m1 ;out27 - paddsw m2, m1 ;out4 - mova m1, [rsp+gprsize*2+16*26] ;t23a - mova m4, [rsp+gprsize*2+16*27] ;t24a - mova [rsp+gprsize*2+16*30], m5 ;out27 - mova [rsp+gprsize*2+16*7 ], m2 ;out4 - psubsw m5, m0, m1 ;t23 - paddsw m0, m1 ;t16 - psubsw m2, m3, m4 ;t24 - paddsw m3, m4 ;t31 - ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a - mova m6, [rsp+gprsize*2+16*18] ;tmp15 - psubsw m4, m6, m0 ;out16 - paddsw m6, m0 ;out15 - mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 - mova m1, [rsp+gprsize*2+16*11] ;tmp8 - mova [rsp+gprsize*2+16*18], m6 ;out15 - mova [rsp+gprsize*2+16*19], m4 ;out16 - psubsw m6, m0, m3 ;out31 - paddsw m0, m3 ;out0 - psubsw m4, m1, m2 ;out23 - paddsw m1, m2 ;out8 - mova m3, [rsp+gprsize*2+16*10] ;tmp7 - mova [rsp+gprsize*2+16*34], m6 ;out31 - mova [rsp+gprsize*2+16*11], m1 ;out8 - mova [rsp+gprsize*2+16*26], m4 ;out23 - paddsw m6, m3, m5 ;out7 - psubsw m3, m5 ;out24 - mova m1, [rsp+gprsize*2+16*20] ;t17 - mova m5, [rsp+gprsize*2+16*25] ;t22 - mova m2, [rsp+gprsize*2+16*17] ;tmp14 - mova [rsp+gprsize*2+16*27], m3 ;out24 - psubsw m4, m1, m5 ;t22a - paddsw m1, m5 ;t17a - psubsw m3, m2, m1 ;out17 - paddsw m2, m1 ;out14 - mova m5, [rsp+gprsize*2+16*28] ;t25 - mova m1, [rsp+gprsize*2+16*33] ;t30 - mova [rsp+gprsize*2+16*17], m2 ;out14 - mova [rsp+gprsize*2+16*20], m3 ;out17 - psubsw m2, m1, m5 ;t25a - paddsw m1, m5 ;t30a - ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 - mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 - psubsw m3, m5, m1 ;out30 - paddsw m5, m1 ;out1 - mova m1, [rsp+gprsize*2+16*12] ;tmp9 - mova [rsp+gprsize*2+16*33], m3 ;out30 - mova [rsp+gprsize*2+16*4 ], m5 ;out1 - psubsw m3, m1, m2 ;out22 - paddsw m1, m2 ;out9 - mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 - mova [rsp+gprsize*2+16*25], m3 ;out22 - mova [rsp+gprsize*2+16*12], m1 ;out9 - psubsw m3, m5, m4 ;out25 - paddsw m5, m4 ;out6 - mova m4, [rsp+gprsize*2+16*21] ;t18a - mova m1, [rsp+gprsize*2+16*24] ;t21a - mova m2, [rsp+gprsize*2+16*16] ;tmp13 - mova [rsp+gprsize*2+16*28], m3 ;out25 - mova [rsp+gprsize*2+16*9 ], m5 ;out6 - paddsw m3, m4, m1 ;t18 - psubsw m4, m1 ;t21 - psubsw m5, m2, m3 ;out18 - paddsw m2, m3 ;out13 - mova m1, [rsp+gprsize*2+16*29] ;t26a - mova m3, [rsp+gprsize*2+16*32] ;t29a - mova [rsp+gprsize*2+16*21], m5 ;out18 - mova [rsp+gprsize*2+16*16], m2 ;out13 - psubsw m5, m3, m1 ;t26 - paddsw m3, m1 ;t29 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a - mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 - psubsw m1, m2, m3 ;out29 - paddsw m2, m3 ;out2 - mova m3, [rsp+gprsize*2+16*13] ;tmp10 - mova [rsp+gprsize*2+16*32], m1 ;out29 - psubsw m7, m3, m5 ;out21 - paddsw m3, m5 ;out10 - mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 - mova [rsp+gprsize*2+16*24], m7 ;out21 - mova [rsp+gprsize*2+16*13], m3 ;out10 - psubsw m1, m5, m4 ;out26 - paddsw m5, m4 ;out5 - mova m7, m6 ;out7 - mova m3, [rsp+gprsize*2+16*6 ] ;out3 - mova m4, [rsp+gprsize*2+16*7 ] ;out4 - mova [rsp+gprsize*2+16*29], m1 ;out26 - mova m6, [rsp+gprsize*2+16*9 ] ;out6 - mova m1, [rsp+gprsize*2+16*4 ] ;out1 - ret - - -cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - call m(idct_32x8_internal) - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_8192)] - mov [coeffq], eobd - mov r3d, 8 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] - -.body: - pmulhrsw m0, m2 - movd m2, [o(pw_2048)] ;intentionally rip-relative - pmulhrsw m0, m1 - pmulhrsw m0, m2 - pshuflw m0, m0, q0000 - punpcklwd m0, m0 - pxor m5, m5 - -.loop: - mova m1, [dstq+16*0] - mova m3, [dstq+16*1] - punpckhbw m2, m1, m5 - punpcklbw m1, m5 - punpckhbw m4, m3, m5 - punpcklbw m3, m5 - paddw m2, m0 - paddw m1, m0 - paddw m4, m0 - paddw m3, m0 - packuswb m1, m2 - packuswb m3, m4 - mova [dstq+16*0], m1 - mova [dstq+16*1], m3 - add dstq, strideq - dec r3d - jg .loop - jmp tx2q - -.end: - RET - - -cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - LOAD_8ROWS coeffq+16*0, 64 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - LOAD_8ROWS coeffq+16*2, 64 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - LOAD_8ROWS coeffq+16*1, 32 - mova [rsp+gprsize+16*19], m0 ;in1 - mova [rsp+gprsize+16*26], m1 ;in3 - mova [rsp+gprsize+16*23], m2 ;in5 - mova [rsp+gprsize+16*22], m3 ;in7 - mova [rsp+gprsize+16*21], m4 ;in9 - mova [rsp+gprsize+16*24], m5 ;in11 - mova [rsp+gprsize+16*25], m6 ;in13 - mova [rsp+gprsize+16*20], m7 ;in15 - - cmp eobd, 106 - jg .full - call m(idct_8x32_internal).main_fast - jmp .pass2 - -.full: - LOAD_8ROWS coeffq+16*17, 32 - mova [rsp+gprsize+16*33], m0 ;in17 - mova [rsp+gprsize+16*28], m1 ;in19 - mova [rsp+gprsize+16*29], m2 ;in21 - mova [rsp+gprsize+16*32], m3 ;in23 - mova [rsp+gprsize+16*31], m4 ;in25 - mova [rsp+gprsize+16*30], m5 ;in27 - mova [rsp+gprsize+16*27], m6 ;in29 - mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main - -.pass2: - mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x8_internal).end)] - jmp m(idct_8x32_internal).end1 - -.end: - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end1)] - jmp m(idct_8x8_internal).pass1_end1 - -.end1: - lea r3, [dstq+8] - lea tx2q, [o(m(idct_32x8_internal).end2)] - jmp m(idct_8x8_internal).pass2_main - -.end2: - LOAD_8ROWS rsp+gprsize+16*11, 16 - mova [rsp+gprsize+16*0 ], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end3)] - jmp m(idct_8x8_internal).pass1_end1 - -.end3: - mov dstq, r3 - add r3, 8 - lea tx2q, [o(m(idct_32x8_internal).end4)] - jmp m(idct_8x8_internal).pass2_main - -.end4: - LOAD_8ROWS rsp+gprsize+16*19, 16 - mova [rsp+gprsize+16*0 ], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end5)] - jmp m(idct_8x8_internal).pass1_end1 - -.end5: - mov dstq, r3 - add r3, 8 - lea tx2q, [o(m(idct_32x8_internal).end6)] - jmp m(idct_8x8_internal).pass2_main - -.end6: - LOAD_8ROWS rsp+gprsize+16*27, 16 - mova [rsp+gprsize+16*0 ], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x8_internal).end7)] - jmp m(idct_8x8_internal).pass1_end1 - -.end7: - mov dstq, r3 - lea tx2q, [o(m(idct_32x8_internal).end8)] - jmp m(idct_8x8_internal).pass2_main - -.end8: - ret - - -cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 - mov r5d, 4 - mov tx2d, 2 - cmp eobd, 107 - cmovns tx2d, r5d - mov r3d, tx2d -%if ARCH_X86_32 - LEA r5, $$ -%endif - lea tx2q, [o(m(idct_32x8_internal).end8)] -.loop: - LOAD_8ROWS coeffq+16*0, 64 - paddsw m6, [o(pw_5)] - mova [rsp+16*1], m6 - mova m6, [o(pw_5)] - REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 - call m(idct_8x8_internal).pass1_end3 - REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 - mova [rsp+16*2], m5 - mova [rsp+16*1], m6 - mova [rsp+16*0], m7 - call m(idct_8x8_internal).end3 - lea dstq, [dstq+strideq*2] - pxor m7, m7 - REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 - add coeffq, 16 - dec r3d - jg .loop - RET - -cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 - mov r5d, 4 - mov tx2d, 2 - cmp eobd, 107 - cmovns tx2d, r5d - mov r3d, tx2d -%if ARCH_X86_32 - LEA r5, $$ -%endif - -.loop: - LOAD_8ROWS coeffq+16*0, 16 - pmulhrsw m6, [o(pw_4096)] - mova [rsp+16*1], m6 - mova m6, [o(pw_4096)] - REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 - lea tx2q, [o(m(idct_32x8_internal).end8)] - call m(idct_8x8_internal).pass1_end3 - - mov [rsp+16*3], dstq - mova [rsp+16*2], m5 - mova [rsp+16*1], m6 - mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_8x8_internal).end4)] - call m(idct_8x8_internal).end3 - - add coeffq, 16*8 - mov dstq, [rsp+16*3] - lea dstq, [dstq+8] - dec r3d - jg .loop - jnc .loop - RET - - -cglobal inv_txfm_add_dct_dct_16x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - call m(idct_16x32_internal) - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_16384)] - mov [coeffq], eobd - pmulhrsw m0, m1 - mov r2d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x32).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly - -.end: - RET - -cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - - LOAD_8ROWS coeffq+16*1, 128, 1 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_8ROWS coeffq+16*5, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end: - SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end1: - mova [coeffq+16*1 ], m0 ;in8 - mova [coeffq+16*5 ], m4 ;in12 - mova [rsp+gprsize+16*13], m2 ;in10 - mova [rsp+gprsize+16*14], m6 ;in14 - mova [rsp+gprsize+16*21], m1 ;in9 - mova [rsp+gprsize+16*24], m3 ;in11 - mova [rsp+gprsize+16*25], m5 ;in13 - mova [rsp+gprsize+16*20], m7 ;in15 - LOAD_8ROWS coeffq+16*0, 128, 1 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_8ROWS coeffq+16*4, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end2: - SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end3: - mova [rsp+gprsize+16*11], m2 ;in2 - mova [rsp+gprsize+16*12], m6 ;in6 - mova [rsp+gprsize+16*19], m1 ;in1 - mova [rsp+gprsize+16*26], m3 ;in3 - mova [rsp+gprsize+16*23], m5 ;in5 - mova [rsp+gprsize+16*22], m7 ;in7 - - cmp eobd, 150 - jg .full - - mova m1, m4 ;in4 - mova m2, [coeffq+16*1 ] ;in8 - mova m3, [coeffq+16*5 ] ;in12 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - mova m0, [rsp+gprsize+16*11] ;in2 - mova m1, [rsp+gprsize+16*12] ;in6 - mova m2, [rsp+gprsize+16*13] ;in10 - mova m3, [rsp+gprsize+16*14] ;in14 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - call m(idct_8x32_internal).main_fast - jmp .pass2 - -.full: - mova [coeffq+16*0 ], m0 ;in0 - mova [coeffq+16*4 ], m4 ;in4 - - LOAD_8ROWS coeffq+16*2, 128, 1 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_8ROWS coeffq+16*6, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end4: - SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end5: - mova [coeffq+16*2 ], m0 ;in16 - mova [coeffq+16*6 ], m4 ;in20 - mova [rsp+gprsize+16*15], m2 ;in18 - mova [rsp+gprsize+16*16], m6 ;in22 - mova [rsp+gprsize+16*33], m1 ;in17 - mova [rsp+gprsize+16*28], m3 ;in19 - mova [rsp+gprsize+16*29], m5 ;in21 - mova [rsp+gprsize+16*32], m7 ;in23 - - LOAD_8ROWS coeffq+16*3, 128, 1 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_8ROWS coeffq+16*7, 128, 1 - call m(idct_16x8_internal).main - lea tx2q, [o(m(idct_16x32_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end6: - SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_16x32_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end7: - mova [rsp+gprsize+16*17], m2 ;in26 - mova [rsp+gprsize+16*18], m6 ;in30 - mova [rsp+gprsize+16*31], m1 ;in25 - mova [rsp+gprsize+16*30], m3 ;in27 - mova [rsp+gprsize+16*27], m5 ;in29 - mova [rsp+gprsize+16*34], m7 ;in31 - - mova m6, m0 ;in24 - mova m7, m4 ;in28 - mova m0, [coeffq+16*0 ] ;in0 - mova m1, [coeffq+16*4 ] ;in4 - mova m2, [coeffq+16*1 ] ;in8 - mova m3, [coeffq+16*5 ] ;in12 - mova m4, [coeffq+16*2 ] ;in16 - mova m5, [coeffq+16*6 ] ;in20 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3 , 16 - LOAD_8ROWS rsp+gprsize+16*11, 16 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - call m(idct_8x32_internal).main - -.pass2: - mov [rsp+gprsize*1+16*35], eobd - lea r3, [dstq+8] - mov [rsp+gprsize*2+16*35], r3 - lea r3, [o(m(idct_16x32_internal).end)] - jmp m(idct_8x32_internal).end - -.end: - mov dstq, [rsp+gprsize*2+16*35] - mov eobd, [rsp+gprsize*1+16*35] - add coeffq, 16*32 - - mova m0, [coeffq+16*4 ] ;in1 - mova m1, [coeffq+16*12] ;in3 - mova m2, [coeffq+16*20] ;in5 - mova m3, [coeffq+16*28] ;in7 - mova m4, [coeffq+16*5 ] ;in9 - mova m5, [coeffq+16*13] ;in11 - mova m6, [coeffq+16*21] ;in13 - mova m7, [coeffq+16*29] ;in15 - - mova [rsp+gprsize+16*19], m0 ;in1 - mova [rsp+gprsize+16*26], m1 ;in3 - mova [rsp+gprsize+16*23], m2 ;in5 - mova [rsp+gprsize+16*22], m3 ;in7 - mova [rsp+gprsize+16*21], m4 ;in9 - mova [rsp+gprsize+16*24], m5 ;in11 - mova [rsp+gprsize+16*25], m6 ;in13 - mova [rsp+gprsize+16*20], m7 ;in15 - - mova m0, [coeffq+16*0 ] ;in0 - mova m1, [coeffq+16*16] ;in4 - mova m2, [coeffq+16*1 ] ;in8 - mova m3, [coeffq+16*17] ;in12 - - cmp eobd, 150 - jg .full1 - - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - mova m0, [coeffq+16*8 ] ;in2 - mova m1, [coeffq+16*24] ;in6 - mova m2, [coeffq+16*9 ] ;in10 - mova m3, [coeffq+16*25] ;in14 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - call m(idct_8x32_internal).main_fast - jmp .end1 - -.full1: - mova m4, [coeffq+16*2 ] ;in16 - mova m5, [coeffq+16*18] ;in20 - mova m6, [coeffq+16*3 ] ;in24 - mova m7, [coeffq+16*19] ;in26 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - mova m0, [coeffq+16*8 ] ;in2 - mova m1, [coeffq+16*24] ;in6 - mova m2, [coeffq+16*9 ] ;in10 - mova m3, [coeffq+16*25] ;in14 - mova m4, [coeffq+16*10] ;in18 - mova m5, [coeffq+16*26] ;in22 - mova m6, [coeffq+16*11] ;in26 - mova m7, [coeffq+16*27] ;in30 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - mova m0, [coeffq+16*6 ] ;in17 - mova m1, [coeffq+16*14] ;in19 - mova m2, [coeffq+16*22] ;in21 - mova m3, [coeffq+16*30] ;in23 - mova m4, [coeffq+16*7 ] ;in25 - mova m5, [coeffq+16*15] ;in27 - mova m6, [coeffq+16*23] ;in29 - mova m7, [coeffq+16*31] ;in31 - - mova [rsp+gprsize+16*33], m0 ;in17 - mova [rsp+gprsize+16*28], m1 ;in19 - mova [rsp+gprsize+16*29], m2 ;in21 - mova [rsp+gprsize+16*32], m3 ;in23 - mova [rsp+gprsize+16*31], m4 ;in25 - mova [rsp+gprsize+16*30], m5 ;in27 - mova [rsp+gprsize+16*27], m6 ;in29 - mova [rsp+gprsize+16*34], m7 ;in31 - - call m(idct_8x32_internal).main - -.end1: - jmp m(idct_8x32_internal).pass2 - - - -cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - - call m(idct_32x16_internal) - call m(idct_8x16_internal).pass2 - - add coeffq, 16*16 - lea dstq, [r3+8] - LOAD_8ROWS rsp+16*11, 16 - mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end - call m(idct_8x16_internal).pass2 - - add coeffq, 16*16 - lea dstq, [r3+8] - LOAD_8ROWS rsp+16*19, 16 - mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end - call m(idct_8x16_internal).pass2 - - add coeffq, 16*16 - lea dstq, [r3+8] - LOAD_8ROWS rsp+16*27, 16 - mova [rsp+16*0], m7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end - call m(idct_8x16_internal).pass2 - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_16384)] - mov [coeffq], eobd - pmulhrsw m0, m1 - mov r3d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] - jmp m(inv_txfm_add_dct_dct_32x8).body - - -cglobal idct_32x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - - add coeffq, 16 - lea r3, [o(m(idct_32x16_internal).pass1_end1)] -.pass1: - LOAD_8ROWS coeffq+16*0, 128, 1 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - LOAD_8ROWS coeffq+16*4, 128, 1 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - LOAD_8ROWS coeffq+16*2, 64, 1 - mova [rsp+gprsize+16*19], m0 ;in1 - mova [rsp+gprsize+16*26], m1 ;in3 - mova [rsp+gprsize+16*23], m2 ;in5 - mova [rsp+gprsize+16*22], m3 ;in7 - mova [rsp+gprsize+16*21], m4 ;in9 - mova [rsp+gprsize+16*24], m5 ;in11 - mova [rsp+gprsize+16*25], m6 ;in13 - mova [rsp+gprsize+16*20], m7 ;in15 - - LOAD_8ROWS coeffq+16*34, 64, 1 - mova [rsp+gprsize+16*33], m0 ;in17 - mova [rsp+gprsize+16*28], m1 ;in19 - mova [rsp+gprsize+16*29], m2 ;in21 - mova [rsp+gprsize+16*32], m3 ;in23 - mova [rsp+gprsize+16*31], m4 ;in25 - mova [rsp+gprsize+16*30], m5 ;in27 - mova [rsp+gprsize+16*27], m6 ;in29 - mova [rsp+gprsize+16*34], m7 ;in31 - call m(idct_8x32_internal).main - -.pass1_end: - mova [rsp+gprsize+16*0 ], m7 - mov tx2q, r3 - jmp m(idct_8x8_internal).pass1_end - -.pass1_end1: - SAVE_8ROWS coeffq+16*0, 32 - LOAD_8ROWS rsp+gprsize+16*11, 16 - mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end2: - SAVE_8ROWS coeffq+16*16, 32 - LOAD_8ROWS rsp+gprsize+16*19, 16 - mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end3: - SAVE_8ROWS coeffq+16*32, 32 - LOAD_8ROWS rsp+gprsize+16*27, 16 - mova [rsp+gprsize+16*0 ], m7 - lea tx2q, [o(m(idct_32x16_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end4: - SAVE_8ROWS coeffq+16*48, 32 - - sub coeffq, 16 - lea r3, [o(m(idct_32x16_internal).end)] - jmp .pass1 - -.end: - ret - - -cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 - %undef cmp - - mov r4d, eobd - cmp eobd, 43 ;if (eob > 43) - sbb r3d, r3d ; iteration_count++ - cmp r4d, 150 ;if (eob > 150) - sbb r3d, 0 ; iteration_count++ - cmp r4d, 278 ;if (eob > 278) - sbb r3d, -4 ; iteration_count++ - -%if ARCH_X86_32 - LEA r5, $$ -%endif - lea r4, [dstq+8] - mov [rsp+16*3], r4 - mov [rsp+gprsize+16*3], r3d - mov [rsp+gprsize*2+16*3], coeffq - -.loop: - LOAD_8ROWS coeffq, 64, 1 - mova [rsp+16*1], m6 - pxor m6, m6 - REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end3 - mova [rsp+16*0], m2 - mova [rsp+16*1], m3 - mova [rsp+16*2], m4 - mova m3, [o(pw_1697x16)] - mova m4, [o(pw_16384)] - REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 - mova m2, [o(pw_8192)] - REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 - mova m2, [rsp+16*0] - mova [rsp+16*0], m7 - IDTX16 2, 7, 3, 4 - mova m7, [rsp+16*2] - mova [rsp+16*2], m5 - IDTX16 7, 5, 3, 4 - mova m5, [rsp+16*1] - mova [rsp+16*1], m6 - pmulhrsw m3, m5 - pmulhrsw m3, m4 - psrlw m4, 1 ; pw_8192 - paddsw m3, m5 - pmulhrsw m2, m4 - pmulhrsw m3, m4 - pmulhrsw m4, m7 - call m(idct_8x8_internal).end3 - lea dstq, [dstq+strideq*2] - add coeffq, 16 - dec r3d - jg .loop - mov coeffq, [rsp+gprsize*2+16*3] - add coeffq, 64*8 - mov r3d, [rsp+gprsize+16*3] - xor dstq, dstq - mov [rsp+gprsize+16*3], dstq - mov dstq, [rsp+16*3] - test r3d, r3d - jnz .loop - RET - - -cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 - %undef cmp - - mov r4d, 12 ;0100b - mov r5d, 136 ;1000 1000b - cmp eobd, 44 ;if (eob > 43) - cmovns r4d, r5d ; iteration_count+2 - cmp eobd, 151 ;if (eob > 150) - mov r3d, 34952 ;1000 1000 1000 1000b - cmovs r3d, r4d ; iteration_count += 4 - -%if ARCH_X86_32 - LEA r5, $$ -%endif - lea r4, [dstq+8] - mov [rsp+16*3], r4 - -.loop: - LOAD_8ROWS coeffq, 32, 1 - REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 - mova [rsp+16*1], m6 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end3 - mova [rsp+16*1], m5 - mova [rsp+16*2], m6 - mova m6, [o(pw_1697x16)] - REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 - pmulhrsw m7, [o(pw_2048)] - mova m5, [rsp+16*1] - mova [rsp+16*0], m7 - IDTX16 5, 7, 6 - mova m7, [rsp+16*2] - IDTX16 7, 6, 6 - mova m6, [o(pw_2048)] - REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 - mova [rsp+16*2], m5 - mova [rsp+16*1], m7 - call m(idct_8x8_internal).end3 - lea dstq, [dstq+strideq*2] - pxor m7, m7 - REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 - -.loop_end: - add coeffq, 16 - shr r3d, 2 - jz .ret - test r3d, 2 - jnz .loop - mov r4d, r3d - and r4d, 1 - lea coeffq, [coeffq+r4*8+32*7] - mov dstq, [rsp+16*3] - lea r4, [dstq+8] - mov [rsp+16*3], r4 - jmp .loop - -.ret: - RET - - -cglobal inv_txfm_add_dct_dct_32x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - - call m(idct_32x32_internal) - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_8192)] - mov [coeffq], eobd - mov r3d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8).end)] - jmp m(inv_txfm_add_dct_dct_32x8).body - - -cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - - mov r4d, 2 - sub eobd, 136 - mov [rsp+gprsize*1+16*35], eobd - mov r3d, 4 - cmovs r3d, r4d - -%if ARCH_X86_32 - LEA r5, $$ -%endif - - mov [rsp+gprsize*2+16*35], coeffq - -.pass1_loop: - LOAD_8ROWS coeffq+64*1, 64*2 - mova [rsp+gprsize+16*19], m0 ;in1 - mova [rsp+gprsize+16*26], m1 ;in3 - mova [rsp+gprsize+16*23], m2 ;in5 - mova [rsp+gprsize+16*22], m3 ;in7 - mova [rsp+gprsize+16*21], m4 ;in9 - mova [rsp+gprsize+16*24], m5 ;in11 - mova [rsp+gprsize+16*25], m6 ;in13 - mova [rsp+gprsize+16*20], m7 ;in15 - - mov tx2d, [rsp+gprsize*1+16*35] - test tx2d, tx2d - jl .fast - -.full: - LOAD_8ROWS coeffq+64*0, 64*4 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_8ROWS coeffq+64*2, 64*4 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - LOAD_8ROWS coeffq+64*17, 64*2 - mova [rsp+gprsize+16*33], m0 ;in17 - mova [rsp+gprsize+16*28], m1 ;in19 - mova [rsp+gprsize+16*29], m2 ;in21 - mova [rsp+gprsize+16*32], m3 ;in23 - mova [rsp+gprsize+16*31], m4 ;in25 - mova [rsp+gprsize+16*30], m5 ;in27 - mova [rsp+gprsize+16*27], m6 ;in29 - mova [rsp+gprsize+16*34], m7 ;in31 - - call m(idct_8x32_internal).main - jmp .pass1_end - -.fast: - mova m0, [coeffq+256*0] - mova m1, [coeffq+256*1] - mova m2, [coeffq+256*2] - mova m3, [coeffq+256*3] - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - - SAVE_7ROWS rsp+gprsize+16*3, 16 - mova m0, [coeffq+128*1] - mova m1, [coeffq+128*3] - mova m2, [coeffq+128*5] - mova m3, [coeffq+128*7] - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - call m(idct_8x32_internal).main_fast - -.pass1_end: - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end1: - SAVE_8ROWS coeffq+64*0, 64 - LOAD_8ROWS rsp+gprsize+16*11, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end2: - SAVE_8ROWS coeffq+64*8, 64 - LOAD_8ROWS rsp+gprsize+16*19, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end3: - SAVE_8ROWS coeffq+64*16, 64 - LOAD_8ROWS rsp+gprsize+16*27, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_32x32_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end4: - SAVE_8ROWS coeffq+64*24, 64 - - add coeffq, 16 - dec r3d - jg .pass1_loop - - -.pass2: - mov coeffq, [rsp+gprsize*2+16*35] - mov r3d, 4 - lea tx2q, [o(m(idct_32x32_internal).pass2_end)] - -.pass2_loop: - mov [rsp+gprsize*3+16*35], r3d - lea r3, [dstq+8] - mov [rsp+gprsize*2+16*35], r3 - - mova m0, [coeffq+16*4 ] - mova m1, [coeffq+16*12] - mova m2, [coeffq+16*20] - mova m3, [coeffq+16*28] - mova m4, [coeffq+16*5 ] - mova m5, [coeffq+16*13] - mova m6, [coeffq+16*21] - mova m7, [coeffq+16*29] - mova [rsp+gprsize+16*19], m0 ;in1 - mova [rsp+gprsize+16*26], m1 ;in3 - mova [rsp+gprsize+16*23], m2 ;in5 - mova [rsp+gprsize+16*22], m3 ;in7 - mova [rsp+gprsize+16*21], m4 ;in9 - mova [rsp+gprsize+16*24], m5 ;in11 - mova [rsp+gprsize+16*25], m6 ;in13 - mova [rsp+gprsize+16*20], m7 ;in15 - - mov eobd, [rsp+gprsize*1+16*35] - test eobd, eobd - jl .fast1 - -.full1: - mova m0, [coeffq+16*0 ] - mova m1, [coeffq+16*16] - mova m2, [coeffq+16*1 ] - mova m3, [coeffq+16*17] - mova m4, [coeffq+16*2 ] - mova m5, [coeffq+16*18] - mova m6, [coeffq+16*3 ] - mova m7, [coeffq+16*19] - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - mova m0, [coeffq+16*8 ] - mova m1, [coeffq+16*24] - mova m2, [coeffq+16*9 ] - mova m3, [coeffq+16*25] - mova m4, [coeffq+16*10] - mova m5, [coeffq+16*26] - mova m6, [coeffq+16*11] - mova m7, [coeffq+16*27] - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - mova m0, [coeffq+16*6 ] - mova m1, [coeffq+16*14] - mova m2, [coeffq+16*22] - mova m3, [coeffq+16*30] - mova m4, [coeffq+16*7 ] - mova m5, [coeffq+16*15] - mova m6, [coeffq+16*23] - mova m7, [coeffq+16*31] - mova [rsp+gprsize+16*33], m0 ;in17 - mova [rsp+gprsize+16*28], m1 ;in19 - mova [rsp+gprsize+16*29], m2 ;in21 - mova [rsp+gprsize+16*32], m3 ;in23 - mova [rsp+gprsize+16*31], m4 ;in25 - mova [rsp+gprsize+16*30], m5 ;in27 - mova [rsp+gprsize+16*27], m6 ;in29 - mova [rsp+gprsize+16*34], m7 ;in31 - - call m(idct_8x32_internal).main - jmp tx2q - -.fast1: - mova m0, [coeffq+16*0 ] - mova m1, [coeffq+16*16] - mova m2, [coeffq+16*1 ] - mova m3, [coeffq+16*17] - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - mova m0, [coeffq+16*8 ] - mova m1, [coeffq+16*24] - mova m2, [coeffq+16*9 ] - mova m3, [coeffq+16*25] - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - call m(idct_8x32_internal).main_fast - jmp tx2q - -.pass2_end: - lea r3, [o(m(idct_32x32_internal).pass2_end1)] - jmp m(idct_8x32_internal).end - -.pass2_end1: - lea tx2q, [o(m(idct_32x32_internal).pass2_end)] - add coeffq, 16*32 - mov dstq, [rsp+gprsize*2+16*35] - mov r3d, [rsp+gprsize*3+16*35] - dec r3d - jg .pass2_loop - - ret - - -cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 - %undef cmp - - mov r4d, 2 - cmp eobd, 136 - mov r3d, 4 - cmovs r3d, r4d - -%if ARCH_X86_32 - LEA r5, $$ -%endif - - lea r4, [dstq+8] - mov [rsp+gprsize*0+16*3], r4 - mov [rsp+gprsize*1+16*3], r3d - mov [rsp+gprsize*2+16*3], r3d - mov [rsp+gprsize*3+16*3], coeffq - -.loop: - LOAD_8ROWS coeffq, 64 - mova [rsp+16*1], m6 - lea tx2q, [o(m(idct_32x16_internal).end)] - call m(idct_8x8_internal).pass1_end3 - pmulhrsw m7, [o(pw_8192)] - mova [rsp+16*0], m7 - mova m7, [o(pw_8192)] - REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 - mova [rsp+16*1], m6 - mova [rsp+16*2], m5 - call m(idct_8x8_internal).end3 - lea dstq, [dstq+strideq*2] - - pxor m7, m7 - REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 - - add coeffq, 16 - dec r3d - jg .loop - - mov r4d, [rsp+gprsize*2+16*3] - dec r4d - jle .ret - - mov dstq, [rsp+gprsize*0+16*3] - mov coeffq, [rsp+gprsize*3+16*3] - mov [rsp+gprsize*2+16*3], r4 - lea r3, [dstq+8] - add coeffq, 64*8 - mov [rsp+gprsize*0+16*3], r3 - mov r3d, [rsp+gprsize*1+16*3] - mov [rsp+gprsize*3+16*3], coeffq - jmp .loop - -.ret: - RET - - -cglobal inv_txfm_add_dct_dct_16x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - - call m(idct_16x64_internal) - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_8192)] - mov [coeffq], eobd - mov r2d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_16x64).end)] - jmp m(inv_txfm_add_dct_dct_16x4).dconly - -.end: - RET - - -cglobal idct_16x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - - mov r4d, 2 - sub eobd, 151 - mov [rsp+gprsize*1+16*67], eobd - mov r3d, 4 - cmovs r3d, r4d - -%if ARCH_X86_32 - LEA r5, $$ -%endif - - mov [rsp+gprsize*2+16*67], coeffq - -.pass1_loop: - LOAD_8ROWS coeffq+64*0, 64*2 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_8ROWS coeffq+64*1, 64*2 - call m(idct_16x8_internal).main - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end: - SAVE_8ROWS coeffq+64*8, 64 - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_16x64_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end1: - SAVE_8ROWS coeffq+64*0, 64 - - add coeffq, 16 - dec r3d - jg .pass1_loop - - mov coeffq, [rsp+gprsize*2+16*67] - mov r3d, 2 - lea r4, [dstq+8] - mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] - -.pass2_loop: - mov [rsp+gprsize*3+16*67], r3d - mov eobd, [rsp+gprsize*1+16*67] - - mova m0, [coeffq+16*4 ] ;in1 - mova m1, [coeffq+16*12] ;in3 - mova m2, [coeffq+16*20] ;in5 - mova m3, [coeffq+16*28] ;in7 - mova m4, [coeffq+16*5 ] ;in9 - mova m5, [coeffq+16*13] ;in11 - mova m6, [coeffq+16*21] ;in13 - mova m7, [coeffq+16*29] ;in15 - mova [rsp+gprsize+16*35], m0 ;in1 - mova [rsp+gprsize+16*49], m1 ;in3 - mova [rsp+gprsize+16*43], m2 ;in5 - mova [rsp+gprsize+16*41], m3 ;in7 - mova [rsp+gprsize+16*39], m4 ;in9 - mova [rsp+gprsize+16*45], m5 ;in11 - mova [rsp+gprsize+16*47], m6 ;in13 - mova [rsp+gprsize+16*37], m7 ;in15 - - pxor m4, m4 - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - - test eobd, eobd - jl .fast - -.full: - mova m2, [coeffq+16*2] - mova m3, [coeffq+16*3] - - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - pxor m4, m4 - mova m0, [coeffq+16*16] - mova m1, [coeffq+16*17] - mova m2, [coeffq+16*18] - mova m3, [coeffq+16*19] - - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - mova m0, [coeffq+16*8 ] - mova m1, [coeffq+16*24] - mova m2, [coeffq+16*9 ] - mova m3, [coeffq+16*25] - mova m4, [coeffq+16*10] - mova m5, [coeffq+16*26] - mova m6, [coeffq+16*11] - mova m7, [coeffq+16*27] - mova [rsp+gprsize+16*19], m0 - mova [rsp+gprsize+16*26], m1 - mova [rsp+gprsize+16*23], m2 - mova [rsp+gprsize+16*22], m3 - mova [rsp+gprsize+16*21], m4 - mova [rsp+gprsize+16*24], m5 - mova [rsp+gprsize+16*25], m6 - mova [rsp+gprsize+16*20], m7 - - call m(idct_8x32_internal).main_fast - SAVE_8ROWS rsp+gprsize+16*3, 16 - - mova m0, [coeffq+16*6 ] ;in17 - mova m1, [coeffq+16*14] ;in19 - mova m2, [coeffq+16*22] ;in21 - mova m3, [coeffq+16*30] ;in23 - mova m4, [coeffq+16*7 ] ;in25 - mova m5, [coeffq+16*15] ;in27 - mova m6, [coeffq+16*23] ;in29 - mova m7, [coeffq+16*31] ;in31 - mova [rsp+gprsize+16*63], m0 ;in17 - mova [rsp+gprsize+16*53], m1 ;in19 - mova [rsp+gprsize+16*55], m2 ;in21 - mova [rsp+gprsize+16*61], m3 ;in23 - mova [rsp+gprsize+16*59], m4 ;in25 - mova [rsp+gprsize+16*57], m5 ;in27 - mova [rsp+gprsize+16*51], m6 ;in29 - mova [rsp+gprsize+16*65], m7 ;in31 - - call .main - jmp .end - -.fast: - REPX {mova x, m4}, m2, m3, m5, m6, m7 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - pxor m4, m4 - mova m0, [coeffq+16*16] - mova m1, [coeffq+16*17] - - REPX {mova x, m4}, m2, m3, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - mova m0, [coeffq+16*8 ] - mova m1, [coeffq+16*24] - mova m2, [coeffq+16*9 ] - mova m3, [coeffq+16*25] - mova [rsp+gprsize+16*19], m0 ;in1 - mova [rsp+gprsize+16*26], m1 ;in3 - mova [rsp+gprsize+16*23], m2 ;in5 - mova [rsp+gprsize+16*22], m3 ;in7 - - call m(idct_8x32_internal).main_veryfast - SAVE_8ROWS rsp+gprsize+16*3, 16 - - call .main_fast - -.end: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - mov r3, r4 - jmp m(idct_8x32_internal).end2 - -.end1: - LOAD_8ROWS rsp+gprsize+16*35, 16 - lea dstq, [dstq+strideq*2] - add rsp, 16*32 - lea r3, [o(m(idct_16x64_internal).end2)] - jmp m(idct_8x32_internal).end - -.end2: - add coeffq, 16*32 - sub rsp, 16*32 - - mov dstq, [rsp+gprsize*2+16*67] - mov r3d, [rsp+gprsize*3+16*67] - lea r4, [dstq+8] - mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] - - dec r3d - jg .pass2_loop - ret - - -ALIGN function_align -.main_fast: - mova m0, [rsp+gprsize*2+16*35] ;in1 - pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 - pmulhrsw m0, [o(pw_101x8)] ;t32,t33 - mova m7, [o(pd_2048)] - mova [rsp+gprsize*2+16*35], m0 ;t32 - mova [rsp+gprsize*2+16*66], m3 ;t63 - ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a - mova [rsp+gprsize*2+16*36], m3 ;t33a - mova [rsp+gprsize*2+16*65], m0 ;t62a - - mova m1, [rsp+gprsize*2+16*37] ;in15 - pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 - pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 - mova [rsp+gprsize*2+16*38], m1 ;t35 - mova [rsp+gprsize*2+16*63], m2 ;t60 - ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a - mova [rsp+gprsize*2+16*37], m2 ;t34a - mova [rsp+gprsize*2+16*64], m1 ;t61a - - mova m0, [rsp+gprsize*2+16*39] ;in9 - pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 - pmulhrsw m0, [o(pw_897x8)] ;t36,t37 - mova [rsp+gprsize*2+16*39], m0 ;t36 - mova [rsp+gprsize*2+16*62], m3 ;t59 - ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a - mova [rsp+gprsize*2+16*40], m3 ;t37a - mova [rsp+gprsize*2+16*61], m0 ;t58a - - mova m1, [rsp+gprsize*2+16*41] ;in7 - pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 - pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 - mova [rsp+gprsize*2+16*42], m1 ;t39 - mova [rsp+gprsize*2+16*59], m2 ;t56 - ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a - mova [rsp+gprsize*2+16*41], m2 ;t38a - mova [rsp+gprsize*2+16*60], m1 ;t57a - - mova m0, [rsp+gprsize*2+16*43] ;in5 - pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 - pmulhrsw m0, [o(pw_501x8)] ;t40,t41 - mova [rsp+gprsize*2+16*43], m0 ;t40 - mova [rsp+gprsize*2+16*58], m3 ;t55 - ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a - mova [rsp+gprsize*2+16*44], m3 ;t41a - mova [rsp+gprsize*2+16*57], m0 ;t54a - - mova m1, [rsp+gprsize*2+16*45] ;in11 - pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 - pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 - mova [rsp+gprsize*2+16*46], m1 ;t43 - mova [rsp+gprsize*2+16*55], m2 ;t52 - ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a - mova [rsp+gprsize*2+16*45], m2 ;t42a - mova [rsp+gprsize*2+16*56], m1 ;t53a - - mova m0, [rsp+gprsize*2+16*47] ;in13 - pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 - pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 - mova m6, m0 - mova [rsp+gprsize*2+16*54], m3 ;t51 - ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a - mova [rsp+gprsize*2+16*48], m3 ;t45a - mova [rsp+gprsize*2+16*53], m0 ;t50a - - mova m0, [rsp+gprsize*2+16*49] ;in3 - pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 - pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 - mova m4, m3 - mova m5, m0 - - jmp .main2 - -ALIGN function_align -.main: - mova m0, [rsp+gprsize*2+16*35] ;in1 - mova m1, [rsp+gprsize*2+16*65] ;in31 - pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a - pmulhrsw m0, [o(pw_101x8)] ;t32a - pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a - pmulhrsw m1, [o(pw_m2824x8)] ;t33a - mova m7, [o(pd_2048)] - psubsw m4, m0, m1 ;t33 - paddsw m0, m1 ;t32 - psubsw m5, m3, m2 ;t62 - paddsw m3, m2 ;t63 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a - mova [rsp+gprsize*2+16*35], m0 ;t32 - mova [rsp+gprsize*2+16*36], m5 ;t33a - mova [rsp+gprsize*2+16*65], m4 ;t62a - mova [rsp+gprsize*2+16*66], m3 ;t63 - - mova m0, [rsp+gprsize*2+16*63] ;in17 - mova m1, [rsp+gprsize*2+16*37] ;in15 - pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a - pmulhrsw m0, [o(pw_1660x8)] ;t34a - pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a - pmulhrsw m1, [o(pw_m1474x8)] ;t35a - psubsw m4, m1, m0 ;t34 - paddsw m0, m1 ;t35 - psubsw m5, m2, m3 ;t61 - paddsw m3, m2 ;t60 - ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a - mova [rsp+gprsize*2+16*37], m5 ;t34a - mova [rsp+gprsize*2+16*38], m0 ;t35 - mova [rsp+gprsize*2+16*63], m3 ;t60 - mova [rsp+gprsize*2+16*64], m4 ;t61a - - mova m0, [rsp+gprsize*2+16*39] ;in9 - mova m1, [rsp+gprsize*2+16*61] ;in23 - pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a - pmulhrsw m0, [o(pw_897x8)] ;t36a - pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a - pmulhrsw m1, [o(pw_m2191x8)] ;t37a - psubsw m4, m0, m1 ;t37 - paddsw m0, m1 ;t36 - psubsw m5, m3, m2 ;t58 - paddsw m3, m2 ;t59 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a - mova [rsp+gprsize*2+16*39], m0 ;t36 - mova [rsp+gprsize*2+16*40], m5 ;t37a - mova [rsp+gprsize*2+16*61], m4 ;t58a - mova [rsp+gprsize*2+16*62], m3 ;t59 - - mova m0, [rsp+gprsize*2+16*59] ;in25 - mova m1, [rsp+gprsize*2+16*41] ;in7 - pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a - pmulhrsw m0, [o(pw_2359x8)] ;t38a - pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a - pmulhrsw m1, [o(pw_m700x8)] ;t39a - psubsw m4, m1, m0 ;t38 - paddsw m0, m1 ;t39 - psubsw m5, m2, m3 ;t57 - paddsw m3, m2 ;t56 - ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a - mova [rsp+gprsize*2+16*41], m5 ;t38a - mova [rsp+gprsize*2+16*42], m0 ;t39 - mova [rsp+gprsize*2+16*59], m3 ;t56 - mova [rsp+gprsize*2+16*60], m4 ;t57a - - mova m0, [rsp+gprsize*2+16*43] ;in5 - mova m1, [rsp+gprsize*2+16*57] ;in27 - pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a - pmulhrsw m0, [o(pw_501x8)] ;t40a - pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a - pmulhrsw m1, [o(pw_m2520x8)] ;t41a - psubsw m4, m0, m1 ;t41 - paddsw m0, m1 ;t40 - psubsw m5, m3, m2 ;t54 - paddsw m3, m2 ;t55 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a - mova [rsp+gprsize*2+16*43], m0 ;t40 - mova [rsp+gprsize*2+16*44], m5 ;t41a - mova [rsp+gprsize*2+16*57], m4 ;t54a - mova [rsp+gprsize*2+16*58], m3 ;t55 - - mova m0, [rsp+gprsize*2+16*55] ;in21 - mova m1, [rsp+gprsize*2+16*45] ;in11 - pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a - pmulhrsw m0, [o(pw_2019x8)] ;t42a - pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a - pmulhrsw m1, [o(pw_m1092x8)] ;t43a - psubsw m4, m1, m0 ;t42 - paddsw m0, m1 ;t43 - psubsw m5, m2, m3 ;t53 - paddsw m3, m2 ;t52 - ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a - mova [rsp+gprsize*2+16*45], m5 ;t42a - mova [rsp+gprsize*2+16*46], m0 ;t43 - mova [rsp+gprsize*2+16*55], m3 ;t52 - mova [rsp+gprsize*2+16*56], m4 ;t53a - - mova m0, [rsp+gprsize*2+16*47] ;in13 - mova m1, [rsp+gprsize*2+16*53] ;in19 - pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a - pmulhrsw m0, [o(pw_1285x8)] ;t44a - pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a - pmulhrsw m1, [o(pw_m1842x8)] ;t45a - psubsw m4, m0, m1 ;t45 - paddsw m0, m1 ;t44 - psubsw m5, m3, m2 ;t50 - paddsw m3, m2 ;t51 - ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a - mova m6, m0 - mova [rsp+gprsize*2+16*48], m5 ;t45a - mova [rsp+gprsize*2+16*53], m4 ;t50a - mova [rsp+gprsize*2+16*54], m3 ;t51 - - mova m0, [rsp+gprsize*2+16*51] ;in29 - mova m1, [rsp+gprsize*2+16*49] ;in3 - pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a - pmulhrsw m0, [o(pw_2675x8)] ;t46a - pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a - pmulhrsw m1, [o(pw_m301x8)] ;t47a - psubsw m5, m1, m0 ;t46 - paddsw m0, m1 ;t47 - psubsw m4, m2, m3 ;t49 - paddsw m3, m2 ;t48 - -ALIGN function_align -.main2: - ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a - mova m1, [rsp+gprsize*2+16*54] ;t51 - psubsw m2, m0, m6 ;t44a - paddsw m0, m6 ;t47a - psubsw m6, m3, m1 ;t51a - paddsw m3, m1 ;t48a - mova [rsp+gprsize*2+16*50], m0 ;t47a - mova [rsp+gprsize*2+16*51], m3 ;t48a - ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 - mova [rsp+gprsize*2+16*47], m6 ;t44 - mova [rsp+gprsize*2+16*54], m2 ;t51 - - mova m0, [rsp+gprsize*2+16*48] ;t45a - mova m3, [rsp+gprsize*2+16*53] ;t50a - psubsw m2, m4, m0 ;t45 - paddsw m4, m0 ;t46 - psubsw m6, m5, m3 ;t50 - paddsw m5, m3 ;t49 - ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a - mova [rsp+gprsize*2+16*48], m6 ;t45a - mova [rsp+gprsize*2+16*49], m4 ;t46 - mova [rsp+gprsize*2+16*52], m5 ;t49 - mova [rsp+gprsize*2+16*53], m2 ;t50a - - mova m0, [rsp+gprsize*2+16*43] ;t40 - mova m2, [rsp+gprsize*2+16*46] ;t43 - mova m3, [rsp+gprsize*2+16*55] ;t52 - mova m1, [rsp+gprsize*2+16*58] ;t55 - psubsw m4, m0, m2 ;t43a - paddsw m0, m2 ;t40a - psubsw m5, m1, m3 ;t52a - paddsw m1, m3 ;t55a - ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 - mova [rsp+gprsize*2+16*43], m0 ;t40a - mova [rsp+gprsize*2+16*46], m5 ;t43 - mova [rsp+gprsize*2+16*55], m4 ;t52 - mova [rsp+gprsize*2+16*58], m1 ;t55a - - mova m0, [rsp+gprsize*2+16*44] ;t41a - mova m2, [rsp+gprsize*2+16*45] ;t42a - mova m3, [rsp+gprsize*2+16*56] ;t53a - mova m1, [rsp+gprsize*2+16*57] ;t54a - psubsw m4, m0, m2 ;t42 - paddsw m0, m2 ;t41 - psubsw m5, m1, m3 ;t53 - paddsw m1, m3 ;t54 - ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a - mova [rsp+gprsize*2+16*44], m0 ;t41 - mova [rsp+gprsize*2+16*45], m5 ;t42a - mova [rsp+gprsize*2+16*56], m4 ;t53a - mova [rsp+gprsize*2+16*57], m1 ;t54 - - mova m0, [rsp+gprsize*2+16*41] ;t38a - mova m2, [rsp+gprsize*2+16*40] ;t37a - mova m3, [rsp+gprsize*2+16*61] ;t58a - mova m1, [rsp+gprsize*2+16*60] ;t57a - psubsw m4, m0, m2 ;t37 - paddsw m0, m2 ;t38 - psubsw m5, m1, m3 ;t58 - paddsw m1, m3 ;t57 - ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a - mova [rsp+gprsize*2+16*41], m0 ;t38 - mova [rsp+gprsize*2+16*40], m5 ;t37a - mova [rsp+gprsize*2+16*61], m4 ;t58a - mova [rsp+gprsize*2+16*60], m1 ;t57 - - mova m0, [rsp+gprsize*2+16*42] ;t39 - mova m2, [rsp+gprsize*2+16*39] ;t36 - mova m3, [rsp+gprsize*2+16*62] ;t59 - mova m1, [rsp+gprsize*2+16*59] ;t56 - psubsw m4, m0, m2 ;t36a - paddsw m0, m2 ;t39a - psubsw m5, m1, m3 ;t59a - paddsw m1, m3 ;t56a - ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 - mova [rsp+gprsize*2+16*42], m0 ;t39a - mova [rsp+gprsize*2+16*39], m5 ;t36 - mova [rsp+gprsize*2+16*62], m4 ;t59 - mova [rsp+gprsize*2+16*59], m1 ;t56a - - mova m0, [rsp+gprsize*2+16*35] ;t32 - mova m2, [rsp+gprsize*2+16*38] ;t35 - mova m3, [rsp+gprsize*2+16*63] ;t60 - mova m1, [rsp+gprsize*2+16*66] ;t63 - psubsw m4, m0, m2 ;t35a - paddsw m0, m2 ;t32a - psubsw m5, m1, m3 ;t60a - paddsw m1, m3 ;t63a - ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 - mova [rsp+gprsize*2+16*35], m0 ;t32a - mova [rsp+gprsize*2+16*38], m5 ;t35 - mova [rsp+gprsize*2+16*63], m4 ;t60 - mova [rsp+gprsize*2+16*66], m1 ;t63a - - mova m0, [rsp+gprsize*2+16*36] ;t33a - mova m2, [rsp+gprsize*2+16*37] ;t34a - mova m3, [rsp+gprsize*2+16*64] ;t61a - mova m1, [rsp+gprsize*2+16*65] ;t62a - psubsw m4, m0, m2 ;t34 - paddsw m0, m2 ;t33 - psubsw m5, m1, m3 ;t61 - paddsw m1, m3 ;t62 - ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a - - mova m2, [rsp+gprsize*2+16*41] ;t38 - mova m3, [rsp+gprsize*2+16*60] ;t57 - psubsw m6, m0, m2 ;t38a - paddsw m0, m2 ;t33a - psubsw m2, m1, m3 ;t57a - paddsw m1, m3 ;t62a - mova [rsp+gprsize*2+16*36], m0 ;t33a - mova [rsp+gprsize*2+16*65], m1 ;t62a - ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 - mova [rsp+gprsize*2+16*41], m2 ;t38 - mova [rsp+gprsize*2+16*60], m6 ;t57 - - mova m2, [rsp+gprsize*2+16*40] ;t37 - mova m3, [rsp+gprsize*2+16*61] ;t58 - psubsw m0, m5, m2 ;t37 - paddsw m5, m2 ;t34 - psubsw m1, m4, m3 ;t58 - paddsw m4, m3 ;t61 - ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a - mova [rsp+gprsize*2+16*37], m5 ;t34 - mova [rsp+gprsize*2+16*64], m4 ;t61 - mova [rsp+gprsize*2+16*40], m1 ;t37a - mova [rsp+gprsize*2+16*61], m0 ;t58a - - mova m0, [rsp+gprsize*2+16*38] ;t35 - mova m2, [rsp+gprsize*2+16*39] ;t36 - mova m3, [rsp+gprsize*2+16*62] ;t59 - mova m1, [rsp+gprsize*2+16*63] ;t60 - psubsw m4, m0, m2 ;t36a - paddsw m0, m2 ;t35a - psubsw m5, m1, m3 ;t59a - paddsw m1, m3 ;t60a - ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 - mova [rsp+gprsize*2+16*38], m0 ;t35a - mova [rsp+gprsize*2+16*39], m5 ;t36 - mova [rsp+gprsize*2+16*62], m4 ;t59 - mova [rsp+gprsize*2+16*63], m1 ;t60a - - mova m0, [rsp+gprsize*2+16*35] ;t32a - mova m2, [rsp+gprsize*2+16*42] ;t39a - mova m3, [rsp+gprsize*2+16*59] ;t56a - mova m1, [rsp+gprsize*2+16*66] ;t63a - psubsw m4, m0, m2 ;t39 - paddsw m0, m2 ;t32 - psubsw m5, m1, m3 ;t56 - paddsw m1, m3 ;t63 - ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a - mova [rsp+gprsize*2+16*35], m0 ;t32 - mova [rsp+gprsize*2+16*42], m5 ;t39a - mova [rsp+gprsize*2+16*59], m4 ;t56a - mova [rsp+gprsize*2+16*66], m1 ;t63 - - mova m0, [rsp+gprsize*2+16*50] ;t47a - mova m2, [rsp+gprsize*2+16*43] ;t40a - mova m3, [rsp+gprsize*2+16*58] ;t55a - mova m1, [rsp+gprsize*2+16*51] ;t48a - psubsw m4, m0, m2 ;t40 - paddsw m0, m2 ;t47 - psubsw m5, m1, m3 ;t55 - paddsw m1, m3 ;t48 - ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a - mova [rsp+gprsize*2+16*50], m0 ;t47 - mova [rsp+gprsize*2+16*43], m5 ;t40a - mova [rsp+gprsize*2+16*58], m4 ;t55a - mova [rsp+gprsize*2+16*51], m1 ;t48 - - mova m0, [rsp+gprsize*2+16*49] ;t46 - mova m2, [rsp+gprsize*2+16*44] ;t41 - mova m3, [rsp+gprsize*2+16*57] ;t54 - mova m1, [rsp+gprsize*2+16*52] ;t49 - psubsw m4, m0, m2 ;t41a - paddsw m0, m2 ;t46a - psubsw m5, m1, m3 ;t54a - paddsw m1, m3 ;t49a - ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 - mova [rsp+gprsize*2+16*49], m0 ;t46a - mova [rsp+gprsize*2+16*44], m5 ;t41 - mova [rsp+gprsize*2+16*57], m4 ;t54 - mova [rsp+gprsize*2+16*52], m1 ;t49a - - mova m0, [rsp+gprsize*2+16*48] ;t45a - mova m2, [rsp+gprsize*2+16*45] ;t42a - mova m3, [rsp+gprsize*2+16*56] ;t53a - mova m1, [rsp+gprsize*2+16*53] ;t50a - psubsw m4, m0, m2 ;t42 - paddsw m0, m2 ;t45 - psubsw m5, m1, m3 ;t53 - paddsw m1, m3 ;t50 - ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a - mova [rsp+gprsize*2+16*48], m0 ;t45 - mova [rsp+gprsize*2+16*45], m5 ;t42a - mova [rsp+gprsize*2+16*56], m4 ;t53a - mova [rsp+gprsize*2+16*53], m1 ;t50 - - mova m0, [rsp+gprsize*2+16*47] ;t44 - mova m2, [rsp+gprsize*2+16*46] ;t43 - mova m3, [rsp+gprsize*2+16*55] ;t52 - mova m1, [rsp+gprsize*2+16*54] ;t51 - psubsw m4, m0, m2 ;t43a - paddsw m0, m2 ;t44a - psubsw m5, m1, m3 ;t52a - paddsw m1, m3 ;t51a - ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 - - mova m2, [rsp+gprsize*2+16*38] ;t35a - mova m3, [rsp+gprsize*2+16*31] ;tmp[28] - psubsw m6, m2, m0 ;t44 - paddsw m2, m0 ;t35 - psubsw m0, m3, m2 ;out35 - paddsw m2, m3 ;out28 - mova m3, [rsp+gprsize*2+16*63] ;t60a - mova [rsp+gprsize*2+16*38], m0 ;out35 - mova [rsp+gprsize*2+16*31], m2 ;out28 - psubsw m0, m3, m1 ;t51 - paddsw m3, m1 ;t60 - ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a - mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] - psubsw m1, m2, m3 ;out60 - paddsw m2, m3 ;out3 - mova m3, [rsp+gprsize*2+16*22] ;tmp[19] - mova [rsp+gprsize*2+16*63], m1 ;out60 - mova [rsp+gprsize*2+16*6 ], m2 ;out3 - psubsw m1, m3, m0 ;out44 - paddsw m3, m0 ;out19 - mova m2, [rsp+gprsize*2+16*15] ;tmp[12] - - mova m0, [rsp+gprsize*2+16*39] ;t36 - mova [rsp+gprsize*2+16*47], m1 ;out44 - mova [rsp+gprsize*2+16*22], m3 ;out19 - mova m1, [rsp+gprsize*2+16*62] ;t59 - psubsw m3, m2, m6 ;out51 - paddsw m2, m6 ;out12 - mova [rsp+gprsize*2+16*54], m3 ;out51 - mova [rsp+gprsize*2+16*15], m2 ;out12 - psubsw m2, m0, m5 ;t43a - paddsw m0, m5 ;t36a - mova m5, [rsp+gprsize*2+16*30] ;tmp[27] - psubsw m3, m1, m4 ;t52a - paddsw m1, m4 ;t59a - ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 - mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] - psubsw m6, m5, m0 ;out36 - paddsw m5, m0 ;out27 - psubsw m0, m4, m1 ;out59 - paddsw m4, m1 ;out4 - mova [rsp+gprsize*2+16*39], m6 ;out36 - mova [rsp+gprsize*2+16*30], m5 ;out27 - mova [rsp+gprsize*2+16*62], m0 ;out59 - mova [rsp+gprsize*2+16*7 ], m4 ;out4 - mova m0, [rsp+gprsize*2+16*23] ;tmp[20] - mova m5, [rsp+gprsize*2+16*14] ;tmp[11] - psubsw m4, m0, m3 ;out43 - paddsw m0, m3 ;out20 - psubsw m6, m5, m2 ;out52 - paddsw m5, m2 ;out11 - mova [rsp+gprsize*2+16*46], m4 ;out43 - mova [rsp+gprsize*2+16*23], m0 ;out20 - mova [rsp+gprsize*2+16*55], m6 ;out52 - mova [rsp+gprsize*2+16*14], m5 ;out11 - - mova m0, [rsp+gprsize*2+16*40] ;t37a - mova m5, [rsp+gprsize*2+16*45] ;t42a - mova m3, [rsp+gprsize*2+16*56] ;t53a - mova m1, [rsp+gprsize*2+16*61] ;t58a - mova m2, [rsp+gprsize*2+16*29] ;tmp[26] - psubsw m4, m0, m5 ;t42 - paddsw m0, m5 ;t37 - psubsw m5, m1, m3 ;t53 - paddsw m1, m3 ;t58 - ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 - mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] - psubsw m6, m2, m0 ;out37 - paddsw m2, m0 ;out26 - psubsw m0, m3, m1 ;out58 - paddsw m3, m1 ;out5 - mova [rsp+gprsize*2+16*40], m6 ;out37 - mova [rsp+gprsize*2+16*29], m2 ;out26 - mova [rsp+gprsize*2+16*61], m0 ;out58 - mova [rsp+gprsize*2+16*8 ], m3 ;out5 - mova m0, [rsp+gprsize*2+16*24] ;tmp[21] - mova m1, [rsp+gprsize*2+16*13] ;tmp[10] - psubsw m2, m0, m5 ;out42 - paddsw m0, m5 ;out21 - psubsw m3, m1, m4 ;out53 - paddsw m1, m4 ;out10 - mova [rsp+gprsize*2+16*45], m2 ;out42 - mova [rsp+gprsize*2+16*24], m0 ;out21 - mova [rsp+gprsize*2+16*56], m3 ;out53 - mova [rsp+gprsize*2+16*13], m1 ;out10 - - mova m0, [rsp+gprsize*2+16*41] ;t38 - mova m5, [rsp+gprsize*2+16*44] ;t41 - mova m3, [rsp+gprsize*2+16*57] ;t54 - mova m1, [rsp+gprsize*2+16*60] ;t57 - mova m2, [rsp+gprsize*2+16*28] ;tmp[25] - psubsw m4, m0, m5 ;t41a - paddsw m0, m5 ;t38a - psubsw m5, m1, m3 ;t54a - paddsw m1, m3 ;t57a - ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a - mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] - psubsw m6, m2, m0 ;out38 - paddsw m2, m0 ;out25 - psubsw m0, m3, m1 ;out57 - paddsw m3, m1 ;out6 - mova [rsp+gprsize*2+16*41], m6 ;out38 - mova [rsp+gprsize*2+16*28], m2 ;out25 - mova [rsp+gprsize*2+16*60], m0 ;out57 - mova [rsp+gprsize*2+16*9 ], m3 ;out6 - mova m0, [rsp+gprsize*2+16*25] ;tmp[22] - mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] - psubsw m2, m0, m5 ;out41 - paddsw m0, m5 ;out22 - psubsw m3, m1, m4 ;out54 - paddsw m1, m4 ;out9 - mova [rsp+gprsize*2+16*44], m2 ;out41 - mova [rsp+gprsize*2+16*25], m0 ;out22 - mova [rsp+gprsize*2+16*57], m3 ;out54 - mova [rsp+gprsize*2+16*12], m1 ;out9 - - mova m0, [rsp+gprsize*2+16*42] ;t39a - mova m5, [rsp+gprsize*2+16*43] ;t40a - mova m3, [rsp+gprsize*2+16*58] ;t55a - mova m1, [rsp+gprsize*2+16*59] ;t56a - mova m2, [rsp+gprsize*2+16*27] ;tmp[24] - psubsw m4, m0, m5 ;t40 - paddsw m0, m5 ;t39 - psubsw m5, m1, m3 ;t55 - paddsw m1, m3 ;t56 - ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a - mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] - psubsw m6, m2, m0 ;out39 - paddsw m2, m0 ;out24 - psubsw m0, m3, m1 ;out56 - paddsw m3, m1 ;out7 - mova [rsp+gprsize*2+16*42], m6 ;out39 - mova [rsp+gprsize*2+16*27], m2 ;out24 - mova [rsp+gprsize*2+16*59], m0 ;out56 - mova [rsp+gprsize*2+16*10], m3 ;out7 - mova m0, [rsp+gprsize*2+16*26] ;tmp[23] - mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] - psubsw m2, m0, m5 ;out40 - paddsw m0, m5 ;out23 - psubsw m3, m1, m4 ;out55 - paddsw m1, m4 ;out8 - mova [rsp+gprsize*2+16*43], m2 ;out40 - mova [rsp+gprsize*2+16*26], m0 ;out23 - mova [rsp+gprsize*2+16*58], m3 ;out55 - mova [rsp+gprsize*2+16*11], m1 ;out8 - - mova m0, [rsp+gprsize*2+16*37] ;t34 - mova m5, [rsp+gprsize*2+16*48] ;t45 - mova m3, [rsp+gprsize*2+16*53] ;t50 - mova m1, [rsp+gprsize*2+16*64] ;t61 - mova m2, [rsp+gprsize*2+16*32] ;tmp[29] - psubsw m4, m0, m5 ;t45a - paddsw m0, m5 ;t34a - psubsw m5, m1, m3 ;t50a - paddsw m1, m3 ;t61a - ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 - mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] - psubsw m6, m2, m0 ;out34 - paddsw m2, m0 ;out29 - psubsw m0, m3, m1 ;out61 - paddsw m3, m1 ;out2 - mova [rsp+gprsize*2+16*37], m6 ;out34 - mova [rsp+gprsize*2+16*32], m2 ;out29 - mova [rsp+gprsize*2+16*64], m0 ;out61 - mova [rsp+gprsize*2+16*5 ], m3 ;out2 - mova m0, [rsp+gprsize*2+16*21] ;tmp[18] - mova m1, [rsp+gprsize*2+16*16] ;tmp[13] - psubsw m2, m0, m5 ;out45 - paddsw m0, m5 ;out18 - psubsw m3, m1, m4 ;out50 - paddsw m1, m4 ;out13 - mova [rsp+gprsize*2+16*48], m2 ;out45 - mova [rsp+gprsize*2+16*21], m0 ;out18 - mova [rsp+gprsize*2+16*53], m3 ;out50 - mova [rsp+gprsize*2+16*16], m1 ;out13 - - mova m0, [rsp+gprsize*2+16*36] ;t33a - mova m5, [rsp+gprsize*2+16*49] ;t46a - mova m3, [rsp+gprsize*2+16*52] ;t49a - mova m1, [rsp+gprsize*2+16*65] ;t62a - mova m2, [rsp+gprsize*2+16*33] ;tmp[30] - psubsw m4, m0, m5 ;t46 - paddsw m0, m5 ;t33 - psubsw m5, m1, m3 ;t49 - paddsw m1, m3 ;t62 - ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 - mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] - psubsw m6, m2, m0 ;out33 - paddsw m2, m0 ;out30 - psubsw m0, m3, m1 ;out62 - paddsw m3, m1 ;out1 - mova [rsp+gprsize*2+16*36], m6 ;out33 - mova [rsp+gprsize*2+16*33], m2 ;out30 - mova [rsp+gprsize*2+16*65], m0 ;out62 - mova [rsp+gprsize*2+16*4 ], m3 ;out1 - mova m0, [rsp+gprsize*2+16*20] ;tmp[17] - mova m1, [rsp+gprsize*2+16*17] ;tmp[14] - psubsw m2, m0, m5 ;out46 - paddsw m0, m5 ;out17 - psubsw m3, m1, m4 ;out49 - paddsw m1, m4 ;out14 - mova [rsp+gprsize*2+16*49], m2 ;out46 - mova [rsp+gprsize*2+16*20], m0 ;out17 - mova [rsp+gprsize*2+16*52], m3 ;out49 - mova [rsp+gprsize*2+16*17], m1 ;out14 - - mova m0, [rsp+gprsize*2+16*35] ;t32 - mova m5, [rsp+gprsize*2+16*50] ;t47 - mova m3, [rsp+gprsize*2+16*51] ;t48 - mova m1, [rsp+gprsize*2+16*66] ;t63 - mova m2, [rsp+gprsize*2+16*34] ;tmp[31] - psubsw m4, m0, m5 ;t47a - paddsw m0, m5 ;t32a - psubsw m5, m1, m3 ;t48a - paddsw m1, m3 ;t63a - ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 - mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] - psubsw m6, m2, m0 ;out32 - paddsw m2, m0 ;out31 - psubsw m0, m3, m1 ;out63 - paddsw m3, m1 ;out0 - mova [rsp+gprsize*2+16*35], m6 ;out32 - mova [rsp+gprsize*2+16*34], m2 ;out31 - mova [rsp+gprsize*2+16*66], m0 ;out63 - mova [rsp+gprsize*2+16*3 ], m3 ;out0 - mova m0, [rsp+gprsize*2+16*19] ;tmp[16] - mova m1, [rsp+gprsize*2+16*18] ;tmp[15] - psubsw m2, m0, m5 ;out47 - paddsw m0, m5 ;out16 - psubsw m3, m1, m4 ;out48 - paddsw m1, m4 ;out15 - mova [rsp+gprsize*2+16*50], m2 ;out47 - mova [rsp+gprsize*2+16*19], m0 ;out16 - mova [rsp+gprsize*2+16*51], m3 ;out48 - mova [rsp+gprsize*2+16*18], m1 ;out15 - ret - - -cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - - call m(idct_64x16_internal) - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_8192)] - mov [coeffq], eobd - mov r3d, 16 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x16).end)] - -.body: - pmulhrsw m0, m2 - movd m2, [o(pw_2048)] ;intentionally rip-relative - pmulhrsw m0, m1 - pmulhrsw m0, m2 - pshuflw m0, m0, q0000 - punpcklwd m0, m0 - pxor m7, m7 - -.loop: - mova m1, [dstq+16*0] - mova m3, [dstq+16*1] - mova m5, [dstq+16*2] - mova m6, [dstq+16*3] - punpckhbw m2, m1, m7 - punpcklbw m1, m7 - punpckhbw m4, m3, m7 - punpcklbw m3, m7 - paddw m2, m0 - paddw m1, m0 - paddw m4, m0 - paddw m3, m0 - packuswb m1, m2 - packuswb m3, m4 - punpckhbw m2, m5, m7 - punpcklbw m5, m7 - punpckhbw m4, m6, m7 - punpcklbw m6, m7 - paddw m2, m0 - paddw m5, m0 - paddw m4, m0 - paddw m6, m0 - packuswb m5, m2 - packuswb m6, m4 - mova [dstq+16*0], m1 - mova [dstq+16*1], m3 - mova [dstq+16*2], m5 - mova [dstq+16*3], m6 - add dstq, strideq - dec r3d - jg .loop - jmp tx2q - -.end: - RET - - -%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 - -%if %3 - mova m3, [o(pw_2896x8)] - pmulhrsw m0, m3, [%1+%2*0] - pmulhrsw m1, m3, [%1+%2*1] - pmulhrsw m2, m3, [%1+%2*2] - pmulhrsw m3, [%1+%2*3] -%else - mova m0, [%1+%2*0] - mova m1, [%1+%2*1] - mova m2, [%1+%2*2] - mova m3, [%1+%2*3] -%endif -%endmacro - -%macro LOAD_4ROWS_H 2 ;src, stride - mova m4, [%1+%2*0] - mova m5, [%1+%2*1] - mova m6, [%1+%2*2] - mova m7, [%1+%2*3] -%endmacro - -cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - mov r3d, 2 - mov [rsp+gprsize*2+16*67], dstq - lea dstq, [rsp+gprsize+16*68] - -.pass1_loop: - LOAD_4ROWS coeffq+32*0, 32*8 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - pxor m4, m4 - LOAD_4ROWS coeffq+32*4, 32*8 - - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - LOAD_8ROWS coeffq+32*2, 32*4 - mova [rsp+gprsize+16*19], m0 - mova [rsp+gprsize+16*26], m1 - mova [rsp+gprsize+16*23], m2 - mova [rsp+gprsize+16*22], m3 - mova [rsp+gprsize+16*21], m4 - mova [rsp+gprsize+16*24], m5 - mova [rsp+gprsize+16*25], m6 - mova [rsp+gprsize+16*20], m7 - - call m(idct_8x32_internal).main_fast - SAVE_8ROWS rsp+gprsize+16*3, 16 - - LOAD_8ROWS coeffq+32*1, 32*2 - mova [rsp+gprsize+16*35], m0 ;in1 - mova [rsp+gprsize+16*49], m1 ;in3 - mova [rsp+gprsize+16*43], m2 ;in5 - mova [rsp+gprsize+16*41], m3 ;in7 - mova [rsp+gprsize+16*39], m4 ;in9 - mova [rsp+gprsize+16*45], m5 ;in11 - mova [rsp+gprsize+16*47], m6 ;in13 - mova [rsp+gprsize+16*37], m7 ;in15 - - LOAD_8ROWS coeffq+32*17, 32*2 - mova [rsp+gprsize+16*63], m0 ;in17 - mova [rsp+gprsize+16*53], m1 ;in19 - mova [rsp+gprsize+16*55], m2 ;in21 - mova [rsp+gprsize+16*61], m3 ;in23 - mova [rsp+gprsize+16*59], m4 ;in25 - mova [rsp+gprsize+16*57], m5 ;in27 - mova [rsp+gprsize+16*51], m6 ;in29 - mova [rsp+gprsize+16*65], m7 ;in31 - - call m(idct_16x64_internal).main - - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end: - SAVE_8ROWS coeffq+32*0, 32 - LOAD_8ROWS rsp+gprsize+16*11, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end1: - SAVE_8ROWS coeffq+32*8, 32 - LOAD_8ROWS rsp+gprsize+16*19, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end2: - SAVE_8ROWS coeffq+32*16, 32 - LOAD_8ROWS rsp+gprsize+16*27, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end3: - SAVE_8ROWS coeffq+32*24, 32 - LOAD_8ROWS rsp+gprsize+16*35, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end4: - SAVE_8ROWS dstq+32*0, 32 - LOAD_8ROWS rsp+gprsize+16*43, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end5: - SAVE_8ROWS dstq+32*8, 32 - LOAD_8ROWS rsp+gprsize+16*51, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end6: - SAVE_8ROWS dstq+32*16, 32 - LOAD_8ROWS rsp+gprsize+16*59, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x16_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end7: - SAVE_8ROWS dstq+32*24, 32 - - add coeffq, 16 - add dstq, 16 - dec r3d - jg .pass1_loop - -.pass2: - mov dstq, [rsp+gprsize*2+16*67] - sub coeffq, 32 - mov r3d, 4 - -.pass2_loop: - mov [rsp+gprsize*1+16*67], r3d - - LOAD_4ROWS coeffq+16*0, 32*2 - LOAD_4ROWS_H coeffq+16*1, 32*2 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_4ROWS coeffq+16*2, 32*2 - LOAD_4ROWS_H coeffq+16*3, 32*2 - call m(idct_16x8_internal).main - - mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal).end)] - lea dstq, [dstq+strideq*8] - jmp m(idct_8x8_internal).end - -.end: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal).end1)] - mov dstq, r3 - jmp m(idct_8x8_internal).end - -.end1: - pxor m7, m7 - REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - - add coeffq, 16*16 - mov r3d, [rsp+gprsize*1+16*67] - mov dstq, [rsp+gprsize*2+16*67] - add dstq, 8 - mov [rsp+gprsize*2+16*67], dstq - dec r3d - jg .pass2_loop - - mov r3d, 4 - lea coeffq, [rsp+gprsize+16*68] -.pass2_loop2: - mov [rsp+gprsize*1+16*67], r3d - - LOAD_4ROWS coeffq+16*0, 32*2 - LOAD_4ROWS_H coeffq+16*1, 32*2 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_4ROWS coeffq+16*2, 32*2 - LOAD_4ROWS_H coeffq+16*3, 32*2 - call m(idct_16x8_internal).main - - mov r3, dstq - lea tx2q, [o(m(idct_64x16_internal).end2)] - lea dstq, [dstq+strideq*8] - jmp m(idct_8x8_internal).end - -.end2: - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x16_internal).end3)] - mov dstq, r3 - jmp m(idct_8x8_internal).end - -.end3: - - add coeffq, 16*16 - mov r3d, [rsp+gprsize*1+16*67] - mov dstq, [rsp+gprsize*2+16*67] - add dstq, 8 - mov [rsp+gprsize*2+16*67], dstq - dec r3d - jg .pass2_loop2 - ret - - -cglobal inv_txfm_add_dct_dct_32x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - - call m(idct_32x64_internal) - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_16384)] - mov [coeffq], eobd - pmulhrsw m0, m1 - mov r3d, 64 - lea tx2q, [o(m(inv_txfm_add_dct_dct_32x64).end)] - jmp m(inv_txfm_add_dct_dct_32x8).body - -.end: - RET - - -cglobal idct_32x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - - mov r4d, 2 - sub eobd, 136 - mov [rsp+gprsize*1+16*67], eobd - mov r3d, 4 - cmovs r3d, r4d - -%if ARCH_X86_32 - LEA r5, $$ -%endif - - mov [rsp+gprsize*2+16*67], coeffq - -.pass1_loop: - LOAD_8ROWS coeffq+64*1, 64*2, 1 - mova [rsp+gprsize+16*19], m0 ;in1 - mova [rsp+gprsize+16*26], m1 ;in3 - mova [rsp+gprsize+16*23], m2 ;in5 - mova [rsp+gprsize+16*22], m3 ;in7 - mova [rsp+gprsize+16*21], m4 ;in9 - mova [rsp+gprsize+16*24], m5 ;in11 - mova [rsp+gprsize+16*25], m6 ;in13 - mova [rsp+gprsize+16*20], m7 ;in15 - - mov tx2d, [rsp+gprsize*1+16*67] - test tx2d, tx2d - jl .fast - -.full: - LOAD_8ROWS coeffq+64*0, 64*4, 1 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_8ROWS coeffq+64*2, 64*4, 1 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - LOAD_8ROWS coeffq+64*17, 64*2, 1 - mova [rsp+gprsize+16*33], m0 ;in17 - mova [rsp+gprsize+16*28], m1 ;in19 - mova [rsp+gprsize+16*29], m2 ;in21 - mova [rsp+gprsize+16*32], m3 ;in23 - mova [rsp+gprsize+16*31], m4 ;in25 - mova [rsp+gprsize+16*30], m5 ;in27 - mova [rsp+gprsize+16*27], m6 ;in29 - mova [rsp+gprsize+16*34], m7 ;in31 - - call m(idct_8x32_internal).main - jmp .pass1_end - -.fast: - LOAD_4ROWS coeffq, 256, 1 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - - SAVE_7ROWS rsp+gprsize+16*3, 16 - LOAD_4ROWS coeffq+128*1, 256, 1 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - call m(idct_8x32_internal).main_fast - -.pass1_end: - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end1: - SAVE_8ROWS coeffq+64*0, 64 - LOAD_8ROWS rsp+gprsize+16*11, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end2: - SAVE_8ROWS coeffq+64*8, 64 - LOAD_8ROWS rsp+gprsize+16*19, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end3: - SAVE_8ROWS coeffq+64*16, 64 - LOAD_8ROWS rsp+gprsize+16*27, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_32x64_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end4: - SAVE_8ROWS coeffq+64*24, 64 - - add coeffq, 16 - dec r3d - jg .pass1_loop - -.pass2: - mov coeffq, [rsp+gprsize*2+16*67] - mov r3d, 4 - lea r4, [dstq+8] - mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] - jmp m(idct_16x64_internal).pass2_loop - - -cglobal inv_txfm_add_dct_dct_64x32, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - - call m(idct_64x32_internal) - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_16384)] - pmulhrsw m0, m1 - mov [coeffq], eobd - mov r3d, 32 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] - jmp m(inv_txfm_add_dct_dct_64x16).body - -.end: - RET - -cglobal idct_64x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - - mov r4d, 2 - sub eobd, 136 - mov [rsp+gprsize*1+16*67], eobd - mov r3d, 4 - cmovs r3d, r4d - -%if ARCH_X86_32 - LEA r5, $$ -%endif - - mov [rsp+gprsize*2+16*67], coeffq - mov [rsp+gprsize*3+16*67], dstq - lea dstq, [rsp+gprsize+16*69] - mov [rsp+gprsize*4+16*67], dstq - -.pass1_loop: - LOAD_4ROWS coeffq+64*0, 64*8, 1 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - pxor m4, m4 - LOAD_4ROWS coeffq+64*4, 64*8, 1 - - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - LOAD_8ROWS coeffq+64*2, 64*4, 1 - mova [rsp+gprsize+16*19], m0 - mova [rsp+gprsize+16*26], m1 - mova [rsp+gprsize+16*23], m2 - mova [rsp+gprsize+16*22], m3 - mova [rsp+gprsize+16*21], m4 - mova [rsp+gprsize+16*24], m5 - mova [rsp+gprsize+16*25], m6 - mova [rsp+gprsize+16*20], m7 - - call m(idct_8x32_internal).main_fast - SAVE_8ROWS rsp+gprsize+16*3, 16 - - LOAD_8ROWS coeffq+64*1, 64*2, 1 - mova [rsp+gprsize+16*35], m0 ;in1 - mova [rsp+gprsize+16*49], m1 ;in3 - mova [rsp+gprsize+16*43], m2 ;in5 - mova [rsp+gprsize+16*41], m3 ;in7 - mova [rsp+gprsize+16*39], m4 ;in9 - mova [rsp+gprsize+16*45], m5 ;in11 - mova [rsp+gprsize+16*47], m6 ;in13 - mova [rsp+gprsize+16*37], m7 ;in15 - - LOAD_8ROWS coeffq+64*17, 64*2, 1 - mova [rsp+gprsize+16*63], m0 ;in17 - mova [rsp+gprsize+16*53], m1 ;in19 - mova [rsp+gprsize+16*55], m2 ;in21 - mova [rsp+gprsize+16*61], m3 ;in23 - mova [rsp+gprsize+16*59], m4 ;in25 - mova [rsp+gprsize+16*57], m5 ;in27 - mova [rsp+gprsize+16*51], m6 ;in29 - mova [rsp+gprsize+16*65], m7 ;in31 - - call m(idct_16x64_internal).main - - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end: - SAVE_8ROWS coeffq+64*0, 64 - LOAD_8ROWS rsp+gprsize+16*11, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end1: - SAVE_8ROWS coeffq+64*8, 64 - LOAD_8ROWS rsp+gprsize+16*19, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end2: - SAVE_8ROWS coeffq+64*16, 64 - LOAD_8ROWS rsp+gprsize+16*27, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end3: - SAVE_8ROWS coeffq+64*24, 64 - LOAD_8ROWS rsp+gprsize+16*35, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end4: - SAVE_8ROWS dstq+64*0, 64 - LOAD_8ROWS rsp+gprsize+16*43, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end5: - SAVE_8ROWS dstq+64*8, 64 - LOAD_8ROWS rsp+gprsize+16*51, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end6: - SAVE_8ROWS dstq+64*16, 64 - LOAD_8ROWS rsp+gprsize+16*59, 16 - mova [rsp+gprsize+16*0], m7 - lea tx2q, [o(m(idct_64x32_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end - -.pass1_end7: - SAVE_8ROWS dstq+64*24, 64 - - add coeffq, 16 - add dstq, 16 - dec r3d - jg .pass1_loop - -.pass2: - mov coeffq, [rsp+gprsize*4+16*67] - mov dstq, [rsp+gprsize*3+16*67] - mov eobd, [rsp+gprsize*1+16*67] - lea dstq, [dstq+32] - mov [rsp+gprsize*1+16*35], eobd - lea tx2q, [o(m(idct_64x32_internal).pass2_end)] - mov r3d, 4 - jmp m(idct_32x32_internal).pass2_loop - -.pass2_end: - mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x32_internal).pass2_end1)] - jmp m(idct_8x32_internal).end2 - -.pass2_end1: - lea tx2q, [o(m(idct_64x32_internal).pass2_end)] - add coeffq, 16*32 - mov dstq, [rsp+gprsize*2+16*35] - mov r3d, [rsp+gprsize*3+16*35] - dec r3d - jg m(idct_32x32_internal).pass2_loop - -.pass2_end2: - mov dstq, [rsp+gprsize*3+16*67] - mov coeffq, [rsp+gprsize*2+16*67] - lea tx2q, [o(m(idct_32x32_internal).pass2_end)] - mov r3d, 4 - jmp m(idct_32x32_internal).pass2_loop - - -cglobal inv_txfm_add_dct_dct_64x64, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 -%if ARCH_X86_32 - LEA r5, $$ -%endif - test eobd, eobd - jz .dconly - - call m(idct_64x64_internal) - RET - -.dconly: - movd m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - movd m2, [o(pw_8192)] - mov [coeffq], eobd - mov r3d, 64 - lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32).end)] - jmp m(inv_txfm_add_dct_dct_64x16).body - -cglobal idct_64x64_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 - %undef cmp - - mov r5d, 4 - mov r4d, 2 - sub eobd, 136 - cmovns r4d, r5d - -%if ARCH_X86_32 - LEA r5, $$ -%endif - - mov [rsp+gprsize*1+16*67], eobd - mov r3d, r4d - mov [rsp+gprsize*4+16*67], coeffq - mov [rsp+gprsize*3+16*67], dstq - lea dstq, [rsp+gprsize+16*69] - mov [rsp+gprsize*2+16*67], dstq - -.pass1_loop: - LOAD_4ROWS coeffq+64*0, 64*8 - pxor m4, m4 - REPX {mova x, m4}, m5, m6, m7 - call m(idct_8x8_internal).main - SAVE_7ROWS rsp+gprsize+16*3, 16 - - pxor m4, m4 - LOAD_4ROWS coeffq+64*4, 64*8 - - REPX {mova x, m4}, m5, m6, m7 - call m(idct_16x8_internal).main - mova m7, [rsp+gprsize+16*0] - SAVE_8ROWS rsp+gprsize+16*11, 16 - - LOAD_8ROWS coeffq+64*2, 64*4 - mova [rsp+gprsize+16*19], m0 - mova [rsp+gprsize+16*26], m1 - mova [rsp+gprsize+16*23], m2 - mova [rsp+gprsize+16*22], m3 - mova [rsp+gprsize+16*21], m4 - mova [rsp+gprsize+16*24], m5 - mova [rsp+gprsize+16*25], m6 - mova [rsp+gprsize+16*20], m7 - - call m(idct_8x32_internal).main_fast - SAVE_8ROWS rsp+gprsize+16*3, 16 - - LOAD_8ROWS coeffq+64*1, 64*2 - mova [rsp+gprsize+16*35], m0 ;in1 - mova [rsp+gprsize+16*49], m1 ;in3 - mova [rsp+gprsize+16*43], m2 ;in5 - mova [rsp+gprsize+16*41], m3 ;in7 - mova [rsp+gprsize+16*39], m4 ;in9 - mova [rsp+gprsize+16*45], m5 ;in11 - mova [rsp+gprsize+16*47], m6 ;in13 - mova [rsp+gprsize+16*37], m7 ;in15 - - LOAD_8ROWS coeffq+64*17, 64*2 - mova [rsp+gprsize+16*63], m0 ;in17 - mova [rsp+gprsize+16*53], m1 ;in19 - mova [rsp+gprsize+16*55], m2 ;in21 - mova [rsp+gprsize+16*61], m3 ;in23 - mova [rsp+gprsize+16*59], m4 ;in25 - mova [rsp+gprsize+16*57], m5 ;in27 - mova [rsp+gprsize+16*51], m6 ;in29 - mova [rsp+gprsize+16*65], m7 ;in31 - - call m(idct_16x64_internal).main - - LOAD_8ROWS rsp+gprsize+16*3, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end: - SAVE_8ROWS coeffq+64*0, 64 - LOAD_8ROWS rsp+gprsize+16*11, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end1)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end1: - SAVE_8ROWS coeffq+64*8, 64 - LOAD_8ROWS rsp+gprsize+16*19, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end2)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end2: - SAVE_8ROWS coeffq+64*16, 64 - LOAD_8ROWS rsp+gprsize+16*27, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end3)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end3: - SAVE_8ROWS coeffq+64*24, 64 - LOAD_8ROWS rsp+gprsize+16*35, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end4)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end4: - SAVE_8ROWS dstq+64*0, 64 - LOAD_8ROWS rsp+gprsize+16*43, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end5)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end5: - SAVE_8ROWS dstq+64*8, 64 - LOAD_8ROWS rsp+gprsize+16*51, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end6)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end6: - SAVE_8ROWS dstq+64*16, 64 - LOAD_8ROWS rsp+gprsize+16*59, 16 - mova [rsp+gprsize+16*0], m7 - mova m7, [o(pw_8192)] - lea tx2q, [o(m(idct_64x64_internal).pass1_end7)] - jmp m(idct_8x8_internal).pass1_end1 - -.pass1_end7: - SAVE_8ROWS dstq+64*24, 64 - - add coeffq, 16 - add dstq, 16 - dec r3d - jg .pass1_loop - -.pass2: - mov dstq, [rsp+gprsize*3+16*67] - mov coeffq, [rsp+gprsize*2+16*67] - lea dstq, [dstq+32] - mov r3d, 4 - lea r4, [dstq+8] - mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal).pass2_end)] - jmp m(idct_16x64_internal).pass2_loop - -.pass2_end: - LOAD_8ROWS rsp+gprsize+16*35, 16 - lea dstq, [dstq+strideq*2] - add rsp, 16*32 - mova [rsp+gprsize+16*0], m7 - lea r3, [o(m(idct_64x64_internal).pass2_end1)] - jmp m(idct_8x32_internal).end2 - -.pass2_end1: - add coeffq, 16*32 - sub rsp, 16*32 - - mov dstq, [rsp+gprsize*2+16*67] - mov r3d, [rsp+gprsize*3+16*67] - lea r4, [dstq+8] - mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_64x64_internal).pass2_end)] - - dec r3d - jg m(idct_16x64_internal).pass2_loop - -.pass2_end2: - mov coeffq, [rsp+gprsize*4+16*67] - mov dstq, [rsp+gprsize*2+16*67] - mov r3d, 4 - sub dstq, 72 - lea r4, [dstq+8] - mov [rsp+gprsize*2+16*67], r4 - lea r4, [o(m(idct_16x64_internal).end1)] - jmp m(idct_16x64_internal).pass2_loop diff -Nru dav1d-0.7.1/src/x86/loopfilter16_avx2.asm dav1d-0.9.1/src/x86/loopfilter16_avx2.asm --- dav1d-0.7.1/src/x86/loopfilter16_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/loopfilter16_avx2.asm 2021-07-28 21:38:28.905852000 +0000 @@ -0,0 +1,1163 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 + times 4 db 8, 9 + times 4 db 0, 1 + times 4 db 8, 9 + +pw_1: times 16 dw 1 +pw_2: times 16 dw 2 +pw_3: times 16 dw 3 +; 4 and 16 need to be next to each other since they are used as alternates +; depending on whether bitdepth is 10 or 12 +pw_4: times 16 dw 4 +pw_16: times 16 dw 16 +pw_8: times 16 dw 8 +pw_4096: times 16 dw 4096 + +pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +; in: out: +; mm%1 a b c d a e i m +; mm%2 e f g h b f j n +; mm%3 i j k l -> c g k o +; mm%4 m n o p d h l p +%macro TRANSPOSE4X4W 5 + punpcklwd m%5, m%1, m%2 + punpckhwd m%1, m%2 + punpcklwd m%2, m%3, m%4 + punpckhwd m%3, m%4 + punpckldq m%4, m%5, m%2 + punpckhdq m%5, m%2 + punpckldq m%2, m%1, m%3 + punpckhdq m%1, m%3 + + SWAP %1, %4 + SWAP %2, %5, %3 +%endmacro + +; in: out: +; xmm%1 a b c d e f g h a i q y 6 E M U +; xmm%2 i j k l m n o p b j r z 7 F N V +; xmm%3 q r s t u v w x c k s 0 8 G O W +; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X +; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y +; xmm%6 E F G H I J K L f n v 3 B J R Z +; xmm%7 M N O P Q R S T g o w 4 C K S + +; xmm%8 U V W X Y Z + = h p x 5 D L T = +%macro TRANSPOSE8X8W 9 + ; xmm%1 a b c d e f g h a i q y b j r z + ; xmm%2 i j k l m n o p c k s 0 d l t 1 + ; xmm%3 q r s t u v w x -> e m u 2 f n v 3 + ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 + TRANSPOSE4X4W %1, %2, %3, %4, %9 + + ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V + ; xmm%6 E F G H I J K L 8 G O W 9 H P X + ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z + ; xmm%8 U V W X Y Z + = C K S + D L T = + TRANSPOSE4X4W %5, %6, %7, %8, %9 + + ; xmm%1 a i q y b j r z a i q y 6 E M U + ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V + ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W + ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X + ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y + ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z + ; xmm%7 A I Q Y B J R Z g o w 4 C K S + + ; xmm%8 C K S + D L T = h p x 5 D L T = + punpckhqdq m%9, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + punpckhqdq m%7, m%4, m%8 + punpcklqdq m%4, m%8 + + SWAP %8, %7, %4, %5, %3, %2, %9 +%endmacro + +; transpose and write m3-6, everything else is scratch +%macro TRANSPOSE_8x4_AND_WRITE_4x16 0 + ; transpose 8x4 + punpcklwd m0, m3, m4 + punpckhwd m3, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckldq m6, m0, m4 + punpckhdq m0, m4 + punpckldq m4, m3, m5 + punpckhdq m3, m5 + + ; write out + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm0 + movhps [dstq+stride3q -4], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm4 + movhps [dstq+strideq*1-4], xm4 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + + vextracti128 xm6, m6, 1 + vextracti128 xm0, m0, 1 + vextracti128 xm4, m4, 1 + vextracti128 xm3, m3, 1 + + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm0 + movhps [dstq+stride3q -4], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm4 + movhps [dstq+strideq*1-4], xm4 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] + ; we load p3 later + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif +%endif +%else + ; load lines +%if %1 == 4 + movq xm3, [dstq+strideq*0-4] + movq xm4, [dstq+strideq*1-4] + movq xm5, [dstq+strideq*2-4] + movq xm6, [dstq+stride3q -4] + lea tmpq, [dstq+strideq*4] + movq xm11, [tmpq+strideq*0-4] + movq xm13, [tmpq+strideq*1-4] + movq xm14, [tmpq+strideq*2-4] + movq xm15, [tmpq+stride3q -4] + lea tmpq, [tmpq+strideq*4] + ; this overreads by 8 bytes but the buffers are padded + ; so that should be ok + vinserti128 m3, [tmpq+strideq*0-4], 1 + vinserti128 m4, [tmpq+strideq*1-4], 1 + vinserti128 m5, [tmpq+strideq*2-4], 1 + vinserti128 m6, [tmpq+stride3q -4], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m11, [tmpq+strideq*0-4], 1 + vinserti128 m13, [tmpq+strideq*1-4], 1 + vinserti128 m14, [tmpq+strideq*2-4], 1 + vinserti128 m15, [tmpq+stride3q -4], 1 + + ; transpose 4x8 + ; xm3: A-D0,A-D4 + ; xm4: A-D1,A-D5 + ; xm5: A-D2,A-D6 + ; xm6: A-D3,A-D7 + punpcklwd m7, m3, m4 + punpcklwd m3, m11, m13 + punpcklwd m4, m5, m6 + punpcklwd m5, m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1 + ; xm3: A4-5,B4-5,C4-5,D4-5 + ; xm4: A2-3,B2-3,C2-3,D2-3 + ; xm5: A6-7,B6-7,C6-7,D6-7 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m3, m5 + punpckhdq m5, m3, m5 + ; xm6: A0-3,B0-3 + ; xm7: C0-3,D0-3 + ; xm8: A4-7,B4-7 + ; xm5: C4-7,D4-7 + punpcklqdq m3, m6, m8 + punpckhqdq m4, m6, m8 + punpckhqdq m6, m7, m5 + punpcklqdq m5, m7, m5 + ; xm3: A0-7 + ; xm4: B0-7 + ; xm5: C0-7 + ; xm6: D0-7 +%elif %1 == 6 || %1 == 8 + movu xm3, [dstq+strideq*0-8] + movu xm4, [dstq+strideq*1-8] + movu xm5, [dstq+strideq*2-8] + movu xm6, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu xm11, [tmpq+strideq*0-8] + movu xm13, [tmpq+strideq*1-8] + movu xm14, [tmpq+strideq*2-8] + movu xm15, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + vinserti128 m3, [tmpq+strideq*0-8], 1 + vinserti128 m4, [tmpq+strideq*1-8], 1 + vinserti128 m5, [tmpq+strideq*2-8], 1 + vinserti128 m6, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m11, [tmpq+strideq*0-8], 1 + vinserti128 m13, [tmpq+strideq*1-8], 1 + vinserti128 m14, [tmpq+strideq*2-8], 1 + vinserti128 m15, [tmpq+stride3q -8], 1 + + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm11: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklwd m7, m3, m4 + punpckhwd m3, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m6, m11, m13 + punpckhwd m11, m13 + punpcklwd m13, m14, m15 + punpckhwd m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1 + ; xm3: E0-1,F0-1,G0-1,H0-1 + ; xm4: A2-3,B2-3,C2-3,D2-3 + ; xm5: E2-3,F2-3,G2-3,H2-3 + ; xm6: A4-5,B4-5,C4-5,D4-5 + ; xm11: E4-5,F4-5,G4-5,H4-5 + ; xm13: A6-7,B6-7,C6-7,D6-7 + ; xm14: E6-7,F6-7,G6-7,H6-7 + punpckldq m15, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m3, m5 + punpckhdq m8, m3, m5 + punpckldq m3, m6, m13 + punpckhdq m6, m13 + punpckldq m10, m11, m14 + punpckhdq m11, m14 + ; xm15: A0-3,B0-3 + ; xm7: C0-3,D0-3 + ; xm9: E0-3,F0-3 + ; xm8: G0-3,H0-3 + ; xm3: A4-7,B4-7 + ; xm6: C4-7,D4-7 + ; xm10: E4-7,F4-7 + ; xm11: G4-7,H4-7 +%if %1 != 6 + punpcklqdq m0, m15, m3 +%endif + punpckhqdq m13, m15, m3 + punpcklqdq m3, m7, m6 + punpckhqdq m4, m7, m6 + punpcklqdq m5, m9, m10 + punpckhqdq m6, m9, m10 + punpcklqdq m14, m8, m11 +%if %1 != 6 + punpckhqdq m15, m8, m11 + mova [rsp+5*32], m0 +%endif +%else + ; We only use 14 pixels but we'll need the remainder at the end for + ; the second transpose + mova xm0, [dstq+strideq*0-16] + mova xm1, [dstq+strideq*1-16] + mova xm2, [dstq+strideq*2-16] + mova xm3, [dstq+stride3q -16] + lea tmpq, [dstq+strideq*4] + mova xm4, [tmpq+strideq*0-16] + mova xm5, [tmpq+strideq*1-16] + mova xm6, [tmpq+strideq*2-16] + mova xm7, [tmpq+stride3q -16] + lea tmpq, [tmpq+strideq*4] + vinserti128 m0, m0, [tmpq+strideq*0-16], 1 + vinserti128 m1, m1, [tmpq+strideq*1-16], 1 + vinserti128 m2, m2, [tmpq+strideq*2-16], 1 + vinserti128 m3, m3, [tmpq+stride3q -16], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m4, m4, [tmpq+strideq*0-16], 1 + vinserti128 m5, m5, [tmpq+strideq*1-16], 1 + vinserti128 m6, m6, [tmpq+strideq*2-16], 1 + vinserti128 m7, m7, [tmpq+stride3q -16], 1 + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + + mova [rsp+6*32], m0 + mova [rsp+7*32], m1 + mova [rsp+8*32], m2 + mova [rsp+9*32], m3 + mova [rsp+5*32], m4 + + mova xm0, [dstq+strideq*0] + mova xm1, [dstq+strideq*1] + mova xm2, [dstq+strideq*2] + mova xm3, [dstq+stride3q ] + lea tmpq, [dstq+strideq*4] + mova xm8, [tmpq+strideq*0] + mova xm9, [tmpq+strideq*1] + mova xm10, [tmpq+strideq*2] + mova xm11, [tmpq+stride3q ] + lea tmpq, [tmpq+strideq*4] + vinserti128 m0, m0, [tmpq+strideq*0], 1 + vinserti128 m1, m1, [tmpq+strideq*1], 1 + vinserti128 m2, m2, [tmpq+strideq*2], 1 + vinserti128 m3, m3, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m8, m8, [tmpq+strideq*0], 1 + vinserti128 m9, m9, [tmpq+strideq*1], 1 + vinserti128 m10, m10, [tmpq+strideq*2], 1 + vinserti128 m11, m11, [tmpq+stride3q ], 1 + + TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4 + + mova [rsp+10*32], m8 + mova [rsp+11*32], m9 + mova [rsp+12*32], m10 + mova [rsp+13*32], m11 + + ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15 + SWAP 13, 5, 0 + SWAP 3, 6, 1, 15 + SWAP 4, 7 + SWAP 2, 14 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + pmovzxbw m1, [lq] + pmovzxbw m0, [lq+l_strideq] + pxor m2, m2 +%else + vpbroadcastq m0, [lq] ; l0, l1 + vpbroadcastq m1, [lq+l_strideq] ; l2, l3 + vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5 + vpbroadcastq m10, [lq+l_stride3q] ; l6, l7 + punpckldq m0, m1 ; l0, l2, l1, l3 [2x] + punpckldq m2, m10 ; l4, l6, l5, l7 [2x] + vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7 + pxor m2, m2 + punpcklbw m1, m0, m2 ; l0, l2, l4, l6 + punpckhbw m0, m2 ; l1, l3, l5, l7 +%endif + pcmpeqw m10, m2, m0 + pand m1, m10 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] + pcmpeqw m10, m2, m0 ; !L + psrlw m10, 1 + psrlw m2, m0, [lutq+128] + vpbroadcastw m1, [lutq+136] + pminuw m2, m1 + pmaxuw m2, [pw_1] ; I + psrlw m1, m0, 4 ; H + paddw m0, [pw_2] + paddw m0, m0 + paddw m0, m2 ; E + REPX {pmullw x, [r11]}, m0, m1, m2 + + psubw m8, m3, m4 ; p1-p0 + psubw m9, m5, m6 ; q1-q0 + REPX {pabsw x, x}, m8, m9 + pmaxuw m8, m10 + pmaxuw m8, m9 + pcmpgtw m7, m8, m1 ; hev +%if %1 != 4 + psubw m9, m13, m4 ; p2-p0 + pabsw m9, m9 + pmaxuw m9, m8 +%if %1 != 6 +%ifidn %2, v + mova m11, [tmpq+strideq*0] ; p3 +%else + mova m11, [rsp+5*32] ; p3 +%endif + psubw m10, m11, m4 ; p3-p0 + pabsw m10, m10 + pmaxuw m9, m10 +%endif + psubw m10, m5, m14 ; q2-q0 + pabsw m10, m10 + pmaxuw m9, m10 +%if %1 != 6 + psubw m10, m5, m15 ; q3-q0 + pabsw m10, m10 + pmaxuw m9, m10 +%endif + pcmpgtw m9, [r11] ; !flat8in + + psubw m10, m13, m3 ; p2-p1 + pabsw m10, m10 +%if %1 != 6 + psubw m11, m13 ; p3-p2 + pabsw m11, m11 + pmaxuw m10, m11 + psubw m11, m14, m15 ; q3-q2 + pabsw m11, m11 + pmaxuw m10, m11 +%endif + psubw m11, m14, m6 ; q2-q1 + pabsw m11, m11 + pmaxuw m10, m11 + +%if %1 == 16 + vpbroadcastd m11, [maskq+8] + vpbroadcastd m1, [maskq+4] + por m11, m1 + pand m11, m12 + pcmpeqd m11, m12 + pand m10, m11 +%else + vpbroadcastd m11, [maskq+4] + pand m11, m12 + pcmpeqd m11, m12 + pand m10, m11 ; only apply fm-wide to wd>4 blocks +%endif + pmaxuw m8, m10 +%endif + pcmpgtw m8, m2 + + psubw m10, m3, m6 ; p1-q1 + psubw m11, m4, m5 ; p0-q0 + REPX {pabsw x, x}, m10, m11 + paddw m11, m11 + psrlw m10, 1 + paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m8, m10 + +%if %1 == 16 + +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] + mova m1, [tmpq+strideq*2] + mova m2, [tmpq+stride3q] +%else + mova m0, [rsp+7*32] + mova m1, [rsp+8*32] + mova m2, [rsp+9*32] +%endif + REPX {psubw x, m4}, m0, m1, m2 + REPX {pabsw x, x}, m0, m1, m2 + pmaxuw m1, m0 + pmaxuw m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] + mova m2, [tmpq+strideq*1] + mova m10, [tmpq+strideq*2] +%else + mova m0, [rsp+10*32] + mova m2, [rsp+11*32] + mova m10, [rsp+12*32] +%endif + REPX {psubw x, m5}, m0, m2, m10 + REPX {pabsw x, x}, m0, m2, m10 + pmaxuw m0, m2 + pmaxuw m1, m10 + pmaxuw m1, m0 + pcmpgtw m1, [r11] ; !flat8out + por m1, m9 ; !flat8in | !flat8out + vpbroadcastd m2, [maskq+8] + pand m10, m2, m12 + pcmpeqd m10, m12 + pandn m1, m10 ; flat16 + pandn m1, m8, m1 ; flat16 & fm + + vpbroadcastd m10, [maskq+4] + por m10, m2 + pand m2, m10, m12 + pcmpeqd m2, m12 + pandn m9, m2 ; flat8in + pandn m9, m8, m9 + vpbroadcastd m2, [maskq+0] + por m2, m10 + pand m2, m12 + pcmpeqd m2, m12 + pandn m8, m2 + pandn m8, m9, m8 ; fm & !flat8 & !flat16 + pandn m9, m1, m9 ; flat8 & !flat16 +%elif %1 != 4 + vpbroadcastd m0, [maskq+4] + pand m2, m0, m12 + pcmpeqd m2, m12 + pandn m9, m2 + pandn m9, m8, m9 ; flat8 & fm + vpbroadcastd m2, [maskq+0] + por m0, m2 + pand m0, m12 + pcmpeqd m0, m12 + pandn m8, m0 + pandn m8, m9, m8 ; fm & !flat8 +%else + vpbroadcastd m0, [maskq+0] + pand m0, m12 + pcmpeqd m0, m12 + pandn m8, m0 ; fm +%endif + + ; short filter + + vpbroadcastw m0, r7m + pcmpeqw m2, m2 + psrlw m0, 1 ; 511 or 2047 + pxor m2, m0 ; -512 or -2048 + + psubw m10, m5, m4 + paddw m11, m10, m10 + paddw m11, m10 + psubw m10, m3, m6 ; iclip_diff(p1-q1) + pminsw m10, m0 + pmaxsw m10, m2 + pand m10, m7 ; f=iclip_diff(p1-q1)&hev + paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f) + pminsw m10, m0 + pmaxsw m10, m2 + pand m8, m10 ; f&=fm + paddw m10, m8, [pw_3] + paddw m8, [pw_4] + REPX {pminsw x, m0}, m10, m8 + psraw m10, 3 ; f2 + psraw m8, 3 ; f1 + paddw m4, m10 + psubw m5, m8 + + paddw m8, [pw_1] + psraw m8, 1 ; f=(f1+1)>>1 + pandn m8, m7, m8 ; f&=!hev + paddw m3, m8 + psubw m6, m8 + pxor m8, m8 + psubw m0, m2 ; 1023 or 4095 + REPX {pminsw x, m0}, m3, m4, m5, m6 + REPX {pmaxsw x, m8}, m3, m4, m5, m6 + +%if %1 == 16 + +; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16 +; m12=filter bits mask +; m13-15=p2/q2/q3 +; m0,2,7-8,10-11 = free + + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 + mova m11, [tmpq+strideq*4] ; p3 +%else + mova m0, [rsp+7*32] + mova m2, [rsp+8*32] + mova m7, [rsp+9*32] + mova m11, [rsp+5*32] +%endif + + mova [rsp+ 0*32], m9 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psllw m8, m0, 3 ; p6*8 + paddw m8, [pw_8] + paddw m10, m2, m7 ; p5+p4 + psubw m8, m0 + paddw m10, m10 ; (p5+p4)*2 + paddw m8, m11 ; p6*7+p3 + paddw m10, m13 ; (p5+p4)*2+p2 + paddw m8, m3 ; p6*7+p3+p1 + paddw m10, m4 ; (p5+p4)*2+p2+p0 + paddw m8, m5 ; p6*7+p3+p1+q0 + paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psrlw m10, m8, 4 + vpblendvb m10, m2, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*2], m10 ; p5 +%else + mova [rsp+8*32], m10 +%endif + + ; sub p6*2, add p3/q1 + paddw m8, m11 + paddw m10, m0, m0 + paddw m8, m6 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m7, m10, m1 +%ifidn %2, v + mova [tmpq+stride3q], m10 ; p4 +%else + mova [rsp+9*32], m10 +%endif + + ; sub p6/p5, add p2/q2 + psubw m8, m0 + paddw m10, m13, m14 + psubw m8, m2 + paddw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m11, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*4], m10 ; p3 + lea tmpq, [dstq+strideq*4] +%else + mova [rsp+5*32], m10 +%endif + + ; sub p6/p4, add p1/q3 + paddw m8, m3 + paddw m10, m0, m7 + paddw m8, m15 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m13, m10, m1 + mova [rsp+1*32], m10 ; don't clobber p2/m13 + + ; sub p6/p3, add p0/q4 + paddw m8, m4 + paddw m10, m0, m11 +%ifidn %2, v + paddw m8, [tmpq+strideq*0] +%else + paddw m8, [rsp+10*32] +%endif + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m3, m10, m1 + mova [rsp+2*32], m10 ; don't clobber p1/m3 + + ; sub p6/p2, add q0/q5 + paddw m8, m5 + paddw m10, m0, m13 +%ifidn %2, v + paddw m8, [tmpq+strideq*1] +%else + paddw m8, [rsp+11*32] +%endif + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m4, m10, m1 + mova [rsp+3*32], m10 ; don't clobber p0/m4 + + ; sub p6/p1, add q1/q6 + paddw m8, m6 + paddw m10, m0, m3 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else + mova m0, [rsp+12*32] ; q6 +%endif + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m5, m10, m1 + mova [rsp+4*32], m10 ; don't clobber q0/m5 + + ; sub p5/p0, add q2/q6 + paddw m8, m14 + paddw m10, m2, m4 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6 + + ; sub p4/q0, add q3/q6 + paddw m8, m15 + paddw m10, m7, m5 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14 + + ; sub p3/q1, add q4/q6 +%ifidn %2, v + paddw m8, [tmpq+strideq*0] +%else + paddw m8, [rsp+10*32] +%endif + paddw m10, m11, m6 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m15, m10, m1 +%ifidn %2, v + mova [tmpq+mstrideq], m10 ; q3 +%else + mova [rsp+14*32], m10 +%endif + + ; sub p2/q2, add q5/q6 +%ifidn %2, v + paddw m8, [tmpq+strideq*1] +%else + paddw m8, [rsp+11*32] +%endif + paddw m10, m13, m14 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 +%ifidn %2, v + mova m9, [tmpq+strideq*0] +%else + mova m9, [rsp+10*32] +%endif + vpblendvb m10, m9, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*0], m10 ; q4 +%else + mova [rsp+10*32], m10 +%endif + + ; sub p1/q3, add q6*2 + psubw m8, m3 + paddw m0, m0 + psubw m8, m15 + paddw m8, m0 + psrlw m10, m8, 4 + pand m10, m1 +%ifidn %2, v + mova m9, [tmpq+strideq*1] +%else + mova m9, [rsp+11*32] +%endif + vpblendvb m10, m9, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+11*32], m10 +%endif + + mova m9, [rsp+0*32] + mova m13, [rsp+1*32] + mova m3, [rsp+2*32] + mova m4, [rsp+3*32] + mova m5, [rsp+4*32] + SWAP 2, 6 + SWAP 7, 14 +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%else + mova m15, [rsp+14*32] +%endif +%endif + +%if %1 >= 8 + ; flat8 filter +%ifidn %2, v + mova m0, [tmpq+strideq*0] ; p3 +%else + mova m0, [rsp+5*32] ; p3 +%endif + paddw m1, m0, m13 ; p3+p2 + paddw m2, m3, m4 ; p1+p0 + paddw m8, m1, m1 ; 2*(p3+p2) + paddw m2, m0 ; p1+p0+p3 + paddw m8, m5 ; 2*(p3+p2)+q0 + paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 + pmulhrsw m7, m2, [pw_4096] + + paddw m8, m3, m6 + psubw m2, m1 + paddw m2, m8 + pmulhrsw m8, m2, [pw_4096] + + paddw m10, m0, m3 + paddw m11, m4, m14 + psubw m2, m10 + paddw m2, m11 + pmulhrsw m10, m2, [pw_4096] + + paddw m11, m0, m4 + paddw m1, m5, m15 + psubw m2, m11 + paddw m2, m1 + pmulhrsw m11, m2, [pw_4096] + + paddw m2, m6 + paddw m2, m15 + paddw m1, m13, m5 + psubw m2, m1 + pmulhrsw m1, m2, [pw_4096] + + psubw m2, m3 + psubw m2, m6 + paddw m0, m15, m14 + paddw m2, m0 + pmulhrsw m2, [pw_4096] + + vpblendvb m13, m13, m7, m9 + vpblendvb m3, m3, m8, m9 + vpblendvb m4, m4, m10, m9 + vpblendvb m5, m5, m11, m9 + vpblendvb m6, m6, m1, m9 + vpblendvb m14, m14, m2, m9 + +%ifidn %2, v + mova [tmpq+strideq*1], m13 ; p2 + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m4 ; p0 + mova [dstq+strideq*0], m5 ; q0 + mova [dstq+strideq*1], m6 ; q1 + mova [dstq+strideq*2], m14 ; q2 +%else + mova m0, [rsp+5*32] +%if %1 == 8 + TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 + + ; write 8x16 + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm3 + movu [dstq+stride3q -8], xm4 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm5 + movu [dstq+strideq*1-8], xm6 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m0, 1 + vextracti128 [dstq+strideq*1-8], m13, 1 + vextracti128 [dstq+strideq*2-8], m3, 1 + vextracti128 [dstq+stride3q -8], m4, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m5, 1 + vextracti128 [dstq+strideq*1-8], m6, 1 + vextracti128 [dstq+strideq*2-8], m14, 1 + vextracti128 [dstq+stride3q -8], m15, 1 + lea dstq, [dstq+strideq*4] +%else + mova m0, [rsp+6*32] + mova m1, [rsp+7*32] + mova m2, [rsp+8*32] + mova m7, [rsp+9*32] + mova m8, [rsp+5*32] + TRANSPOSE8X8W 0, 1, 2, 7, 8, 13, 3, 4, 9 + + mova [dstq+strideq*0-16], xm0 + mova [dstq+strideq*1-16], xm1 + mova [dstq+strideq*2-16], xm2 + mova [dstq+stride3q -16], xm7 + lea tmpq, [dstq+strideq*4] + mova [tmpq+strideq*0-16], xm8 + mova [tmpq+strideq*1-16], xm13 + mova [tmpq+strideq*2-16], xm3 + mova [tmpq+stride3q -16], xm4 + lea tmpq, [tmpq+strideq*4] + vextracti128 [tmpq+strideq*0-16], m0, 1 + vextracti128 [tmpq+strideq*1-16], m1, 1 + vextracti128 [tmpq+strideq*2-16], m2, 1 + vextracti128 [tmpq+stride3q -16], m7, 1 + lea tmpq, [tmpq+strideq*4] + vextracti128 [tmpq+strideq*0-16], m8, 1 + vextracti128 [tmpq+strideq*1-16], m13, 1 + vextracti128 [tmpq+strideq*2-16], m3, 1 + vextracti128 [tmpq+stride3q -16], m4, 1 + + mova m0, [rsp+10*32] + mova m1, [rsp+11*32] + mova m2, [rsp+12*32] + mova m3, [rsp+13*32] + TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4 + mova [dstq+strideq*0], xm5 + mova [dstq+strideq*1], xm6 + mova [dstq+strideq*2], xm14 + mova [dstq+stride3q ], xm15 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + mova [dstq+strideq*2], xm2 + mova [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0], m5, 1 + vextracti128 [dstq+strideq*1], m6, 1 + vextracti128 [dstq+strideq*2], m14, 1 + vextracti128 [dstq+stride3q ], m15, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0], m0, 1 + vextracti128 [dstq+strideq*1], m1, 1 + vextracti128 [dstq+strideq*2], m2, 1 + vextracti128 [dstq+stride3q ], m3, 1 + lea dstq, [dstq+strideq*4] +%endif +%endif +%elif %1 == 6 + ; flat6 filter + + paddw m8, m3, m4 + paddw m8, m13 ; p2+p1+p0 + paddw m11, m13, m5 + paddw m8, m8 + paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 + pmulhrsw m2, m8, [pw_4096] + + paddw m8, m5 + paddw m11, m13, m13 + paddw m8, m6 + psubw m8, m11 + pmulhrsw m10, m8, [pw_4096] + + paddw m8, m6 + paddw m11, m13, m3 + paddw m8, m14 + psubw m8, m11 + pmulhrsw m11, m8, [pw_4096] + + psubw m8, m3 + paddw m14, m14 + psubw m8, m4 + paddw m8, m14 + pmulhrsw m8, [pw_4096] + + vpblendvb m3, m3, m2, m9 + vpblendvb m4, m4, m10, m9 + vpblendvb m5, m5, m11, m9 + vpblendvb m6, m6, m8, m9 + +%ifidn %2, v + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m4 ; p0 + mova [dstq+strideq*0], m5 ; q0 + mova [dstq+strideq*1], m6 ; q1 +%else + TRANSPOSE_8x4_AND_WRITE_4x16 +%endif +%else +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_8x4_AND_WRITE_4x16 +%endif +%endif +%endmacro + +INIT_YMM avx2 +cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + rorx r6d, r7m, 6 + and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + + FILTER 4, v + +.end: + pslld m12, 4 + add lq, 16 + add dstq, 32 + shl mask_bitsd, 4 + sub wd, 4 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + rorx r6d, r7m, 6 + and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov hd, hm + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +.end: + pslld m12, 4 + lea lq, [lq+l_strideq*4] + shl mask_bitsd, 4 + sub hd, 4 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + rorx r6d, r7m, 6 + and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + + FILTER 4, v + +.end: + pslld m12, 4 + add lq, 16 + add dstq, 32 + shl mask_bitsd, 4 + sub wd, 4 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + rorx r6d, r7m, 6 + and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + lea r11, [pw_4] + add r11, r6 + mov hd, hm + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +.end: + pslld m12, 4 + lea lq, [lq+l_strideq*4] + shl mask_bitsd, 4 + sub hd, 4 + jg .loop + RET + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/loopfilter16_sse.asm dav1d-0.9.1/src/x86/loopfilter16_sse.asm --- dav1d-0.7.1/src/x86/loopfilter16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/loopfilter16_sse.asm 2021-07-28 21:38:28.905852000 +0000 @@ -0,0 +1,1801 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%if ARCH_X86_64 +%define PIC_sym(a) a +%else +%define PIC_base $$ +%define PIC_sym(a) pic_regq+a-PIC_base +%endif + +pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 + times 4 db 8, 9 + +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_3: times 8 dw 3 +; 4 and 16 need to be next to each other since they are used as alternates +; depending on whether bitdepth is 10 or 12 +pw_4: times 8 dw 4 +pw_16: times 8 dw 16 +pw_8: times 8 dw 8 +pw_4096: times 8 dw 4096 + +pb_mask: dd 1, 1, 2, 2 + +SECTION .text + +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 +%define extra_stack 2 +%else +%define extra_stack 0 +%endif +%endif + +%macro RELOC_ARGS 2 ; h/v, off +ASSERT ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + mov r5d, [rstk + stack_offset + 4*4 + 4] +%define lstridem [esp+%2+0*gprsize] + mov lstridem, r5d + mov r5d, [rstk + stack_offset + 4*5 + 4] +%define lutm [esp+%2+1*gprsize] + mov lutm, r5d + mov r5d, [rstk + stack_offset + 4*6 + 4] +%ifidn %1, v +%define wm [esp+%2+2*gprsize] + mov wm, r5d + mov r5d, [rstk + stack_offset + 4*3 + 4] +%define lm [esp+%2+3*gprsize] + mov lm, r5d +%else ; %1 == h +%define hm [esp+%2+2*gprsize] + mov hm, r5d +%endif ; %1==v + mov r5d, r7m +%define bdmulm [esp+%2+4*gprsize] + mov bdmulm, r5d +%else +%define lstridem r4m +%define lutm r5m +%ifidn %1, v +%define wm r6m +%define lm r3m +%else +%define hm r6m +%endif +%define bdmulm r7m +%endif ; STACK_ALIGNMENT +%endmacro + +%macro UNRELOC_ARGS 0 +%if ARCH_X86_32 +%undef lm +%undef lstridem +%undef wm +%undef hm +%undef lutm +%endif +%endmacro + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%macro SPLATD 2 + movd %1, %2 + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 + movd %1, %2 + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + +; in: out: +; mm%1 a b c d a e i m +; mm%2 e f g h b f j n +; mm%3 i j k l -> c g k o +; mm%4 m n o p d h l p +%macro TRANSPOSE4X4W 5 + punpcklwd m%5, m%1, m%2 + punpckhwd m%1, m%2 + punpcklwd m%2, m%3, m%4 + punpckhwd m%3, m%4 + punpckldq m%4, m%5, m%2 + punpckhdq m%5, m%2 + punpckldq m%2, m%1, m%3 + punpckhdq m%1, m%3 + + SWAP %1, %4 + SWAP %2, %5, %3 +%endmacro + +; in: out: +; m%1 a b c d e f g h a i q y 6 E M U +; m%2 i j k l m n o p b j r z 7 F N V +; m%3 q r s t u v w x c k s 0 8 G O W +; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X +; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y +; m%6 E F G H I J K L f n v 3 B J R Z +; m%7 M N O P Q R S T g o w 4 C K S + +; m%8 U V W X Y Z + = h p x 5 D L T = +%if ARCH_X86_64 +%macro TRANSPOSE8X8W 9 + ; m%1 a b c d e f g h a i q y b j r z + ; m%2 i j k l m n o p c k s 0 d l t 1 + ; m%3 q r s t u v w x -> e m u 2 f n v 3 + ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 + TRANSPOSE4X4W %1, %2, %3, %4, %9 + + ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V + ; m%6 E F G H I J K L 8 G O W 9 H P X + ; m%7 M N O P Q R S T -> A I Q Y B J R Z + ; m%8 U V W X Y Z + = C K S + D L T = + TRANSPOSE4X4W %5, %6, %7, %8, %9 + + ; m%1 a i q y b j r z a i q y 6 E M U + ; m%2 c k s 0 d l t 1 b j r z 7 F N V + ; m%3 e m u 2 f n v 3 c k s 0 8 G O W + ; m%4 g o w 4 h p x 5 d l t 1 9 H P X + ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y + ; m%6 8 G O W 9 H P X f n v 3 B J R Z + ; m%7 A I Q Y B J R Z g o w 4 C K S + + ; m%8 C K S + D L T = h p x 5 D L T = + punpckhqdq m%9, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + punpckhqdq m%7, m%4, m%8 + punpcklqdq m%4, m%8 + + SWAP %8, %7, %4, %5, %3, %2, %9 +%endmacro +%else ; x86-32 +; input: 1-7 in registers, 8 in first memory [read-only] +; second memory is scratch, and may overlap with first or third memory +; output: 1-5,7-8 in registers, 6 in third memory [write-only] +%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x] + TRANSPOSE4X4W %1, %2, %3, %4, %8 +%ifnidn %9, "" + mov%12 m%8, %9 +%else + mova m%8, %10 +%endif + mova %10, m%4 + TRANSPOSE4X4W %5, %6, %7, %8, %4 + punpckhqdq m%4, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + mova m%7, %10 +%ifnidn %11, "" + mov%13 %11, m%6 +%else + mova %10, m%6 +%endif + punpckhqdq m%6, m%7, m%8 + punpcklqdq m%7, m%8 + + ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8 + SWAP %2, %4, %5, %3 + SWAP %6, %8 +%endmacro +%endif ; x86-32/64 + +; transpose and write m8-11, everything else is scratch +%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp + ; transpose 8x4 + punpcklwd %5, %1, %2 + punpckhwd %1, %2 + punpcklwd %2, %3, %4 + punpckhwd %3, %4 + punpckldq %4, %5, %2 + punpckhdq %5, %2 + punpckldq %2, %1, %3 + punpckhdq %1, %3 + + ; write out + movq [dstq+strideq*0-4], %4 + movhps [dstq+strideq*1-4], %4 + movq [dstq+strideq*2-4], %5 + movhps [dstq+stride3q -4], %5 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], %2 + movhps [dstq+strideq*1-4], %2 + movq [dstq+strideq*2-4], %1 + movhps [dstq+stride3q -4], %1 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 +%if ARCH_X86_64 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 + mova P1, [dstq+mstrideq*2] ; p1 + mova P0, [dstq+mstrideq*1] ; p0 + mova Q0, [dstq+strideq*0] ; q0 + mova Q1, [dstq+strideq*1] ; q1 +%else ; x86-32 +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%endif ; x86-32/64 +%else ; %1 != 4 + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] +%if ARCH_X86_64 + ; we load p3 later +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq+strideq*0] + mova Q1, [dstq+strideq*1] + mova Q2, [dstq+strideq*2] +%if %1 != 6 +%define P3 [tmpq+strideq*0] +%define Q3 m15 + mova Q3, [dstq+stride3q] +%endif ; %1 != 6 +%else ; x86-32 +%define P2 [tmpq+strideq*1] +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%define Q2 [dstq+strideq*2] +%if %1 != 6 +%define P3 [dstq+mstrideq*4] +%define Q3 [dstq+stride3q] +%endif ; %1 != 6 +%endif ; x86-32/64 +%endif ; %1 ==/!= 4 +%else ; %2 != v + ; load lines +%if %1 == 4 + movq m0, [dstq+strideq*0-4] + movq m2, [dstq+strideq*1-4] + movq m4, [dstq+strideq*2-4] + movq m5, [dstq+stride3q -4] + lea tmpq, [dstq+strideq*4] + movq m3, [tmpq+strideq*0-4] + movq m6, [tmpq+strideq*1-4] + movq m1, [tmpq+strideq*2-4] + movq m7, [tmpq+stride3q -4] + + ; transpose 4x8 + ; m0: A-D0 + ; m2: A-D1 + ; m4: A-D2 + ; m5: A-D3 + ; m3: A-D4 + ; m6: A-D5 + ; m1: A-D6 + ; m7: A-D7 + punpcklwd m0, m2 + punpcklwd m4, m5 + punpcklwd m3, m6 + punpcklwd m1, m7 + ; m0: A0-1,B0-1,C0-1,D0-1 + ; m4: A2-3,B2-3,C2-3,D2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m1: A6-7,B6-7,C6-7,D6-7 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m4, m3, m1 + punpckldq m3, m1 + ; m0: A0-3,B0-3 + ; m2: C0-3,D0-3 + ; m3: A4-7,B4-7 + ; m4: C4-7,D4-7 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + punpckhqdq m3, m2, m4 + punpcklqdq m2, m4 + ; m0: A0-7 + ; m1: B0-7 + ; m2: C0-7 + ; m3: D0-7 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%else +%define P1 [esp+3*mmsize] +%define P0 [esp+4*mmsize] +%define Q0 [esp+5*mmsize] +%define Q1 [esp+6*mmsize] + mova P1, m0 + mova P0, m1 + mova Q0, m2 + mova Q1, m3 +%endif +%elif %1 == 6 || %1 == 8 + movu m0, [dstq+strideq*0-8] + movu m1, [dstq+strideq*1-8] + movu m2, [dstq+strideq*2-8] + movu m3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu m4, [tmpq+strideq*0-8] + movu m5, [tmpq+strideq*1-8] + movu m6, [tmpq+strideq*2-8] +%if ARCH_X86_64 + movu m7, [tmpq+stride3q -8] +%endif + + ; transpose 8x16 + ; m0: A-H0,A-H8 + ; m1: A-H1,A-H9 + ; m2: A-H2,A-H10 + ; m3: A-H3,A-H11 + ; m4: A-H4,A-H12 + ; m5: A-H5,A-H13 + ; m6: A-H6,A-H14 + ; m7: A-H7,A-H15 +%if ARCH_X86_64 + punpcklwd m8, m0, m1 +%else + punpcklwd m7, m0, m1 +%endif + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 +%if ARCH_X86_64 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 +%else + mova [rsp+3*16], m4 + movu m4, [tmpq+stride3q -8] + punpcklwd m5, m6, m4 + punpckhwd m6, m4 +%endif + ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32] + ; m0: E0-1,F0-1,G0-1,H0-1 + ; m1: A2-3,B2-3,C2-3,D2-3 + ; m2: E2-3,F2-3,G2-3,H2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32] + ; m5: A6-7,B6-7,C6-7,D6-7 + ; m6: E6-7,F6-7,G6-7,H6-7 +%if ARCH_X86_64 + punpckldq m7, m8, m1 + punpckhdq m8, m1 +%else + punpckldq m4, m7, m1 + punpckhdq m7, m1 +%endif + punpckldq m1, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 +%if ARCH_X86_64 + punpckldq m5, m4, m6 + punpckhdq m4, m6 +%else + mova [rsp+4*16], m3 + mova m3, [rsp+3*16] + punpckldq m5, m3, m6 + punpckhdq m3, m6 +%endif + ; m7: A0-3,B0-3 [m4 on x86-32] + ; m8: C0-3,D0-3 [m7 on x86-32] + ; m1: E0-3,F0-3 + ; m0: G0-3,H0-3 + ; m2: A4-7,B4-7 + ; m3: C4-7,D4-7 [r4 on x86-32] + ; m5: E4-7,F4-7 + ; m4: G4-7,H4-7 [m3 on x86-32] +%if ARCH_X86_64 +%if %1 != 6 + punpcklqdq m6, m7, m2 +%endif + punpckhqdq m7, m2 + punpcklqdq m2, m8, m3 + punpckhqdq m8, m3 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 +%if %1 != 6 + punpckhqdq m5, m0, m4 +%endif + punpcklqdq m0, m4 +%if %1 == 8 + mova [rsp+1*16], m6 +%define P3 [rsp+1*16] +%endif + ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15 + SWAP 7, 13 + SWAP 8, 2, 9 + SWAP 3, 10 + SWAP 1, 11 + SWAP 0, 14 + SWAP 5, 15 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%if %1 == 8 +%define Q3 m15 +%endif +%else ; x86-32 +%if %1 == 8 +%define P3 [rsp+ 6*16] + punpcklqdq m6, m4, m2 + mova P3, m6 +%endif + mova m6, [rsp+4*16] + punpckhqdq m4, m2 + punpcklqdq m2, m7, m6 + punpckhqdq m7, m6 + punpcklqdq m6, m1, m5 + punpckhqdq m1, m5 +%if %1 == 8 +%define Q3 [rsp+24*16] + punpckhqdq m5, m0, m3 + mova Q3, m5 +%endif + punpcklqdq m0, m3 +%if %1 == 8 +%define P2 [rsp+18*16] +%define P1 [rsp+19*16] +%define P0 [rsp+20*16] +%define Q0 [rsp+21*16] +%define Q1 [rsp+22*16] +%define Q2 [rsp+23*16] +%else +%define P2 [rsp+3*16] +%define P1 [rsp+4*16] +%define P0 [rsp+5*16] +%define Q0 [rsp+6*16] +%define Q1 [rsp+7*16] +%define Q2 [rsp+8*16] +%endif + mova P2, m4 + mova P1, m2 + mova P0, m7 + mova Q0, m6 + mova Q1, m1 + mova Q2, m0 +%endif ; x86-32/64 +%else ; %1 == 16 + ; We only use 14 pixels but we'll need the remainder at the end for + ; the second transpose + mova m0, [dstq+strideq*0-16] + mova m1, [dstq+strideq*1-16] + mova m2, [dstq+strideq*2-16] + mova m3, [dstq+stride3q -16] + lea tmpq, [dstq+strideq*4] + mova m4, [tmpq+strideq*0-16] + mova m5, [tmpq+strideq*1-16] + mova m6, [tmpq+strideq*2-16] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q -16] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + SWAP 5, 13 + SWAP 6, 8 + SWAP 7, 9 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%else ; x86-32 +%define P2 [esp+18*16] +%define P1 [esp+19*16] +%define P0 [esp+20*16] + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q -16], P2, "", a, a + mova P1, m6 + mova P0, m7 +%endif ; x86-32/64 + mova [rsp+ 7*16], m0 + mova [rsp+ 8*16], m1 + mova [rsp+ 9*16], m2 + mova [rsp+10*16], m3 +%define P3 [rsp+6*16] + mova P3, m4 + + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + mova m2, [dstq+strideq*2] + mova m3, [dstq+stride3q ] + lea tmpq, [dstq+strideq*4] + mova m4, [tmpq+strideq*0] + mova m5, [tmpq+strideq*1] + mova m6, [tmpq+strideq*2] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q ] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10 + SWAP 0, 10 + SWAP 1, 11 + SWAP 2, 14 + SWAP 3, 15 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%define Q3 m15 +%else ; x86-32 + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q ], [rsp+12*16], "", a, a +%define Q0 [esp+21*16] +%define Q1 [esp+22*16] +%define Q2 [esp+23*16] +%define Q3 [esp+24*16] + mova Q0, m0 + mova Q1, m1 + mova Q2, m2 + mova Q3, m3 +%endif ; x86-32/64 + + mova [rsp+11*16], m4 +%if ARCH_X86_64 + mova [rsp+12*16], m5 +%endif + mova [rsp+13*16], m6 + mova [rsp+14*16], m7 +%endif ; %1 == 4/6/8/16 +%endif ; %2 ==/!= v + + ; load L/E/I/H +%if ARCH_X86_32 +%define l_strideq r5 + mov l_strideq, dword lstridem +%ifidn %2, v +%define lq r3 + mov lq, dword lm +%endif +%endif +%ifidn %2, v +%if cpuflag(sse4) + pmovzxbw m1, [lq] + pmovzxbw m0, [lq+l_strideq] + pxor m2, m2 +%else ; ssse3 + movq m1, [lq] + movq m0, [lq+l_strideq] + pxor m2, m2 + REPX {punpcklbw x, m2}, m1, m0 +%endif ; ssse3/sse4 +%else ; %2 != v + movq m0, [lq] ; l0, l1 + movq m1, [lq+l_strideq] ; l2, l3 + punpckldq m0, m1 ; l0, l2, l1, l3 + pxor m2, m2 + punpcklbw m1, m0, m2 ; l0, l2 + punpckhbw m0, m2 ; l1, l3 +%endif ; %2==/!=v +%if ARCH_X86_32 +%ifidn %2, v +%undef lq + mov mstrideq, mstridem +%endif +%endif + pcmpeqw m5, m2, m0 + pand m1, m5 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1] + pcmpeqw m5, m2, m0 ; !L + psrlw m5, 1 +%if ARCH_X86_64 + psrlw m2, m0, [lutq+128] + SPLATW m1, [lutq+136] +%else ; x86-32 + mov r5, lutm + psrlw m2, m0, [r5+128] + SPLATW m1, [r5+136] +%endif ; x86-32/64 + pminsw m2, m1 + pmaxsw m2, [PIC_sym(pw_1)] ; I + psrlw m1, m0, 4 ; H + paddw m0, [PIC_sym(pw_2)] + paddw m0, m0 + paddw m0, m2 ; E + REPX {pmullw x, [bdmulq]}, m0, m1, m2 +%if ARCH_X86_32 +%undef l_strideq + lea stride3q, [strideq*3] +%endif + + psubw m3, P1, P0 ; p1-p0 + psubw m4, Q0, Q1 ; q0-q1 + REPX {pabsw x, x}, m3, m4 + pmaxsw m3, m5 + pmaxsw m3, m4 + pcmpgtw m7, m3, m1 ; hev +%if %1 != 4 + psubw m4, P2, P0 ; p2-p0 + pabsw m4, m4 + pmaxsw m4, m3 +%if %1 != 6 + mova m6, P3 ; p3 + psubw m5, m6, P0 ; p3-p0 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + psubw m5, Q0, Q2 ; q0-q2 + pabsw m5, m5 + pmaxsw m4, m5 +%if %1 != 6 + psubw m5, Q0, Q3 ; q0-q3 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + pcmpgtw m4, [bdmulq] ; !flat8in + + psubw m5, P2, P1 ; p2-p1 + pabsw m5, m5 +%if %1 != 6 + psubw m6, P2 ; p3-p2 + pabsw m6, m6 + pmaxsw m5, m6 + psubw m6, Q2, Q3 ; q2-q3 + pabsw m6, m6 + pmaxsw m5, m6 +%endif ; %1 != 6 + psubw m6, Q2, Q1 ; q2-q1 + pabsw m6, m6 + pmaxsw m5, m6 + +%if %1 == 16 + SPLATD m6, [maskq+8] + SPLATD m1, [maskq+4] + por m6, m1 + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 +%else ; %1 != 16 + SPLATD m6, [maskq+4] + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 ; only apply fm-wide to wd>4 blocks +%endif ; %1==/!=16 + pmaxsw m3, m5 +%endif ; %1 != 4 + pcmpgtw m3, m2 + + psubw m5, P1, Q1 ; p1-q1 + psubw m6, P0, Q0 ; p0-q0 + REPX {pabsw x, x}, m5, m6 + paddw m6, m6 + psrlw m5, 1 + paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m3, m5 + +%if %1 == 16 + +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] + mova m1, [tmpq+strideq*2] + mova m2, [tmpq+stride3q] +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m1, [rsp+ 9*16] + mova m2, [rsp+10*16] +%endif ; %2==/!=v + REPX {psubw x, P0}, m0, m1, m2 + REPX {pabsw x, x}, m0, m1, m2 + pmaxsw m1, m0 + pmaxsw m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] + mova m2, [tmpq+strideq*1] + mova m5, [tmpq+strideq*2] +%else ; %2 != v + mova m0, [rsp+11*16] + mova m2, [rsp+12*16] + mova m5, [rsp+13*16] +%endif ; %2==/!=v + REPX {psubw x, Q0}, m0, m2, m5 + REPX {pabsw x, x}, m0, m2, m5 + pmaxsw m0, m2 + pmaxsw m1, m5 + pmaxsw m1, m0 + pcmpgtw m1, [bdmulq] ; !flat8out + por m1, m4 ; !flat8in | !flat8out + SPLATD m2, [maskq+8] + pand m5, m2, m12 + pcmpeqd m5, m12 + pandn m1, m5 ; flat16 + pandn m5, m3, m1 ; flat16 & fm + SWAP 1, 5 + + SPLATD m5, [maskq+4] + por m5, m2 + pand m2, m5, m12 + pcmpeqd m2, m12 + pandn m4, m2 ; flat8in + pandn m2, m3, m4 + SWAP 2, 4 + SPLATD m2, [maskq+0] + por m2, m5 + pand m2, m12 + pcmpeqd m2, m12 + pandn m3, m2 + pandn m0, m4, m3 ; fm & !flat8 & !flat16 + SWAP 0, 3 + pandn m0, m1, m4 ; flat8 & !flat16 + SWAP 0, 4 +%elif %1 != 4 + SPLATD m0, [maskq+4] + pand m2, m0, m12 + pcmpeqd m2, m12 + pandn m4, m2 + pandn m2, m3, m4 ; flat8 & fm + SWAP 2, 4 + SPLATD m2, [maskq+0] + por m0, m2 + pand m0, m12 + pcmpeqd m0, m12 + pandn m3, m0 + pandn m0, m4, m3 ; fm & !flat8 + SWAP 0, 3 +%else ; %1 == 4 + SPLATD m0, [maskq+0] + pand m0, m12 + pcmpeqd m0, m12 + pandn m3, m0 ; fm +%endif ; %1==/!=4 + + ; short filter +%if ARCH_X86_64 + SPLATW m0, r7m +%else + SPLATW m0, bdmulm +%endif + pcmpeqw m2, m2 + psrlw m0, 1 ; 511 or 2047 + pxor m2, m0 ; -512 or -2048 + + psubw m5, Q0, P0 ; q0-p0 + paddw m6, m5, m5 + paddw m6, m5 ; 3*(q0-p0) + psubw m5, P1, Q1 ; iclip_diff(p1-q1) + pminsw m5, m0 + pmaxsw m5, m2 + pand m5, m7 ; f=iclip_diff(p1-q1)&hev + paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f) + pminsw m5, m0 + pmaxsw m5, m2 + pand m3, m5 ; f&=fm + paddw m5, m3, [PIC_sym(pw_3)] + paddw m3, [PIC_sym(pw_4)] + REPX {pminsw x, m0}, m5, m3 + psraw m5, 3 ; f2 + psraw m3, 3 ; f1 + psubw m0, m2 ; 1023 or 4095 + pxor m2, m2 +%if ARCH_X86_64 + paddw P0, m5 + psubw Q0, m3 +%else + paddw m5, P0 + psubw m6, Q0, m3 + REPX {pminsw x, m0}, m5, m6 + REPX {pmaxsw x, m2}, m5, m6 +%endif + + paddw m3, [PIC_sym(pw_1)] + psraw m3, 1 ; f=(f1+1)>>1 + pandn m7, m3 ; f&=!hev + SWAP 7, 3 +%if ARCH_X86_64 + paddw P1, m3 + psubw Q1, m3 + REPX {pminsw x, m0}, P1, P0, Q0, Q1 + REPX {pmaxsw x, m2}, P1, P0, Q0, Q1 +%else + psubw m7, Q1, m3 + paddw m3, P1 + REPX {pminsw x, m0}, m7, m3 + REPX {pmaxsw x, m2}, m7, m3 +%if %1 > 4 + mova P1, m3 + mova P0, m5 + mova Q0, m6 + mova Q1, m7 +%endif +%endif + +%if %1 == 16 + +; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16 +; m12=filter bits mask +; m13-15=p2/q2/q3 +; m0,2-3,5-7 = free + + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 + mova m6, [tmpq+strideq*4] ; p3 + lea tmpq, [dstq+mstrideq*4] +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m2, [rsp+ 9*16] + mova m7, [rsp+10*16] + mova m6, [rsp+ 6*16] +%endif ; %2==/!=v + + mova [rsp+ 0*16], m4 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psllw m3, m0, 3 ; p6*8 + paddw m3, [PIC_sym(pw_8)] + paddw m5, m2, m7 ; p5+p4 + psubw m3, m0 + paddw m5, m5 ; (p5+p4)*2 + paddw m3, m6 ; p6*7+p3 + paddw m5, P2 ; (p5+p4)*2+p2 + paddw m3, P1 ; p6*7+p3+p1 + paddw m5, P0 ; (p5+p4)*2+p2+p0 + paddw m3, Q0 ; p6*7+p3+p1+q0 + paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m2 + por m5, m4 +%ifidn %2, v + mova [tmpq+mstrideq*2], m5 ; p5 +%else ; %2 != v + mova [rsp+9*16], m5 +%endif ; %2==/!=v + + ; sub p6*2, add p3/q1 + paddw m3, m6 + paddw m5, m0, m0 + paddw m3, Q1 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m7 + por m5, m4 +%ifidn %2, v + mova [tmpq+mstrideq*1], m5 ; p4 +%else ; %2 != v + mova [rsp+10*16], m5 +%endif ; %2==/!=v + + ; sub p6/p5, add p2/q2 + psubw m3, m0 + paddw m5, P2, Q2 + psubw m3, m2 + paddw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m6 + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*0], m5 ; p3 +%else ; %2 != v + mova [rsp+6*16], m5 +%endif ; %2==/!=v + +%define WRITE_IN_PLACE 0 +%ifidn %2, v +%if ARCH_X86_64 +%define WRITE_IN_PLACE 1 +%endif +%endif + + ; sub p6/p4, add p1/q3 + paddw m3, P1 + paddw m5, m0, m7 + paddw m3, Q3 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P2 + por m5, m4 +%if WRITE_IN_PLACE + mova [tmpq+strideq*1], m5 +%else + mova [rsp+1*16], m5 ; don't clobber p2/m13 +%endif + + ; sub p6/p3, add p0/q4 + paddw m3, P0 + paddw m5, m0, m6 +%ifidn %2, v + paddw m3, [dstq+strideq*4] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P1 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*2], m5 +%else + mova [rsp+2*16], m5 ; don't clobber p1/m3 +%endif + + ; sub p6/p2, add q0/q5 + paddw m3, Q0 + paddw m5, m0, P2 +%ifidn %2, v +%if ARCH_X86_32 + lea r4, P2 +%endif + lea tmpq, [dstq+strideq*4] + paddw m3, [tmpq+strideq*1] +%else ; %2 != v + paddw m3, [rsp+12*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*1], m5 +%else + mova [rsp+3*16], m5 ; don't clobber p0/m4 +%endif + + ; sub p6/p1, add q1/q6 + paddw m3, Q1 + paddw m5, m0, P1 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else ; %2 != v + mova m0, [rsp+13*16] ; q6 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq], m5 +%else + mova [rsp+4*16], m5 ; don't clobber q0/m5 +%endif + + ; sub p5/p0, add q2/q6 + paddw m3, Q2 + paddw m5, m2, P0 + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q1 + por m2, m5, m4 ; don't clobber q1/m6 + + ; sub p4/q0, add q3/q6 + paddw m3, Q3 + paddw m7, Q0 + paddw m3, m0 + psubw m3, m7 + psrlw m7, m3, 4 + pand m7, m1 + pandn m4, m1, Q2 + por m7, m4 ; don't clobber q2/m14 + + ; sub p3/q1, add q4/q6 +%ifidn %2, v + paddw m3, [tmpq+strideq*0] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + paddw m6, Q1 + paddw m3, m0 + psubw m3, m6 + psrlw m6, m3, 4 + pand m6, m1 + pandn m4, m1, Q3 + por m6, m4 +%if WRITE_IN_PLACE + mova [tmpq+mstrideq], m6 ; q3 +%else ; %2 != v + mova [rsp+5*16], m6 +%endif ; %2==/!=v + + ; sub p2/q2, add q5/q6 +%ifidn %2, v + paddw m3, [tmpq+strideq*1] +%if ARCH_X86_64 + paddw m5, P2, Q2 +%else + ; because tmpq is clobbered, so we use a backup pointer for P2 instead + paddw m5, [r4], Q2 + mov pic_regq, pic_regm +%endif +%else ; %2 != v + paddw m3, [rsp+12*16] + paddw m5, P2, Q2 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 +%ifidn %2, v + pandn m4, m1, [tmpq+strideq*0] +%else ; %2 != v + pandn m4, m1, [rsp+11*16] +%endif ; %2==/!=v + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*0], m5 ; q4 +%else ; %2 != v + mova [rsp+11*16], m5 +%endif ; %2==/!=v + + ; sub p1/q3, add q6*2 + psubw m3, P1 + paddw m0, m0 + psubw m3, Q3 + paddw m3, m0 + psrlw m5, m3, 4 + pand m5, m1 +%ifidn %2, v + pandn m4, m1, [tmpq+strideq*1] +%else ; %2 != v + pandn m4, m1, [rsp+12*16] +%endif ; %2==/!=v + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*1], m5 ; q5 +%else ; %2 != v + mova [rsp+12*16], m5 +%endif ; %2==/!=v + + mova m4, [rsp+0*16] +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%if ARCH_X86_64 + SWAP 2, 11 + SWAP 7, 14 + SWAP 6, 15 +%else ; x86-32 + mova Q1, m2 + mova Q2, m7 +%endif ; x86-32/64 +%if WRITE_IN_PLACE + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq] +%elif ARCH_X86_64 + mova P2, [rsp+1*16] + mova P1, [rsp+2*16] + mova P0, [rsp+3*16] + mova Q0, [rsp+4*16] +%else ; !WRITE_IN_PLACE & x86-32 + mova m0, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m3, [rsp+4*16] + mova m7, [rsp+5*16] + mova P2, m0 + mova P1, m1 + mova P0, m2 + mova Q0, m3 + mova Q3, m7 +%endif ; WRITE_IN_PLACE / x86-32/64 +%undef WRITE_IN_PLACE +%endif ; %1 == 16 + +%if %1 >= 8 + + ; flat8 filter + mova m0, P3 ; p3 + paddw m1, m0, P2 ; p3+p2 + paddw m2, P1, P0 ; p1+p0 + paddw m3, m1, m1 ; 2*(p3+p2) + paddw m2, m0 ; p1+p0+p3 + paddw m3, Q0 ; 2*(p3+p2)+q0 + paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0 + pmulhrsw m7, m2, [PIC_sym(pw_4096)] + psubw m7, P2 + pand m7, m4 + + paddw m3, P1, Q1 ; p1+q1 + psubw m2, m1 ; 2*p3+p2+p1+p0+q0 + paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1 + pmulhrsw m3, m2, [PIC_sym(pw_4096)] + psubw m3, P1 + pand m3, m4 + + paddw m5, m0, P1 ; p3+p1 + paddw m6, P0, Q2 ; p0+q2 + psubw m2, m5 ; p3+p2+p1+p0+q0+q1 + paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2 + pmulhrsw m5, m2, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m6, m0, P0 ; p3+p0 + paddw m1, Q0, Q3 ; q0+q3 + psubw m2, m6 ; p2+p1+p0+q0+q1+q2 + paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3 + pmulhrsw m6, m2, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3 + paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3 + paddw m1, P2, Q0 ; p2+q0 + psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3 + pmulhrsw m1, m2, [PIC_sym(pw_4096)] + psubw m1, Q1 + pand m1, m4 + + psubw m2, P1 ; p0+q0+2*q1+q2+2*q3 + psubw m2, Q1 ; p0+q0+q1+q2+2*q3 + paddw m0, Q3, Q2 ; q3+q2 + paddw m2, m0 ; p0+q0+q1+2*q2+3*q3 + pmulhrsw m2, [PIC_sym(pw_4096)] + psubw m2, Q2 + pand m2, m4 + + paddw m7, P2 + paddw m3, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m1, Q1 + paddw m2, Q2 + +%ifidn %2, v + mova [tmpq+strideq*1], m7 ; p2 + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m1 ; q1 + mova [dstq+strideq*2], m2 ; q2 +%else ; %2 != v + mova m0, P3 + +%if %1 == 8 + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + SWAP 4, 15 + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8 +%else + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \ + Q3, [tmpq+strideq*1-8], a, u +%endif + + ; write 8x8 + movu [dstq+strideq*0-8], m0 + movu [dstq+strideq*1-8], m7 + movu [dstq+strideq*2-8], m3 + movu [dstq+stride3q -8], m5 + movu [tmpq+strideq*0-8], m6 +%if ARCH_X86_64 + movu [tmpq+strideq*1-8], m1 +%endif + movu [tmpq+strideq*2-8], m2 + movu [tmpq+stride3q -8], m4 + lea dstq, [dstq+strideq*8] +%else ; %1 != 8 +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 +%else + mova [rsp+1*16], m6 + mova [rsp+2*16], m1 + mova [rsp+3*16], m2 +%endif + + mova m1, [rsp+ 7*16] + mova m2, [rsp+ 8*16] + mova m4, [rsp+ 9*16] + mova m6, [rsp+10*16] + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11 +%else + mova [rsp+7*16], m5 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \ + [rsp+7*16], [tmpq+strideq*1-16], a, a +%endif + + mova [dstq+strideq*0-16], m1 + mova [dstq+strideq*1-16], m2 + mova [dstq+strideq*2-16], m4 + mova [dstq+stride3q -16], m6 + mova [tmpq+strideq*0-16], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1-16], m7 +%endif + mova [tmpq+strideq*2-16], m3 + mova [tmpq+stride3q -16], m5 + +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 4, 15 +%else + mova m6, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m4, Q3 +%endif + mova m0, [rsp+11*16] + mova m3, [rsp+12*16] + mova m5, [rsp+13*16] +%if ARCH_X86_64 + mova m7, [rsp+14*16] + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8 +%else + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \ + [rsp+14*16], [tmpq+strideq*1], a, a +%endif + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + mova [tmpq+strideq*0], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1], m3 +%endif + mova [tmpq+strideq*2], m5 + mova [tmpq+stride3q ], m7 + lea dstq, [dstq+strideq*8] +%endif ; %1==/!=8 +%endif ; %2==/!=v +%elif %1 == 6 + ; flat6 filter + paddw m3, P1, P0 ; p1+p0 + paddw m3, P2 ; p2+p1+p0 + paddw m6, P2, Q0 ; p2+q0 + paddw m3, m3 ; 2*(p2+p1+p0) + paddw m3, m6 ; p2+2*(p2+p1+p0)+q0 + pmulhrsw m2, m3, [PIC_sym(pw_4096)] + psubw m2, P1 + pand m2, m4 + + paddw m3, Q0 ; p2+2*(p2+p1+p0+q0) + paddw m6, P2, P2 ; 2*p2 + paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1 + psubw m3, m6 ; p2+2*(p1+p0+q0)+q1 + pmulhrsw m5, m3, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m3, Q1 ; p2+2*(p1+p0+q0+q1) + paddw m6, P2, P1 ; p2+p1 + paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2 + psubw m3, m6 ; p1+2*(p0+q0+q1)+q2 + pmulhrsw m6, m3, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + psubw m3, P1 ; 2*(p0+q0+q1)+q2 +%if ARCH_X86_64 + paddw Q2, Q2 ; q2*2 +%else + mova m0, Q2 + paddw m0, m0 +%endif + psubw m3, P0 ; p0+2*(q0+q1)+q2 +%if ARCH_X86_64 + paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2 +%else + paddw m3, m0 +%endif + pmulhrsw m3, [PIC_sym(pw_4096)] + psubw m3, Q1 + pand m3, m4 + + paddw m2, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m3, Q1 + +%ifidn %2, v + mova [dstq+mstrideq*2], m2 ; p1 + mova [dstq+mstrideq*1], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m3 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0 +%endif ; %2==/!=v +%else ; %1 == 4 +%if ARCH_X86_64 +%ifidn %2, v + mova [dstq+mstrideq*2], P1 ; p1 + mova [dstq+mstrideq*1], P0 ; p0 + mova [dstq+strideq*0], Q0 ; q0 + mova [dstq+strideq*1], Q1 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0 +%endif ; %2==/!=v +%else ; x86-32 +%ifidn %2, v + mova [dstq+mstrideq*2], m3 + mova [dstq+mstrideq*1], m5 + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m7 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0 +%endif ; %2==/!=v +%endif ; x86-32/64 +%endif ; %1 +%undef P3 +%undef P2 +%undef P1 +%undef P0 +%undef Q0 +%undef Q1 +%undef Q2 +%undef Q3 +%endmacro + +INIT_XMM ssse3 +; stack layout: +; r0 - flat8 backup inside flat16 code +%if ARCH_X86_64 +cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq +%else +; stack layout [32bit only]: +; r1-4 - p2-q0 post-filter16 +; r5 - p3 +; r6 - q3 post-filter16 +; r7 - GPRs [mask_bitsm, mstridem] +; r8 - m12/pb_mask +; r9 - bdmulq +cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 10*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base +%define pic_regm dword [esp+7*16+2*gprsize] + mov pic_regm, pic_regq + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+9*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mstridem dword [esp+7*16+1*gprsize] + mov mstridem, mstrideq +%define mask_bitsm dword [esp+7*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+8*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif + jz .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 8, v + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .end + + FILTER 4, v + +.end: +%if ARCH_X86_64 + pslld m12, 2 + add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif + add dstq, 16 +%if ARCH_X86_64 + shl mask_bitsd, 2 + sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +; stack layout: +; r0 - flat8 backup inside flat16 +; r1-4 - p2-q0 post-filter16 backup +; r5 - q3 post-filter16 backup +; r6 - p3 +; r7-10 - p7-4 +; r11-14 - q4-7 +%if ARCH_X86_64 +cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov hd, hm + shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r15 - GPRs [mask_bitsm] +; r16 - m12/pb_mask +; r17 - bdmulq +; r18-24 - p2-q3 +cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 25*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+17*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif + sub lq, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+15*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+16*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif + jz .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 8, h + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: +%if ARCH_X86_64 + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm, mstridem] +; r1 - m12/pb_mask +; r2 - bdmulq +cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 3*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] +%define mstridem dword [esp+1*gprsize] + mov mask_bitsm, 0x3 + mov mstridem, mstrideq + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 6, v + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .end + + FILTER 4, v + +.end: +%if ARCH_X86_64 + pslld m12, 2 + add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif + add dstq, 16 +%if ARCH_X86_64 + shl mask_bitsd, 2 + sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov hd, hm + shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm] +; r1 - m12/pb_mask +; r2 - bdmulq +; r3-8 - p2-q2 +cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 9*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif + sub lq, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 6, h + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: +%if ARCH_X86_64 + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET diff -Nru dav1d-0.7.1/src/x86/loopfilter.asm dav1d-0.9.1/src/x86/loopfilter.asm --- dav1d-0.7.1/src/x86/loopfilter.asm 2020-06-21 11:48:55.028126500 +0000 +++ dav1d-0.9.1/src/x86/loopfilter.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1600 +0,0 @@ -; Copyright © 2018, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - -%if ARCH_X86_64 - -SECTION_RODATA 32 - -pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 -pb_7_1: times 16 db 7, 1 -pb_3_1: times 16 db 3, 1 -pb_2_1: times 16 db 2, 1 -pb_m1_0: times 16 db -1, 0 -pb_m1_1: times 16 db -1, 1 -pb_m1_2: times 16 db -1, 2 -pb_1: times 32 db 1 -pb_2: times 32 db 2 -pb_3: times 32 db 3 -pb_4: times 32 db 4 -pb_16: times 32 db 16 -pb_63: times 32 db 63 -pb_64: times 32 db 64 -pb_128: times 32 db 0x80 -pb_129: times 32 db 0x81 -pb_240: times 32 db 0xf0 -pb_248: times 32 db 0xf8 -pb_254: times 32 db 0xfe - -pw_2048: times 16 dw 2048 -pw_4096: times 16 dw 4096 - -pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128 - -SECTION .text - -%macro ABSSUB 4 ; dst, a, b, tmp - psubusb %1, %2, %3 - psubusb %4, %3, %2 - por %1, %4 -%endmacro - -%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 - ; transpose 16x4 - punpcklbw m%5, m%1, m%2 - punpckhbw m%1, m%2 - punpcklbw m%2, m%3, m%4 - punpckhbw m%3, m%4 - punpcklwd m%4, m%5, m%2 - punpckhwd m%5, m%2 - punpcklwd m%2, m%1, m%3 - punpckhwd m%1, m%3 - - ; write out - movd [dstq+strideq*0-2], xm%4 - pextrd [dstq+strideq*1-2], xm%4, 1 - pextrd [dstq+strideq*2-2], xm%4, 2 - pextrd [dstq+stride3q-2], xm%4, 3 - lea dstq, [dstq+strideq*4] - movd [dstq+strideq*0-2], xm%5 - pextrd [dstq+strideq*1-2], xm%5, 1 - pextrd [dstq+strideq*2-2], xm%5, 2 - pextrd [dstq+stride3q-2], xm%5, 3 - lea dstq, [dstq+strideq*4] - movd [dstq+strideq*0-2], xm%2 - pextrd [dstq+strideq*1-2], xm%2, 1 - pextrd [dstq+strideq*2-2], xm%2, 2 - pextrd [dstq+stride3q-2], xm%2, 3 - lea dstq, [dstq+strideq*4] - movd [dstq+strideq*0-2], xm%1 - pextrd [dstq+strideq*1-2], xm%1, 1 - pextrd [dstq+strideq*2-2], xm%1, 2 - pextrd [dstq+stride3q-2], xm%1, 3 - lea dstq, [dstq+strideq*4] - - vextracti128 xm%4, m%4, 1 - vextracti128 xm%5, m%5, 1 - vextracti128 xm%2, m%2, 1 - vextracti128 xm%1, m%1, 1 - - movd [dstq+strideq*0-2], xm%4 - pextrd [dstq+strideq*1-2], xm%4, 1 - pextrd [dstq+strideq*2-2], xm%4, 2 - pextrd [dstq+stride3q-2], xm%4, 3 - lea dstq, [dstq+strideq*4] - movd [dstq+strideq*0-2], xm%5 - pextrd [dstq+strideq*1-2], xm%5, 1 - pextrd [dstq+strideq*2-2], xm%5, 2 - pextrd [dstq+stride3q-2], xm%5, 3 - lea dstq, [dstq+strideq*4] - movd [dstq+strideq*0-2], xm%2 - pextrd [dstq+strideq*1-2], xm%2, 1 - pextrd [dstq+strideq*2-2], xm%2, 2 - pextrd [dstq+stride3q-2], xm%2, 3 - lea dstq, [dstq+strideq*4] - movd [dstq+strideq*0-2], xm%1 - pextrd [dstq+strideq*1-2], xm%1, 1 - pextrd [dstq+strideq*2-2], xm%1, 2 - pextrd [dstq+stride3q-2], xm%1, 3 - lea dstq, [dstq+strideq*4] -%endmacro - -%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem -%if %1 == 0 - mova %3, m15 -%endif - - ; input in m0-15 - punpcklbw m15, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m2, m3 - punpckhbw m2, m3 - punpcklbw m3, m4, m5 - punpckhbw m4, m5 - punpcklbw m5, m6, m7 - punpckhbw m6, m7 - punpcklbw m7, m8, m9 - punpckhbw m8, m9 - punpcklbw m9, m10, m11 - punpckhbw m10, m11 - punpcklbw m11, m12, m13 - punpckhbw m12, m13 - mova m13, %3 - mova %3, m12 - punpcklbw m12, m14, m13 - punpckhbw m13, m14, m13 - - ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 - punpcklwd m14, m15, m1 - punpckhwd m15, m1 - punpcklwd m1, m0, m2 - punpckhwd m0, m2 - punpcklwd m2, m3, m5 - punpckhwd m3, m5 - punpcklwd m5, m4, m6 - punpckhwd m4, m6 - punpcklwd m6, m7, m9 - punpckhwd m7, m9 - punpcklwd m9, m8, m10 - punpckhwd m8, m10 - punpcklwd m10, m11, m12 - punpckhwd m11, m12 - mova m12, %3 - mova %3, m11 - punpcklwd m11, m12, m13 - punpckhwd m12, m13 - - ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 - punpckldq m13, m14, m2 - punpckhdq m14, m2 - punpckldq m2, m15, m3 - punpckhdq m15, m3 - punpckldq m3, m1, m5 - punpckhdq m1, m5 - punpckldq m5, m0, m4 - punpckhdq m0, m4 - punpckldq m4, m6, m10 - punpckhdq m6, m10 - punpckldq m10, m9, m11 - punpckhdq m9, m11 - punpckldq m11, m8, m12 - punpckhdq m8, m12 - mova m12, %3 - mova %3, m8 - punpckldq m8, m7, m12 - punpckhdq m7, m12 - - ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 - punpcklqdq m12, m13, m4 - punpckhqdq m13, m4 - punpcklqdq m4, m14, m6 - punpckhqdq m14, m6 - punpcklqdq m6, m2, m8 - punpckhqdq m2, m8 - punpcklqdq m8, m15, m7 - punpckhqdq m15, m7 - punpcklqdq m7, m3, m10 - punpckhqdq m3, m10 - punpcklqdq m10, m1, m9 - punpckhqdq m1, m9 - punpcklqdq m9, m5, m11 - punpckhqdq m5, m11 - mova m11, %3 - mova %3, m12 - punpcklqdq m12, m0, m11 - punpckhqdq m0, m11 -%if %2 == 0 - mova m11, %3 -%endif - - ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 - SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 - SWAP 3, 14, 12, 9 -%endmacro - -%macro FILTER 2 ; width [4/6/8/16], dir [h/v] - ; load data -%ifidn %2, v -%if %1 == 4 - lea tmpq, [dstq+mstrideq*2] - mova m3, [tmpq+strideq*0] ; p1 - mova m4, [tmpq+strideq*1] ; p0 - mova m5, [tmpq+strideq*2] ; q0 - mova m6, [tmpq+stride3q] ; q1 -%else - ; load 6-8 pixels, remainder (for wd=16) will be read inline - lea tmpq, [dstq+mstrideq*4] -%if %1 != 6 - mova m12, [tmpq+strideq*0] -%endif - mova m13, [tmpq+strideq*1] - mova m3, [tmpq+strideq*2] - mova m4, [tmpq+stride3q] - mova m5, [dstq+strideq*0] - mova m6, [dstq+strideq*1] - mova m14, [dstq+strideq*2] -%if %1 != 6 - mova m15, [dstq+stride3q] -%endif -%endif -%else - ; load lines -%if %1 == 4 - movd xm3, [dstq+strideq*0-2] - movd xm4, [dstq+strideq*1-2] - movd xm5, [dstq+strideq*2-2] - movd xm6, [dstq+stride3q -2] - lea tmpq, [dstq+strideq*4] - pinsrd xm3, [tmpq+strideq*0-2], 2 - pinsrd xm4, [tmpq+strideq*1-2], 2 - pinsrd xm5, [tmpq+strideq*2-2], 2 - pinsrd xm6, [tmpq+stride3q -2], 2 - lea tmpq, [tmpq+strideq*4] - pinsrd xm3, [tmpq+strideq*0-2], 1 - pinsrd xm4, [tmpq+strideq*1-2], 1 - pinsrd xm5, [tmpq+strideq*2-2], 1 - pinsrd xm6, [tmpq+stride3q -2], 1 - lea tmpq, [tmpq+strideq*4] - pinsrd xm3, [tmpq+strideq*0-2], 3 - pinsrd xm4, [tmpq+strideq*1-2], 3 - pinsrd xm5, [tmpq+strideq*2-2], 3 - pinsrd xm6, [tmpq+stride3q -2], 3 - lea tmpq, [tmpq+strideq*4] - movd xm12, [tmpq+strideq*0-2] - movd xm13, [tmpq+strideq*1-2] - movd xm14, [tmpq+strideq*2-2] - movd xm15, [tmpq+stride3q -2] - lea tmpq, [tmpq+strideq*4] - pinsrd xm12, [tmpq+strideq*0-2], 2 - pinsrd xm13, [tmpq+strideq*1-2], 2 - pinsrd xm14, [tmpq+strideq*2-2], 2 - pinsrd xm15, [tmpq+stride3q -2], 2 - lea tmpq, [tmpq+strideq*4] - pinsrd xm12, [tmpq+strideq*0-2], 1 - pinsrd xm13, [tmpq+strideq*1-2], 1 - pinsrd xm14, [tmpq+strideq*2-2], 1 - pinsrd xm15, [tmpq+stride3q -2], 1 - lea tmpq, [tmpq+strideq*4] - pinsrd xm12, [tmpq+strideq*0-2], 3 - pinsrd xm13, [tmpq+strideq*1-2], 3 - pinsrd xm14, [tmpq+strideq*2-2], 3 - pinsrd xm15, [tmpq+stride3q -2], 3 - vinserti128 m3, xm12, 1 - vinserti128 m4, xm13, 1 - vinserti128 m5, xm14, 1 - vinserti128 m6, xm15, 1 - - ; transpose 4x16 - ; xm3: A-D0,A-D8,A-D4,A-D12 - ; xm4: A-D1,A-D9,A-D5,A-D13 - ; xm5: A-D2,A-D10,A-D6,A-D14 - ; xm6: A-D3,A-D11,A-D7,A-D15 - punpcklbw m7, m3, m4 - punpckhbw m3, m4 - punpcklbw m4, m5, m6 - punpckhbw m5, m6 - ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 - ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 - ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 - ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 - punpcklwd m6, m7, m4 - punpckhwd m7, m4 - punpcklwd m4, m3, m5 - punpckhwd m3, m5 - ; xm6: A0-3,B0-3,C0-3,D0-3 - ; xm7: A8-11,B8-11,C8-11,D8-11 - ; xm4: A4-7,B4-7,C4-7,D4-7 - ; xm3: A12-15,B12-15,C12-15,D12-15 - punpckldq m5, m6, m4 - punpckhdq m6, m4 - punpckldq m4, m7, m3 - punpckhdq m7, m3 - ; xm5: A0-7,B0-7 - ; xm6: C0-7,D0-7 - ; xm4: A8-15,B8-15 - ; xm7: C8-15,D8-15 - punpcklqdq m3, m5, m4 - punpckhqdq m4, m5, m4 - punpcklqdq m5, m6, m7 - punpckhqdq m6, m7 - ; xm3: A0-15 - ; xm5: B0-15 - ; xm4: C0-15 - ; xm6: D0-15 -%elif %1 == 6 || %1 == 8 - movq xm3, [dstq+strideq*0-%1/2] - movq xm4, [dstq+strideq*1-%1/2] - movq xm5, [dstq+strideq*2-%1/2] - movq xm6, [dstq+stride3q -%1/2] - lea tmpq, [dstq+strideq*8] - movhps xm3, [tmpq+strideq*0-%1/2] - movhps xm4, [tmpq+strideq*1-%1/2] - movhps xm5, [tmpq+strideq*2-%1/2] - movhps xm6, [tmpq+stride3q -%1/2] - lea tmpq, [tmpq+strideq*8] - movq xm7, [tmpq+strideq*0-%1/2] - movq xm8, [tmpq+strideq*1-%1/2] - movq xm9, [tmpq+strideq*2-%1/2] - movq xm11, [tmpq+stride3q -%1/2] - lea tmpq, [tmpq+strideq*8] - movhps xm7, [tmpq+strideq*0-%1/2] - movhps xm8, [tmpq+strideq*1-%1/2] - movhps xm9, [tmpq+strideq*2-%1/2] - movhps xm11, [tmpq+stride3q -%1/2] - vinserti128 m3, xm7, 1 - vinserti128 m4, xm8, 1 - vinserti128 m5, xm9, 1 - vinserti128 m6, xm11, 1 - lea tmpq, [dstq+strideq*4] - movq xm12, [tmpq+strideq*0-%1/2] - movq xm13, [tmpq+strideq*1-%1/2] - movq xm14, [tmpq+strideq*2-%1/2] - movq xm15, [tmpq+stride3q -%1/2] - lea tmpq, [tmpq+strideq*8] - movhps xm12, [tmpq+strideq*0-%1/2] - movhps xm13, [tmpq+strideq*1-%1/2] - movhps xm14, [tmpq+strideq*2-%1/2] - movhps xm15, [tmpq+stride3q -%1/2] - lea tmpq, [tmpq+strideq*8] - movq xm7, [tmpq+strideq*0-%1/2] - movq xm8, [tmpq+strideq*1-%1/2] - movq xm9, [tmpq+strideq*2-%1/2] - movq xm11, [tmpq+stride3q -%1/2] - lea tmpq, [tmpq+strideq*8] - movhps xm7, [tmpq+strideq*0-%1/2] - movhps xm8, [tmpq+strideq*1-%1/2] - movhps xm9, [tmpq+strideq*2-%1/2] - movhps xm11, [tmpq+stride3q -%1/2] - vinserti128 m12, xm7, 1 - vinserti128 m13, xm8, 1 - vinserti128 m14, xm9, 1 - vinserti128 m15, xm11, 1 - - ; transpose 8x16 - ; xm3: A-H0,A-H8 - ; xm4: A-H1,A-H9 - ; xm5: A-H2,A-H10 - ; xm6: A-H3,A-H11 - ; xm12: A-H4,A-H12 - ; xm13: A-H5,A-H13 - ; xm14: A-H6,A-H14 - ; xm15: A-H7,A-H15 - punpcklbw m7, m3, m4 - punpckhbw m3, m4 - punpcklbw m4, m5, m6 - punpckhbw m5, m6 - punpcklbw m6, m12, m13 - punpckhbw m12, m13 - punpcklbw m13, m14, m15 - punpckhbw m14, m15 - ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 - ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 - ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 - ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 - ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 - ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 - ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 - ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 - punpcklwd m15, m7, m4 - punpckhwd m7, m4 - punpcklwd m4, m3, m5 - punpckhwd m3, m5 - punpcklwd m5, m6, m13 - punpckhwd m6, m13 - punpcklwd m13, m12, m14 - punpckhwd m12, m14 - ; xm15: A0-3,B0-3,C0-3,D0-3 - ; xm7: E0-3,F0-3,G0-3,H0-3 - ; xm4: A8-11,B8-11,C8-11,D8-11 - ; xm3: E8-11,F8-11,G8-11,H8-11 - ; xm5: A4-7,B4-7,C4-7,D4-7 - ; xm6: E4-7,F4-7,G4-7,H4-7 - ; xm13: A12-15,B12-15,C12-15,D12-15 - ; xm12: E12-15,F12-15,G12-15,H12-15 - punpckldq m14, m15, m5 - punpckhdq m15, m5 - punpckldq m5, m7, m6 -%if %1 != 6 - punpckhdq m7, m6 -%endif - punpckldq m6, m4, m13 - punpckhdq m4, m13 - punpckldq m13, m3, m12 -%if %1 != 6 - punpckhdq m12, m3, m12 -%endif - ; xm14: A0-7,B0-7 - ; xm15: C0-7,D0-7 - ; xm5: E0-7,F0-7 - ; xm7: G0-7,H0-7 - ; xm6: A8-15,B8-15 - ; xm4: C8-15,D8-15 - ; xm13: E8-15,F8-15 - ; xm12: G8-15,H8-15 - punpcklqdq m3, m14, m6 - punpckhqdq m14, m6 - punpckhqdq m6, m15, m4 - punpcklqdq m15, m4 - punpcklqdq m4, m5, m13 - punpckhqdq m13, m5, m13 -%if %1 == 8 - punpcklqdq m5, m7, m12 - punpckhqdq m12, m7, m12 - ; xm3: A0-15 - ; xm14: B0-15 - ; xm15: C0-15 - ; xm6: D0-15 - ; xm4: E0-15 - ; xm13: F0-15 - ; xm5: G0-15 - ; xm12: H0-15 - SWAP 12, 3, 15 - SWAP 13, 14, 5, 4, 6 - ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 -%else - SWAP 13, 3, 14 - SWAP 6, 4, 15, 5 - ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 -%endif -%else - ; load and 16x16 transpose. We only use 14 pixels but we'll need the - ; remainder at the end for the second transpose - movu xm0, [dstq+strideq*0-8] - movu xm1, [dstq+strideq*1-8] - movu xm2, [dstq+strideq*2-8] - movu xm3, [dstq+stride3q -8] - lea tmpq, [dstq+strideq*4] - movu xm4, [tmpq+strideq*0-8] - movu xm5, [tmpq+strideq*1-8] - movu xm6, [tmpq+strideq*2-8] - movu xm7, [tmpq+stride3q -8] - lea tmpq, [tmpq+strideq*4] - movu xm8, [tmpq+strideq*0-8] - movu xm9, [tmpq+strideq*1-8] - movu xm10, [tmpq+strideq*2-8] - movu xm11, [tmpq+stride3q -8] - lea tmpq, [tmpq+strideq*4] - movu xm12, [tmpq+strideq*0-8] - movu xm13, [tmpq+strideq*1-8] - movu xm14, [tmpq+strideq*2-8] - movu xm15, [tmpq+stride3q -8] - lea tmpq, [tmpq+strideq*4] - vinserti128 m0, [tmpq+strideq*0-8], 1 - vinserti128 m1, [tmpq+strideq*1-8], 1 - vinserti128 m2, [tmpq+strideq*2-8], 1 - vinserti128 m3, [tmpq+stride3q -8], 1 - lea tmpq, [tmpq+strideq*4] - vinserti128 m4, [tmpq+strideq*0-8], 1 - vinserti128 m5, [tmpq+strideq*1-8], 1 - vinserti128 m6, [tmpq+strideq*2-8], 1 - vinserti128 m7, [tmpq+stride3q -8], 1 - lea tmpq, [tmpq+strideq*4] - vinserti128 m8, [tmpq+strideq*0-8], 1 - vinserti128 m9, [tmpq+strideq*1-8], 1 - vinserti128 m10, [tmpq+strideq*2-8], 1 - vinserti128 m11, [tmpq+stride3q -8], 1 - lea tmpq, [tmpq+strideq*4] - vinserti128 m12, [tmpq+strideq*0-8], 1 - vinserti128 m13, [tmpq+strideq*1-8], 1 - vinserti128 m14, [tmpq+strideq*2-8], 1 - vinserti128 m15, [tmpq+stride3q -8], 1 - - TRANSPOSE_16X16B 0, 1, [rsp+11*32] - mova [rsp+12*32], m1 - mova [rsp+13*32], m2 - mova [rsp+14*32], m3 - mova [rsp+15*32], m12 - mova [rsp+16*32], m13 - mova [rsp+17*32], m14 - mova [rsp+18*32], m15 - ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 - SWAP 12, 4, 7 - SWAP 13, 5, 8 - SWAP 3, 6, 9 - SWAP 10, 14 - SWAP 11, 15 -%endif -%endif - - ; load L/E/I/H -%ifidn %2, v - movu m1, [lq] - movu m0, [lq+l_strideq] -%else - movq xm1, [lq] - movq xm2, [lq+l_strideq*2] - movhps xm1, [lq+l_strideq] - movhps xm2, [lq+l_stride3q] - lea lq, [lq+l_strideq*4] - movq xm10, [lq] - movq xm0, [lq+l_strideq*2] - movhps xm10, [lq+l_strideq] - movhps xm0, [lq+l_stride3q] - lea lq, [lq+l_strideq*4] - vinserti128 m1, xm10, 1 - vinserti128 m2, xm0, 1 - shufps m0, m1, m2, q3131 - shufps m1, m2, q2020 -%endif - pxor m2, m2 - pcmpeqb m10, m2, m0 - pand m1, m10 - por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] - pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] - pcmpeqb m10, m2, m0 ; !L - psrlq m2, m0, [lutq+128] - pand m2, [pb_63] - vpbroadcastb m1, [lutq+136] - pminub m2, m1 - pmaxub m2, [pb_1] ; I - pand m1, m0, [pb_240] - psrlq m1, 4 ; H - paddb m0, [pb_2] - paddb m0, m0 - paddb m0, m2 ; E - pxor m1, [pb_128] - pxor m2, [pb_128] - pxor m0, [pb_128] - - ABSSUB m8, m3, m4, m9 ; abs(p1-p0) - pmaxub m8, m10 - ABSSUB m9, m5, m6, m10 ; abs(q1-q0) - pmaxub m8, m9 -%if %1 == 4 - pxor m8, [pb_128] - pcmpgtb m7, m8, m1 ; hev -%else - pxor m7, m8, [pb_128] - pcmpgtb m7, m1 ; hev - -%if %1 == 6 - ABSSUB m9, m13, m4, m10 ; abs(p2-p0) - pmaxub m9, m8 -%else - ABSSUB m9, m12, m4, m10 ; abs(p3-p0) - pmaxub m9, m8 - ABSSUB m10, m13, m4, m11 ; abs(p2-p0) - pmaxub m9, m10 -%endif - ABSSUB m10, m5, m14, m11 ; abs(q2-q0) - pmaxub m9, m10 -%if %1 != 6 - ABSSUB m10, m5, m15, m11 ; abs(q3-q0) - pmaxub m9, m10 -%endif - pxor m9, [pb_128] - pcmpgtb m9, [pb_129] ; !flat8in - -%if %1 == 6 - ABSSUB m10, m13, m3, m1 ; abs(p2-p1) -%else - ABSSUB m10, m12, m13, m11 ; abs(p3-p2) - ABSSUB m11, m13, m3, m1 ; abs(p2-p1) - pmaxub m10, m11 - ABSSUB m11, m14, m15, m1 ; abs(q3-q2) - pmaxub m10, m11 -%endif - ABSSUB m11, m14, m6, m1 ; abs(q2-q1) - pmaxub m10, m11 -%if %1 == 16 - vpbroadcastd m11, [maskq+8] - vpbroadcastd m1, [maskq+4] - por m11, m1 - pand m11, [pb_mask] - pcmpeqd m11, [pb_mask] - pand m10, m11 -%else - vpbroadcastd m11, [maskq+4] - pand m11, [pb_mask] - pcmpeqd m11, [pb_mask] - pand m10, m11 ; only apply fm-wide to wd>4 blocks -%endif - pmaxub m8, m10 - - pxor m8, [pb_128] -%endif - pcmpgtb m8, m2 - - ABSSUB m10, m3, m6, m11 ; abs(p1-q1) - ABSSUB m11, m4, m5, m2 ; abs(p0-q0) - paddusb m11, m11 - pand m10, [pb_254] - psrlq m10, 1 - paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) - pxor m10, [pb_128] - pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E - por m8, m10 - -%if %1 == 16 -%ifidn %2, v - lea tmpq, [dstq+mstrideq*8] - mova m0, [tmpq+strideq*1] -%else - mova m0, [rsp+12*32] -%endif - ABSSUB m1, m0, m4, m2 -%ifidn %2, v - mova m0, [tmpq+strideq*2] -%else - mova m0, [rsp+13*32] -%endif - ABSSUB m2, m0, m4, m10 - pmaxub m1, m2 -%ifidn %2, v - mova m0, [tmpq+stride3q] -%else - mova m0, [rsp+14*32] -%endif - ABSSUB m2, m0, m4, m10 - pmaxub m1, m2 -%ifidn %2, v - lea tmpq, [dstq+strideq*4] - mova m0, [tmpq+strideq*0] -%else - mova m0, [rsp+15*32] -%endif - ABSSUB m2, m0, m5, m10 - pmaxub m1, m2 -%ifidn %2, v - mova m0, [tmpq+strideq*1] -%else - mova m0, [rsp+16*32] -%endif - ABSSUB m2, m0, m5, m10 - pmaxub m1, m2 -%ifidn %2, v - mova m0, [tmpq+strideq*2] -%else - mova m0, [rsp+17*32] -%endif - ABSSUB m2, m0, m5, m10 - pmaxub m1, m2 - pxor m1, [pb_128] - pcmpgtb m1, [pb_129] ; !flat8out - por m1, m9 ; !flat8in | !flat8out - vpbroadcastd m2, [maskq+8] - pand m10, m2, [pb_mask] - pcmpeqd m10, [pb_mask] - pandn m1, m10 ; flat16 - pandn m1, m8, m1 ; flat16 & fm - - vpbroadcastd m10, [maskq+4] - por m10, m2 - pand m2, m10, [pb_mask] - pcmpeqd m2, [pb_mask] - pandn m9, m2 ; flat8in - pandn m9, m8, m9 - vpbroadcastd m2, [maskq+0] - por m2, m10 - pand m2, [pb_mask] - pcmpeqd m2, [pb_mask] - pandn m8, m2 - pandn m8, m9, m8 ; fm & !flat8 & !flat16 - pandn m9, m1, m9 ; flat8 & !flat16 -%elif %1 != 4 - vpbroadcastd m0, [maskq+4] - pand m2, m0, [pb_mask] - pcmpeqd m2, [pb_mask] - pandn m9, m2 - pandn m9, m8, m9 ; flat8 & fm - vpbroadcastd m2, [maskq+0] - por m0, m2 - pand m0, [pb_mask] - pcmpeqd m0, [pb_mask] - pandn m8, m0 - pandn m8, m9, m8 ; fm & !flat8 -%else - vpbroadcastd m0, [maskq+0] - pand m0, [pb_mask] - pcmpeqd m0, [pb_mask] - pandn m8, m0 ; fm -%endif - - ; short filter - - pxor m3, [pb_128] - pxor m6, [pb_128] - psubsb m10, m3, m6 ; iclip_diff(p1-q1) - pand m10, m7 ; f=iclip_diff(p1-q1)&hev - pxor m4, [pb_128] - pxor m5, [pb_128] - psubsb m11, m5, m4 - paddsb m10, m11 - paddsb m10, m11 - paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f) - pand m8, m10 ; f&=fm - paddsb m10, m8, [pb_3] - paddsb m8, [pb_4] - pand m10, [pb_248] - pand m8, [pb_248] - psrlq m10, 3 - psrlq m8, 3 - pxor m10, [pb_16] - pxor m8, [pb_16] - psubb m10, [pb_16] ; f2 - psubb m8, [pb_16] ; f1 - paddsb m4, m10 - psubsb m5, m8 - pxor m4, [pb_128] - pxor m5, [pb_128] - - pxor m8, [pb_128] - pxor m10, m10 - pavgb m8, m10 ; f=(f1+1)>>1 - psubb m8, [pb_64] - pandn m8, m7, m8 ; f&=!hev - paddsb m3, m8 - psubsb m6, m8 - pxor m3, [pb_128] - pxor m6, [pb_128] - -%if %1 == 16 - ; flat16 filter -%ifidn %2, v - lea tmpq, [dstq+mstrideq*8] - mova m0, [tmpq+strideq*1] ; p6 - mova m2, [tmpq+strideq*2] ; p5 - mova m7, [tmpq+stride3q] ; p4 -%else - mova m0, [rsp+12*32] - mova m2, [rsp+13*32] - mova m7, [rsp+14*32] -%endif - - mova [rsp+0*32], m9 - mova [rsp+1*32], m14 - mova [rsp+2*32], m15 - - ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A - ; write -6 - punpcklbw m14, m0, m12 - punpckhbw m15, m0, m12 - pmaddubsw m10, m14, [pb_7_1] - pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3 - punpcklbw m8, m2, m7 - punpckhbw m9, m2, m7 - pmaddubsw m8, [pb_2] - pmaddubsw m9, [pb_2] - paddw m10, m8 - paddw m11, m9 ; p6*7+p5*2+p4*2+p3 - punpcklbw m8, m13, m3 - punpckhbw m9, m13, m3 - pmaddubsw m8, [pb_1] - pmaddubsw m9, [pb_1] - paddw m10, m8 - paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 - punpcklbw m8, m4, m5 - punpckhbw m9, m4, m5 - pmaddubsw m8, [pb_1] - pmaddubsw m9, [pb_1] - paddw m10, m8 - paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 - pmulhrsw m8, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m2 - por m8, m9 -%ifidn %2, v - mova [tmpq+strideq*2], m8 ; p5 -%else - mova [rsp+13*32], m8 -%endif - - ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B - ; write -5 - pmaddubsw m14, [pb_m1_1] - pmaddubsw m15, [pb_m1_1] - paddw m10, m14 - paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 - punpcklbw m8, m0, m6 - punpckhbw m9, m0, m6 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - mova [rsp+3*32], m8 - mova [rsp+4*32], m9 - paddw m10, m8 - paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 - pmulhrsw m8, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m7 - por m8, m9 -%ifidn %2, v - mova [tmpq+stride3q], m8 ; p4 -%else - mova [rsp+14*32], m8 -%endif - - ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C - ; write -4 - mova m14, [rsp+1*32] - punpcklbw m8, m0, m13 - punpckhbw m9, m0, m13 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - paddw m10, m8 - paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 - punpcklbw m8, m2, m14 - punpckhbw m2, m14 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m2, [pb_m1_1] - mova [rsp+1*32], m8 - paddw m10, m8 - paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 - pmulhrsw m8, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m12 - por m8, m9 -%ifidn %2, v - mova [tmpq+strideq*4], m8 ; p3 -%else - mova [rsp+19*32], m8 -%endif - - ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D - ; write -3 - mova m15, [rsp+2*32] - punpcklbw m8, m0, m3 - punpckhbw m9, m0, m3 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - paddw m10, m8 - paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 - punpcklbw m8, m7, m15 - punpckhbw m7, m15 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m7, [pb_m1_1] - mova [rsp+2*32], m8 - paddw m10, m8 - paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 - pmulhrsw m8, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m13 - por m8, m9 - mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F - - ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E - ; write -2 -%ifidn %2, v - lea tmpq, [dstq+strideq*4] -%endif - punpcklbw m8, m0, m4 - punpckhbw m9, m0, m4 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - paddw m10, m8 - paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 -%ifidn %2, v - mova m9, [tmpq+strideq*0] ; q4 -%else - mova m9, [rsp+15*32] -%endif - punpcklbw m8, m12, m9 - punpckhbw m9, m12, m9 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - mova [rsp+7*32], m8 - mova [rsp+5*32], m9 - paddw m10, m8 - paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 - pmulhrsw m8, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m3 - por m8, m9 - mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G - - ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F - ; write -1 -%ifidn %2, v - mova m9, [tmpq+strideq*1] ; q5 -%else - mova m9, [rsp+16*32] -%endif - punpcklbw m8, m0, m5 - punpckhbw m0, m5 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m0, [pb_m1_1] - paddw m10, m8 - paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 - punpcklbw m0, m13, m9 - punpckhbw m9, m13, m9 - mova m13, [rsp+6*32] - pmaddubsw m0, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - mova [rsp+ 9*32], m0 - mova [rsp+10*32], m9 - paddw m10, m0 - paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 - pmulhrsw m0, m10, [pw_2048] - pmulhrsw m8, m11, [pw_2048] - packuswb m0, m8 - pand m0, m1 - pandn m8, m1, m4 - por m0, m8 - mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H - - ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G - ; write +0 -%ifidn %2, v - mova m0, [tmpq+strideq*2] ; q6 -%else - mova m0, [rsp+17*32] -%endif - paddw m10, [rsp+3*32] - paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 - punpcklbw m8, m3, m0 - punpckhbw m9, m3, m0 - mova m3, [rsp+8*32] - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - mova [rsp+3*32], m8 - mova [rsp+4*32], m9 - paddw m10, m8 - paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 - pmulhrsw m8, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m5 - por m8, m9 - mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I - - ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H - ; write +1 - paddw m10, [rsp+1*32] - paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 - punpcklbw m8, m4, m0 - punpckhbw m2, m4, m0 - mova m4, [rsp+6*32] - pmaddubsw m8, [pb_m1_1] - pmaddubsw m2, [pb_m1_1] - paddw m10, m8 - paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 - pmulhrsw m2, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m2, m9 - pand m2, m1 - pandn m9, m1, m6 - por m2, m9 ; don't clobber q1/m6 since we need it in K - - ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I - ; write +2 - paddw m10, [rsp+2*32] - paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 - punpcklbw m8, m5, m0 - punpckhbw m9, m5, m0 - mova m5, [rsp+8*32] - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - paddw m10, m8 - paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 - pmulhrsw m7, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m7, m9 - pand m7, m1 - pandn m9, m1, m14 - por m7, m9 ; don't clobber q2/m14 since we need it in K - - ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J - ; write +3 - paddw m10, [rsp+7*32] - paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 - punpcklbw m8, m6, m0 - punpckhbw m9, m6, m0 - SWAP 2, 6 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - paddw m10, m8 - paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 - pmulhrsw m8, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m8, m9 - pand m8, m1 - pandn m9, m1, m15 - por m8, m9 -%ifidn %2, v - mova [tmpq+mstrideq], m8 ; q3 -%else - mova [rsp+20*32], m8 -%endif - - ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K - ; write +4 - paddw m10, [rsp+ 9*32] - paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - punpcklbw m8, m14, m0 - punpckhbw m9, m14, m0 - SWAP 14, 7 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - paddw m10, m8 - paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 - pmulhrsw m8, m10, [pw_2048] - pmulhrsw m9, m11, [pw_2048] - packuswb m8, m9 - pand m8, m1 -%ifidn %2, v - pandn m9, m1, [tmpq+strideq*0] -%else - pandn m9, m1, [rsp+15*32] -%endif - por m8, m9 -%ifidn %2, v - mova [tmpq+strideq*0], m8 ; q4 -%else - mova [rsp+15*32], m8 -%endif - - ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L - ; write +5 - paddw m10, [rsp+3*32] - paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - punpcklbw m8, m15, m0 - punpckhbw m9, m15, m0 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m9, [pb_m1_1] - paddw m10, m8 - paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 - pmulhrsw m10, [pw_2048] - pmulhrsw m11, [pw_2048] - packuswb m10, m11 - pand m10, m1 -%ifidn %2, v - pandn m11, m1, [tmpq+strideq*1] -%else - pandn m11, m1, [rsp+16*32] -%endif - por m10, m11 -%ifidn %2, v - mova [tmpq+strideq*1], m10 ; q5 -%else - mova [rsp+16*32], m10 -%endif - - mova m9, [rsp+0*32] -%ifidn %2, v - lea tmpq, [dstq+mstrideq*4] -%endif -%endif -%if %1 >= 8 - ; flat8 filter - punpcklbw m0, m12, m3 - punpckhbw m1, m12, m3 - pmaddubsw m2, m0, [pb_3_1] - pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1 - punpcklbw m8, m13, m4 - punpckhbw m11, m13, m4 - pmaddubsw m8, [pb_2_1] - pmaddubsw m11, [pb_2_1] - paddw m2, m8 - paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 - punpcklbw m8, m5, [pb_4] - punpckhbw m11, m5, [pb_4] - pmaddubsw m8, [pb_1] - pmaddubsw m11, [pb_1] - paddw m2, m8 - paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 - psrlw m8, m2, 3 - psrlw m11, m7, 3 - packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m13 - por m10, m8, m11 ; p2 -%ifidn %2, v - mova [tmpq+strideq*1], m10 ; p2 -%endif - - pmaddubsw m8, m0, [pb_m1_1] - pmaddubsw m11, m1, [pb_m1_1] - paddw m2, m8 - paddw m7, m11 - punpcklbw m8, m13, m6 - punpckhbw m11, m13, m6 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m11, [pb_m1_1] - paddw m2, m8 - paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 - psrlw m8, m2, 3 - psrlw m11, m7, 3 - packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m3 - por m8, m11 ; p1 -%ifidn %2, v - mova [tmpq+strideq*2], m8 ; p1 -%else - mova [rsp+0*32], m8 -%endif - - pmaddubsw m0, [pb_1] - pmaddubsw m1, [pb_1] - psubw m2, m0 - psubw m7, m1 - punpcklbw m8, m4, m14 - punpckhbw m11, m4, m14 - pmaddubsw m8, [pb_1] - pmaddubsw m11, [pb_1] - paddw m2, m8 - paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 - psrlw m8, m2, 3 - psrlw m11, m7, 3 - packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m4 - por m8, m11 ; p0 -%ifidn %2, v - mova [tmpq+stride3q ], m8 ; p0 -%else - mova [rsp+1*32], m8 -%endif - - punpcklbw m0, m5, m15 - punpckhbw m1, m5, m15 - pmaddubsw m8, m0, [pb_1] - pmaddubsw m11, m1, [pb_1] - paddw m2, m8 - paddw m7, m11 - punpcklbw m8, m4, m12 - punpckhbw m11, m4, m12 - pmaddubsw m8, [pb_1] - pmaddubsw m11, [pb_1] - psubw m2, m8 - psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 - psrlw m8, m2, 3 - psrlw m11, m7, 3 - packuswb m8, m11 - pand m8, m9 - pandn m11, m9, m5 - por m11, m8, m11 ; q0 -%ifidn %2, v - mova [dstq+strideq*0], m11 ; q0 -%endif - - pmaddubsw m0, [pb_m1_1] - pmaddubsw m1, [pb_m1_1] - paddw m2, m0 - paddw m7, m1 - punpcklbw m8, m13, m6 - punpckhbw m13, m6 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m13, [pb_m1_1] - paddw m2, m8 - paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 - psrlw m8, m2, 3 - psrlw m13, m7, 3 - packuswb m8, m13 - pand m8, m9 - pandn m13, m9, m6 - por m13, m8, m13 ; q1 -%ifidn %2, v - mova [dstq+strideq*1], m13 ; q1 -%endif - - punpcklbw m0, m3, m6 - punpckhbw m1, m3, m6 - pmaddubsw m0, [pb_1] - pmaddubsw m1, [pb_1] - psubw m2, m0 - psubw m7, m1 - punpcklbw m0, m14, m15 - punpckhbw m1, m14, m15 - pmaddubsw m0, [pb_1] - pmaddubsw m1, [pb_1] - paddw m2, m0 - paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 - psrlw m2, 3 - psrlw m7, 3 - packuswb m2, m7 - pand m2, m9 - pandn m7, m9, m14 - por m2, m7 ; q2 -%ifidn %2, v - mova [dstq+strideq*2], m2 ; q2 -%else - mova m0, [rsp+0*32] - mova m1, [rsp+1*32] -%if %1 == 8 - ; 16x8 transpose - punpcklbw m3, m12, m10 - punpckhbw m12, m10 - punpcklbw m10, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m11, m13 - punpckhbw m11, m13 - punpcklbw m13, m2, m15 - punpckhbw m2, m15 - - punpcklwd m15, m3, m10 - punpckhwd m3, m10 - punpcklwd m10, m12, m0 - punpckhwd m12, m0 - punpcklwd m0, m1, m13 - punpckhwd m1, m13 - punpcklwd m13, m11, m2 - punpckhwd m11, m2 - - punpckldq m2, m15, m0 - punpckhdq m15, m0 - punpckldq m0, m3, m1 - punpckhdq m3, m1 - punpckldq m1, m10, m13 - punpckhdq m10, m13 - punpckldq m13, m12, m11 - punpckhdq m12, m11 - - ; write 8x32 - movq [dstq+strideq*0-4], xm2 - movhps [dstq+strideq*1-4], xm2 - movq [dstq+strideq*2-4], xm15 - movhps [dstq+stride3q -4], xm15 - lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm0 - movhps [dstq+strideq*1-4], xm0 - movq [dstq+strideq*2-4], xm3 - movhps [dstq+stride3q -4], xm3 - lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm1 - movhps [dstq+strideq*1-4], xm1 - movq [dstq+strideq*2-4], xm10 - movhps [dstq+stride3q -4], xm10 - lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm13 - movhps [dstq+strideq*1-4], xm13 - movq [dstq+strideq*2-4], xm12 - movhps [dstq+stride3q -4], xm12 - lea dstq, [dstq+strideq*4] - - vextracti128 xm2, m2, 1 - vextracti128 xm15, m15, 1 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m3, 1 - vextracti128 xm1, m1, 1 - vextracti128 xm10, m10, 1 - vextracti128 xm13, m13, 1 - vextracti128 xm12, m12, 1 - - movq [dstq+strideq*0-4], xm2 - movhps [dstq+strideq*1-4], xm2 - movq [dstq+strideq*2-4], xm15 - movhps [dstq+stride3q -4], xm15 - lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm0 - movhps [dstq+strideq*1-4], xm0 - movq [dstq+strideq*2-4], xm3 - movhps [dstq+stride3q -4], xm3 - lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm1 - movhps [dstq+strideq*1-4], xm1 - movq [dstq+strideq*2-4], xm10 - movhps [dstq+stride3q -4], xm10 - lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm13 - movhps [dstq+strideq*1-4], xm13 - movq [dstq+strideq*2-4], xm12 - movhps [dstq+stride3q -4], xm12 - lea dstq, [dstq+strideq*4] -%else - ; 16x16 transpose and store - SWAP 5, 10, 2 - SWAP 6, 0 - SWAP 7, 1 - SWAP 8, 11 - SWAP 9, 13 - mova m0, [rsp+11*32] - mova m1, [rsp+12*32] - mova m2, [rsp+13*32] - mova m3, [rsp+14*32] - mova m4, [rsp+19*32] - mova m11, [rsp+20*32] - mova m12, [rsp+15*32] - mova m13, [rsp+16*32] - mova m14, [rsp+17*32] - TRANSPOSE_16X16B 1, 0, [rsp+18*32] - movu [dstq+strideq*0-8], xm0 - movu [dstq+strideq*1-8], xm1 - movu [dstq+strideq*2-8], xm2 - movu [dstq+stride3q -8], xm3 - lea dstq, [dstq+strideq*4] - movu [dstq+strideq*0-8], xm4 - movu [dstq+strideq*1-8], xm5 - movu [dstq+strideq*2-8], xm6 - movu [dstq+stride3q -8], xm7 - lea dstq, [dstq+strideq*4] - movu [dstq+strideq*0-8], xm8 - movu [dstq+strideq*1-8], xm9 - movu [dstq+strideq*2-8], xm10 - movu [dstq+stride3q -8], xm11 - lea dstq, [dstq+strideq*4] - movu [dstq+strideq*0-8], xm12 - movu [dstq+strideq*1-8], xm13 - movu [dstq+strideq*2-8], xm14 - movu [dstq+stride3q -8], xm15 - lea dstq, [dstq+strideq*4] - vextracti128 [dstq+strideq*0-8], m0, 1 - vextracti128 [dstq+strideq*1-8], m1, 1 - vextracti128 [dstq+strideq*2-8], m2, 1 - vextracti128 [dstq+stride3q -8], m3, 1 - lea dstq, [dstq+strideq*4] - vextracti128 [dstq+strideq*0-8], m4, 1 - vextracti128 [dstq+strideq*1-8], m5, 1 - vextracti128 [dstq+strideq*2-8], m6, 1 - vextracti128 [dstq+stride3q -8], m7, 1 - lea dstq, [dstq+strideq*4] - vextracti128 [dstq+strideq*0-8], m8, 1 - vextracti128 [dstq+strideq*1-8], m9, 1 - vextracti128 [dstq+strideq*2-8], m10, 1 - vextracti128 [dstq+stride3q -8], m11, 1 - lea dstq, [dstq+strideq*4] - vextracti128 [dstq+strideq*0-8], m12, 1 - vextracti128 [dstq+strideq*1-8], m13, 1 - vextracti128 [dstq+strideq*2-8], m14, 1 - vextracti128 [dstq+stride3q -8], m15, 1 - lea dstq, [dstq+strideq*4] -%endif -%endif -%elif %1 == 6 - ; flat6 filter - - punpcklbw m8, m13, m5 - punpckhbw m11, m13, m5 - pmaddubsw m0, m8, [pb_3_1] - pmaddubsw m1, m11, [pb_3_1] - punpcklbw m7, m4, m3 - punpckhbw m10, m4, m3 - pmaddubsw m2, m7, [pb_2] - pmaddubsw m12, m10, [pb_2] - paddw m0, m2 - paddw m1, m12 - pmulhrsw m2, m0, [pw_4096] - pmulhrsw m12, m1, [pw_4096] - packuswb m2, m12 - pand m2, m9 - pandn m12, m9, m3 - por m2, m12 -%ifidn %2, v - mova [tmpq+strideq*2], m2 ; p1 -%endif - - pmaddubsw m8, [pb_m1_1] - pmaddubsw m11, [pb_m1_1] - paddw m0, m8 - paddw m1, m11 - punpcklbw m8, m13, m6 - punpckhbw m11, m13, m6 - pmaddubsw m8, [pb_m1_1] - pmaddubsw m11, [pb_m1_1] - paddw m0, m8 - paddw m1, m11 - pmulhrsw m12, m0, [pw_4096] - pmulhrsw m13, m1, [pw_4096] - packuswb m12, m13 - pand m12, m9 - pandn m13, m9, m4 - por m12, m13 -%ifidn %2, v - mova [tmpq+stride3q], m12 ; p0 -%endif - - paddw m0, m8 - paddw m1, m11 - punpcklbw m8, m3, m14 - punpckhbw m11, m3, m14 - pmaddubsw m14, m8, [pb_m1_1] - pmaddubsw m13, m11, [pb_m1_1] - paddw m0, m14 - paddw m1, m13 - pmulhrsw m14, m0, [pw_4096] - pmulhrsw m13, m1, [pw_4096] - packuswb m14, m13 - pand m14, m9 - pandn m13, m9, m5 - por m14, m13 -%ifidn %2, v - mova [dstq+strideq*0], m14 ; q0 -%endif - - pmaddubsw m8, [pb_m1_2] - pmaddubsw m11, [pb_m1_2] - paddw m0, m8 - paddw m1, m11 - pmaddubsw m7, [pb_m1_0] - pmaddubsw m10, [pb_m1_0] - paddw m0, m7 - paddw m1, m10 - pmulhrsw m0, [pw_4096] - pmulhrsw m1, [pw_4096] - packuswb m0, m1 - pand m0, m9 - pandn m9, m6 - por m0, m9 -%ifidn %2, v - mova [dstq+strideq*1], m0 ; q1 -%else - TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 -%endif -%else -%ifidn %2, v - mova [tmpq+strideq*0], m3 ; p1 - mova [tmpq+strideq*1], m4 ; p0 - mova [tmpq+strideq*2], m5 ; q0 - mova [tmpq+stride3q ], m6 ; q1 -%else - TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 -%endif -%endif -%endmacro - -INIT_YMM avx2 -cglobal lpf_v_sb_y, 7, 10, 16, 32 * 11, \ - dst, stride, mask, l, l_stride, lut, \ - w, stride3, mstride, tmp - shl l_strideq, 2 - sub lq, l_strideq - mov mstrideq, strideq - neg mstrideq - lea stride3q, [strideq*3] - -.loop: - cmp byte [maskq+8], 0 ; vmask[2] - je .no_flat16 - - FILTER 16, v - jmp .end - -.no_flat16: - cmp byte [maskq+4], 0 ; vmask[1] - je .no_flat - - FILTER 8, v - jmp .end - -.no_flat: - cmp byte [maskq+0], 0 ; vmask[0] - je .end - - FILTER 4, v - -.end: - add lq, 32 - add dstq, 32 - add maskq, 1 - sub wd, 8 - jg .loop - RET - -INIT_YMM avx2 -cglobal lpf_h_sb_y, 7, 10, 16, 32 * 21, \ - dst, stride, mask, l, l_stride, lut, \ - h, stride3, l_stride3, tmp - shl l_strideq, 2 - sub lq, 4 - lea stride3q, [strideq*3] - lea l_stride3q, [l_strideq*3] - -.loop: - cmp byte [maskq+8], 0 ; vmask[2] - je .no_flat16 - - FILTER 16, h - jmp .end - -.no_flat16: - cmp byte [maskq+4], 0 ; vmask[1] - je .no_flat - - FILTER 8, h - jmp .end - -.no_flat: - cmp byte [maskq+0], 0 ; vmask[0] - je .no_filter - - FILTER 4, h - jmp .end - -.no_filter: - lea dstq, [dstq+stride3q*8] - lea lq, [lq+l_strideq*8] - lea dstq, [dstq+strideq*8] -.end: - add maskq, 1 - sub hd, 8 - jg .loop - RET - -INIT_YMM avx2 -cglobal lpf_v_sb_uv, 7, 10, 16, \ - dst, stride, mask, l, l_stride, lut, \ - w, stride3, mstride, tmp - shl l_strideq, 2 - sub lq, l_strideq - mov mstrideq, strideq - neg mstrideq - lea stride3q, [strideq*3] - -.loop: - cmp byte [maskq+4], 0 ; vmask[1] - je .no_flat - - FILTER 6, v - jmp .end - -.no_flat: - cmp byte [maskq+0], 0 ; vmask[0] - je .end - - FILTER 4, v - -.end: - add lq, 32 - add dstq, 32 - add maskq, 1 - sub wd, 8 - jg .loop - RET - -INIT_YMM avx2 -cglobal lpf_h_sb_uv, 7, 10, 16, \ - dst, stride, mask, l, l_stride, lut, \ - h, stride3, l_stride3, tmp - shl l_strideq, 2 - sub lq, 4 - lea stride3q, [strideq*3] - lea l_stride3q, [l_strideq*3] - -.loop: - cmp byte [maskq+4], 0 ; vmask[1] - je .no_flat - - FILTER 6, h - jmp .end - -.no_flat: - cmp byte [maskq+0], 0 ; vmask[0] - je .no_filter - - FILTER 4, h - jmp .end - -.no_filter: - lea dstq, [dstq+stride3q*8] - lea lq, [lq+l_strideq*8] - lea dstq, [dstq+strideq*8] -.end: - add maskq, 1 - sub hd, 8 - jg .loop - RET - -%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/loopfilter_avx2.asm dav1d-0.9.1/src/x86/loopfilter_avx2.asm --- dav1d-0.7.1/src/x86/loopfilter_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/loopfilter_avx2.asm 2021-07-28 21:38:28.905852000 +0000 @@ -0,0 +1,1561 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 +pb_7_1: times 16 db 7, 1 +pb_3_1: times 16 db 3, 1 +pb_2_1: times 16 db 2, 1 +pb_m1_0: times 16 db -1, 0 +pb_m1_1: times 16 db -1, 1 +pb_m1_2: times 16 db -1, 2 +pb_1: times 32 db 1 +pb_2: times 32 db 2 +pb_3: times 32 db 3 +pb_4: times 32 db 4 +pb_16: times 32 db 16 +pb_63: times 32 db 63 +pb_64: times 32 db 64 +pb_128: times 32 db 0x80 +pb_129: times 32 db 0x81 +pb_240: times 32 db 0xf0 +pb_248: times 32 db 0xf8 +pb_254: times 32 db 0xfe + +pw_2048: times 16 dw 2048 +pw_4096: times 16 dw 4096 + +pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128 + +SECTION .text + +%macro ABSSUB 4 ; dst, a, b, tmp + psubusb %1, %2, %3 + psubusb %4, %3, %2 + por %1, %4 +%endmacro + +%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 + ; transpose 16x4 + punpcklbw m%5, m%1, m%2 + punpckhbw m%1, m%2 + punpcklbw m%2, m%3, m%4 + punpckhbw m%3, m%4 + punpcklwd m%4, m%5, m%2 + punpckhwd m%5, m%2 + punpcklwd m%2, m%1, m%3 + punpckhwd m%1, m%3 + + ; write out + movd [dstq+strideq*0-2], xm%4 + pextrd [dstq+strideq*1-2], xm%4, 1 + pextrd [dstq+strideq*2-2], xm%4, 2 + pextrd [dstq+stride3q-2], xm%4, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%5 + pextrd [dstq+strideq*1-2], xm%5, 1 + pextrd [dstq+strideq*2-2], xm%5, 2 + pextrd [dstq+stride3q-2], xm%5, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%2 + pextrd [dstq+strideq*1-2], xm%2, 1 + pextrd [dstq+strideq*2-2], xm%2, 2 + pextrd [dstq+stride3q-2], xm%2, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%1 + pextrd [dstq+strideq*1-2], xm%1, 1 + pextrd [dstq+strideq*2-2], xm%1, 2 + pextrd [dstq+stride3q-2], xm%1, 3 + lea dstq, [dstq+strideq*4] + + vextracti128 xm%4, m%4, 1 + vextracti128 xm%5, m%5, 1 + vextracti128 xm%2, m%2, 1 + vextracti128 xm%1, m%1, 1 + + movd [dstq+strideq*0-2], xm%4 + pextrd [dstq+strideq*1-2], xm%4, 1 + pextrd [dstq+strideq*2-2], xm%4, 2 + pextrd [dstq+stride3q-2], xm%4, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%5 + pextrd [dstq+strideq*1-2], xm%5, 1 + pextrd [dstq+strideq*2-2], xm%5, 2 + pextrd [dstq+stride3q-2], xm%5, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%2 + pextrd [dstq+strideq*1-2], xm%2, 1 + pextrd [dstq+strideq*2-2], xm%2, 2 + pextrd [dstq+stride3q-2], xm%2, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%1 + pextrd [dstq+strideq*1-2], xm%1, 1 + pextrd [dstq+strideq*2-2], xm%1, 2 + pextrd [dstq+stride3q-2], xm%1, 3 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem +%if %1 == 0 + mova %3, m15 +%endif + + ; input in m0-15 + punpcklbw m15, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklbw m7, m8, m9 + punpckhbw m8, m9 + punpcklbw m9, m10, m11 + punpckhbw m10, m11 + punpcklbw m11, m12, m13 + punpckhbw m12, m13 + mova m13, %3 + mova %3, m12 + punpcklbw m12, m14, m13 + punpckhbw m13, m14, m13 + + ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 + punpcklwd m14, m15, m1 + punpckhwd m15, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m7, m9 + punpckhwd m7, m9 + punpcklwd m9, m8, m10 + punpckhwd m8, m10 + punpcklwd m10, m11, m12 + punpckhwd m11, m12 + mova m12, %3 + mova %3, m11 + punpcklwd m11, m12, m13 + punpckhwd m12, m13 + + ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 + punpckldq m13, m14, m2 + punpckhdq m14, m2 + punpckldq m2, m15, m3 + punpckhdq m15, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + punpckldq m4, m6, m10 + punpckhdq m6, m10 + punpckldq m10, m9, m11 + punpckhdq m9, m11 + punpckldq m11, m8, m12 + punpckhdq m8, m12 + mova m12, %3 + mova %3, m8 + punpckldq m8, m7, m12 + punpckhdq m7, m12 + + ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 + punpcklqdq m12, m13, m4 + punpckhqdq m13, m4 + punpcklqdq m4, m14, m6 + punpckhqdq m14, m6 + punpcklqdq m6, m2, m8 + punpckhqdq m2, m8 + punpcklqdq m8, m15, m7 + punpckhqdq m15, m7 + punpcklqdq m7, m3, m10 + punpckhqdq m3, m10 + punpcklqdq m10, m1, m9 + punpckhqdq m1, m9 + punpcklqdq m9, m5, m11 + punpckhqdq m5, m11 + mova m11, %3 + mova %3, m12 + punpcklqdq m12, m0, m11 + punpckhqdq m0, m11 +%if %2 == 0 + mova m11, %3 +%endif + + ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 + SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 + SWAP 3, 14, 12, 9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] +%if %1 != 6 + mova m12, [tmpq+strideq*0] +%endif + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif +%endif +%else + ; load lines +%if %1 == 4 + movd xm3, [dstq+strideq*0-2] + movd xm4, [dstq+strideq*1-2] + movd xm5, [dstq+strideq*2-2] + movd xm6, [dstq+stride3q -2] + lea tmpq, [dstq+strideq*4] + pinsrd xm3, [tmpq+strideq*0-2], 2 + pinsrd xm4, [tmpq+strideq*1-2], 2 + pinsrd xm5, [tmpq+strideq*2-2], 2 + pinsrd xm6, [tmpq+stride3q -2], 2 + lea tmpq, [tmpq+strideq*4] + pinsrd xm3, [tmpq+strideq*0-2], 1 + pinsrd xm4, [tmpq+strideq*1-2], 1 + pinsrd xm5, [tmpq+strideq*2-2], 1 + pinsrd xm6, [tmpq+stride3q -2], 1 + lea tmpq, [tmpq+strideq*4] + pinsrd xm3, [tmpq+strideq*0-2], 3 + pinsrd xm4, [tmpq+strideq*1-2], 3 + pinsrd xm5, [tmpq+strideq*2-2], 3 + pinsrd xm6, [tmpq+stride3q -2], 3 + lea tmpq, [tmpq+strideq*4] + movd xm12, [tmpq+strideq*0-2] + movd xm13, [tmpq+strideq*1-2] + movd xm14, [tmpq+strideq*2-2] + movd xm15, [tmpq+stride3q -2] + lea tmpq, [tmpq+strideq*4] + pinsrd xm12, [tmpq+strideq*0-2], 2 + pinsrd xm13, [tmpq+strideq*1-2], 2 + pinsrd xm14, [tmpq+strideq*2-2], 2 + pinsrd xm15, [tmpq+stride3q -2], 2 + lea tmpq, [tmpq+strideq*4] + pinsrd xm12, [tmpq+strideq*0-2], 1 + pinsrd xm13, [tmpq+strideq*1-2], 1 + pinsrd xm14, [tmpq+strideq*2-2], 1 + pinsrd xm15, [tmpq+stride3q -2], 1 + lea tmpq, [tmpq+strideq*4] + pinsrd xm12, [tmpq+strideq*0-2], 3 + pinsrd xm13, [tmpq+strideq*1-2], 3 + pinsrd xm14, [tmpq+strideq*2-2], 3 + pinsrd xm15, [tmpq+stride3q -2], 3 + vinserti128 m3, xm12, 1 + vinserti128 m4, xm13, 1 + vinserti128 m5, xm14, 1 + vinserti128 m6, xm15, 1 + + ; transpose 4x16 + ; xm3: A-D0,A-D8,A-D4,A-D12 + ; xm4: A-D1,A-D9,A-D5,A-D13 + ; xm5: A-D2,A-D10,A-D6,A-D14 + ; xm6: A-D3,A-D11,A-D7,A-D15 + punpcklbw m7, m3, m4 + punpckhbw m3, m4 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 + ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 + ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 + ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 + punpcklwd m6, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + ; xm6: A0-3,B0-3,C0-3,D0-3 + ; xm7: A8-11,B8-11,C8-11,D8-11 + ; xm4: A4-7,B4-7,C4-7,D4-7 + ; xm3: A12-15,B12-15,C12-15,D12-15 + punpckldq m5, m6, m4 + punpckhdq m6, m4 + punpckldq m4, m7, m3 + punpckhdq m7, m3 + ; xm5: A0-7,B0-7 + ; xm6: C0-7,D0-7 + ; xm4: A8-15,B8-15 + ; xm7: C8-15,D8-15 + punpcklqdq m3, m5, m4 + punpckhqdq m4, m5, m4 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + ; xm3: A0-15 + ; xm5: B0-15 + ; xm4: C0-15 + ; xm6: D0-15 +%elif %1 == 6 || %1 == 8 + movq xm3, [dstq+strideq*0-%1/2] + movq xm4, [dstq+strideq*1-%1/2] + movq xm5, [dstq+strideq*2-%1/2] + movq xm6, [dstq+stride3q -%1/2] + lea tmpq, [dstq+strideq*8] + movhps xm3, [tmpq+strideq*0-%1/2] + movhps xm4, [tmpq+strideq*1-%1/2] + movhps xm5, [tmpq+strideq*2-%1/2] + movhps xm6, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movq xm7, [tmpq+strideq*0-%1/2] + movq xm8, [tmpq+strideq*1-%1/2] + movq xm9, [tmpq+strideq*2-%1/2] + movq xm11, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movhps xm7, [tmpq+strideq*0-%1/2] + movhps xm8, [tmpq+strideq*1-%1/2] + movhps xm9, [tmpq+strideq*2-%1/2] + movhps xm11, [tmpq+stride3q -%1/2] + vinserti128 m3, xm7, 1 + vinserti128 m4, xm8, 1 + vinserti128 m5, xm9, 1 + vinserti128 m6, xm11, 1 + lea tmpq, [dstq+strideq*4] + movq xm12, [tmpq+strideq*0-%1/2] + movq xm13, [tmpq+strideq*1-%1/2] + movq xm14, [tmpq+strideq*2-%1/2] + movq xm15, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movhps xm12, [tmpq+strideq*0-%1/2] + movhps xm13, [tmpq+strideq*1-%1/2] + movhps xm14, [tmpq+strideq*2-%1/2] + movhps xm15, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movq xm7, [tmpq+strideq*0-%1/2] + movq xm8, [tmpq+strideq*1-%1/2] + movq xm9, [tmpq+strideq*2-%1/2] + movq xm11, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movhps xm7, [tmpq+strideq*0-%1/2] + movhps xm8, [tmpq+strideq*1-%1/2] + movhps xm9, [tmpq+strideq*2-%1/2] + movhps xm11, [tmpq+stride3q -%1/2] + vinserti128 m12, xm7, 1 + vinserti128 m13, xm8, 1 + vinserti128 m14, xm9, 1 + vinserti128 m15, xm11, 1 + + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm12: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklbw m7, m3, m4 + punpckhbw m3, m4 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + punpcklbw m6, m12, m13 + punpckhbw m12, m13 + punpcklbw m13, m14, m15 + punpckhbw m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 + ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 + ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 + ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 + ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 + ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 + ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 + ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 + punpcklwd m15, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m6, m13 + punpckhwd m6, m13 + punpcklwd m13, m12, m14 + punpckhwd m12, m14 + ; xm15: A0-3,B0-3,C0-3,D0-3 + ; xm7: E0-3,F0-3,G0-3,H0-3 + ; xm4: A8-11,B8-11,C8-11,D8-11 + ; xm3: E8-11,F8-11,G8-11,H8-11 + ; xm5: A4-7,B4-7,C4-7,D4-7 + ; xm6: E4-7,F4-7,G4-7,H4-7 + ; xm13: A12-15,B12-15,C12-15,D12-15 + ; xm12: E12-15,F12-15,G12-15,H12-15 + punpckldq m14, m15, m5 + punpckhdq m15, m5 + punpckldq m5, m7, m6 +%if %1 != 6 + punpckhdq m7, m6 +%endif + punpckldq m6, m4, m13 + punpckhdq m4, m13 + punpckldq m13, m3, m12 +%if %1 != 6 + punpckhdq m12, m3, m12 +%endif + ; xm14: A0-7,B0-7 + ; xm15: C0-7,D0-7 + ; xm5: E0-7,F0-7 + ; xm7: G0-7,H0-7 + ; xm6: A8-15,B8-15 + ; xm4: C8-15,D8-15 + ; xm13: E8-15,F8-15 + ; xm12: G8-15,H8-15 + punpcklqdq m3, m14, m6 + punpckhqdq m14, m6 + punpckhqdq m6, m15, m4 + punpcklqdq m15, m4 + punpcklqdq m4, m5, m13 + punpckhqdq m13, m5, m13 +%if %1 == 8 + punpcklqdq m5, m7, m12 + punpckhqdq m12, m7, m12 + ; xm3: A0-15 + ; xm14: B0-15 + ; xm15: C0-15 + ; xm6: D0-15 + ; xm4: E0-15 + ; xm13: F0-15 + ; xm5: G0-15 + ; xm12: H0-15 + SWAP 12, 3, 15 + SWAP 13, 14, 5, 4, 6 + ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 +%else + SWAP 13, 3, 14 + SWAP 6, 4, 15, 5 + ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 +%endif +%else + ; load and 16x16 transpose. We only use 14 pixels but we'll need the + ; remainder at the end for the second transpose + movu xm0, [dstq+strideq*0-8] + movu xm1, [dstq+strideq*1-8] + movu xm2, [dstq+strideq*2-8] + movu xm3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu xm4, [tmpq+strideq*0-8] + movu xm5, [tmpq+strideq*1-8] + movu xm6, [tmpq+strideq*2-8] + movu xm7, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu xm8, [tmpq+strideq*0-8] + movu xm9, [tmpq+strideq*1-8] + movu xm10, [tmpq+strideq*2-8] + movu xm11, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu xm12, [tmpq+strideq*0-8] + movu xm13, [tmpq+strideq*1-8] + movu xm14, [tmpq+strideq*2-8] + movu xm15, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + vinserti128 m0, [tmpq+strideq*0-8], 1 + vinserti128 m1, [tmpq+strideq*1-8], 1 + vinserti128 m2, [tmpq+strideq*2-8], 1 + vinserti128 m3, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m4, [tmpq+strideq*0-8], 1 + vinserti128 m5, [tmpq+strideq*1-8], 1 + vinserti128 m6, [tmpq+strideq*2-8], 1 + vinserti128 m7, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m8, [tmpq+strideq*0-8], 1 + vinserti128 m9, [tmpq+strideq*1-8], 1 + vinserti128 m10, [tmpq+strideq*2-8], 1 + vinserti128 m11, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m12, [tmpq+strideq*0-8], 1 + vinserti128 m13, [tmpq+strideq*1-8], 1 + vinserti128 m14, [tmpq+strideq*2-8], 1 + vinserti128 m15, [tmpq+stride3q -8], 1 + + TRANSPOSE_16X16B 0, 1, [rsp+11*32] + mova [rsp+12*32], m1 + mova [rsp+13*32], m2 + mova [rsp+14*32], m3 + mova [rsp+15*32], m12 + mova [rsp+16*32], m13 + mova [rsp+17*32], m14 + mova [rsp+18*32], m15 + ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 + SWAP 12, 4, 7 + SWAP 13, 5, 8 + SWAP 3, 6, 9 + SWAP 10, 14 + SWAP 11, 15 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + movu m1, [lq] + movu m0, [lq+l_strideq] +%else + movq xm1, [lq] + movq xm2, [lq+l_strideq*2] + movhps xm1, [lq+l_strideq] + movhps xm2, [lq+l_stride3q] + lea lq, [lq+l_strideq*4] + movq xm10, [lq] + movq xm0, [lq+l_strideq*2] + movhps xm10, [lq+l_strideq] + movhps xm0, [lq+l_stride3q] + lea lq, [lq+l_strideq*4] + vinserti128 m1, xm10, 1 + vinserti128 m2, xm0, 1 + shufps m0, m1, m2, q3131 + shufps m1, m2, q2020 +%endif + pxor m2, m2 + pcmpeqb m10, m2, m0 + pand m1, m10 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] + pcmpeqb m10, m2, m0 ; !L + psrlq m2, m0, [lutq+128] + pand m2, [pb_63] + vpbroadcastb m1, [lutq+136] + pminub m2, m1 + pmaxub m2, [pb_1] ; I + pand m1, m0, [pb_240] + psrlq m1, 4 ; H + paddb m0, [pb_2] + paddb m0, m0 + paddb m0, m2 ; E + pxor m1, [pb_128] + pxor m2, [pb_128] + pxor m0, [pb_128] + + ABSSUB m8, m3, m4, m9 ; abs(p1-p0) + pmaxub m8, m10 + ABSSUB m9, m5, m6, m10 ; abs(q1-q0) + pmaxub m8, m9 +%if %1 == 4 + pxor m8, [pb_128] + pcmpgtb m7, m8, m1 ; hev +%else + pxor m7, m8, [pb_128] + pcmpgtb m7, m1 ; hev + +%if %1 == 6 + ABSSUB m9, m13, m4, m10 ; abs(p2-p0) + pmaxub m9, m8 +%else + ABSSUB m9, m12, m4, m10 ; abs(p3-p0) + pmaxub m9, m8 + ABSSUB m10, m13, m4, m11 ; abs(p2-p0) + pmaxub m9, m10 +%endif + ABSSUB m10, m5, m14, m11 ; abs(q2-q0) + pmaxub m9, m10 +%if %1 != 6 + ABSSUB m10, m5, m15, m11 ; abs(q3-q0) + pmaxub m9, m10 +%endif + pxor m9, [pb_128] + pcmpgtb m9, [pb_129] ; !flat8in + +%if %1 == 6 + ABSSUB m10, m13, m3, m1 ; abs(p2-p1) +%else + ABSSUB m10, m12, m13, m11 ; abs(p3-p2) + ABSSUB m11, m13, m3, m1 ; abs(p2-p1) + pmaxub m10, m11 + ABSSUB m11, m14, m15, m1 ; abs(q3-q2) + pmaxub m10, m11 +%endif + ABSSUB m11, m14, m6, m1 ; abs(q2-q1) + pmaxub m10, m11 +%if %1 == 16 + vpbroadcastd m11, [maskq+8] + vpbroadcastd m1, [maskq+4] + por m11, m1 + pand m11, [pb_mask] + pcmpeqd m11, [pb_mask] + pand m10, m11 +%else + vpbroadcastd m11, [maskq+4] + pand m11, [pb_mask] + pcmpeqd m11, [pb_mask] + pand m10, m11 ; only apply fm-wide to wd>4 blocks +%endif + pmaxub m8, m10 + + pxor m8, [pb_128] +%endif + pcmpgtb m8, m2 + + ABSSUB m10, m3, m6, m11 ; abs(p1-q1) + ABSSUB m11, m4, m5, m2 ; abs(p0-q0) + paddusb m11, m11 + pand m10, [pb_254] + psrlq m10, 1 + paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pxor m10, [pb_128] + pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m8, m10 + +%if %1 == 16 +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+12*32] +%endif + ABSSUB m1, m0, m4, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+13*32] +%endif + ABSSUB m2, m0, m4, m10 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+stride3q] +%else + mova m0, [rsp+14*32] +%endif + ABSSUB m2, m0, m4, m10 + pmaxub m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] +%else + mova m0, [rsp+15*32] +%endif + ABSSUB m2, m0, m5, m10 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+16*32] +%endif + ABSSUB m2, m0, m5, m10 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+17*32] +%endif + ABSSUB m2, m0, m5, m10 + pmaxub m1, m2 + pxor m1, [pb_128] + pcmpgtb m1, [pb_129] ; !flat8out + por m1, m9 ; !flat8in | !flat8out + vpbroadcastd m2, [maskq+8] + pand m10, m2, [pb_mask] + pcmpeqd m10, [pb_mask] + pandn m1, m10 ; flat16 + pandn m1, m8, m1 ; flat16 & fm + + vpbroadcastd m10, [maskq+4] + por m10, m2 + pand m2, m10, [pb_mask] + pcmpeqd m2, [pb_mask] + pandn m9, m2 ; flat8in + pandn m9, m8, m9 + vpbroadcastd m2, [maskq+0] + por m2, m10 + pand m2, [pb_mask] + pcmpeqd m2, [pb_mask] + pandn m8, m2 + pandn m8, m9, m8 ; fm & !flat8 & !flat16 + pandn m9, m1, m9 ; flat8 & !flat16 +%elif %1 != 4 + vpbroadcastd m0, [maskq+4] + pand m2, m0, [pb_mask] + pcmpeqd m2, [pb_mask] + pandn m9, m2 + pandn m9, m8, m9 ; flat8 & fm + vpbroadcastd m2, [maskq+0] + por m0, m2 + pand m0, [pb_mask] + pcmpeqd m0, [pb_mask] + pandn m8, m0 + pandn m8, m9, m8 ; fm & !flat8 +%else + vpbroadcastd m0, [maskq+0] + pand m0, [pb_mask] + pcmpeqd m0, [pb_mask] + pandn m8, m0 ; fm +%endif + + ; short filter + + pxor m3, [pb_128] + pxor m6, [pb_128] + psubsb m10, m3, m6 ; iclip_diff(p1-q1) + pand m10, m7 ; f=iclip_diff(p1-q1)&hev + pxor m4, [pb_128] + pxor m5, [pb_128] + psubsb m11, m5, m4 + paddsb m10, m11 + paddsb m10, m11 + paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f) + pand m8, m10 ; f&=fm + paddsb m10, m8, [pb_3] + paddsb m8, [pb_4] + pand m10, [pb_248] + pand m8, [pb_248] + psrlq m10, 3 + psrlq m8, 3 + pxor m10, [pb_16] + pxor m8, [pb_16] + psubb m10, [pb_16] ; f2 + psubb m8, [pb_16] ; f1 + paddsb m4, m10 + psubsb m5, m8 + pxor m4, [pb_128] + pxor m5, [pb_128] + + pxor m8, [pb_128] + pxor m10, m10 + pavgb m8, m10 ; f=(f1+1)>>1 + psubb m8, [pb_64] + pandn m8, m7, m8 ; f&=!hev + paddsb m3, m8 + psubsb m6, m8 + pxor m3, [pb_128] + pxor m6, [pb_128] + +%if %1 == 16 + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 +%else + mova m0, [rsp+12*32] + mova m2, [rsp+13*32] + mova m7, [rsp+14*32] +%endif + + mova [rsp+0*32], m9 + mova [rsp+1*32], m14 + mova [rsp+2*32], m15 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A + ; write -6 + punpcklbw m14, m0, m12 + punpckhbw m15, m0, m12 + pmaddubsw m10, m14, [pb_7_1] + pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3 + punpcklbw m8, m2, m7 + punpckhbw m9, m2, m7 + pmaddubsw m8, [pb_2] + pmaddubsw m9, [pb_2] + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3 + punpcklbw m8, m13, m3 + punpckhbw m9, m13, m3 + pmaddubsw m8, [pb_1] + pmaddubsw m9, [pb_1] + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 + punpcklbw m8, m4, m5 + punpckhbw m9, m4, m5 + pmaddubsw m8, [pb_1] + pmaddubsw m9, [pb_1] + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 + pandn m9, m1, m2 + por m8, m9 +%ifidn %2, v + mova [tmpq+strideq*2], m8 ; p5 +%else + mova [rsp+13*32], m8 +%endif + + ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B + ; write -5 + pmaddubsw m14, [pb_m1_1] + pmaddubsw m15, [pb_m1_1] + paddw m10, m14 + paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m8, m0, m6 + punpckhbw m9, m0, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+3*32], m8 + mova [rsp+4*32], m9 + paddw m10, m8 + paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m7, m8, m1 +%ifidn %2, v + mova [tmpq+stride3q], m8 ; p4 +%else + mova [rsp+14*32], m8 +%endif + + ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C + ; write -4 + mova m14, [rsp+1*32] + punpcklbw m8, m0, m13 + punpckhbw m9, m0, m13 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 + punpcklbw m8, m2, m14 + punpckhbw m2, m14 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m2, [pb_m1_1] + mova [rsp+1*32], m8 + paddw m10, m8 + paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m12, m8, m1 +%ifidn %2, v + mova [tmpq+strideq*4], m8 ; p3 +%else + mova [rsp+19*32], m8 +%endif + + ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D + ; write -3 + mova m15, [rsp+2*32] + punpcklbw m8, m0, m3 + punpckhbw m9, m0, m3 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 + punpcklbw m8, m7, m15 + punpckhbw m7, m15 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m7, [pb_m1_1] + mova [rsp+2*32], m8 + paddw m10, m8 + paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m13, m8, m1 + mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F + + ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E + ; write -2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] +%endif + punpcklbw m8, m0, m4 + punpckhbw m9, m0, m4 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 +%ifidn %2, v + mova m9, [tmpq+strideq*0] ; q4 +%else + mova m9, [rsp+15*32] +%endif + punpcklbw m8, m12, m9 + punpckhbw m9, m12, m9 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+7*32], m8 + mova [rsp+5*32], m9 + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m3, m8, m1 + mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G + + ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F + ; write -1 +%ifidn %2, v + mova m9, [tmpq+strideq*1] ; q5 +%else + mova m9, [rsp+16*32] +%endif + punpcklbw m8, m0, m5 + punpckhbw m0, m5 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m0, [pb_m1_1] + paddw m10, m8 + paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m0, m13, m9 + punpckhbw m9, m13, m9 + mova m13, [rsp+6*32] + pmaddubsw m0, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+ 9*32], m0 + mova [rsp+10*32], m9 + paddw m10, m0 + paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 + pmulhrsw m0, m10, [pw_2048] + pmulhrsw m8, m11, [pw_2048] + packuswb m0, m8 + vpblendvb m0, m4, m0, m1 + mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H + + ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G + ; write +0 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else + mova m0, [rsp+17*32] +%endif + paddw m10, [rsp+3*32] + paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 + punpcklbw m8, m3, m0 + punpckhbw m9, m3, m0 + mova m3, [rsp+8*32] + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+3*32], m8 + mova [rsp+4*32], m9 + paddw m10, m8 + paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m5, m8, m1 + mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I + + ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H + ; write +1 + paddw m10, [rsp+1*32] + paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 + punpcklbw m8, m4, m0 + punpckhbw m2, m4, m0 + mova m4, [rsp+6*32] + pmaddubsw m8, [pb_m1_1] + pmaddubsw m2, [pb_m1_1] + paddw m10, m8 + paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 + pmulhrsw m2, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m2, m9 + vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K + + ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I + ; write +2 + paddw m10, [rsp+2*32] + paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + punpcklbw m8, m5, m0 + punpckhbw m9, m5, m0 + mova m5, [rsp+8*32] + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 + pmulhrsw m7, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m7, m9 + vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K + + ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J + ; write +3 + paddw m10, [rsp+7*32] + paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + punpcklbw m8, m6, m0 + punpckhbw m9, m6, m0 + SWAP 2, 6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m15, m8, m1 +%ifidn %2, v + mova [tmpq+mstrideq], m8 ; q3 +%else + mova [rsp+20*32], m8 +%endif + + ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K + ; write +4 + paddw m10, [rsp+ 9*32] + paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m8, m14, m0 + punpckhbw m9, m14, m0 + SWAP 14, 7 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 +%ifidn %2, v + mova m9, [tmpq+strideq*0] +%else + mova m9, [rsp+15*32] +%endif + vpblendvb m8, m9, m8, m1 +%ifidn %2, v + mova [tmpq+strideq*0], m8 ; q4 +%else + mova [rsp+15*32], m8 +%endif + + ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L + ; write +5 + paddw m10, [rsp+3*32] + paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m8, m15, m0 + punpckhbw m9, m15, m0 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m10, [pw_2048] + pmulhrsw m11, [pw_2048] + packuswb m10, m11 +%ifidn %2, v + mova m11, [tmpq+strideq*1] +%else + mova m11, [rsp+16*32] +%endif + vpblendvb m10, m11, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+16*32], m10 +%endif + + mova m9, [rsp+0*32] +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%endif +%if %1 >= 8 + ; flat8 filter + punpcklbw m0, m12, m3 + punpckhbw m1, m12, m3 + pmaddubsw m2, m0, [pb_3_1] + pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1 + punpcklbw m8, m13, m4 + punpckhbw m11, m13, m4 + pmaddubsw m8, [pb_2_1] + pmaddubsw m11, [pb_2_1] + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + punpcklbw m8, m5, [pb_4] + punpckhbw m11, m5, [pb_4] + pmaddubsw m8, [pb_1] + pmaddubsw m11, [pb_1] + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendvb m10, m13, m8, m9 ; p2 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; p2 +%endif + + pmaddubsw m8, m0, [pb_m1_1] + pmaddubsw m11, m1, [pb_m1_1] + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m11, [pb_m1_1] + paddw m2, m8 + paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendvb m8, m3, m8, m9 ; p1 +%ifidn %2, v + mova [tmpq+strideq*2], m8 ; p1 +%else + mova [rsp+0*32], m8 +%endif + + pmaddubsw m0, [pb_1] + pmaddubsw m1, [pb_1] + psubw m2, m0 + psubw m7, m1 + punpcklbw m8, m4, m14 + punpckhbw m11, m4, m14 + pmaddubsw m8, [pb_1] + pmaddubsw m11, [pb_1] + paddw m2, m8 + paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendvb m8, m4, m8, m9 ; p0 +%ifidn %2, v + mova [tmpq+stride3q ], m8 ; p0 +%else + mova [rsp+1*32], m8 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m8, m0, [pb_1] + pmaddubsw m11, m1, [pb_1] + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m4, m12 + punpckhbw m11, m4, m12 + pmaddubsw m8, [pb_1] + pmaddubsw m11, [pb_1] + psubw m2, m8 + psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendvb m11, m5, m8, m9 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m11 ; q0 +%endif + + pmaddubsw m0, [pb_m1_1] + pmaddubsw m1, [pb_m1_1] + paddw m2, m0 + paddw m7, m1 + punpcklbw m8, m13, m6 + punpckhbw m13, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m13, [pb_m1_1] + paddw m2, m8 + paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 + psrlw m8, m2, 3 + psrlw m13, m7, 3 + packuswb m8, m13 + vpblendvb m13, m6, m8, m9 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m13 ; q1 +%endif + + punpcklbw m0, m3, m6 + punpckhbw m1, m3, m6 + pmaddubsw m0, [pb_1] + pmaddubsw m1, [pb_1] + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m14, m15 + punpckhbw m1, m14, m15 + pmaddubsw m0, [pb_1] + pmaddubsw m1, [pb_1] + paddw m2, m0 + paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + psrlw m2, 3 + psrlw m7, 3 + packuswb m2, m7 + vpblendvb m2, m14, m2, m9 ; q2 +%ifidn %2, v + mova [dstq+strideq*2], m2 ; q2 +%else + mova m0, [rsp+0*32] + mova m1, [rsp+1*32] +%if %1 == 8 + ; 16x8 transpose + punpcklbw m3, m12, m10 + punpckhbw m12, m10 + punpcklbw m10, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m11, m13 + punpckhbw m11, m13 + punpcklbw m13, m2, m15 + punpckhbw m2, m15 + + punpcklwd m15, m3, m10 + punpckhwd m3, m10 + punpcklwd m10, m12, m0 + punpckhwd m12, m0 + punpcklwd m0, m1, m13 + punpckhwd m1, m13 + punpcklwd m13, m11, m2 + punpckhwd m11, m2 + + punpckldq m2, m15, m0 + punpckhdq m15, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + punpckldq m1, m10, m13 + punpckhdq m10, m13 + punpckldq m13, m12, m11 + punpckhdq m12, m11 + + ; write 8x32 + movq [dstq+strideq*0-4], xm2 + movhps [dstq+strideq*1-4], xm2 + movq [dstq+strideq*2-4], xm15 + movhps [dstq+stride3q -4], xm15 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm0 + movhps [dstq+strideq*1-4], xm0 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm1 + movhps [dstq+strideq*1-4], xm1 + movq [dstq+strideq*2-4], xm10 + movhps [dstq+stride3q -4], xm10 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm13 + movhps [dstq+strideq*1-4], xm13 + movq [dstq+strideq*2-4], xm12 + movhps [dstq+stride3q -4], xm12 + lea dstq, [dstq+strideq*4] + + vextracti128 xm2, m2, 1 + vextracti128 xm15, m15, 1 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m3, 1 + vextracti128 xm1, m1, 1 + vextracti128 xm10, m10, 1 + vextracti128 xm13, m13, 1 + vextracti128 xm12, m12, 1 + + movq [dstq+strideq*0-4], xm2 + movhps [dstq+strideq*1-4], xm2 + movq [dstq+strideq*2-4], xm15 + movhps [dstq+stride3q -4], xm15 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm0 + movhps [dstq+strideq*1-4], xm0 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm1 + movhps [dstq+strideq*1-4], xm1 + movq [dstq+strideq*2-4], xm10 + movhps [dstq+stride3q -4], xm10 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm13 + movhps [dstq+strideq*1-4], xm13 + movq [dstq+strideq*2-4], xm12 + movhps [dstq+stride3q -4], xm12 + lea dstq, [dstq+strideq*4] +%else + ; 16x16 transpose and store + SWAP 5, 10, 2 + SWAP 6, 0 + SWAP 7, 1 + SWAP 8, 11 + SWAP 9, 13 + mova m0, [rsp+11*32] + mova m1, [rsp+12*32] + mova m2, [rsp+13*32] + mova m3, [rsp+14*32] + mova m4, [rsp+19*32] + mova m11, [rsp+20*32] + mova m12, [rsp+15*32] + mova m13, [rsp+16*32] + mova m14, [rsp+17*32] + TRANSPOSE_16X16B 1, 0, [rsp+18*32] + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*2-8], xm2 + movu [dstq+stride3q -8], xm3 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm4 + movu [dstq+strideq*1-8], xm5 + movu [dstq+strideq*2-8], xm6 + movu [dstq+stride3q -8], xm7 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm8 + movu [dstq+strideq*1-8], xm9 + movu [dstq+strideq*2-8], xm10 + movu [dstq+stride3q -8], xm11 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm12 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m0, 1 + vextracti128 [dstq+strideq*1-8], m1, 1 + vextracti128 [dstq+strideq*2-8], m2, 1 + vextracti128 [dstq+stride3q -8], m3, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m4, 1 + vextracti128 [dstq+strideq*1-8], m5, 1 + vextracti128 [dstq+strideq*2-8], m6, 1 + vextracti128 [dstq+stride3q -8], m7, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m8, 1 + vextracti128 [dstq+strideq*1-8], m9, 1 + vextracti128 [dstq+strideq*2-8], m10, 1 + vextracti128 [dstq+stride3q -8], m11, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m12, 1 + vextracti128 [dstq+strideq*1-8], m13, 1 + vextracti128 [dstq+strideq*2-8], m14, 1 + vextracti128 [dstq+stride3q -8], m15, 1 + lea dstq, [dstq+strideq*4] +%endif +%endif +%elif %1 == 6 + ; flat6 filter + + punpcklbw m8, m13, m5 + punpckhbw m11, m13, m5 + pmaddubsw m0, m8, [pb_3_1] + pmaddubsw m1, m11, [pb_3_1] + punpcklbw m7, m4, m3 + punpckhbw m10, m4, m3 + pmaddubsw m2, m7, [pb_2] + pmaddubsw m12, m10, [pb_2] + paddw m0, m2 + paddw m1, m12 + pmulhrsw m2, m0, [pw_4096] + pmulhrsw m12, m1, [pw_4096] + packuswb m2, m12 + vpblendvb m2, m3, m2, m9 +%ifidn %2, v + mova [tmpq+strideq*2], m2 ; p1 +%endif + + pmaddubsw m8, [pb_m1_1] + pmaddubsw m11, [pb_m1_1] + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m11, [pb_m1_1] + paddw m0, m8 + paddw m1, m11 + pmulhrsw m12, m0, [pw_4096] + pmulhrsw m13, m1, [pw_4096] + packuswb m12, m13 + vpblendvb m12, m4, m12, m9 +%ifidn %2, v + mova [tmpq+stride3q], m12 ; p0 +%endif + + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m3, m14 + punpckhbw m11, m3, m14 + pmaddubsw m14, m8, [pb_m1_1] + pmaddubsw m13, m11, [pb_m1_1] + paddw m0, m14 + paddw m1, m13 + pmulhrsw m14, m0, [pw_4096] + pmulhrsw m13, m1, [pw_4096] + packuswb m14, m13 + vpblendvb m14, m5, m14, m9 +%ifidn %2, v + mova [dstq+strideq*0], m14 ; q0 +%endif + + pmaddubsw m8, [pb_m1_2] + pmaddubsw m11, [pb_m1_2] + paddw m0, m8 + paddw m1, m11 + pmaddubsw m7, [pb_m1_0] + pmaddubsw m10, [pb_m1_0] + paddw m0, m7 + paddw m1, m10 + pmulhrsw m0, [pw_4096] + pmulhrsw m1, [pw_4096] + packuswb m0, m1 + vpblendvb m0, m6, m0, m9 +%ifidn %2, v + mova [dstq+strideq*1], m0 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 +%endif +%else +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 +%endif +%endif +%endmacro + +INIT_YMM avx2 +cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + +.loop: + cmp byte [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .end + + FILTER 4, v + +.end: + add lq, 32 + add dstq, 32 + add maskq, 1 + sub wd, 8 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + +.loop: + cmp byte [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+stride3q*8] + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+strideq*8] +.end: + add maskq, 1 + sub hd, 8 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + +.loop: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .end + + FILTER 4, v + +.end: + add lq, 32 + add dstq, 32 + add maskq, 1 + sub wd, 8 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + +.loop: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+stride3q*8] + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+strideq*8] +.end: + add maskq, 1 + sub hd, 8 + jg .loop + RET + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/loopfilter_init_tmpl.c dav1d-0.9.1/src/x86/loopfilter_init_tmpl.c --- dav1d-0.7.1/src/x86/loopfilter_init_tmpl.c 2020-06-21 11:48:55.028126500 +0000 +++ dav1d-0.9.1/src/x86/loopfilter_init_tmpl.c 2021-07-28 21:38:28.905852000 +0000 @@ -1,5 +1,5 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018-2021, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * @@ -29,10 +29,10 @@ #include "src/loopfilter.h" #define decl_loopfilter_sb_fns(ext) \ -decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_##ext); \ -decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_##ext); \ -decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_##ext); \ -decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext) +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext)) decl_loopfilter_sb_fns(ssse3); decl_loopfilter_sb_fns(avx2); @@ -43,18 +43,23 @@ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; #if BITDEPTH == 8 - c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3; - c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3; - c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3; - c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3; + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3); +#else + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3); #endif if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if BITDEPTH == 8 && ARCH_X86_64 - c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_avx2; - c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_avx2; - c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_avx2; - c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_avx2; +#if ARCH_X86_64 + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2); #endif } diff -Nru dav1d-0.7.1/src/x86/loopfilter_sse.asm dav1d-0.9.1/src/x86/loopfilter_sse.asm --- dav1d-0.7.1/src/x86/loopfilter_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/loopfilter_sse.asm 2021-07-28 21:38:28.909852300 +0000 @@ -0,0 +1,2348 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 +pb_7_1: times 8 db 7, 1 +pb_3_1: times 8 db 3, 1 +pb_2_1: times 8 db 2, 1 +pb_m1_0: times 8 db -1, 0 +pb_m1_1: times 8 db -1, 1 +pb_m1_2: times 8 db -1, 2 +pb_1: times 16 db 1 +pb_2: times 16 db 2 +pb_3: times 16 db 3 +pb_4: times 16 db 4 +pb_16: times 16 db 16 +pb_63: times 16 db 63 +pb_64: times 16 db 64 +pb_128: times 16 db 0x80 +pb_129: times 16 db 0x81 +pb_240: times 16 db 0xf0 +pb_248: times 16 db 0xf8 +pb_254: times 16 db 0xfe + +pw_2048: times 8 dw 2048 +pw_4096: times 8 dw 4096 + +pd_mask: dd 1, 2, 4, 8 + +SECTION .text + +%macro ABSSUB 4 ; dst, a, b, tmp + psubusb %1, %2, %3 + psubusb %4, %3, %2 + por %1, %4 +%endmacro + +%macro TRANSPOSE_16x4_AND_WRITE_4x16 5 + ; transpose 16x4 + punpcklbw m%5, m%1, m%2 + punpckhbw m%1, m%2 + punpcklbw m%2, m%3, m%4 + punpckhbw m%3, m%4 + punpcklwd m%4, m%5, m%2 + punpckhwd m%5, m%2 + punpcklwd m%2, m%1, m%3 + punpckhwd m%1, m%3 + + ; write out +%assign %%n 0 +%rep 4 + movd [dstq+strideq *0-2], xm%4 + movd [dstq+strideq *4-2], xm%5 + movd [dstq+strideq *8-2], xm%2 + movd [dstq+stride3q*4-2], xm%1 + add dstq, strideq +%if %%n < 3 + psrldq xm%4, 4 + psrldq xm%5, 4 + psrldq xm%2, 4 + psrldq xm%1, 4 +%endif +%assign %%n (%%n+1) +%endrep + lea dstq, [dstq+stride3q*4] +%endmacro + +%macro TRANSPOSE_16X16B 2 ; output_transpose, mem +%if %1 == 0 + mova %2, m15 ; m7 in 32-bit +%endif + + ; input in m0-7 + punpcklbw m15, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 +%if ARCH_X86_64 + SWAP 4, 5, 7 +%else + %if %1 == 0 + mova m5, %2 + %else + mova m5, [esp+1*16] + %endif + mova %2, m4 +%endif + punpcklbw m4, m6, m5 + punpckhbw m6, m5 + + ; interleaved in m15,0,1,2,3,7,4,6 + punpcklwd m5, m15, m1 + punpckhwd m15, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m4 + punpckhwd m3, m4 +%if ARCH_X86_64 + SWAP 3, 4, 7 +%else + mova m4, %2 + mova %2, m3 +%endif + punpcklwd m3, m4, m6 + punpckhwd m4, m6 + + ; interleaved in m5,15,1,0,2,7,3,4 + punpckldq m6, m5, m2 + punpckhdq m5, m2 +%if ARCH_X86_64 + SWAP 2, 7, 5 +%else + mova m2, %2 + mova [esp+1*16], m5 +%endif + punpckldq m5, m15, m2 + punpckhdq m15, m2 + punpckldq m2, m1, m3 + punpckhdq m1, m3 + punpckldq m3, m0, m4 + punpckhdq m0, m4 + +%if ARCH_X86_32 + mova [esp+0*16], m6 + mova [esp+2*16], m5 + mova [esp+3*16], m15 + mova [esp+4*16], m2 + mova [esp+5*16], m1 + mova [esp+6*16], m3 + mova [esp+7*16], m0 + mova m8, [esp+ 8*16] + mova m9, [esp+ 9*16] + mova m10, [esp+10*16] + %if %1 == 0 + mova m11, [esp+11*16] + mova m12, [esp+12*16] + mova m13, [esp+13*16] + mova m14, [esp+14*16] + %else + mova m11, [esp+20*16] + mova m12, [esp+15*16] + mova m13, [esp+16*16] + mova m14, [esp+17*16] + %endif +%endif + + ; input in m8-m15 +%if ARCH_X86_64 + SWAP 7, 4 +%endif + punpcklbw m7, m8, m9 + punpckhbw m8, m9 + punpcklbw m9, m10, m11 + punpckhbw m10, m11 + punpcklbw m11, m12, m13 + punpckhbw m12, m13 +%if ARCH_X86_64 + mova m13, %2 +%else + %if %1 == 0 + mova m13, [esp+15*16] + %else + mova m13, [esp+18*16] + %endif +%endif + mova %2, m12 + punpcklbw m12, m14, m13 + punpckhbw m14, m14, m13 + + ; interleaved in m7,8,9,10,11,rsp%2,12,14 + punpcklwd m13, m7, m9 + punpckhwd m7, m9 + punpcklwd m9, m8, m10 + punpckhwd m8, m10 + punpcklwd m10, m11, m12 + punpckhwd m11, m12 + mova m12, %2 + mova %2, m11 + punpcklwd m11, m12, m14 + punpckhwd m12, m14 + + ; interleaved in m13,7,9,8,10,rsp%2,11,12 + punpckldq m14, m13, m10 + punpckhdq m13, m10 + punpckldq m10, m9, m11 + punpckhdq m9, m11 + punpckldq m11, m8, m12 + punpckhdq m8, m12 + mova m12, %2 + mova %2, m8 + punpckldq m8, m7, m12 + punpckhdq m7, m12 + +%if ARCH_X86_32 + mova [esp+ 8*16], m10 + mova [esp+ 9*16], m9 + mova [esp+10*16], m11 + SWAP 6, 1 + SWAP 4, 2 + SWAP 5, 3 + mova m6, [esp+0*16] + mova m4, [esp+1*16] + mova m5, [esp+2*16] +%endif + + ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7 + punpcklqdq m12, m6, m14 + punpckhqdq m6, m14 + punpcklqdq m14, m4, m13 + punpckhqdq m4, m13 + punpcklqdq m13, m5, m8 + punpckhqdq m5, m8 +%if ARCH_X86_64 + SWAP 8, 5 +%else + mova m8, [esp+3*16] + mova [esp+27*16], m5 + %define m15 m8 +%endif + punpcklqdq m5, m15, m7 + punpckhqdq m15, m7 + +%if ARCH_X86_32 + mova [esp+11*16], m12 + mova [esp+12*16], m6 + mova [esp+13*16], m14 + mova [esp+14*16], m4 + mova [esp+26*16], m13 + mova [esp+ 0*16], m5 + mova [esp+ 1*16], m15 + mova m2, [esp+ 4*16] + mova m10, [esp+ 8*16] + mova m1, [esp+ 5*16] + mova m9, [esp+ 9*16] + mova m3, [esp+ 6*16] + mova m11, [esp+10*16] + mova m0, [esp+ 7*16] +%endif + + punpcklqdq m7, m2, m10 + punpckhqdq m2, m10 + punpcklqdq m10, m1, m9 + punpckhqdq m1, m9 + punpcklqdq m9, m3, m11 + punpckhqdq m3, m11 + mova m11, %2 +%if ARCH_X86_32 + %define m12 m3 +%endif + mova %2, m12 + punpcklqdq m12, m0, m11 + punpckhqdq m0, m11 +%if %1 == 1 + mova m11, %2 +%endif + +%if ARCH_X86_64 + ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0 + SWAP 0, 11, 1, 6, 5, 8, 7, 15 + SWAP 2, 14, 12, 9 + SWAP 3, 4, 13 +%else + %if %1 == 0 + mova [esp+15*16], m9 + mova [esp+17*16], m12 + mova [esp+18*16], m0 + mova [esp+28*16], m10 + mova [esp+29*16], m1 + mova m3, [esp+0*16] + mova m4, [esp+1*16] + SWAP m5, m7 + SWAP m6, m2 + %else + SWAP 0, 7 + SWAP 3, 1, 2, 4, 6 + %endif +%endif +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] +%if ARCH_X86_64 + %define %%flat8mem [rsp+0*16] + %define %%q2mem [rsp+1*16] + %define %%q3mem [rsp+2*16] +%else + %if %1 == 4 || %1 == 6 + %define %%p2mem [esp+ 8*16] + %define %%q2mem [esp+ 9*16] + %define %%flat8mem [esp+10*16] + %else + %ifidn %2, v + %define %%p2mem [esp+16*16] + %define %%q2mem [esp+ 1*16] + %define %%q3mem [esp+18*16] + %define %%flat8mem [esp+ 0*16] + %define %%flat16mem [esp+20*16] + %else + %define %%p2mem [esp+27*16] + %define %%q2mem [esp+28*16] + %define %%q3mem [esp+29*16] + %define %%flat8mem [esp+21*16] + %define %%flat16mem [esp+30*16] + %endif + %endif + %xdefine m12reg m12 +%endif + +%if ARCH_X86_32 + lea stride3q, [strideq*3] +%endif + ; load data +%ifidn %2, v +%if ARCH_X86_32 + mov mstrideq, strideq + neg mstrideq +%endif +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] + ; we load p3 later +%define %%p3mem [dstq+mstrideq*4] + %if ARCH_X86_32 + %define m13 m0 + %define m14 m1 + %define m15 m2 + %endif + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif + %if ARCH_X86_32 + mova %%p2mem, m13 + mova %%q2mem, m14 + %define m13 %%p2mem + %define m14 %%q2mem + %if %1 != 6 + mova %%q3mem, m15 + %define m15 %%q3mem + %endif + %endif +%endif +%else ; %2 == h + ; load lines +%if %1 == 4 + ; transpose 4x16 + movd m7, [dstq+strideq*0-2] + movd m3, [dstq+strideq*1-2] + movd m4, [dstq+strideq*2-2] + movd m5, [dstq+stride3q -2] + lea tmpq, [dstq+strideq*4] + punpcklbw m7, m3 + punpcklbw m4, m5 + movd m3, [tmpq+strideq*0-2] + movd m1, [tmpq+strideq*1-2] + movd m5, [tmpq+strideq*2-2] + movd m6, [tmpq+stride3q -2] + lea tmpq, [tmpq+strideq*4] + punpcklbw m3, m1 + punpcklbw m5, m6 + movd m0, [tmpq+strideq*0-2] + movd m1, [tmpq+strideq*1-2] + punpcklbw m0, m1 + movd m1, [tmpq+strideq*2-2] + movd m2, [tmpq+stride3q -2] + punpcklbw m1, m2 + punpcklqdq m7, m0 + punpcklqdq m4, m1 + lea tmpq, [tmpq+strideq*4] + movd m0, [tmpq+strideq*0-2] + movd m1, [tmpq+strideq*1-2] + punpcklbw m0, m1 + movd m1, [tmpq+strideq*2-2] + movd m2, [tmpq+stride3q -2] + punpcklbw m1, m2 + punpcklqdq m3, m0 + punpcklqdq m5, m1 + ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 + ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 + ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 + ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 + punpcklwd m6, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + ; xm6: A0-3,B0-3,C0-3,D0-3 + ; xm7: A8-11,B8-11,C8-11,D8-11 + ; xm4: A4-7,B4-7,C4-7,D4-7 + ; xm3: A12-15,B12-15,C12-15,D12-15 + punpckldq m5, m6, m4 + punpckhdq m6, m4 + punpckldq m4, m7, m3 + punpckhdq m7, m3 + ; xm5: A0-7,B0-7 + ; xm6: C0-7,D0-7 + ; xm4: A8-15,B8-15 + ; xm7: C8-15,D8-15 + punpcklqdq m3, m5, m4 + punpckhqdq m5, m5, m4 + punpcklqdq m4, m6, m7 + punpckhqdq m6, m7 + ; xm3: A0-15 + ; xm5: B0-15 + ; xm4: C0-15 + ; xm6: D0-15 + SWAP 4, 5 +%elif %1 == 6 || %1 == 8 + ; transpose 8x16 + movq m7, [dstq+strideq*0-%1/2] + movq m3, [dstq+strideq*1-%1/2] + movq m4, [dstq+strideq*2-%1/2] + movq m5, [dstq+stride3q -%1/2] + lea tmpq, [dstq+strideq*8] + punpcklbw m7, m3 + punpcklbw m4, m5 + movq m3, [tmpq+strideq*0-%1/2] + movq m1, [tmpq+strideq*1-%1/2] + movq m5, [tmpq+strideq*2-%1/2] + movq m6, [tmpq+stride3q -%1/2] + lea tmpq, [dstq+strideq*4] + punpcklbw m3, m1 + punpcklbw m5, m6 + movq m6, [tmpq+strideq*0-%1/2] + movq m0, [tmpq+strideq*1-%1/2] + movq m1, [tmpq+strideq*2-%1/2] + movq m2, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + punpcklbw m6, m0 + punpcklbw m1, m2 + movq m2, [tmpq+strideq*2-%1/2] + movq m0, [tmpq+stride3q -%1/2] + punpcklbw m2, m0 +%if ARCH_X86_64 + SWAP m15, m2 +%else + %define m15 [esp+3*16] + mova m15, m2 +%endif + movq m0, [tmpq+strideq*0-%1/2] + movq m2, [tmpq+strideq*1-%1/2] + punpcklbw m0, m2 + ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 + ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 + ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 + ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 + ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 + ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 + ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 + ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 + punpcklwd m2, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m6, m1 + punpckhwd m6, m1 + punpcklwd m1, m0, m15 + punpckhwd m0, m15 +%if ARCH_X86_64 + SWAP m15, m0 +%else + mova m15, m0 +%endif + ; xm2: A0-3,B0-3,C0-3,D0-3 + ; xm7: E0-3,F0-3,G0-3,H0-3 + ; xm4: A8-11,B8-11,C8-11,D8-11 + ; xm3: E8-11,F8-11,G8-11,H8-11 + ; xm5: A4-7,B4-7,C4-7,D4-7 + ; xm6: E4-7,F4-7,G4-7,H4-7 + ; xm1: A12-15,B12-15,C12-15,D12-15 + ; xm0: E12-15,F12-15,G12-15,H12-15 + punpckldq m0, m2, m5 + punpckhdq m2, m5 + punpckldq m5, m7, m6 +%if %1 != 6 + punpckhdq m7, m6 +%endif + punpckldq m6, m4, m1 + punpckhdq m4, m1 + punpckldq m1, m3, m15 +%if %1 != 6 + punpckhdq m3, m15 + %if ARCH_X86_64 + SWAP m15, m3 + %else + mova m15, m3 + %endif +%endif + ; xm0: A0-7,B0-7 + ; xm2: C0-7,D0-7 + ; xm5: E0-7,F0-7 + ; xm7: G0-7,H0-7 + ; xm6: A8-15,B8-15 + ; xm4: C8-15,D8-15 + ; xm1: E8-15,F8-15 + ; xm3: G8-15,H8-15 + punpcklqdq m3, m0, m6 + punpckhqdq m0, m6 + punpckhqdq m6, m2, m4 + punpcklqdq m2, m4 + punpcklqdq m4, m5, m1 + punpckhqdq m5, m1 +%if %1 == 8 + punpcklqdq m1, m7, m15 + punpckhqdq m7, m15 + ; xm3: A0-15 + ; xm0: B0-15 + ; xm2: C0-15 + ; xm6: D0-15 + ; xm4: E0-15 + ; xm5: F0-15 + ; xm1: G0-15 + ; xm7: H0-15 +%if ARCH_X86_64 + SWAP 11, 3, 2 + SWAP 13, 0 + SWAP 6, 5, 4 + SWAP 14, 1 + SWAP 15, 7 + ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15 + mova [rsp+21*16], m11 + %define %%p3mem [rsp+21*16] +%else + %define m11 [esp+26*16] + %define m13 [esp+27*16] + %define m14 [esp+28*16] + %define m15 [esp+29*16] + mova m11, m3 + mova m13, m0 + SWAP 3, 2 + SWAP 6, 5, 4 + mova m14, m1 + mova m15, m7 + %define %%p3mem [esp+26*16] +%endif +%else + %if ARCH_X86_64 + SWAP 13, 3, 0 + SWAP 14, 5, 6, 4, 2 + ; 3,0,2,6,4,5 -> 13,3,4,5,6,14 + %else + %define m13 %%p2mem + %define m14 %%q2mem + mova m13, m3 + mova m14, m5 + SWAP 3, 0 + SWAP 5, 6, 4, 2 + ; 0,2,6,4 -> 3,4,5,6 + %endif +%endif +%else +%if ARCH_X86_64 + mova [rsp+20*16], m12 +%endif + ; load and 16x16 transpose. We only use 14 pixels but we'll need the + ; remainder at the end for the second transpose +%if ARCH_X86_32 + %xdefine m8 m0 + %xdefine m9 m1 + %xdefine m10 m2 + %xdefine m11 m3 + %xdefine m12 m4 + %xdefine m13 m5 + %xdefine m14 m6 + %xdefine m15 m7 + lea tmpq, [dstq+strideq*8] + movu m8, [tmpq+strideq*0-8] + movu m9, [tmpq+strideq*1-8] + movu m10, [tmpq+strideq*2-8] + movu m11, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu m12, [tmpq+strideq*0-8] + movu m13, [tmpq+strideq*1-8] + movu m14, [tmpq+strideq*2-8] + movu m15, [tmpq+stride3q -8] + mova [esp+ 8*16], m8 + mova [esp+ 9*16], m9 + mova [esp+10*16], m10 + mova [esp+11*16], m11 + mova [esp+12*16], m12 + mova [esp+13*16], m13 + mova [esp+14*16], m14 + mova [esp+15*16], m15 +%endif + movu m0, [dstq+strideq*0-8] + movu m1, [dstq+strideq*1-8] + movu m2, [dstq+strideq*2-8] + movu m3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu m4, [tmpq+strideq*0-8] + movu m5, [tmpq+strideq*1-8] + movu m6, [tmpq+strideq*2-8] + movu m7, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] +%if ARCH_X86_64 + movu m8, [tmpq+strideq*0-8] + movu m9, [tmpq+strideq*1-8] + movu m10, [tmpq+strideq*2-8] + movu m11, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu m12, [tmpq+strideq*0-8] + movu m13, [tmpq+strideq*1-8] + movu m14, [tmpq+strideq*2-8] + movu m15, [tmpq+stride3q -8] +%endif + +%if ARCH_X86_64 + TRANSPOSE_16X16B 0, [rsp+11*16] + mova [rsp+12*16], m1 + mova [rsp+13*16], m2 + mova [rsp+14*16], m3 + mova [rsp+15*16], m12 + mova [rsp+16*16], m13 + mova [rsp+17*16], m14 + mova [rsp+18*16], m15 + ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 + SWAP 12, 4, 7 + SWAP 13, 5, 8 + SWAP 3, 6, 9 + SWAP 10, 14 + SWAP 11, 15 + mova [rsp+21*16], m12 + %define %%p3mem [rsp+21*16] + mova m12, [rsp+20*16] +%else + TRANSPOSE_16X16B 0, [esp+16*16] + %define %%p3mem [esp+26*16] + %define m11 %%p3mem + %define m13 %%p2mem + %define m14 %%q2mem + %define m15 %%q3mem +%endif +%endif ; if 4 elif 6 or 8 else 16 +%endif ; if v else h + + ; load L/E/I/H +%if ARCH_X86_32 + mov l_strideq, l_stridem +%endif +%ifidn %2, v + movu m1, [lq] + movu m0, [lq+l_strideq] +%else + %if ARCH_X86_32 + lea l_stride3q, [l_strideq*3] + %endif + movq xm1, [lq] + movq xm2, [lq+l_strideq*2] + movhps xm1, [lq+l_strideq] + movhps xm2, [lq+l_stride3q] + shufps m0, m1, m2, q3131 + shufps m1, m2, q2020 + %if ARCH_X86_32 + lea stride3q, [strideq*3] + %endif +%endif + +%if ARCH_X86_32 + %ifidn %2, v + mov lutd, lutm + %endif +%endif + pxor m2, m2 + pcmpeqb m7, m2, m0 + pand m1, m7 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1] + pcmpeqb m2, m0 ; !L + psrlq m7, m0, [lutq+128] + pand m7, [PIC_sym(pb_63)] + pminub m7, minlvl + pmaxub m7, [PIC_sym(pb_1)] ; I + pand m1, m0, [PIC_sym(pb_240)] + psrlq m1, 4 ; H + paddb m0, [PIC_sym(pb_2)] + paddb m0, m0 + paddb m0, m7 ; E + pxor m1, [PIC_sym(pb_128)] + pxor m7, [PIC_sym(pb_128)] + pxor m0, [PIC_sym(pb_128)] + SWAP 2, 7 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 2, 10 +%else + %ifidn %2, v + mov mstrideq, strideq + neg mstrideq + %if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + %elif %1 == 6 || %1 == 8 + lea tmpq, [dstq+mstrideq*4] + %endif + %endif + mova [esp+3*16], m0 + mova [esp+4*16], m2 +%endif + + ABSSUB m0, m3, m4, m2 ; abs(p1-p0) + pmaxub m0, m7 + ABSSUB m2, m5, m6, m7 ; abs(q1-q0) + pmaxub m0, m2 +%if %1 == 4 + pxor m0, [PIC_sym(pb_128)] + pcmpgtb m7, m0, m1 ; hev + %if ARCH_X86_64 + SWAP 7, 11 + %else + mova [esp+5*16], m7 + %endif +%else + pxor m7, m0, [PIC_sym(pb_128)] + pcmpgtb m7, m1 ; hev +%if ARCH_X86_64 + SWAP 7, 11 +%else + mova [esp+5*16], m7 +%endif + +%if %1 == 6 + ABSSUB m1, m13, m4, m7 ; abs(p2-p0) + pmaxub m1, m0 +%else + mova m2, %%p3mem + ABSSUB m1, m2, m4, m7 ; abs(p3-p0) + pmaxub m1, m0 + ABSSUB m7, m13, m4, m2 ; abs(p2-p0) + pmaxub m1, m7 +%endif + ABSSUB m7, m5, m14, m2 ; abs(p2-p0) + pmaxub m1, m7 +%if %1 != 6 + ABSSUB m7, m5, m15, m2 ; abs(q3-q0) + pmaxub m1, m7 +%endif + pxor m1, [PIC_sym(pb_128)] + pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in +%if ARCH_X86_64 + SWAP 1, 9 +%else + mova [esp+6*16], m1 +%endif + +%if %1 == 6 + ABSSUB m7, m13, m3, m1 ; abs(p2-p1) +%else + mova m2, %%p3mem + ABSSUB m7, m2, m13, m1 ; abs(p3-p2) + ABSSUB m2, m13, m3, m1 ; abs(p2-p1) + pmaxub m7, m2 + ABSSUB m2, m14, m15, m1 ; abs(q3-q2) + pmaxub m7, m2 +%endif + ABSSUB m2, m14, m6, m1 ; abs(q2-q1) + pmaxub m7, m2 +%if ARCH_X86_32 + %define m12 m1 + mova m12, maskmem +%endif + pand m2, m12, mask1 + pcmpeqd m2, m12 + pand m7, m2 ; only apply fm-wide to wd>4 blocks + pmaxub m0, m7 + + pxor m0, [PIC_sym(pb_128)] +%endif ; %if %1 == 4 else +%if ARCH_X86_64 + SWAP 2, 10 + pcmpgtb m0, m2 +%else + pcmpgtb m0, [esp+4*16] +%endif + + ABSSUB m1, m3, m6, m7 ; abs(p1-q1) + ABSSUB m7, m4, m5, m2 ; abs(p0-q0) + paddusb m7, m7 + pand m1, [PIC_sym(pb_254)] + psrlq m1, 1 + paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pxor m1, [PIC_sym(pb_128)] +%if ARCH_X86_64 + pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E +%else + pcmpgtb m1, [esp+3*16] +%endif + por m0, m1 + +%if %1 == 16 +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova [esp+3*16], m0 +%endif +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+12*16] +%endif + ABSSUB m1, m0, m4, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+13*16] +%endif + ABSSUB m2, m0, m4, m7 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+stride3q] +%else + mova m0, [rsp+14*16] +%endif + ABSSUB m2, m0, m4, m7 + pmaxub m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] +%else + mova m0, [rsp+15*16] +%endif + ABSSUB m2, m0, m5, m7 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+16*16] +%endif + ABSSUB m2, m0, m5, m7 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+17*16] +%endif + ABSSUB m2, m0, m5, m7 + pmaxub m1, m2 + pxor m1, [PIC_sym(pb_128)] + pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out +%if ARCH_X86_64 + por m1, m9 ; !flat8in | !flat8out +%else + por m1, [esp+6*16] + %define m12 m7 + mova m12, maskmem +%endif + pand m2, m12, mask2 + pcmpeqd m2, m12 + pandn m1, m2 ; flat16 +%if ARCH_X86_64 + pandn m2, m8, m1 ; flat16 & fm +%else + pandn m2, [esp+3*16], m1 ; flat16 & fm + mova %%flat16mem, m2 +%endif + SWAP 1, 2 + + pand m2, m12, mask1 + pcmpeqd m2, m12 +%if ARCH_X86_64 + pandn m9, m2 ; flat8in + pandn m2, m8, m9 + SWAP 2, 9 +%else + pandn m0, [esp+6*16], m2 + pandn m2, [esp+3*16], m0 + mova [esp+6*16], m2 +%endif + pand m2, m12, mask0 + pcmpeqd m2, m12 +%if ARCH_X86_64 + pandn m8, m2 + pandn m2, m9, m8 ; fm & !flat8 & !flat16 + SWAP 2, 8 + pandn m2, m1, m9 ; flat8 & !flat16 + SWAP 2, 9 + SWAP 0, 8 + SWAP 1, 10 +%else + pandn m0, [esp+3*16], m2 + pandn m2, [esp+6*16], m0 + SWAP 2, 0 + pandn m2, m1, [esp+6*16] + mova %%flat8mem, m2 +%endif +%elif %1 != 4 + %if ARCH_X86_64 + SWAP 1, 9 + %else + %define m12 m7 + mova m12, maskmem + mova m1, [esp+6*16] + %endif + pand m2, m12, mask1 + pcmpeqd m2, m12 + pandn m1, m2 + pandn m2, m0, m1 ; flat8 & fm + pand m1, m12, mask0 + pcmpeqd m1, m12 + pandn m0, m1 + pandn m1, m2, m0 ; fm & !flat8 + SWAP 1, 2, 0 + %if ARCH_X86_64 + SWAP 1, 9 + %else + mova %%flat8mem, m1 + %endif +%else +%if ARCH_X86_32 + %define m12 m1 + mova m12, maskmem +%endif + pand m2, m12, mask0 + pcmpeqd m2, m12 + pandn m0, m2 ; fm +%endif + + ; short filter + + mova m1, [PIC_sym(pb_128)] +%if ARCH_X86_64 + SWAP 7, 11 +%else + mova m7, [esp+5*16] +%endif + pxor m3, m1 + pxor m6, m1 + pxor m4, m1 + pxor m5, m1 + psubsb m1, m3, m6 ; iclip_diff(p1-q1) + pand m1, m7 ; f=iclip_diff(p1-q1)&hev + psubsb m2, m5, m4 + paddsb m1, m2 + paddsb m1, m2 + paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f) + mova m2, [PIC_sym(pb_16)] + pand m0, m1 ; f&=fm + paddsb m1, m0, [PIC_sym(pb_3)] + paddsb m0, [PIC_sym(pb_4)] + pand m1, [PIC_sym(pb_248)] + pand m0, [PIC_sym(pb_248)] + psrlq m1, 3 + psrlq m0, 3 + pxor m1, m2 + pxor m0, m2 + psubb m1, m2 ; f2 + psubb m0, m2 ; f1 + mova m2, [PIC_sym(pb_128)] + paddsb m4, m1 + psubsb m5, m0 + pxor m4, m2 + pxor m5, m2 + + pxor m0, m2 + pxor m1, m1 + pavgb m0, m1 ; f=(f1+1)>>1 + psubb m0, [PIC_sym(pb_64)] + pandn m7, m0 ; f&=!hev + paddsb m3, m7 + psubsb m6, m7 + pxor m3, m2 + pxor m6, m2 + +%if %1 == 16 + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 +%else + mova m0, [rsp+12*16] + mova m2, [rsp+13*16] + mova m7, [rsp+14*16] +%endif + +%if ARCH_X86_64 + SWAP 1, 10 + mova %%flat8mem, m9 + mova %%q2mem, m14 + mova %%q3mem, m15 + SWAP 0, 8 + SWAP 1, 9 +%else + %ifidn %2, v + mova [esp+17*16], m0 + mova [esp+19*16], m3 + mova [esp+21*16], m4 + mova [esp+22*16], m5 + mova [esp+23*16], m6 + %xdefine m11 m3 + %xdefine m14 m4 + %xdefine m15 m5 + %xdefine m10 m6 + %define m13 %%p2mem + %define m8 [esp+17*16] + %define m9 %%flat16mem + %define m3 [esp+19*16] + %define m4 [esp+21*16] + %define m5 [esp+22*16] + %define m6 [esp+23*16] + %else + mova [esp+31*16], m0 + mova [esp+32*16], m3 + mova [esp+33*16], m4 + mova [esp+34*16], m5 + mova [esp+35*16], m6 + %xdefine m11 m3 + %xdefine m14 m4 + %xdefine m15 m5 + %xdefine m10 m6 + %define m13 %%p2mem + %define m8 [esp+31*16] + %define m9 %%flat16mem + %define m3 [esp+32*16] + %define m4 [esp+33*16] + %define m5 [esp+34*16] + %define m6 [esp+35*16] + %endif +%endif + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A + ; write -6 + mova m11, %%p3mem +%if ARCH_X86_64 + punpcklbw m14, m8, m11 + punpckhbw m15, m8, m11 +%else + punpcklbw m14, m0, m11 + punpckhbw m15, m0, m11 +%endif +%ifidn %2, v + mova [rsp+5*16], m11 +%endif + pmaddubsw m10, m14, [PIC_sym(pb_7_1)] + pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3 + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + pmaddubsw m0, [PIC_sym(pb_2)] + pmaddubsw m1, [PIC_sym(pb_2)] + paddw m10, m0 + paddw m11, m1 ; p6*7+p5*2+p4*2+p3 + punpcklbw m0, m13, m3 + punpckhbw m1, m13, m3 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m10, m0 + paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1 + punpcklbw m0, m4, m5 + punpckhbw m1, m4, m5 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m10, m0 + paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m2 + por m0, m1 +%ifidn %2, v + mova [tmpq+strideq*2], m0 ; p5 +%else + mova [rsp+13*16], m0 +%endif + + ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B + ; write -5 + pmaddubsw m14, [PIC_sym(pb_m1_1)] + pmaddubsw m15, [PIC_sym(pb_m1_1)] + paddw m10, m14 + paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m0, m8, m6 + punpckhbw m1, m8, m6 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+3*16], m0 + mova [rsp+4*16], m1 + paddw m10, m0 + paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m7 + por m0, m1 +%ifidn %2, v + mova [tmpq+stride3q], m0 ; p4 +%else + mova [rsp+14*16], m0 +%endif + + ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C + ; write -4 + mova m14, %%q2mem + punpcklbw m0, m8, m13 + punpckhbw m1, m8, m13 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 + punpcklbw m0, m2, m14 + punpckhbw m2, m14 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m2, [PIC_sym(pb_m1_1)] + mova [rsp+1*16], m0 + paddw m10, m0 + paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, %%p3mem + por m0, m1 +%ifidn %2, v + mova [tmpq+strideq*4], m0 ; p3 +%else + mova [rsp+19*16], m0 +%endif + + ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D + ; write -3 + mova m15, %%q3mem + punpcklbw m0, m8, m3 + punpckhbw m1, m8, m3 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 + punpcklbw m0, m7, m15 + punpckhbw m7, m15 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m7, [PIC_sym(pb_m1_1)] + mova [rsp+2*16], m0 +%if ARCH_X86_32 + %ifidn %2, v + mova [esp+24*16], m7 + %else + mova [esp+36*16], m7 + %endif +%endif + paddw m10, m0 + paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m13 + por m0, m1 + mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F + + ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E + ; write -2 + punpcklbw m0, m8, m4 + punpckhbw m1, m8, m4 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 +%if ARCH_X86_64 + SWAP 7, 8 +%endif +%ifidn %2, v + mova m1, [dstq+strideq*4] ; q4 + mova m7, [rsp+5*16] ; (pre-filter) p3 +%else + mova m1, [rsp+15*16] + mova m7, %%p3mem ; (pre-filter) p3 +%endif + punpcklbw m0, m1, m7 + punpckhbw m1, m1, m7 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+7*16], m0 + mova [rsp+5*16], m1 + psubw m10, m0 + psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m3 + por m0, m1 + mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G + + ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F + ; write -1 +%ifidn %2, v + mova m7, [tmpq+strideq*1] ; p6 + lea tmpq, [dstq+strideq*4] + mova m1, [tmpq+strideq*1] ; q5 +%else + mova m7, [rsp+12*16] ; p6 + mova m1, [rsp+16*16] +%endif + punpcklbw m0, m7, m5 + punpckhbw m7, m5 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m7, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m7, m13, m1 + pmaddubsw m7, [PIC_sym(pb_m1_1)] + mova [rsp+9*16], m7 + paddw m10, m7 +%if ARCH_X86_64 + punpckhbw m13, m1 + mova m1, [rsp+6*16] + SWAP 1, 13 +%else + punpckhbw m7, m13, m1 + mova m1, [esp+6*16] + mova m13, m1 + SWAP 1, 7 +%endif + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+10*16], m1 + paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 + pmulhrsw m7, m10, [PIC_sym(pw_2048)] + pmulhrsw m0, m11, [PIC_sym(pw_2048)] + packuswb m7, m0 + pand m7, m9 + pandn m0, m9, m4 + por m7, m0 + mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H + + ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G + ; write +0 +%ifidn %2, v + mova m7, [tmpq+strideq*2] ; q6 +%else + mova m7, [rsp+17*16] +%endif + paddw m10, [rsp+3*16] + paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 + punpcklbw m0, m3, m7 + punpckhbw m1, m3, m7 +%if ARCH_X86_64 + mova m3, [rsp+8*16] +%endif + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+3*16], m0 + mova [rsp+4*16], m1 + paddw m10, m0 + paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m5 + por m0, m1 +%if ARCH_X86_32 + mova m1, [esp+8*16] + mova m3, m1 +%endif + mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I + + ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H + ; write +1 + paddw m10, [rsp+1*16] + paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 + punpcklbw m0, m4, m7 + punpckhbw m2, m4, m7 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m2, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 +%if ARCH_X86_64 + mova m4, [rsp+6*16] +%else + %define m4 [esp+6*16] +%endif + pmulhrsw m2, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m2, m1 + pand m2, m9 + pandn m1, m9, m6 + por m2, m1 ; don't clobber q1/m6 since we need it in K + + ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I + ; write +2 + paddw m10, [rsp+2*16] +%if ARCH_X86_64 + SWAP 7, 8 + paddw m11, m7 +%else + mova m8, m7 + %ifidn %2, v + paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + %else + paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + %endif +%endif + punpcklbw m0, m5, m8 + punpckhbw m1, m5, m8 +%if ARCH_X86_64 + mova m5, [rsp+8*16] +%else + %define m5 [esp+8*16] +%endif + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 + pmulhrsw m7, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m7, m1 + pand m7, m9 + pandn m1, m9, m14 + por m7, m1 ; don't clobber q2/m14 since we need it in K + + ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J + ; write +3 + psubw m10, [rsp+7*16] + psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + punpcklbw m0, m6, m8 + punpckhbw m1, m6, m8 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m15 + por m0, m1 +%ifidn %2, v + mova [tmpq+mstrideq], m0 ; q3 +%else + mova [rsp+20*16], m0 +%endif + + ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K + ; write +4 + paddw m10, [rsp+ 9*16] + paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m0, m14, m8 + punpckhbw m1, m14, m8 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 +%ifidn %2, v + pandn m1, m9, [tmpq+strideq*0] +%else + pandn m1, m9, [rsp+15*16] +%endif + por m0, m1 +%ifidn %2, v + mova [tmpq+strideq*0], m0 ; q4 +%else + mova [rsp+15*16], m0 +%endif + + ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L + ; write +5 + paddw m10, [rsp+3*16] + paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m0, m15, m8 + punpckhbw m1, m15, m8 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m10, [PIC_sym(pw_2048)] + pmulhrsw m11, [PIC_sym(pw_2048)] + packuswb m10, m11 + pand m10, m9 +%ifidn %2, v + pandn m11, m9, [tmpq+strideq*1] +%else + pandn m11, m9, [rsp+16*16] +%endif + por m10, m11 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+16*16], m10 +%endif + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 14, 7 +%else + %xdefine m3 m11 + %xdefine m4 m14 + %xdefine m5 m15 + %xdefine m6 m10 + mova %%q2mem, m7 + %ifidn %2, v + mova m3, [esp+19*16] + %else + mova m3, [esp+32*16] + %endif + mova m4, [esp+ 6*16] + mova m5, [esp+ 8*16] +%endif + SWAP m6, m2 + +%if ARCH_X86_64 + mova m9, %%flat8mem +%endif +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%endif ; if %1 == 16 +%if %1 >= 8 + ; flat8 filter +%if ARCH_X86_32 + %define m9 %%flat8mem + %define m11 m1 + %define m13 %%p2mem + %define m14 %%q2mem + %define m15 %%q3mem +%endif + mova m11, %%p3mem + punpcklbw m0, m11, m3 + punpcklbw m7, m13, m4 + pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 + pmaddubsw m7, [PIC_sym(pb_2_1)] + paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + punpcklbw m7, m5, [PIC_sym(pb_4)] + pmaddubsw m7, [PIC_sym(pb_1)] + paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + punpckhbw m1, m11, m3 + pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 + punpckhbw m0, m13, m4 + pmaddubsw m0, [PIC_sym(pb_2_1)] + paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + punpckhbw m0, m5, [PIC_sym(pb_4)] + pmaddubsw m0, [PIC_sym(pb_1)] + paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m13 + por m0, m1 ; p2 +%ifidn %2, v + mova [tmpq+strideq*1], m0 +%else + %if ARCH_X86_64 + SWAP 0, 10 + %else + mova [esp+2*16], m0 + %endif +%endif + +%if ARCH_X86_32 + mova m11, %%p3mem +%endif + punpcklbw m0, m11, m3 + punpckhbw m1, m11, m3 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 + punpcklbw m0, m13, m6 + punpckhbw m1, m13, m6 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m3 + por m0, m1 ; p1 +%ifidn %2, v + mova [tmpq+strideq*2], m0 +%else + mova [rsp+0*16], m0 +%endif + +%if ARCH_X86_32 + mova m11, %%p3mem +%endif + punpcklbw m0, m11, m3 + punpckhbw m1, m11, m3 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m4, m14 + punpckhbw m1, m4, m14 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m2, m0 + paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m4 + por m0, m1 ; p0 +%ifidn %2, v + mova [tmpq+stride3q], m0 +%else + mova [rsp+1*16], m0 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m2, m0 + paddw m7, m1 +%if ARCH_X86_32 + mova m11, %%p3mem +%endif + punpcklbw m0, m11, m4 + punpckhbw m11, m11, m4 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m11, [PIC_sym(pb_1)] + psubw m2, m0 + psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 + psrlw m0, m2, 3 + psrlw m11, m7, 3 + packuswb m0, m11 + pand m0, m9 + pandn m11, m9, m5 + por m11, m0 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m11 +%elif ARCH_X86_32 + mova [esp+8*16], m11 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 + punpcklbw m0, m13, m6 + punpckhbw m1, m13, m6 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m6 + por m0, m1 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m0 +%else + %if ARCH_X86_64 + SWAP 0, 13 + %else + mova [esp+9*16], m0 + %endif +%endif + + punpcklbw m0, m3, m6 + punpckhbw m1, m3, m6 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m14, m15 + punpckhbw m1, m14, m15 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m2, m0 + paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + psrlw m2, 3 + psrlw m7, 3 + packuswb m2, m7 + pand m2, m9 + pandn m7, m9, m14 + por m2, m7 ; q2 +%ifidn %2, v + mova [dstq+strideq*2], m2 +%else + mova m0, [rsp+0*16] +%if %1 == 8 + mova m1, [rsp+1*16] + mova m4, %%p3mem + +%if ARCH_X86_32 + %define m10 [esp+2*16] + %define m11 [esp+8*16] + %define m13 [esp+9*16] +%endif + + ; 16x8 transpose + punpcklbw m3, m4, m10 + punpckhbw m4, m10 + punpcklbw m5, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m11, m13 + punpckhbw m6, m11, m13 + punpcklbw m7, m2, m15 + punpckhbw m2, m15 +%if ARCH_X86_64 + SWAP 2, 15 +%else + mova m15, m2 +%endif + + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m7 + punpckhwd m1, m7 + punpcklwd m7, m6, m15 + punpckhwd m6, m15 +%if ARCH_X86_64 + SWAP 6, 15 +%else + mova m15, m6 +%endif + + punpckldq m6, m2, m0 + punpckhdq m2, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + punpckldq m1, m5, m7 + punpckhdq m5, m7 + punpckldq m7, m4, m15 + punpckhdq m4, m15 + + ; write 8x16 + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm2 + movhps [dstq+stride3q -4], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm0 + movhps [dstq+strideq*1-4], xm0 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm1 + movhps [dstq+strideq*1-4], xm1 + movq [dstq+strideq*2-4], xm5 + movhps [dstq+stride3q -4], xm5 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm7 + movhps [dstq+strideq*1-4], xm7 + movq [dstq+strideq*2-4], xm4 + movhps [dstq+stride3q -4], xm4 + lea dstq, [dstq+strideq*4] +%else + ; 16x16 transpose and store + SWAP 6, 0 + SWAP 7, 1 + %if ARCH_X86_64 + SWAP 5, 10, 2 + SWAP 8, 11 + SWAP 9, 13 + mova [rsp+21*16], m12 + %else + mova [esp+10*16], m2 + %xdefine m8 m0 + %xdefine m9 m1 + %xdefine m10 m2 + %xdefine m11 m3 + %xdefine m12 m4 + %xdefine m13 m5 + %xdefine m14 m6 + %xdefine m15 m7 + %endif + mova m0, [rsp+11*16] + mova m1, [rsp+12*16] + mova m2, [rsp+13*16] + mova m3, [rsp+14*16] + mova m4, [rsp+19*16] +%if ARCH_X86_64 + mova m7, [rsp+ 1*16] + mova m11, [rsp+20*16] + mova m12, [rsp+15*16] + mova m13, [rsp+16*16] + mova m14, [rsp+17*16] + TRANSPOSE_16X16B 1, [rsp+18*16] +%else + mova m5, [esp+ 2*16] + TRANSPOSE_16X16B 1, [esp+32*16] + mov tmpq, dstq + lea dstq, [dstq+strideq*8] +%endif + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*2-8], xm2 + movu [dstq+stride3q -8], xm3 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm4 + movu [dstq+strideq*1-8], xm5 + movu [dstq+strideq*2-8], xm6 + movu [dstq+stride3q -8], xm7 +%if ARCH_X86_64 + lea dstq, [dstq+strideq*4] +%else + %xdefine m8 m0 + %xdefine m9 m1 + %xdefine m10 m2 + %xdefine m11 m3 + %xdefine m12 m4 + %xdefine m13 m5 + %xdefine m14 m6 + %xdefine m15 m7 + mova m8, [esp+11*16] + mova m9, [esp+12*16] + mova m10, [esp+13*16] + mova m11, [esp+14*16] + mova m12, [esp+26*16] + mova m13, [esp+27*16] + mova m14, [esp+ 0*16] + mova m15, [esp+ 1*16] + mov dstq, tmpq +%endif + movu [dstq+strideq*0-8], xm8 + movu [dstq+strideq*1-8], xm9 + movu [dstq+strideq*2-8], xm10 + movu [dstq+stride3q -8], xm11 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm12 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] +%if ARCH_X86_32 + lea dstq, [dstq+strideq*8] +%else + mova m12, [rsp+21*16] +%endif + +%endif ; if %1 == 8 +%endif ; ifidn %2, v +%elif %1 == 6 + ; flat6 filter +%if ARCH_X86_32 + mova [esp+3*16], m3 + mova [esp+4*16], m4 + mova [esp+5*16], m5 + mova [esp+6*16], m6 + %xdefine m8 m3 + %xdefine m10 m4 + %xdefine m11 m5 + %xdefine m15 m6 + %define m3 [esp+3*16] + %define m4 [esp+4*16] + %define m5 [esp+5*16] + %define m6 [esp+6*16] + %define m9 %%flat8mem + %define m13 %%p2mem + %define m14 %%q2mem +%endif + + punpcklbw m8, m13, m5 + punpckhbw m11, m13, m5 + pmaddubsw m0, m8, [PIC_sym(pb_3_1)] + pmaddubsw m1, m11, [PIC_sym(pb_3_1)] + punpcklbw m7, m4, m3 + punpckhbw m10, m4, m3 + pmaddubsw m2, m7, [PIC_sym(pb_2)] + pmaddubsw m15, m10, [PIC_sym(pb_2)] + paddw m0, m2 + paddw m1, m15 + pmulhrsw m2, m0, [PIC_sym(pw_4096)] + pmulhrsw m15, m1, [PIC_sym(pw_4096)] + packuswb m2, m15 + pand m2, m9 + pandn m15, m9, m3 + por m2, m15 +%ifidn %2, v + mova [tmpq+strideq*2], m2 ; p1 +%elif ARCH_X86_32 + mova [esp+11*16], m2 +%endif + + pmaddubsw m8, [PIC_sym(pb_m1_1)] + pmaddubsw m11, [PIC_sym(pb_m1_1)] + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 +%if ARCH_X86_64 + SWAP 2, 13 +%endif + pmaddubsw m8, [PIC_sym(pb_m1_1)] + pmaddubsw m11, [PIC_sym(pb_m1_1)] + paddw m0, m8 + paddw m1, m11 + pmulhrsw m2, m0, [PIC_sym(pw_4096)] + pmulhrsw m15, m1, [PIC_sym(pw_4096)] + packuswb m2, m15 + pand m2, m9 + pandn m15, m9, m4 + por m2, m15 +%ifidn %2, v + mova [tmpq+stride3q], m2 ; p0 +%elif ARCH_X86_32 + mova [esp+8*16], m2 +%endif + + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m3, m14 + punpckhbw m11, m3, m14 +%if ARCH_X86_64 + SWAP 2, 14 +%endif + pmaddubsw m2, m8, [PIC_sym(pb_m1_1)] + pmaddubsw m15, m11, [PIC_sym(pb_m1_1)] + paddw m0, m2 + paddw m1, m15 + pmulhrsw m2, m0, [PIC_sym(pw_4096)] + pmulhrsw m15, m1, [PIC_sym(pw_4096)] + packuswb m2, m15 + pand m2, m9 + pandn m15, m9, m5 + por m2, m15 +%ifidn %2, v + mova [dstq+strideq*0], m2 ; q0 +%endif + + pmaddubsw m8, [PIC_sym(pb_m1_2)] + pmaddubsw m11, [PIC_sym(pb_m1_2)] + paddw m0, m8 + paddw m1, m11 + pmaddubsw m7, [PIC_sym(pb_m1_0)] + pmaddubsw m10, [PIC_sym(pb_m1_0)] + paddw m0, m7 + paddw m1, m10 + pmulhrsw m0, [PIC_sym(pw_4096)] + pmulhrsw m1, [PIC_sym(pw_4096)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m6 + por m0, m1 +%if ARCH_X86_32 + %xdefine m3 m8 + %xdefine m4 m10 + %xdefine m5 m11 + %xdefine m6 m15 +%endif +%ifidn %2, v + mova [dstq+strideq*1], m0 ; q1 +%else + %if ARCH_X86_64 + SWAP 3, 13 + SWAP 4, 14 + %else + mova m3, [esp+11*16] + mova m4, [esp+ 8*16] + %endif + SWAP 5, 2 + SWAP 6, 0 + TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 +%endif +%else ; if %1 == 4 +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 +%endif +%endif +%if ARCH_X86_32 + %define m12 m12reg +%endif +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; 32-bit PIC helpers ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%if ARCH_X86_32 + %define PIC_base_offset $$ + + %macro SETUP_PIC 0 ; PIC_reg + %define PIC_reg r2 + %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4) + LEA PIC_reg, $$ + %endmacro + + %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base + %if %1 == 0 + mov [esp+PIC_reg_stk_offset], PIC_reg + mov PIC_reg, maskm + %else + mov PIC_reg, [esp+PIC_reg_stk_offset] + %endif + %endmacro + + %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) + +%else + %macro XCHG_PIC_REG 1 + %endmacro + %define PIC_sym(sym) (sym) +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < required_stack_alignment + %assign copy_args 1 + %else + %assign copy_args 0 + %endif +%endif + +%macro RELOC_ARGS 1 + %if copy_args + %define maskm [esp+stack_size-gprsize*1] + %define l_stridem [esp+stack_size-gprsize*2] + %define lutm [esp+stack_size-gprsize*3] + %define %1m [esp+stack_size-gprsize*4] + mov r6d, r6m + mov maskm, maskd + mov lutm, lutd + mov %1m, r6d + %else + %define %1m r6m + %endif +%endmacro + +%if ARCH_X86_32 + %define tmpq r4 + %define mstrideq r5 + %define stride3q r6 + %define l_stride3q r6 +%endif + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits +%else +cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS w + SETUP_PIC + %define m12 m5 +%endif + shl l_strideq, 2 + sub lq, l_strideq +%if ARCH_X86_64 + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movu m0, [maskq] + pxor m4, m4 + movd m3, [lutq+136] + pshufb m3, m4 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m1, m2 + por m0, m1 + mova [rsp+11*16], m0 + mova [rsp+12*16], m1 + mova [rsp+13*16], m2 + mova [rsp+14*16], m3 + +%define maskmem [esp+15*16] +%define mask0 [rsp+11*16] +%define mask1 [rsp+12*16] +%define mask2 [rsp+13*16] +%define minlvl [rsp+14*16] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + je .no_flat16 + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+25*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 16, v + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+25*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 8, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+25*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, v + +.end: +%if ARCH_X86_32 + mova m12, maskmem + mov mask_bitsd, [esp+25*16] +%endif +.no_filter: + pslld m12, 4 + shl mask_bitsd, 4 + add lq, 16 + add dstq, 16 +%if ARCH_X86_64 + sub wd, 4 +%else + sub dword wm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits +%else +cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS h + SETUP_PIC + %define m12 m5 +%endif + sub lq, 4 + shl l_strideq, 2 +%if ARCH_X86_64 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movu m0, [maskq] + pxor m4, m4 + movd m3, [lutq+136] + pshufb m3, m4 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m1, m2 + por m0, m1 + mova [rsp+22*16], m0 + mova [rsp+23*16], m1 + mova [rsp+24*16], m2 + mova [rsp+25*16], m3 + +%define maskmem [esp+37*16] +%define mask0 [rsp+22*16] +%define mask1 [rsp+23*16] +%define mask2 [rsp+24*16] +%define minlvl [rsp+25*16] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + je .no_flat16 + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+38*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 16, h + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+38*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 8, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+38*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +%if ARCH_X86_32 + jmp .end_noload +.end: + mova m12, maskmem + mov l_strideq, l_stridem + mov mask_bitsd, [esp+38*16] +.end_noload: +%else +.end: +%endif + lea lq, [lq+l_strideq*4] + pslld m12, 4 + shl mask_bitsd, 4 +%if ARCH_X86_64 + sub hd, 4 +%else + sub dword hm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits +%else +cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS w + SETUP_PIC + %define m12 m4 +%endif + shl l_strideq, 2 + sub lq, l_strideq +%if ARCH_X86_64 + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movq m0, [maskq] + pxor m3, m3 + movd m2, [lutq+136] + pshufb m2, m3 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m0, m1 + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + +%define maskmem [esp+7*16] +%define mask0 [rsp+0*16] +%define mask1 [rsp+1*16] +%define minlvl [rsp+2*16] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+11*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 6, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[1] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+11*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, v + +.end: +%if ARCH_X86_32 + mova m12, maskmem + mov mask_bitsd, [esp+11*16] +%endif +.no_filter: + pslld m12, 4 + shl mask_bitsd, 4 + add lq, 16 + add dstq, 16 +%if ARCH_X86_64 + sub wd, 4 +%else + sub dword wm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits +%else +cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS h + SETUP_PIC + %define m12 m4 +%endif + sub lq, 4 + shl l_strideq, 2 +%if ARCH_X86_64 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movq m0, [maskq] + pxor m3, m3 + movd m2, [lutq+136] + pshufb m2, m3 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m0, m1 + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + +%define maskmem [esp+7*16] +%define mask0 [rsp+0*16] +%define mask1 [rsp+1*16] +%define minlvl [rsp+2*16] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+12*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 6, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[1] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+12*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +%if ARCH_X86_32 + jmp .end_noload +.end: + mova m12, maskmem + mov l_strided, l_stridem + mov mask_bitsd, [esp+12*16] +.end_noload: +%else +.end: +%endif + lea lq, [lq+l_strideq*4] + pslld m12, 4 + shl mask_bitsd, 4 +%if ARCH_X86_64 + sub hd, 4 +%else + sub dword hm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET diff -Nru dav1d-0.7.1/src/x86/loopfilter_ssse3.asm dav1d-0.9.1/src/x86/loopfilter_ssse3.asm --- dav1d-0.7.1/src/x86/loopfilter_ssse3.asm 2020-06-21 11:48:55.028126500 +0000 +++ dav1d-0.9.1/src/x86/loopfilter_ssse3.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,2348 +0,0 @@ -; Copyright © 2018, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "config.asm" -%include "ext/x86/x86inc.asm" - -SECTION_RODATA 16 - -pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 -pb_7_1: times 8 db 7, 1 -pb_3_1: times 8 db 3, 1 -pb_2_1: times 8 db 2, 1 -pb_m1_0: times 8 db -1, 0 -pb_m1_1: times 8 db -1, 1 -pb_m1_2: times 8 db -1, 2 -pb_1: times 16 db 1 -pb_2: times 16 db 2 -pb_3: times 16 db 3 -pb_4: times 16 db 4 -pb_16: times 16 db 16 -pb_63: times 16 db 63 -pb_64: times 16 db 64 -pb_128: times 16 db 0x80 -pb_129: times 16 db 0x81 -pb_240: times 16 db 0xf0 -pb_248: times 16 db 0xf8 -pb_254: times 16 db 0xfe - -pw_2048: times 8 dw 2048 -pw_4096: times 8 dw 4096 - -pd_mask: dd 1, 2, 4, 8 - -SECTION .text - -%macro ABSSUB 4 ; dst, a, b, tmp - psubusb %1, %2, %3 - psubusb %4, %3, %2 - por %1, %4 -%endmacro - -%macro TRANSPOSE_16x4_AND_WRITE_4x16 5 - ; transpose 16x4 - punpcklbw m%5, m%1, m%2 - punpckhbw m%1, m%2 - punpcklbw m%2, m%3, m%4 - punpckhbw m%3, m%4 - punpcklwd m%4, m%5, m%2 - punpckhwd m%5, m%2 - punpcklwd m%2, m%1, m%3 - punpckhwd m%1, m%3 - - ; write out -%assign %%n 0 -%rep 4 - movd [dstq+strideq *0-2], xm%4 - movd [dstq+strideq *4-2], xm%5 - movd [dstq+strideq *8-2], xm%2 - movd [dstq+stride3q*4-2], xm%1 - add dstq, strideq -%if %%n < 3 - psrldq xm%4, 4 - psrldq xm%5, 4 - psrldq xm%2, 4 - psrldq xm%1, 4 -%endif -%assign %%n (%%n+1) -%endrep - lea dstq, [dstq+stride3q*4] -%endmacro - -%macro TRANSPOSE_16X16B 2 ; output_transpose, mem -%if %1 == 0 - mova %2, m15 ; m7 in 32-bit -%endif - - ; input in m0-7 - punpcklbw m15, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m2, m3 - punpckhbw m2, m3 - punpcklbw m3, m4, m5 - punpckhbw m4, m5 -%if ARCH_X86_64 - SWAP 4, 5, 7 -%else - %if %1 == 0 - mova m5, %2 - %else - mova m5, [esp+1*16] - %endif - mova %2, m4 -%endif - punpcklbw m4, m6, m5 - punpckhbw m6, m5 - - ; interleaved in m15,0,1,2,3,7,4,6 - punpcklwd m5, m15, m1 - punpckhwd m15, m1 - punpcklwd m1, m0, m2 - punpckhwd m0, m2 - punpcklwd m2, m3, m4 - punpckhwd m3, m4 -%if ARCH_X86_64 - SWAP 3, 4, 7 -%else - mova m4, %2 - mova %2, m3 -%endif - punpcklwd m3, m4, m6 - punpckhwd m4, m6 - - ; interleaved in m5,15,1,0,2,7,3,4 - punpckldq m6, m5, m2 - punpckhdq m5, m2 -%if ARCH_X86_64 - SWAP 2, 7, 5 -%else - mova m2, %2 - mova [esp+1*16], m5 -%endif - punpckldq m5, m15, m2 - punpckhdq m15, m2 - punpckldq m2, m1, m3 - punpckhdq m1, m3 - punpckldq m3, m0, m4 - punpckhdq m0, m4 - -%if ARCH_X86_32 - mova [esp+0*16], m6 - mova [esp+2*16], m5 - mova [esp+3*16], m15 - mova [esp+4*16], m2 - mova [esp+5*16], m1 - mova [esp+6*16], m3 - mova [esp+7*16], m0 - mova m8, [esp+ 8*16] - mova m9, [esp+ 9*16] - mova m10, [esp+10*16] - %if %1 == 0 - mova m11, [esp+11*16] - mova m12, [esp+12*16] - mova m13, [esp+13*16] - mova m14, [esp+14*16] - %else - mova m11, [esp+20*16] - mova m12, [esp+15*16] - mova m13, [esp+16*16] - mova m14, [esp+17*16] - %endif -%endif - - ; input in m8-m15 -%if ARCH_X86_64 - SWAP 7, 4 -%endif - punpcklbw m7, m8, m9 - punpckhbw m8, m9 - punpcklbw m9, m10, m11 - punpckhbw m10, m11 - punpcklbw m11, m12, m13 - punpckhbw m12, m13 -%if ARCH_X86_64 - mova m13, %2 -%else - %if %1 == 0 - mova m13, [esp+15*16] - %else - mova m13, [esp+18*16] - %endif -%endif - mova %2, m12 - punpcklbw m12, m14, m13 - punpckhbw m14, m14, m13 - - ; interleaved in m7,8,9,10,11,rsp%2,12,14 - punpcklwd m13, m7, m9 - punpckhwd m7, m9 - punpcklwd m9, m8, m10 - punpckhwd m8, m10 - punpcklwd m10, m11, m12 - punpckhwd m11, m12 - mova m12, %2 - mova %2, m11 - punpcklwd m11, m12, m14 - punpckhwd m12, m14 - - ; interleaved in m13,7,9,8,10,rsp%2,11,12 - punpckldq m14, m13, m10 - punpckhdq m13, m10 - punpckldq m10, m9, m11 - punpckhdq m9, m11 - punpckldq m11, m8, m12 - punpckhdq m8, m12 - mova m12, %2 - mova %2, m8 - punpckldq m8, m7, m12 - punpckhdq m7, m12 - -%if ARCH_X86_32 - mova [esp+ 8*16], m10 - mova [esp+ 9*16], m9 - mova [esp+10*16], m11 - SWAP 6, 1 - SWAP 4, 2 - SWAP 5, 3 - mova m6, [esp+0*16] - mova m4, [esp+1*16] - mova m5, [esp+2*16] -%endif - - ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7 - punpcklqdq m12, m6, m14 - punpckhqdq m6, m14 - punpcklqdq m14, m4, m13 - punpckhqdq m4, m13 - punpcklqdq m13, m5, m8 - punpckhqdq m5, m8 -%if ARCH_X86_64 - SWAP 8, 5 -%else - mova m8, [esp+3*16] - mova [esp+27*16], m5 - %define m15 m8 -%endif - punpcklqdq m5, m15, m7 - punpckhqdq m15, m7 - -%if ARCH_X86_32 - mova [esp+11*16], m12 - mova [esp+12*16], m6 - mova [esp+13*16], m14 - mova [esp+14*16], m4 - mova [esp+26*16], m13 - mova [esp+ 0*16], m5 - mova [esp+ 1*16], m15 - mova m2, [esp+ 4*16] - mova m10, [esp+ 8*16] - mova m1, [esp+ 5*16] - mova m9, [esp+ 9*16] - mova m3, [esp+ 6*16] - mova m11, [esp+10*16] - mova m0, [esp+ 7*16] -%endif - - punpcklqdq m7, m2, m10 - punpckhqdq m2, m10 - punpcklqdq m10, m1, m9 - punpckhqdq m1, m9 - punpcklqdq m9, m3, m11 - punpckhqdq m3, m11 - mova m11, %2 -%if ARCH_X86_32 - %define m12 m3 -%endif - mova %2, m12 - punpcklqdq m12, m0, m11 - punpckhqdq m0, m11 -%if %1 == 1 - mova m11, %2 -%endif - -%if ARCH_X86_64 - ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0 - SWAP 0, 11, 1, 6, 5, 8, 7, 15 - SWAP 2, 14, 12, 9 - SWAP 3, 4, 13 -%else - %if %1 == 0 - mova [esp+15*16], m9 - mova [esp+17*16], m12 - mova [esp+18*16], m0 - mova [esp+28*16], m10 - mova [esp+29*16], m1 - mova m3, [esp+0*16] - mova m4, [esp+1*16] - SWAP m5, m7 - SWAP m6, m2 - %else - SWAP 0, 7 - SWAP 3, 1, 2, 4, 6 - %endif -%endif -%endmacro - -%macro FILTER 2 ; width [4/6/8/16], dir [h/v] -%if ARCH_X86_64 - %define %%flat8mem [rsp+0*16] - %define %%q2mem [rsp+1*16] - %define %%q3mem [rsp+2*16] -%else - %if %1 == 4 || %1 == 6 - %define %%p2mem [esp+ 8*16] - %define %%q2mem [esp+ 9*16] - %define %%flat8mem [esp+10*16] - %else - %ifidn %2, v - %define %%p2mem [esp+16*16] - %define %%q2mem [esp+ 1*16] - %define %%q3mem [esp+18*16] - %define %%flat8mem [esp+ 0*16] - %define %%flat16mem [esp+20*16] - %else - %define %%p2mem [esp+27*16] - %define %%q2mem [esp+28*16] - %define %%q3mem [esp+29*16] - %define %%flat8mem [esp+21*16] - %define %%flat16mem [esp+30*16] - %endif - %endif - %xdefine m12reg m12 -%endif - -%if ARCH_X86_32 - lea stride3q, [strideq*3] -%endif - ; load data -%ifidn %2, v -%if ARCH_X86_32 - mov mstrideq, strideq - neg mstrideq -%endif -%if %1 == 4 - lea tmpq, [dstq+mstrideq*2] - mova m3, [tmpq+strideq*0] ; p1 - mova m4, [tmpq+strideq*1] ; p0 - mova m5, [tmpq+strideq*2] ; q0 - mova m6, [tmpq+stride3q] ; q1 -%else - ; load 6-8 pixels, remainder (for wd=16) will be read inline - lea tmpq, [dstq+mstrideq*4] - ; we load p3 later -%define %%p3mem [dstq+mstrideq*4] - %if ARCH_X86_32 - %define m13 m0 - %define m14 m1 - %define m15 m2 - %endif - mova m13, [tmpq+strideq*1] - mova m3, [tmpq+strideq*2] - mova m4, [tmpq+stride3q] - mova m5, [dstq+strideq*0] - mova m6, [dstq+strideq*1] - mova m14, [dstq+strideq*2] -%if %1 != 6 - mova m15, [dstq+stride3q] -%endif - %if ARCH_X86_32 - mova %%p2mem, m13 - mova %%q2mem, m14 - %define m13 %%p2mem - %define m14 %%q2mem - %if %1 != 6 - mova %%q3mem, m15 - %define m15 %%q3mem - %endif - %endif -%endif -%else ; %2 == h - ; load lines -%if %1 == 4 - ; transpose 4x16 - movd m7, [dstq+strideq*0-2] - movd m3, [dstq+strideq*1-2] - movd m4, [dstq+strideq*2-2] - movd m5, [dstq+stride3q -2] - lea tmpq, [dstq+strideq*4] - punpcklbw m7, m3 - punpcklbw m4, m5 - movd m3, [tmpq+strideq*0-2] - movd m1, [tmpq+strideq*1-2] - movd m5, [tmpq+strideq*2-2] - movd m6, [tmpq+stride3q -2] - lea tmpq, [tmpq+strideq*4] - punpcklbw m3, m1 - punpcklbw m5, m6 - movd m0, [tmpq+strideq*0-2] - movd m1, [tmpq+strideq*1-2] - punpcklbw m0, m1 - movd m1, [tmpq+strideq*2-2] - movd m2, [tmpq+stride3q -2] - punpcklbw m1, m2 - punpcklqdq m7, m0 - punpcklqdq m4, m1 - lea tmpq, [tmpq+strideq*4] - movd m0, [tmpq+strideq*0-2] - movd m1, [tmpq+strideq*1-2] - punpcklbw m0, m1 - movd m1, [tmpq+strideq*2-2] - movd m2, [tmpq+stride3q -2] - punpcklbw m1, m2 - punpcklqdq m3, m0 - punpcklqdq m5, m1 - ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 - ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 - ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 - ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 - punpcklwd m6, m7, m4 - punpckhwd m7, m4 - punpcklwd m4, m3, m5 - punpckhwd m3, m5 - ; xm6: A0-3,B0-3,C0-3,D0-3 - ; xm7: A8-11,B8-11,C8-11,D8-11 - ; xm4: A4-7,B4-7,C4-7,D4-7 - ; xm3: A12-15,B12-15,C12-15,D12-15 - punpckldq m5, m6, m4 - punpckhdq m6, m4 - punpckldq m4, m7, m3 - punpckhdq m7, m3 - ; xm5: A0-7,B0-7 - ; xm6: C0-7,D0-7 - ; xm4: A8-15,B8-15 - ; xm7: C8-15,D8-15 - punpcklqdq m3, m5, m4 - punpckhqdq m5, m5, m4 - punpcklqdq m4, m6, m7 - punpckhqdq m6, m7 - ; xm3: A0-15 - ; xm5: B0-15 - ; xm4: C0-15 - ; xm6: D0-15 - SWAP 4, 5 -%elif %1 == 6 || %1 == 8 - ; transpose 8x16 - movq m7, [dstq+strideq*0-%1/2] - movq m3, [dstq+strideq*1-%1/2] - movq m4, [dstq+strideq*2-%1/2] - movq m5, [dstq+stride3q -%1/2] - lea tmpq, [dstq+strideq*8] - punpcklbw m7, m3 - punpcklbw m4, m5 - movq m3, [tmpq+strideq*0-%1/2] - movq m1, [tmpq+strideq*1-%1/2] - movq m5, [tmpq+strideq*2-%1/2] - movq m6, [tmpq+stride3q -%1/2] - lea tmpq, [dstq+strideq*4] - punpcklbw m3, m1 - punpcklbw m5, m6 - movq m6, [tmpq+strideq*0-%1/2] - movq m0, [tmpq+strideq*1-%1/2] - movq m1, [tmpq+strideq*2-%1/2] - movq m2, [tmpq+stride3q -%1/2] - lea tmpq, [tmpq+strideq*8] - punpcklbw m6, m0 - punpcklbw m1, m2 - movq m2, [tmpq+strideq*2-%1/2] - movq m0, [tmpq+stride3q -%1/2] - punpcklbw m2, m0 -%if ARCH_X86_64 - SWAP m15, m2 -%else - %define m15 [esp+3*16] - mova m15, m2 -%endif - movq m0, [tmpq+strideq*0-%1/2] - movq m2, [tmpq+strideq*1-%1/2] - punpcklbw m0, m2 - ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 - ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 - ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 - ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 - ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 - ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 - ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 - ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 - punpcklwd m2, m7, m4 - punpckhwd m7, m4 - punpcklwd m4, m3, m5 - punpckhwd m3, m5 - punpcklwd m5, m6, m1 - punpckhwd m6, m1 - punpcklwd m1, m0, m15 - punpckhwd m0, m15 -%if ARCH_X86_64 - SWAP m15, m0 -%else - mova m15, m0 -%endif - ; xm2: A0-3,B0-3,C0-3,D0-3 - ; xm7: E0-3,F0-3,G0-3,H0-3 - ; xm4: A8-11,B8-11,C8-11,D8-11 - ; xm3: E8-11,F8-11,G8-11,H8-11 - ; xm5: A4-7,B4-7,C4-7,D4-7 - ; xm6: E4-7,F4-7,G4-7,H4-7 - ; xm1: A12-15,B12-15,C12-15,D12-15 - ; xm0: E12-15,F12-15,G12-15,H12-15 - punpckldq m0, m2, m5 - punpckhdq m2, m5 - punpckldq m5, m7, m6 -%if %1 != 6 - punpckhdq m7, m6 -%endif - punpckldq m6, m4, m1 - punpckhdq m4, m1 - punpckldq m1, m3, m15 -%if %1 != 6 - punpckhdq m3, m15 - %if ARCH_X86_64 - SWAP m15, m3 - %else - mova m15, m3 - %endif -%endif - ; xm0: A0-7,B0-7 - ; xm2: C0-7,D0-7 - ; xm5: E0-7,F0-7 - ; xm7: G0-7,H0-7 - ; xm6: A8-15,B8-15 - ; xm4: C8-15,D8-15 - ; xm1: E8-15,F8-15 - ; xm3: G8-15,H8-15 - punpcklqdq m3, m0, m6 - punpckhqdq m0, m6 - punpckhqdq m6, m2, m4 - punpcklqdq m2, m4 - punpcklqdq m4, m5, m1 - punpckhqdq m5, m1 -%if %1 == 8 - punpcklqdq m1, m7, m15 - punpckhqdq m7, m15 - ; xm3: A0-15 - ; xm0: B0-15 - ; xm2: C0-15 - ; xm6: D0-15 - ; xm4: E0-15 - ; xm5: F0-15 - ; xm1: G0-15 - ; xm7: H0-15 -%if ARCH_X86_64 - SWAP 11, 3, 2 - SWAP 13, 0 - SWAP 6, 5, 4 - SWAP 14, 1 - SWAP 15, 7 - ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15 - mova [rsp+21*16], m11 - %define %%p3mem [rsp+21*16] -%else - %define m11 [esp+26*16] - %define m13 [esp+27*16] - %define m14 [esp+28*16] - %define m15 [esp+29*16] - mova m11, m3 - mova m13, m0 - SWAP 3, 2 - SWAP 6, 5, 4 - mova m14, m1 - mova m15, m7 - %define %%p3mem [esp+26*16] -%endif -%else - %if ARCH_X86_64 - SWAP 13, 3, 0 - SWAP 14, 5, 6, 4, 2 - ; 3,0,2,6,4,5 -> 13,3,4,5,6,14 - %else - %define m13 %%p2mem - %define m14 %%q2mem - mova m13, m3 - mova m14, m5 - SWAP 3, 0 - SWAP 5, 6, 4, 2 - ; 0,2,6,4 -> 3,4,5,6 - %endif -%endif -%else -%if ARCH_X86_64 - mova [rsp+20*16], m12 -%endif - ; load and 16x16 transpose. We only use 14 pixels but we'll need the - ; remainder at the end for the second transpose -%if ARCH_X86_32 - %xdefine m8 m0 - %xdefine m9 m1 - %xdefine m10 m2 - %xdefine m11 m3 - %xdefine m12 m4 - %xdefine m13 m5 - %xdefine m14 m6 - %xdefine m15 m7 - lea tmpq, [dstq+strideq*8] - movu m8, [tmpq+strideq*0-8] - movu m9, [tmpq+strideq*1-8] - movu m10, [tmpq+strideq*2-8] - movu m11, [tmpq+stride3q -8] - lea tmpq, [tmpq+strideq*4] - movu m12, [tmpq+strideq*0-8] - movu m13, [tmpq+strideq*1-8] - movu m14, [tmpq+strideq*2-8] - movu m15, [tmpq+stride3q -8] - mova [esp+ 8*16], m8 - mova [esp+ 9*16], m9 - mova [esp+10*16], m10 - mova [esp+11*16], m11 - mova [esp+12*16], m12 - mova [esp+13*16], m13 - mova [esp+14*16], m14 - mova [esp+15*16], m15 -%endif - movu m0, [dstq+strideq*0-8] - movu m1, [dstq+strideq*1-8] - movu m2, [dstq+strideq*2-8] - movu m3, [dstq+stride3q -8] - lea tmpq, [dstq+strideq*4] - movu m4, [tmpq+strideq*0-8] - movu m5, [tmpq+strideq*1-8] - movu m6, [tmpq+strideq*2-8] - movu m7, [tmpq+stride3q -8] - lea tmpq, [tmpq+strideq*4] -%if ARCH_X86_64 - movu m8, [tmpq+strideq*0-8] - movu m9, [tmpq+strideq*1-8] - movu m10, [tmpq+strideq*2-8] - movu m11, [tmpq+stride3q -8] - lea tmpq, [tmpq+strideq*4] - movu m12, [tmpq+strideq*0-8] - movu m13, [tmpq+strideq*1-8] - movu m14, [tmpq+strideq*2-8] - movu m15, [tmpq+stride3q -8] -%endif - -%if ARCH_X86_64 - TRANSPOSE_16X16B 0, [rsp+11*16] - mova [rsp+12*16], m1 - mova [rsp+13*16], m2 - mova [rsp+14*16], m3 - mova [rsp+15*16], m12 - mova [rsp+16*16], m13 - mova [rsp+17*16], m14 - mova [rsp+18*16], m15 - ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 - SWAP 12, 4, 7 - SWAP 13, 5, 8 - SWAP 3, 6, 9 - SWAP 10, 14 - SWAP 11, 15 - mova [rsp+21*16], m12 - %define %%p3mem [rsp+21*16] - mova m12, [rsp+20*16] -%else - TRANSPOSE_16X16B 0, [esp+16*16] - %define %%p3mem [esp+26*16] - %define m11 %%p3mem - %define m13 %%p2mem - %define m14 %%q2mem - %define m15 %%q3mem -%endif -%endif ; if 4 elif 6 or 8 else 16 -%endif ; if v else h - - ; load L/E/I/H -%if ARCH_X86_32 - mov l_strideq, l_stridem -%endif -%ifidn %2, v - movu m1, [lq] - movu m0, [lq+l_strideq] -%else - %if ARCH_X86_32 - lea l_stride3q, [l_strideq*3] - %endif - movq xm1, [lq] - movq xm2, [lq+l_strideq*2] - movhps xm1, [lq+l_strideq] - movhps xm2, [lq+l_stride3q] - shufps m0, m1, m2, q3131 - shufps m1, m2, q2020 - %if ARCH_X86_32 - lea stride3q, [strideq*3] - %endif -%endif - -%if ARCH_X86_32 - %ifidn %2, v - mov lutd, lutm - %endif -%endif - pxor m2, m2 - pcmpeqb m7, m2, m0 - pand m1, m7 - por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] - pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1] - pcmpeqb m2, m0 ; !L - psrlq m7, m0, [lutq+128] - pand m7, [PIC_sym(pb_63)] - pminub m7, minlvl - pmaxub m7, [PIC_sym(pb_1)] ; I - pand m1, m0, [PIC_sym(pb_240)] - psrlq m1, 4 ; H - paddb m0, [PIC_sym(pb_2)] - paddb m0, m0 - paddb m0, m7 ; E - pxor m1, [PIC_sym(pb_128)] - pxor m7, [PIC_sym(pb_128)] - pxor m0, [PIC_sym(pb_128)] - SWAP 2, 7 - -%if ARCH_X86_64 - SWAP 0, 8 - SWAP 2, 10 -%else - %ifidn %2, v - mov mstrideq, strideq - neg mstrideq - %if %1 == 4 - lea tmpq, [dstq+mstrideq*2] - %elif %1 == 6 || %1 == 8 - lea tmpq, [dstq+mstrideq*4] - %endif - %endif - mova [esp+3*16], m0 - mova [esp+4*16], m2 -%endif - - ABSSUB m0, m3, m4, m2 ; abs(p1-p0) - pmaxub m0, m7 - ABSSUB m2, m5, m6, m7 ; abs(q1-q0) - pmaxub m0, m2 -%if %1 == 4 - pxor m0, [PIC_sym(pb_128)] - pcmpgtb m7, m0, m1 ; hev - %if ARCH_X86_64 - SWAP 7, 11 - %else - mova [esp+5*16], m7 - %endif -%else - pxor m7, m0, [PIC_sym(pb_128)] - pcmpgtb m7, m1 ; hev -%if ARCH_X86_64 - SWAP 7, 11 -%else - mova [esp+5*16], m7 -%endif - -%if %1 == 6 - ABSSUB m1, m13, m4, m7 ; abs(p2-p0) - pmaxub m1, m0 -%else - mova m2, %%p3mem - ABSSUB m1, m2, m4, m7 ; abs(p3-p0) - pmaxub m1, m0 - ABSSUB m7, m13, m4, m2 ; abs(p2-p0) - pmaxub m1, m7 -%endif - ABSSUB m7, m5, m14, m2 ; abs(p2-p0) - pmaxub m1, m7 -%if %1 != 6 - ABSSUB m7, m5, m15, m2 ; abs(q3-q0) - pmaxub m1, m7 -%endif - pxor m1, [PIC_sym(pb_128)] - pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in -%if ARCH_X86_64 - SWAP 1, 9 -%else - mova [esp+6*16], m1 -%endif - -%if %1 == 6 - ABSSUB m7, m13, m3, m1 ; abs(p2-p1) -%else - mova m2, %%p3mem - ABSSUB m7, m2, m13, m1 ; abs(p3-p2) - ABSSUB m2, m13, m3, m1 ; abs(p2-p1) - pmaxub m7, m2 - ABSSUB m2, m14, m15, m1 ; abs(q3-q2) - pmaxub m7, m2 -%endif - ABSSUB m2, m14, m6, m1 ; abs(q2-q1) - pmaxub m7, m2 -%if ARCH_X86_32 - %define m12 m1 - mova m12, maskmem -%endif - pand m2, m12, mask1 - pcmpeqd m2, m12 - pand m7, m2 ; only apply fm-wide to wd>4 blocks - pmaxub m0, m7 - - pxor m0, [PIC_sym(pb_128)] -%endif ; %if %1 == 4 else -%if ARCH_X86_64 - SWAP 2, 10 - pcmpgtb m0, m2 -%else - pcmpgtb m0, [esp+4*16] -%endif - - ABSSUB m1, m3, m6, m7 ; abs(p1-q1) - ABSSUB m7, m4, m5, m2 ; abs(p0-q0) - paddusb m7, m7 - pand m1, [PIC_sym(pb_254)] - psrlq m1, 1 - paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1) - pxor m1, [PIC_sym(pb_128)] -%if ARCH_X86_64 - pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E -%else - pcmpgtb m1, [esp+3*16] -%endif - por m0, m1 - -%if %1 == 16 -%if ARCH_X86_64 - SWAP 0, 8 -%else - mova [esp+3*16], m0 -%endif -%ifidn %2, v - lea tmpq, [dstq+mstrideq*8] - mova m0, [tmpq+strideq*1] -%else - mova m0, [rsp+12*16] -%endif - ABSSUB m1, m0, m4, m2 -%ifidn %2, v - mova m0, [tmpq+strideq*2] -%else - mova m0, [rsp+13*16] -%endif - ABSSUB m2, m0, m4, m7 - pmaxub m1, m2 -%ifidn %2, v - mova m0, [tmpq+stride3q] -%else - mova m0, [rsp+14*16] -%endif - ABSSUB m2, m0, m4, m7 - pmaxub m1, m2 -%ifidn %2, v - lea tmpq, [dstq+strideq*4] - mova m0, [tmpq+strideq*0] -%else - mova m0, [rsp+15*16] -%endif - ABSSUB m2, m0, m5, m7 - pmaxub m1, m2 -%ifidn %2, v - mova m0, [tmpq+strideq*1] -%else - mova m0, [rsp+16*16] -%endif - ABSSUB m2, m0, m5, m7 - pmaxub m1, m2 -%ifidn %2, v - mova m0, [tmpq+strideq*2] -%else - mova m0, [rsp+17*16] -%endif - ABSSUB m2, m0, m5, m7 - pmaxub m1, m2 - pxor m1, [PIC_sym(pb_128)] - pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out -%if ARCH_X86_64 - por m1, m9 ; !flat8in | !flat8out -%else - por m1, [esp+6*16] - %define m12 m7 - mova m12, maskmem -%endif - pand m2, m12, mask2 - pcmpeqd m2, m12 - pandn m1, m2 ; flat16 -%if ARCH_X86_64 - pandn m2, m8, m1 ; flat16 & fm -%else - pandn m2, [esp+3*16], m1 ; flat16 & fm - mova %%flat16mem, m2 -%endif - SWAP 1, 2 - - pand m2, m12, mask1 - pcmpeqd m2, m12 -%if ARCH_X86_64 - pandn m9, m2 ; flat8in - pandn m2, m8, m9 - SWAP 2, 9 -%else - pandn m0, [esp+6*16], m2 - pandn m2, [esp+3*16], m0 - mova [esp+6*16], m2 -%endif - pand m2, m12, mask0 - pcmpeqd m2, m12 -%if ARCH_X86_64 - pandn m8, m2 - pandn m2, m9, m8 ; fm & !flat8 & !flat16 - SWAP 2, 8 - pandn m2, m1, m9 ; flat8 & !flat16 - SWAP 2, 9 - SWAP 0, 8 - SWAP 1, 10 -%else - pandn m0, [esp+3*16], m2 - pandn m2, [esp+6*16], m0 - SWAP 2, 0 - pandn m2, m1, [esp+6*16] - mova %%flat8mem, m2 -%endif -%elif %1 != 4 - %if ARCH_X86_64 - SWAP 1, 9 - %else - %define m12 m7 - mova m12, maskmem - mova m1, [esp+6*16] - %endif - pand m2, m12, mask1 - pcmpeqd m2, m12 - pandn m1, m2 - pandn m2, m0, m1 ; flat8 & fm - pand m1, m12, mask0 - pcmpeqd m1, m12 - pandn m0, m1 - pandn m1, m2, m0 ; fm & !flat8 - SWAP 1, 2, 0 - %if ARCH_X86_64 - SWAP 1, 9 - %else - mova %%flat8mem, m1 - %endif -%else -%if ARCH_X86_32 - %define m12 m1 - mova m12, maskmem -%endif - pand m2, m12, mask0 - pcmpeqd m2, m12 - pandn m0, m2 ; fm -%endif - - ; short filter - - mova m1, [PIC_sym(pb_128)] -%if ARCH_X86_64 - SWAP 7, 11 -%else - mova m7, [esp+5*16] -%endif - pxor m3, m1 - pxor m6, m1 - pxor m4, m1 - pxor m5, m1 - psubsb m1, m3, m6 ; iclip_diff(p1-q1) - pand m1, m7 ; f=iclip_diff(p1-q1)&hev - psubsb m2, m5, m4 - paddsb m1, m2 - paddsb m1, m2 - paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f) - mova m2, [PIC_sym(pb_16)] - pand m0, m1 ; f&=fm - paddsb m1, m0, [PIC_sym(pb_3)] - paddsb m0, [PIC_sym(pb_4)] - pand m1, [PIC_sym(pb_248)] - pand m0, [PIC_sym(pb_248)] - psrlq m1, 3 - psrlq m0, 3 - pxor m1, m2 - pxor m0, m2 - psubb m1, m2 ; f2 - psubb m0, m2 ; f1 - mova m2, [PIC_sym(pb_128)] - paddsb m4, m1 - psubsb m5, m0 - pxor m4, m2 - pxor m5, m2 - - pxor m0, m2 - pxor m1, m1 - pavgb m0, m1 ; f=(f1+1)>>1 - psubb m0, [PIC_sym(pb_64)] - pandn m7, m0 ; f&=!hev - paddsb m3, m7 - psubsb m6, m7 - pxor m3, m2 - pxor m6, m2 - -%if %1 == 16 - ; flat16 filter -%ifidn %2, v - lea tmpq, [dstq+mstrideq*8] - mova m0, [tmpq+strideq*1] ; p6 - mova m2, [tmpq+strideq*2] ; p5 - mova m7, [tmpq+stride3q] ; p4 -%else - mova m0, [rsp+12*16] - mova m2, [rsp+13*16] - mova m7, [rsp+14*16] -%endif - -%if ARCH_X86_64 - SWAP 1, 10 - mova %%flat8mem, m9 - mova %%q2mem, m14 - mova %%q3mem, m15 - SWAP 0, 8 - SWAP 1, 9 -%else - %ifidn %2, v - mova [esp+17*16], m0 - mova [esp+19*16], m3 - mova [esp+21*16], m4 - mova [esp+22*16], m5 - mova [esp+23*16], m6 - %xdefine m11 m3 - %xdefine m14 m4 - %xdefine m15 m5 - %xdefine m10 m6 - %define m13 %%p2mem - %define m8 [esp+17*16] - %define m9 %%flat16mem - %define m3 [esp+19*16] - %define m4 [esp+21*16] - %define m5 [esp+22*16] - %define m6 [esp+23*16] - %else - mova [esp+31*16], m0 - mova [esp+32*16], m3 - mova [esp+33*16], m4 - mova [esp+34*16], m5 - mova [esp+35*16], m6 - %xdefine m11 m3 - %xdefine m14 m4 - %xdefine m15 m5 - %xdefine m10 m6 - %define m13 %%p2mem - %define m8 [esp+31*16] - %define m9 %%flat16mem - %define m3 [esp+32*16] - %define m4 [esp+33*16] - %define m5 [esp+34*16] - %define m6 [esp+35*16] - %endif -%endif - - ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A - ; write -6 - mova m11, %%p3mem -%if ARCH_X86_64 - punpcklbw m14, m8, m11 - punpckhbw m15, m8, m11 -%else - punpcklbw m14, m0, m11 - punpckhbw m15, m0, m11 -%endif -%ifidn %2, v - mova [rsp+5*16], m11 -%endif - pmaddubsw m10, m14, [PIC_sym(pb_7_1)] - pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3 - punpcklbw m0, m2, m7 - punpckhbw m1, m2, m7 - pmaddubsw m0, [PIC_sym(pb_2)] - pmaddubsw m1, [PIC_sym(pb_2)] - paddw m10, m0 - paddw m11, m1 ; p6*7+p5*2+p4*2+p3 - punpcklbw m0, m13, m3 - punpckhbw m1, m13, m3 - pmaddubsw m0, [PIC_sym(pb_1)] - pmaddubsw m1, [PIC_sym(pb_1)] - paddw m10, m0 - paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1 - punpcklbw m0, m4, m5 - punpckhbw m1, m4, m5 - pmaddubsw m0, [PIC_sym(pb_1)] - pmaddubsw m1, [PIC_sym(pb_1)] - paddw m10, m0 - paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 - pmulhrsw m0, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m2 - por m0, m1 -%ifidn %2, v - mova [tmpq+strideq*2], m0 ; p5 -%else - mova [rsp+13*16], m0 -%endif - - ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B - ; write -5 - pmaddubsw m14, [PIC_sym(pb_m1_1)] - pmaddubsw m15, [PIC_sym(pb_m1_1)] - paddw m10, m14 - paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 - punpcklbw m0, m8, m6 - punpckhbw m1, m8, m6 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - mova [rsp+3*16], m0 - mova [rsp+4*16], m1 - paddw m10, m0 - paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 - pmulhrsw m0, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m7 - por m0, m1 -%ifidn %2, v - mova [tmpq+stride3q], m0 ; p4 -%else - mova [rsp+14*16], m0 -%endif - - ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C - ; write -4 - mova m14, %%q2mem - punpcklbw m0, m8, m13 - punpckhbw m1, m8, m13 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m10, m0 - paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 - punpcklbw m0, m2, m14 - punpckhbw m2, m14 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m2, [PIC_sym(pb_m1_1)] - mova [rsp+1*16], m0 - paddw m10, m0 - paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 - pmulhrsw m0, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, %%p3mem - por m0, m1 -%ifidn %2, v - mova [tmpq+strideq*4], m0 ; p3 -%else - mova [rsp+19*16], m0 -%endif - - ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D - ; write -3 - mova m15, %%q3mem - punpcklbw m0, m8, m3 - punpckhbw m1, m8, m3 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m10, m0 - paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 - punpcklbw m0, m7, m15 - punpckhbw m7, m15 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m7, [PIC_sym(pb_m1_1)] - mova [rsp+2*16], m0 -%if ARCH_X86_32 - %ifidn %2, v - mova [esp+24*16], m7 - %else - mova [esp+36*16], m7 - %endif -%endif - paddw m10, m0 - paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 - pmulhrsw m0, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m13 - por m0, m1 - mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F - - ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E - ; write -2 - punpcklbw m0, m8, m4 - punpckhbw m1, m8, m4 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m10, m0 - paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 -%if ARCH_X86_64 - SWAP 7, 8 -%endif -%ifidn %2, v - mova m1, [dstq+strideq*4] ; q4 - mova m7, [rsp+5*16] ; (pre-filter) p3 -%else - mova m1, [rsp+15*16] - mova m7, %%p3mem ; (pre-filter) p3 -%endif - punpcklbw m0, m1, m7 - punpckhbw m1, m1, m7 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - mova [rsp+7*16], m0 - mova [rsp+5*16], m1 - psubw m10, m0 - psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 - pmulhrsw m0, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m3 - por m0, m1 - mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G - - ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F - ; write -1 -%ifidn %2, v - mova m7, [tmpq+strideq*1] ; p6 - lea tmpq, [dstq+strideq*4] - mova m1, [tmpq+strideq*1] ; q5 -%else - mova m7, [rsp+12*16] ; p6 - mova m1, [rsp+16*16] -%endif - punpcklbw m0, m7, m5 - punpckhbw m7, m5 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m7, [PIC_sym(pb_m1_1)] - paddw m10, m0 - paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 - punpcklbw m7, m13, m1 - pmaddubsw m7, [PIC_sym(pb_m1_1)] - mova [rsp+9*16], m7 - paddw m10, m7 -%if ARCH_X86_64 - punpckhbw m13, m1 - mova m1, [rsp+6*16] - SWAP 1, 13 -%else - punpckhbw m7, m13, m1 - mova m1, [esp+6*16] - mova m13, m1 - SWAP 1, 7 -%endif - pmaddubsw m1, [PIC_sym(pb_m1_1)] - mova [rsp+10*16], m1 - paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 - pmulhrsw m7, m10, [PIC_sym(pw_2048)] - pmulhrsw m0, m11, [PIC_sym(pw_2048)] - packuswb m7, m0 - pand m7, m9 - pandn m0, m9, m4 - por m7, m0 - mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H - - ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G - ; write +0 -%ifidn %2, v - mova m7, [tmpq+strideq*2] ; q6 -%else - mova m7, [rsp+17*16] -%endif - paddw m10, [rsp+3*16] - paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 - punpcklbw m0, m3, m7 - punpckhbw m1, m3, m7 -%if ARCH_X86_64 - mova m3, [rsp+8*16] -%endif - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - mova [rsp+3*16], m0 - mova [rsp+4*16], m1 - paddw m10, m0 - paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 - pmulhrsw m0, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m5 - por m0, m1 -%if ARCH_X86_32 - mova m1, [esp+8*16] - mova m3, m1 -%endif - mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I - - ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H - ; write +1 - paddw m10, [rsp+1*16] - paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 - punpcklbw m0, m4, m7 - punpckhbw m2, m4, m7 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m2, [PIC_sym(pb_m1_1)] - paddw m10, m0 - paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 -%if ARCH_X86_64 - mova m4, [rsp+6*16] -%else - %define m4 [esp+6*16] -%endif - pmulhrsw m2, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m2, m1 - pand m2, m9 - pandn m1, m9, m6 - por m2, m1 ; don't clobber q1/m6 since we need it in K - - ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I - ; write +2 - paddw m10, [rsp+2*16] -%if ARCH_X86_64 - SWAP 7, 8 - paddw m11, m7 -%else - mova m8, m7 - %ifidn %2, v - paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 - %else - paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 - %endif -%endif - punpcklbw m0, m5, m8 - punpckhbw m1, m5, m8 -%if ARCH_X86_64 - mova m5, [rsp+8*16] -%else - %define m5 [esp+8*16] -%endif - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m10, m0 - paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 - pmulhrsw m7, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m7, m1 - pand m7, m9 - pandn m1, m9, m14 - por m7, m1 ; don't clobber q2/m14 since we need it in K - - ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J - ; write +3 - psubw m10, [rsp+7*16] - psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 - punpcklbw m0, m6, m8 - punpckhbw m1, m6, m8 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m10, m0 - paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 - pmulhrsw m0, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m15 - por m0, m1 -%ifidn %2, v - mova [tmpq+mstrideq], m0 ; q3 -%else - mova [rsp+20*16], m0 -%endif - - ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K - ; write +4 - paddw m10, [rsp+ 9*16] - paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - punpcklbw m0, m14, m8 - punpckhbw m1, m14, m8 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m10, m0 - paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 - pmulhrsw m0, m10, [PIC_sym(pw_2048)] - pmulhrsw m1, m11, [PIC_sym(pw_2048)] - packuswb m0, m1 - pand m0, m9 -%ifidn %2, v - pandn m1, m9, [tmpq+strideq*0] -%else - pandn m1, m9, [rsp+15*16] -%endif - por m0, m1 -%ifidn %2, v - mova [tmpq+strideq*0], m0 ; q4 -%else - mova [rsp+15*16], m0 -%endif - - ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L - ; write +5 - paddw m10, [rsp+3*16] - paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - punpcklbw m0, m15, m8 - punpckhbw m1, m15, m8 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m10, m0 - paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 - pmulhrsw m10, [PIC_sym(pw_2048)] - pmulhrsw m11, [PIC_sym(pw_2048)] - packuswb m10, m11 - pand m10, m9 -%ifidn %2, v - pandn m11, m9, [tmpq+strideq*1] -%else - pandn m11, m9, [rsp+16*16] -%endif - por m10, m11 -%ifidn %2, v - mova [tmpq+strideq*1], m10 ; q5 -%else - mova [rsp+16*16], m10 -%endif - -%if ARCH_X86_64 - SWAP 0, 8 - SWAP 1, 9 - SWAP 14, 7 -%else - %xdefine m3 m11 - %xdefine m4 m14 - %xdefine m5 m15 - %xdefine m6 m10 - mova %%q2mem, m7 - %ifidn %2, v - mova m3, [esp+19*16] - %else - mova m3, [esp+32*16] - %endif - mova m4, [esp+ 6*16] - mova m5, [esp+ 8*16] -%endif - SWAP m6, m2 - -%if ARCH_X86_64 - mova m9, %%flat8mem -%endif -%ifidn %2, v - lea tmpq, [dstq+mstrideq*4] -%endif -%endif ; if %1 == 16 -%if %1 >= 8 - ; flat8 filter -%if ARCH_X86_32 - %define m9 %%flat8mem - %define m11 m1 - %define m13 %%p2mem - %define m14 %%q2mem - %define m15 %%q3mem -%endif - mova m11, %%p3mem - punpcklbw m0, m11, m3 - punpcklbw m7, m13, m4 - pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 - pmaddubsw m7, [PIC_sym(pb_2_1)] - paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 - punpcklbw m7, m5, [PIC_sym(pb_4)] - pmaddubsw m7, [PIC_sym(pb_1)] - paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 - punpckhbw m1, m11, m3 - pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 - punpckhbw m0, m13, m4 - pmaddubsw m0, [PIC_sym(pb_2_1)] - paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 - punpckhbw m0, m5, [PIC_sym(pb_4)] - pmaddubsw m0, [PIC_sym(pb_1)] - paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 - psrlw m0, m2, 3 - psrlw m1, m7, 3 - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m13 - por m0, m1 ; p2 -%ifidn %2, v - mova [tmpq+strideq*1], m0 -%else - %if ARCH_X86_64 - SWAP 0, 10 - %else - mova [esp+2*16], m0 - %endif -%endif - -%if ARCH_X86_32 - mova m11, %%p3mem -%endif - punpcklbw m0, m11, m3 - punpckhbw m1, m11, m3 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m2, m0 - paddw m7, m1 - punpcklbw m0, m13, m6 - punpckhbw m1, m13, m6 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m2, m0 - paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 - psrlw m0, m2, 3 - psrlw m1, m7, 3 - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m3 - por m0, m1 ; p1 -%ifidn %2, v - mova [tmpq+strideq*2], m0 -%else - mova [rsp+0*16], m0 -%endif - -%if ARCH_X86_32 - mova m11, %%p3mem -%endif - punpcklbw m0, m11, m3 - punpckhbw m1, m11, m3 - pmaddubsw m0, [PIC_sym(pb_1)] - pmaddubsw m1, [PIC_sym(pb_1)] - psubw m2, m0 - psubw m7, m1 - punpcklbw m0, m4, m14 - punpckhbw m1, m4, m14 - pmaddubsw m0, [PIC_sym(pb_1)] - pmaddubsw m1, [PIC_sym(pb_1)] - paddw m2, m0 - paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 - psrlw m0, m2, 3 - psrlw m1, m7, 3 - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m4 - por m0, m1 ; p0 -%ifidn %2, v - mova [tmpq+stride3q], m0 -%else - mova [rsp+1*16], m0 -%endif - - punpcklbw m0, m5, m15 - punpckhbw m1, m5, m15 - pmaddubsw m0, [PIC_sym(pb_1)] - pmaddubsw m1, [PIC_sym(pb_1)] - paddw m2, m0 - paddw m7, m1 -%if ARCH_X86_32 - mova m11, %%p3mem -%endif - punpcklbw m0, m11, m4 - punpckhbw m11, m11, m4 - pmaddubsw m0, [PIC_sym(pb_1)] - pmaddubsw m11, [PIC_sym(pb_1)] - psubw m2, m0 - psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 - psrlw m0, m2, 3 - psrlw m11, m7, 3 - packuswb m0, m11 - pand m0, m9 - pandn m11, m9, m5 - por m11, m0 ; q0 -%ifidn %2, v - mova [dstq+strideq*0], m11 -%elif ARCH_X86_32 - mova [esp+8*16], m11 -%endif - - punpcklbw m0, m5, m15 - punpckhbw m1, m5, m15 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m2, m0 - paddw m7, m1 - punpcklbw m0, m13, m6 - punpckhbw m1, m13, m6 - pmaddubsw m0, [PIC_sym(pb_m1_1)] - pmaddubsw m1, [PIC_sym(pb_m1_1)] - paddw m2, m0 - paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 - psrlw m0, m2, 3 - psrlw m1, m7, 3 - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m6 - por m0, m1 ; q1 -%ifidn %2, v - mova [dstq+strideq*1], m0 -%else - %if ARCH_X86_64 - SWAP 0, 13 - %else - mova [esp+9*16], m0 - %endif -%endif - - punpcklbw m0, m3, m6 - punpckhbw m1, m3, m6 - pmaddubsw m0, [PIC_sym(pb_1)] - pmaddubsw m1, [PIC_sym(pb_1)] - psubw m2, m0 - psubw m7, m1 - punpcklbw m0, m14, m15 - punpckhbw m1, m14, m15 - pmaddubsw m0, [PIC_sym(pb_1)] - pmaddubsw m1, [PIC_sym(pb_1)] - paddw m2, m0 - paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 - psrlw m2, 3 - psrlw m7, 3 - packuswb m2, m7 - pand m2, m9 - pandn m7, m9, m14 - por m2, m7 ; q2 -%ifidn %2, v - mova [dstq+strideq*2], m2 -%else - mova m0, [rsp+0*16] -%if %1 == 8 - mova m1, [rsp+1*16] - mova m4, %%p3mem - -%if ARCH_X86_32 - %define m10 [esp+2*16] - %define m11 [esp+8*16] - %define m13 [esp+9*16] -%endif - - ; 16x8 transpose - punpcklbw m3, m4, m10 - punpckhbw m4, m10 - punpcklbw m5, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m11, m13 - punpckhbw m6, m11, m13 - punpcklbw m7, m2, m15 - punpckhbw m2, m15 -%if ARCH_X86_64 - SWAP 2, 15 -%else - mova m15, m2 -%endif - - punpcklwd m2, m3, m5 - punpckhwd m3, m5 - punpcklwd m5, m4, m0 - punpckhwd m4, m0 - punpcklwd m0, m1, m7 - punpckhwd m1, m7 - punpcklwd m7, m6, m15 - punpckhwd m6, m15 -%if ARCH_X86_64 - SWAP 6, 15 -%else - mova m15, m6 -%endif - - punpckldq m6, m2, m0 - punpckhdq m2, m0 - punpckldq m0, m3, m1 - punpckhdq m3, m1 - punpckldq m1, m5, m7 - punpckhdq m5, m7 - punpckldq m7, m4, m15 - punpckhdq m4, m15 - - ; write 8x16 - movq [dstq+strideq*0-4], xm6 - movhps [dstq+strideq*1-4], xm6 - movq [dstq+strideq*2-4], xm2 - movhps [dstq+stride3q -4], xm2 - lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm0 - movhps [dstq+strideq*1-4], xm0 - movq [dstq+strideq*2-4], xm3 - movhps [dstq+stride3q -4], xm3 - lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm1 - movhps [dstq+strideq*1-4], xm1 - movq [dstq+strideq*2-4], xm5 - movhps [dstq+stride3q -4], xm5 - lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0-4], xm7 - movhps [dstq+strideq*1-4], xm7 - movq [dstq+strideq*2-4], xm4 - movhps [dstq+stride3q -4], xm4 - lea dstq, [dstq+strideq*4] -%else - ; 16x16 transpose and store - SWAP 6, 0 - SWAP 7, 1 - %if ARCH_X86_64 - SWAP 5, 10, 2 - SWAP 8, 11 - SWAP 9, 13 - mova [rsp+21*16], m12 - %else - mova [esp+10*16], m2 - %xdefine m8 m0 - %xdefine m9 m1 - %xdefine m10 m2 - %xdefine m11 m3 - %xdefine m12 m4 - %xdefine m13 m5 - %xdefine m14 m6 - %xdefine m15 m7 - %endif - mova m0, [rsp+11*16] - mova m1, [rsp+12*16] - mova m2, [rsp+13*16] - mova m3, [rsp+14*16] - mova m4, [rsp+19*16] -%if ARCH_X86_64 - mova m7, [rsp+ 1*16] - mova m11, [rsp+20*16] - mova m12, [rsp+15*16] - mova m13, [rsp+16*16] - mova m14, [rsp+17*16] - TRANSPOSE_16X16B 1, [rsp+18*16] -%else - mova m5, [esp+ 2*16] - TRANSPOSE_16X16B 1, [esp+32*16] - mov tmpq, dstq - lea dstq, [dstq+strideq*8] -%endif - movu [dstq+strideq*0-8], xm0 - movu [dstq+strideq*1-8], xm1 - movu [dstq+strideq*2-8], xm2 - movu [dstq+stride3q -8], xm3 - lea dstq, [dstq+strideq*4] - movu [dstq+strideq*0-8], xm4 - movu [dstq+strideq*1-8], xm5 - movu [dstq+strideq*2-8], xm6 - movu [dstq+stride3q -8], xm7 -%if ARCH_X86_64 - lea dstq, [dstq+strideq*4] -%else - %xdefine m8 m0 - %xdefine m9 m1 - %xdefine m10 m2 - %xdefine m11 m3 - %xdefine m12 m4 - %xdefine m13 m5 - %xdefine m14 m6 - %xdefine m15 m7 - mova m8, [esp+11*16] - mova m9, [esp+12*16] - mova m10, [esp+13*16] - mova m11, [esp+14*16] - mova m12, [esp+26*16] - mova m13, [esp+27*16] - mova m14, [esp+ 0*16] - mova m15, [esp+ 1*16] - mov dstq, tmpq -%endif - movu [dstq+strideq*0-8], xm8 - movu [dstq+strideq*1-8], xm9 - movu [dstq+strideq*2-8], xm10 - movu [dstq+stride3q -8], xm11 - lea dstq, [dstq+strideq*4] - movu [dstq+strideq*0-8], xm12 - movu [dstq+strideq*1-8], xm13 - movu [dstq+strideq*2-8], xm14 - movu [dstq+stride3q -8], xm15 - lea dstq, [dstq+strideq*4] -%if ARCH_X86_32 - lea dstq, [dstq+strideq*8] -%else - mova m12, [rsp+21*16] -%endif - -%endif ; if %1 == 8 -%endif ; ifidn %2, v -%elif %1 == 6 - ; flat6 filter -%if ARCH_X86_32 - mova [esp+3*16], m3 - mova [esp+4*16], m4 - mova [esp+5*16], m5 - mova [esp+6*16], m6 - %xdefine m8 m3 - %xdefine m10 m4 - %xdefine m11 m5 - %xdefine m15 m6 - %define m3 [esp+3*16] - %define m4 [esp+4*16] - %define m5 [esp+5*16] - %define m6 [esp+6*16] - %define m9 %%flat8mem - %define m13 %%p2mem - %define m14 %%q2mem -%endif - - punpcklbw m8, m13, m5 - punpckhbw m11, m13, m5 - pmaddubsw m0, m8, [PIC_sym(pb_3_1)] - pmaddubsw m1, m11, [PIC_sym(pb_3_1)] - punpcklbw m7, m4, m3 - punpckhbw m10, m4, m3 - pmaddubsw m2, m7, [PIC_sym(pb_2)] - pmaddubsw m15, m10, [PIC_sym(pb_2)] - paddw m0, m2 - paddw m1, m15 - pmulhrsw m2, m0, [PIC_sym(pw_4096)] - pmulhrsw m15, m1, [PIC_sym(pw_4096)] - packuswb m2, m15 - pand m2, m9 - pandn m15, m9, m3 - por m2, m15 -%ifidn %2, v - mova [tmpq+strideq*2], m2 ; p1 -%elif ARCH_X86_32 - mova [esp+11*16], m2 -%endif - - pmaddubsw m8, [PIC_sym(pb_m1_1)] - pmaddubsw m11, [PIC_sym(pb_m1_1)] - paddw m0, m8 - paddw m1, m11 - punpcklbw m8, m13, m6 - punpckhbw m11, m13, m6 -%if ARCH_X86_64 - SWAP 2, 13 -%endif - pmaddubsw m8, [PIC_sym(pb_m1_1)] - pmaddubsw m11, [PIC_sym(pb_m1_1)] - paddw m0, m8 - paddw m1, m11 - pmulhrsw m2, m0, [PIC_sym(pw_4096)] - pmulhrsw m15, m1, [PIC_sym(pw_4096)] - packuswb m2, m15 - pand m2, m9 - pandn m15, m9, m4 - por m2, m15 -%ifidn %2, v - mova [tmpq+stride3q], m2 ; p0 -%elif ARCH_X86_32 - mova [esp+8*16], m2 -%endif - - paddw m0, m8 - paddw m1, m11 - punpcklbw m8, m3, m14 - punpckhbw m11, m3, m14 -%if ARCH_X86_64 - SWAP 2, 14 -%endif - pmaddubsw m2, m8, [PIC_sym(pb_m1_1)] - pmaddubsw m15, m11, [PIC_sym(pb_m1_1)] - paddw m0, m2 - paddw m1, m15 - pmulhrsw m2, m0, [PIC_sym(pw_4096)] - pmulhrsw m15, m1, [PIC_sym(pw_4096)] - packuswb m2, m15 - pand m2, m9 - pandn m15, m9, m5 - por m2, m15 -%ifidn %2, v - mova [dstq+strideq*0], m2 ; q0 -%endif - - pmaddubsw m8, [PIC_sym(pb_m1_2)] - pmaddubsw m11, [PIC_sym(pb_m1_2)] - paddw m0, m8 - paddw m1, m11 - pmaddubsw m7, [PIC_sym(pb_m1_0)] - pmaddubsw m10, [PIC_sym(pb_m1_0)] - paddw m0, m7 - paddw m1, m10 - pmulhrsw m0, [PIC_sym(pw_4096)] - pmulhrsw m1, [PIC_sym(pw_4096)] - packuswb m0, m1 - pand m0, m9 - pandn m1, m9, m6 - por m0, m1 -%if ARCH_X86_32 - %xdefine m3 m8 - %xdefine m4 m10 - %xdefine m5 m11 - %xdefine m6 m15 -%endif -%ifidn %2, v - mova [dstq+strideq*1], m0 ; q1 -%else - %if ARCH_X86_64 - SWAP 3, 13 - SWAP 4, 14 - %else - mova m3, [esp+11*16] - mova m4, [esp+ 8*16] - %endif - SWAP 5, 2 - SWAP 6, 0 - TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 -%endif -%else ; if %1 == 4 -%ifidn %2, v - mova [tmpq+strideq*0], m3 ; p1 - mova [tmpq+strideq*1], m4 ; p0 - mova [tmpq+strideq*2], m5 ; q0 - mova [tmpq+stride3q ], m6 ; q1 -%else - TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 -%endif -%endif -%if ARCH_X86_32 - %define m12 m12reg -%endif -%endmacro - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; 32-bit PIC helpers ;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%if ARCH_X86_32 - %define PIC_base_offset $$ - - %macro SETUP_PIC 0 ; PIC_reg - %define PIC_reg r2 - %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4) - LEA PIC_reg, $$ - %endmacro - - %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base - %if %1 == 0 - mov [esp+PIC_reg_stk_offset], PIC_reg - mov PIC_reg, maskm - %else - mov PIC_reg, [esp+PIC_reg_stk_offset] - %endif - %endmacro - - %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) - -%else - %macro XCHG_PIC_REG 1 - %endmacro - %define PIC_sym(sym) (sym) -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%if ARCH_X86_32 - %if STACK_ALIGNMENT < required_stack_alignment - %assign copy_args 1 - %else - %assign copy_args 0 - %endif -%endif - -%macro RELOC_ARGS 1 - %if copy_args - %define maskm [esp+stack_size-gprsize*1] - %define l_stridem [esp+stack_size-gprsize*2] - %define lutm [esp+stack_size-gprsize*3] - %define %1m [esp+stack_size-gprsize*4] - mov r6d, r6m - mov maskm, maskd - mov lutm, lutd - mov %1m, r6d - %else - %define %1m r6m - %endif -%endmacro - -%if ARCH_X86_32 - %define tmpq r4 - %define mstrideq r5 - %define stride3q r6 - %define l_stride3q r6 -%endif - -INIT_XMM ssse3 -%if ARCH_X86_64 -cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \ - dst, stride, mask, l, l_stride, lut, \ - w, stride3, mstride, tmp, mask_bits -%else -cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \ - dst, stride, mask, l, l_stride, lut, mask_bits - RELOC_ARGS w - SETUP_PIC - %define m12 m5 -%endif - shl l_strideq, 2 - sub lq, l_strideq -%if ARCH_X86_64 - mov mstrideq, strideq - neg mstrideq - lea stride3q, [strideq*3] -%else - mov l_stridem, l_strided -%endif - mov mask_bitsd, 0xf - mova m12, [PIC_sym(pd_mask)] - XCHG_PIC_REG 0 - movu m0, [maskq] - pxor m4, m4 - movd m3, [lutq+136] - pshufb m3, m4 - pshufd m2, m0, q2222 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - por m1, m2 - por m0, m1 - mova [rsp+11*16], m0 - mova [rsp+12*16], m1 - mova [rsp+13*16], m2 - mova [rsp+14*16], m3 - -%define maskmem [esp+15*16] -%define mask0 [rsp+11*16] -%define mask1 [rsp+12*16] -%define mask2 [rsp+13*16] -%define minlvl [rsp+14*16] - -.loop: - test [maskq+8], mask_bitsd ; vmask[2] - je .no_flat16 - -%if ARCH_X86_32 - XCHG_PIC_REG 1 - mov [esp+25*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 16, v - jmp .end - -.no_flat16: - test [maskq+4], mask_bitsd ; vmask[1] - je .no_flat - -%if ARCH_X86_32 - XCHG_PIC_REG 1 - mov [esp+25*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 8, v - jmp .end - -.no_flat: - test [maskq+0], mask_bitsd ; vmask[0] - XCHG_PIC_REG 1 - je .no_filter - -%if ARCH_X86_32 - mov [esp+25*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 4, v - -.end: -%if ARCH_X86_32 - mova m12, maskmem - mov mask_bitsd, [esp+25*16] -%endif -.no_filter: - pslld m12, 4 - shl mask_bitsd, 4 - add lq, 16 - add dstq, 16 -%if ARCH_X86_64 - sub wd, 4 -%else - sub dword wm, 4 -%endif - XCHG_PIC_REG 0 - jg .loop - RET - -INIT_XMM ssse3 -%if ARCH_X86_64 -cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \ - dst, stride, mask, l, l_stride, lut, \ - h, stride3, l_stride3, tmp, mask_bits -%else -cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \ - dst, stride, mask, l, l_stride, lut, mask_bits - RELOC_ARGS h - SETUP_PIC - %define m12 m5 -%endif - sub lq, 4 - shl l_strideq, 2 -%if ARCH_X86_64 - lea stride3q, [strideq*3] - lea l_stride3q, [l_strideq*3] -%else - mov l_stridem, l_strided -%endif - mov mask_bitsd, 0xf - mova m12, [PIC_sym(pd_mask)] - XCHG_PIC_REG 0 - movu m0, [maskq] - pxor m4, m4 - movd m3, [lutq+136] - pshufb m3, m4 - pshufd m2, m0, q2222 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - por m1, m2 - por m0, m1 - mova [rsp+22*16], m0 - mova [rsp+23*16], m1 - mova [rsp+24*16], m2 - mova [rsp+25*16], m3 - -%define maskmem [esp+37*16] -%define mask0 [rsp+22*16] -%define mask1 [rsp+23*16] -%define mask2 [rsp+24*16] -%define minlvl [rsp+25*16] - -.loop: - test [maskq+8], mask_bitsd ; vmask[2] - je .no_flat16 - -%if ARCH_X86_32 - XCHG_PIC_REG 1 - mov [esp+38*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 16, h - jmp .end - -.no_flat16: - test [maskq+4], mask_bitsd ; vmask[1] - je .no_flat - -%if ARCH_X86_32 - XCHG_PIC_REG 1 - mov [esp+38*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 8, h - jmp .end - -.no_flat: - test [maskq+0], mask_bitsd ; vmask[0] - XCHG_PIC_REG 1 - je .no_filter - -%if ARCH_X86_32 - mov [esp+38*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 4, h - jmp .end - -.no_filter: - lea dstq, [dstq+strideq*8] - lea dstq, [dstq+strideq*8] -%if ARCH_X86_32 - jmp .end_noload -.end: - mova m12, maskmem - mov l_strideq, l_stridem - mov mask_bitsd, [esp+38*16] -.end_noload: -%else -.end: -%endif - lea lq, [lq+l_strideq*4] - pslld m12, 4 - shl mask_bitsd, 4 -%if ARCH_X86_64 - sub hd, 4 -%else - sub dword hm, 4 -%endif - XCHG_PIC_REG 0 - jg .loop - RET - -INIT_XMM ssse3 -%if ARCH_X86_64 -cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \ - dst, stride, mask, l, l_stride, lut, \ - w, stride3, mstride, tmp, mask_bits -%else -cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \ - dst, stride, mask, l, l_stride, lut, mask_bits - RELOC_ARGS w - SETUP_PIC - %define m12 m4 -%endif - shl l_strideq, 2 - sub lq, l_strideq -%if ARCH_X86_64 - mov mstrideq, strideq - neg mstrideq - lea stride3q, [strideq*3] -%else - mov l_stridem, l_strided -%endif - mov mask_bitsd, 0xf - mova m12, [PIC_sym(pd_mask)] - XCHG_PIC_REG 0 - movq m0, [maskq] - pxor m3, m3 - movd m2, [lutq+136] - pshufb m2, m3 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - por m0, m1 - mova [rsp+0*16], m0 - mova [rsp+1*16], m1 - mova [rsp+2*16], m2 - -%define maskmem [esp+7*16] -%define mask0 [rsp+0*16] -%define mask1 [rsp+1*16] -%define minlvl [rsp+2*16] - -.loop: - test [maskq+4], mask_bitsd ; vmask[1] - je .no_flat - -%if ARCH_X86_32 - XCHG_PIC_REG 1 - mov [esp+11*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 6, v - jmp .end - -.no_flat: - test [maskq+0], mask_bitsd ; vmask[1] - XCHG_PIC_REG 1 - je .no_filter - -%if ARCH_X86_32 - mov [esp+11*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 4, v - -.end: -%if ARCH_X86_32 - mova m12, maskmem - mov mask_bitsd, [esp+11*16] -%endif -.no_filter: - pslld m12, 4 - shl mask_bitsd, 4 - add lq, 16 - add dstq, 16 -%if ARCH_X86_64 - sub wd, 4 -%else - sub dword wm, 4 -%endif - XCHG_PIC_REG 0 - jg .loop - RET - -INIT_XMM ssse3 -%if ARCH_X86_64 -cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \ - dst, stride, mask, l, l_stride, lut, \ - h, stride3, l_stride3, tmp, mask_bits -%else -cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \ - dst, stride, mask, l, l_stride, lut, mask_bits - RELOC_ARGS h - SETUP_PIC - %define m12 m4 -%endif - sub lq, 4 - shl l_strideq, 2 -%if ARCH_X86_64 - lea stride3q, [strideq*3] - lea l_stride3q, [l_strideq*3] -%else - mov l_stridem, l_strided -%endif - mov mask_bitsd, 0xf - mova m12, [PIC_sym(pd_mask)] - XCHG_PIC_REG 0 - movq m0, [maskq] - pxor m3, m3 - movd m2, [lutq+136] - pshufb m2, m3 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - por m0, m1 - mova [rsp+0*16], m0 - mova [rsp+1*16], m1 - mova [rsp+2*16], m2 - -%define maskmem [esp+7*16] -%define mask0 [rsp+0*16] -%define mask1 [rsp+1*16] -%define minlvl [rsp+2*16] - -.loop: - test [maskq+4], mask_bitsd ; vmask[1] - je .no_flat - -%if ARCH_X86_32 - XCHG_PIC_REG 1 - mov [esp+12*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 6, h - jmp .end - -.no_flat: - test [maskq+0], mask_bitsd ; vmask[1] - XCHG_PIC_REG 1 - je .no_filter - -%if ARCH_X86_32 - mov [esp+12*16], mask_bitsd - mova maskmem, m12 -%endif - FILTER 4, h - jmp .end - -.no_filter: - lea dstq, [dstq+strideq*8] - lea dstq, [dstq+strideq*8] -%if ARCH_X86_32 - jmp .end_noload -.end: - mova m12, maskmem - mov l_strided, l_stridem - mov mask_bitsd, [esp+12*16] -.end_noload: -%else -.end: -%endif - lea lq, [lq+l_strideq*4] - pslld m12, 4 - shl mask_bitsd, 4 -%if ARCH_X86_64 - sub hd, 4 -%else - sub dword hm, 4 -%endif - XCHG_PIC_REG 0 - jg .loop - RET diff -Nru dav1d-0.7.1/src/x86/looprestoration16_avx2.asm dav1d-0.9.1/src/x86/looprestoration16_avx2.asm --- dav1d-0.7.1/src/x86/looprestoration16_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/looprestoration16_avx2.asm 2021-07-28 21:38:28.909852300 +0000 @@ -0,0 +1,2581 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 +pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + +wiener_hshift: dw 4, 4, 1, 1 +wiener_vshift: dw 1024, 1024, 4096, 4096 +wiener_round: dd 1049600, 1048832 + +pb_m10_m9: times 2 db -10, -9 +pb_m6_m5: times 2 db -6, -5 +pb_m2_m1: times 2 db -2, -1 +pb_2_3: times 2 db 2, 3 +pb_6_7: times 2 db 6, 7 +pw_1023: times 2 dw 1023 +pd_8: dd 8 +pd_25: dd 25 +pd_4096: dd 4096 +pd_34816: dd 34816 +pd_m262128: dd -262128 +pd_0xf00800a4: dd 0xf00800a4 +pd_0xf00801c7: dd 0xf00801c7 + +%define pw_256 sgr_lshuf5 + +cextern sgr_x_by_x_avx2 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers + +INIT_YMM avx2 +cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h +%define base t4-wiener_hshift + mov fltq, fltmp + mov edged, r8m + movifnidn wd, wm + mov hd, r6m + mov t3d, r9m ; pixel_max + vbroadcasti128 m6, [wiener_shufA] + vpbroadcastd m12, [fltq+ 0] ; x0 x1 + lea t4, [wiener_hshift] + vbroadcasti128 m7, [wiener_shufB] + add wd, wd + vpbroadcastd m13, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vbroadcasti128 m8, [wiener_shufC] + add lpfq, wq + vbroadcasti128 m9, [wiener_shufD] + lea t1, [rsp+wq+16] + vpbroadcastd m14, [fltq+16] ; y0 y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + neg wq + vpbroadcastd m0, [base+wiener_hshift+t3*4] + vpbroadcastd m10, [base+wiener_round+t3*4] + vpbroadcastd m11, [base+wiener_vshift+t3*4] + pmullw m12, m0 ; upshift filter coefs to make the + pmullw m13, m0 ; horizontal downshift constant + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + mov [rsp+8*1], lpf_strideq + add r7, lpf_strideq + mov [rsp+8*0], r7 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp+8*0] + call .hv_bottom + add lpfq, [rsp+8*1] + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov [rsp+8*0], r7 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.extend_right: + movd xm1, r10d + vpbroadcastd m0, [pb_6_7] + movu m2, [pb_0to31] + vpbroadcastb m1, xm1 + psubb m0, m1 + pminub m0, m2 + pshufb m3, m0 + vpbroadcastd m0, [pb_m2_m1] + psubb m0, m1 + pminub m0, m2 + pshufb m4, m0 + vpbroadcastd m0, [pb_m10_m9] + psubb m0, m1 + pminub m0, m2 + pshufb m5, m0 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq xm3, [leftq] + vpblendd m3, [lpfq+r10-8], 0xfc + add leftq, 8 + jmp .h_main +.h_extend_left: + vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + shufpd m3, m4, 0x05 + pshufb m3, [wiener_lshuf7] + jmp .h_main2 +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+r10-8] +.h_main: + mova m4, [lpfq+r10+0] +.h_main2: + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18) + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq xm3, [leftq] + vpblendd m3, [lpfq+r10-8], 0xfc + add leftq, 8 + jmp .hv_main +.hv_extend_left: + movu m3, [lpfq+r10-8] + pshufb m3, [wiener_lshuf7] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-8] +.hv_main: + mova m4, [lpfq+r10+0] + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -36 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + vpbroadcastd m2, [pd_m262128] + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + mova m2, [t4+r10] + paddw m2, [t2+r10] + mova m5, [t3+r10] + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova m4, [t5+r10] + paddw m4, [t1+r10] + psraw m0, 1 + paddw m3, m0, [t6+r10] + mova [t0+r10], m0 + punpcklwd m0, m2, m5 + pmaddwd m0, m15 + punpckhwd m2, m5 + pmaddwd m2, m15 + punpcklwd m1, m3, m4 + pmaddwd m1, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m2, m10 + paddd m0, m1 + paddd m2, m3 + psrad m0, 5 + psrad m2, 5 + packusdw m0, m2 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, dst_strideq + ret +.v: + mov r10, wq +.v_loop: + mova m1, [t4+r10] + paddw m1, [t2+r10] + mova m2, [t3+r10] + mova m4, [t1+r10] + paddw m3, m4, [t6+r10] + paddw m4, [t5+r10] + punpcklwd m0, m1, m2 + pmaddwd m0, m15 + punpckhwd m1, m2 + pmaddwd m1, m15 + punpcklwd m2, m3, m4 + pmaddwd m2, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m1, m10 + paddd m0, m2 + paddd m1, m3 + psrad m0, 5 + psrad m1, 5 + packusdw m0, m1 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, dst_strideq + ret +cglobal wiener_filter5_16bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h +%define base t4-wiener_hshift + mov fltq, fltmp + mov edged, r8m + movifnidn wd, wm + mov hd, r6m + mov t3d, r9m ; pixel_max + vbroadcasti128 m5, [wiener_shufE] + vpbroadcastw m11, [fltq+ 2] ; x1 + vbroadcasti128 m6, [wiener_shufB] + lea t4, [wiener_hshift] + vbroadcasti128 m7, [wiener_shufD] + add wd, wd + vpbroadcastd m12, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + lea t1, [rsp+wq+16] + vpbroadcastw m13, [fltq+18] ; y1 + add dstq, wq + vpbroadcastd m14, [fltq+20] ; y2 y3 + neg wq + vpbroadcastd m0, [base+wiener_hshift+t3*4] + vpbroadcastd m9, [base+wiener_round+t3*4] + vpbroadcastd m10, [base+wiener_vshift+t3*4] + movu xm15, [wiener_lshuf5] + pmullw m11, m0 + vinserti128 m15, [pb_0to31], 1 + pmullw m12, m0 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + mov [rsp+8*1], lpf_strideq + add r7, lpf_strideq + mov [rsp+8*0], r7 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp+8*0] + call .hv_bottom + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + RET +.no_top: + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov [rsp+8*0], r7 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, dst_strideq +.v1: + call .v + jmp .end +.extend_right: + movd xm2, r10d + vpbroadcastd m0, [pb_2_3] + vpbroadcastd m1, [pb_m6_m5] + vpbroadcastb m2, xm2 + psubb m0, m2 + psubb m1, m2 + movu m2, [pb_0to31] + pminub m0, m2 + pminub m1, m2 + pshufb m3, m0 + pshufb m4, m1 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm3, [leftq+4] + vpblendd m3, [lpfq+r10-4], 0xfe + add leftq, 8 + jmp .h_main +.h_extend_left: + vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located + mova m3, [lpfq+r10] ; before the start of the buffer + palignr m3, m4, 12 + pshufb m3, m15 + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+r10-4] +.h_main: + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + paddd m1, m3 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm3, [leftq+4] + vpblendd m3, [lpfq+r10-4], 0xfe + add leftq, 8 + jmp .hv_main +.hv_extend_left: + movu m3, [lpfq+r10-4] + pshufb m3, m15 + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-4] +.hv_main: + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + mova m2, [t3+r10] + paddw m2, [t1+r10] + paddd m1, m3 + mova m4, [t2+r10] + punpckhwd m3, m2, m4 + pmaddwd m3, m14 + punpcklwd m2, m4 + mova m4, [t4+r10] + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + pmaddwd m2, m14 + psraw m0, 1 + mova [t0+r10], m0 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 5 + psrad m0, 5 + packusdw m0, m1 + pmulhuw m0, m10 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + add dstq, dst_strideq + ret +.v: + mov r10, wq +.v_loop: + mova m0, [t1+r10] + paddw m2, m0, [t3+r10] + mova m1, [t2+r10] + mova m4, [t4+r10] + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 5 + psrad m0, 5 + packusdw m0, m1 + pmulhuw m0, m10 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + ret + +cglobal sgr_filter_5x5_16bpc, 5, 14, 16, 400*24+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h + movifnidn wd, wm + mov paramsq, paramsmp + lea r13, [sgr_x_by_x_avx2+256*4] + mov edged, r8m + mov hd, r6m + add wd, wd + vpbroadcastw m7, [paramsq+8] ; w0 + add lpfq, wq + vpbroadcastd m8, [pd_8] + lea t1, [rsp+wq+20] + vpbroadcastd m9, [pd_25] + add dstq, wq + vpbroadcastd m10, [paramsq+0] ; s0 + lea t3, [rsp+wq*2+400*12+16] + vpbroadcastd m11, [pd_0xf00800a4] + lea t4, [rsp+wq+400*20+16] + vpbroadcastd m12, [pw_256] + neg wq + vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15) + pxor m6, m6 + vpbroadcastd m14, [pw_1023] + psllw m7, 4 + mova xm15, [sgr_lshuf5] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + add r10, lpf_strideq + mov [rsp+8*0], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, dst_strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + test hd, hd + jz .odd_height + call .h + add lpfq, dst_strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp+8*0] + call .h_top + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov [rsp+8*0], r10 + call .h + lea t2, [t1+400*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + vpbroadcastw m0, [lpfq-2] + movu m1, [r13+r10+ 0] + movu m2, [r13+r10+16] + vpblendvb m4, m0, m1 + vpblendvb m5, m0, m2 + ret +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm15 + vinserti128 m4, [lpfq+wq+10], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10- 2] +.h_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + shufpd m5, m4, m5, 0x05 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10+400*0] + paddd m1, [t1+r10+400*2] + paddd m2, [t1+r10+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+r10+400*0], m0 + mova [t1+r10+400*2], m1 + mova [t1+r10+400*4], m2 + add r10, 32 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-4] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10+400*0] + mova m1, [t1+r10+400*2] + mova m2, [t1+r10+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m1 + mova [t2+r10+400*4], m2 + add r10, 32 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv_main +.hv_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm15 + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv_main +.hv_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10- 2] +.hv_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -36 + jl .hv_have_right + call .extend_right +.hv_have_right: + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + shufpd m5, m4, m5, 0x05 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+r10+400*0] + paddd m4, m2, [t1+r10+400*2] + paddd m5, m3, [t1+r10+400*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10+400*0] ; hv sum + paddd m4, [t2+r10+400*2] ; hv sumsq + paddd m5, [t2+r10+400*4] + mova [t0+r10+400*0], m0 + mova [t0+r10+400*2], m2 + mova [t0+r10+400*4], m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + pmulld m4, m9 ; a * 25 + pmulld m5, m9 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + pmaxud m5, m3 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m10 ; p * s + pmulld m5, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + psubw m2, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+r10+4], m2 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+r10*2+ 8], xm0 + vextracti128 [t3+r10*2+40], m0, 1 + mova [t3+r10*2+24], xm1 + vextracti128 [t3+r10*2+56], m1, 1 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10+400*0], m1 + paddw m1, m0 + mova [t1+r10+400*2], m4 + paddd m4, m2 + mova [t1+r10+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-4] +.v_loop: + mova m0, [t1+r10+400*0] + mova m2, [t1+r10+400*2] + mova m3, [t1+r10+400*4] + paddw m1, m0, [t2+r10+400*0] + paddd m4, m2, [t2+r10+400*2] + paddd m5, m3, [t2+r10+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + pmulld m4, m9 ; a * 25 + pmulld m5, m9 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + pmaxud m5, m3 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m10 ; p * s + pmulld m5, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + psubw m2, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+r10+4], m2 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+r10*2+ 8], xm0 + vextracti128 [t3+r10*2+40], m0, 1 + mova [t3+r10*2+24], xm1 + vextracti128 [t3+r10*2+56], m1, 1 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+36] + paddw m3, m0, [t4+r10*1+ 0] + paddd m4, m1, [t3+r10*2+ 0] + paddd m5, m2, [t3+r10*2+32] + paddw m3, [t4+r10*1+ 4] + paddd m4, [t3+r10*2+ 8] + paddd m5, [t3+r10*2+40] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+r10*1+400*2+ 0], m0 + mova [t3+r10*2+400*4+ 0], m1 + mova [t3+r10*2+400*4+32], m2 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+36] + paddw m3, m0, [t4+r10*1+ 0] + paddd m4, m1, [t3+r10*2+ 0] + paddd m5, m2, [t3+r10*2+32] + paddw m3, [t4+r10*1+ 4] + paddd m4, [t3+r10*2+ 8] + paddd m5, [t3+r10*2+40] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+r10*1+400*2+ 0] + paddd m4, m1, [t3+r10*2+400*4+ 0] + paddd m5, m2, [t3+r10*2+400*4+32] + mova [t4+r10*1+400*2+ 0], m0 + mova [t3+r10*2+400*4+ 0], m1 + mova [t3+r10*2+400*4+32], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 << 8) + paddd m3, m4 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 32 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m0, [dstq+r10] + mova m3, [t4+r10*1+400*2+ 0] + mova m4, [t3+r10*2+400*4+ 0] + mova m5, [t3+r10*2+400*4+32] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 <<7) + paddd m3, m4 + psrld m2, 8 + psrld m3, 8 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 32 + jl .n1_loop + add dstq, dst_strideq + ret + +cglobal sgr_filter_3x3_16bpc, 5, 14, 15, 400*42+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h + movifnidn wd, wm + mov paramsq, paramsmp + lea r13, [sgr_x_by_x_avx2+256*4] + mov edged, r8m + add wd, wd + mov hd, r6m + add lpfq, wq + vpbroadcastw m7, [paramsq+10] ; w1 + lea t1, [rsp+wq+12] + vpbroadcastd m8, [pd_8] + add dstq, wq + vpbroadcastd m9, [paramsq+ 4] ; s1 + lea t3, [rsp+wq*2+400*12+8] + vpbroadcastd m10, [pd_0xf00801c7] + lea t4, [rsp+wq+400*32+8] + vpbroadcastd m11, [pd_34816] + neg wq + vpbroadcastd m12, [pw_256] + pxor m6, m6 + vpbroadcastd m13, [pw_1023] + psllw m7, 4 + mova xm14, [sgr_lshuf3] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, dst_strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, lpf_strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+r10+400*0] + mova m1, [t1+r10+400*2] + mova m2, [t1+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m1 + mova [t2+r10+400*4], m2 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.extend_right: + vpbroadcastw m0, [lpfq-2] + movu m1, [r13+r10+ 2] + movu m2, [r13+r10+18] + vpblendvb m4, m0, m1 + vpblendvb m5, m0, m2 + ret +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm14 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10+ 0] +.h_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+r10+400*0], m1 + mova [t1+r10+400*2], m2 + mova [t1+r10+400*4], m3 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .hv0_main +.hv0_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm14 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m4, [lpfq+r10+ 0] +.hv0_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -34 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+r10+400*0] + paddd m4, m2, [t1+r10+400*2] + paddd m5, m3, [t1+r10+400*4] + mova [t1+r10+400*0], m1 + mova [t1+r10+400*2], m2 + mova [t1+r10+400*4], m3 + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m2 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .hv1_main +.hv1_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm14 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m4, [lpfq+r10+ 0] +.hv1_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -34 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+r10+400*0] + paddd m4, m2, [t2+r10+400*2] + paddd m5, m3, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m2 + mova [t2+r10+400*4], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + psrld m5, 4 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2 +4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m2 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m0, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psubw m2, m12, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + mova xm0, [t4+r10*1+400*0+0] + paddw xm0, [t4+r10*1+400*0+4] + paddw xm2, xm0, [t4+r10*1+400*0+2] + mova m1, [t3+r10*2+400*0+0] + paddd m1, [t3+r10*2+400*0+8] + paddd m3, m1, [t3+r10*2+400*0+4] + psllw xm2, 2 ; a[-1] 444 + pslld m3, 2 ; b[-1] 444 + psubw xm2, xm0 ; a[-1] 343 + psubd m3, m1 ; b[-1] 343 + mova [t4+r10*1+400* 4], xm2 + mova [t3+r10*2+400* 8], m3 + mova xm0, [t4+r10*1+400*2+0] + paddw xm0, [t4+r10*1+400*2+4] + paddw xm2, xm0, [t4+r10*1+400*2+2] + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m3, m1, [t3+r10*2+400*4+4] + psllw xm2, 2 ; a[ 0] 444 + pslld m3, 2 ; b[ 0] 444 + mova [t4+r10*1+400* 6], xm2 + mova [t3+r10*2+400*12], m3 + psubw xm2, xm0 ; a[ 0] 343 + psubd m3, m1 ; b[ 0] 343 + mova [t4+r10*1+400* 8], xm2 + mova [t3+r10*2+400*16], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + mova m3, [t4+r10*1+400*0+0] + paddw m3, [t4+r10*1+400*0+4] + paddw m1, m3, [t4+r10*1+400*0+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+400*4] + paddw m3, [t4+r10*1+400*6] + mova [t4+r10*1+400*4], m2 + mova [t4+r10*1+400*6], m1 + mova m4, [t3+r10*2+400*0+0] + paddd m4, [t3+r10*2+400*0+8] + paddd m1, m4, [t3+r10*2+400*0+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+r10*2+400* 8+ 0] + paddd m4, [t3+r10*2+400*12+ 0] + mova [t3+r10*2+400* 8+ 0], m2 + mova [t3+r10*2+400*12+ 0], m1 + mova m5, [t3+r10*2+400*0+32] + paddd m5, [t3+r10*2+400*0+40] + paddd m1, m5, [t3+r10*2+400*0+36] + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+r10*2+400* 8+32] + paddd m5, [t3+r10*2+400*12+32] + mova [t3+r10*2+400* 8+32], m2 + mova [t3+r10*2+400*12+32], m1 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 << 8) + paddd m3, m4 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+r10], m0 + add r10, 32 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t4+r10*1+400*2+0] + paddw m3, [t4+r10*1+400*2+4] + paddw m1, m3, [t4+r10*1+400*2+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+400*6] + paddw m3, [t4+r10*1+400*8] + mova [t4+r10*1+400*6], m1 + mova [t4+r10*1+400*8], m2 + mova m4, [t3+r10*2+400*4+0] + paddd m4, [t3+r10*2+400*4+8] + paddd m1, m4, [t3+r10*2+400*4+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+r10*2+400*12+ 0] + paddd m4, [t3+r10*2+400*16+ 0] + mova [t3+r10*2+400*12+ 0], m1 + mova [t3+r10*2+400*16+ 0], m2 + mova m5, [t3+r10*2+400*4+32] + paddd m5, [t3+r10*2+400*4+40] + paddd m1, m5, [t3+r10*2+400*4+36] + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+r10*2+400*12+32] + paddd m5, [t3+r10*2+400*16+32] + mova [t3+r10*2+400*12+32], m1 + mova [t3+r10*2+400*16+32], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + paddd m2, m1 ; a * src + b + (1 << 8) + paddd m3, m4 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+r10], m0 + add r10, 32 + jl .n1_loop + add dstq, dst_strideq + ret + +cglobal sgr_filter_mix_16bpc, 5, 14, 16, 400*66+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h + movifnidn wd, wm + mov paramsq, paramsmp + lea r13, [sgr_x_by_x_avx2+256*4] + mov edged, r8m + add wd, wd + mov hd, r6m + add lpfq, wq + vpbroadcastd m9, [pd_8] + lea t1, [rsp+wq+12] + vpbroadcastd m10, [pd_34816] + add dstq, wq + vpbroadcastd m11, [pw_256] + lea t3, [rsp+wq*2+400*24+8] + vpbroadcastd m12, [pd_0xf00801c7] + lea t4, [rsp+wq+400*52+8] + vpbroadcastd m15, [paramsq+8] ; w0 w1 + neg wq + vpbroadcastd m13, [paramsq+0] ; s0 + pxor m7, m7 + vpbroadcastd m14, [paramsq+4] ; s1 + psllw m15, 2 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup + add t1, 400*12 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, dst_strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, lpf_strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+r10+400* 0] + mova m1, [t1+r10+400* 2] + mova m2, [t1+r10+400* 4] + paddw m0, m0 + mova m3, [t1+r10+400* 6] + paddd m1, m1 + mova m4, [t1+r10+400* 8] + paddd m2, m2 + mova m5, [t1+r10+400*10] + mova [t2+r10+400* 0], m0 + mova [t2+r10+400* 2], m1 + mova [t2+r10+400* 4], m2 + mova [t2+r10+400* 6], m3 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10- 2] +.h_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.h_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m6, m0, m7 + pmaddwd m6, m6 + punpckhwd m0, m7 + pmaddwd m0, m0 + paddd m2, m6 ; sumsq3 + shufpd m6, m4, m5, 0x05 + punpcklwd m5, m6, m4 + paddw m8, m4, m6 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + paddd m3, m0 + mova [t1+r10+400* 6], m1 + mova [t1+r10+400* 8], m2 + mova [t1+r10+400*10], m3 + paddw m8, m1 ; sum5 + paddd m5, m2 ; sumsq5 + paddd m6, m3 + mova [t1+r10+400* 0], m8 + mova [t1+r10+400* 2], m5 + mova [t1+r10+400* 4], m6 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv0_main +.hv0_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m4, [lpfq+r10- 2] +.hv0_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -36 + jl .hv0_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.hv0_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m6, m0, m7 + pmaddwd m6, m6 + punpckhwd m0, m7 + pmaddwd m0, m0 + paddd m2, m6 ; h sumsq3 + shufpd m6, m4, m5, 0x05 + punpcklwd m5, m6, m4 + paddw m8, m4, m6 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m5, m2 ; h sumsq5 + paddd m6, m3 + mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4? + mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd + mova [t3+r10*2+400*0+40], m6 + paddw m8, [t1+r10+400* 0] + paddd m5, [t1+r10+400* 2] + paddd m6, [t1+r10+400* 4] + mova [t1+r10+400* 0], m8 + mova [t1+r10+400* 2], m5 + mova [t1+r10+400* 4], m6 + paddw m0, m1, [t1+r10+400* 6] + paddd m4, m2, [t1+r10+400* 8] + paddd m5, m3, [t1+r10+400*10] + mova [t1+r10+400* 6], m1 + mova [t1+r10+400* 8], m2 + mova [t1+r10+400*10], m3 + paddw m1, m0, [t2+r10+400* 6] + paddd m2, m4, [t2+r10+400* 8] + paddd m3, m5, [t2+r10+400*10] + mova [t2+r10+400* 6], m0 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m2, m11, m2 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv1_main +.hv1_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m4, [lpfq+r10- 2] +.hv1_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -36 + jl .hv1_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.hv1_have_right: + palignr m6, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m6, m3 + punpcklwd m0, m6, m3 + pmaddwd m0, m0 + punpckhwd m6, m3 + pmaddwd m6, m6 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m7 + pmaddwd m1, m1 + punpckhwd m3, m7 + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + shufpd m1, m4, m5, 0x05 + punpckhwd m5, m4, m1 + paddw m8, m4, m1 + pmaddwd m5, m5 + punpcklwd m4, m1 + pmaddwd m4, m4 + paddd m6, m3 + paddw m1, m2, [t2+r10+400* 6] + mova [t2+r10+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+r10+400* 8] + paddd m3, m6, [t2+r10+400*10] + mova [t2+r10+400* 8], m0 + mova [t2+r10+400*10], m6 + paddd m4, m0 ; h sumsq5 + paddd m5, m6 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m0, m2, 3 + pslld m6, m3, 3 + paddd m2, m0 ; ((a3 + 8) >> 4) * 9 + paddd m3, m6 + psrlw m6, m1, 1 + pavgw m6, m7 ; (b3 + 2) >> 2 + punpcklwd m0, m6, m7 + pmaddwd m0, m0 + punpckhwd m6, m7 + pmaddwd m6, m6 + pmaxud m2, m0 + psubd m2, m0 ; p3 + pmaxud m3, m6 + psubd m3, m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmulld m2, m14 ; p3 * s1 + pmulld m3, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m2, m12 + paddusw m3, m12 + psrad m7, m2, 20 ; min(z3, 255) - 256 + vpgatherdd m6, [r13+m7*4], m2 + psrad m2, m3, 20 + vpgatherdd m7, [r13+m2*4], m3 + pmulld m0, m6 + packssdw m6, m7 + pmulld m7, m1 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m7, m10 + psubw m6, m11, m6 + psrld m0, 12 + psrld m7, 12 + paddw m1, m8, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + paddw m1, [t1+r10+400*0] + paddd m2, [t1+r10+400*2] + paddd m3, [t1+r10+400*4] + mova [t2+r10+400*0], m8 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + mova [t4+r10*1+400*4 +4], m6 + mova [t3+r10*2+400*8+ 8], xm0 + vextracti128 [t3+r10*2+400*8+40], m0, 1 + mova [t3+r10*2+400*8+24], xm7 + vextracti128 [t3+r10*2+400*8+56], m7, 1 + vpbroadcastd m4, [pd_25] + pxor m7, m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 + pmulld m3, m4 + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmaxud m2, m4 + psubd m2, m4 ; p5 + vpbroadcastd m4, [pd_0xf00800a4] + pmaxud m3, m5 + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r13+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r13+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + packssdw m4, m5 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m4, m11, m4 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m4 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+400* 6] + mova m4, [t1+r10+400* 8] + mova m5, [t1+r10+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10+400* 6] + paddd m2, m4, [t2+r10+400* 8] + paddd m3, m5, [t2+r10+400*10] + mova [t2+r10+400* 6], m0 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m2, m11, m2 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + mova [t3+r10*2+400*8+ 8], m3 + mova [t3+r10*2+400*0+ 8], m4 + mova [t3+r10*2+400*0+40], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+r10+400*0], m3 + mova [t1+r10+400*2], m4 + mova [t1+r10+400*4], m5 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m4, [t1+r10+400* 6] + mova m5, [t1+r10+400* 8] + mova m6, [t1+r10+400*10] + paddw m1, m4, [t2+r10+400* 6] + paddd m2, m5, [t2+r10+400* 8] + paddd m3, m6, [t2+r10+400*10] + mova [t2+r10+400* 6], m4 + mova [t2+r10+400* 8], m5 + mova [t2+r10+400*10], m6 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m2, m11, m2 + psrld m0, 12 + psrld m8, m1, 12 + mova [t4+r10*1+400*4+4], m2 + mova m4, [t3+r10*2+400*8+ 8] + mova m5, [t3+r10*2+400*0+ 8] + mova m6, [t3+r10*2+400*0+40] + paddw m1, m4, [t2+r10+400*0] + paddd m2, m5, [t2+r10+400*2] + paddd m3, m6, [t2+r10+400*4] + paddw m1, [t1+r10+400*0] + paddd m2, [t1+r10+400*2] + paddd m3, [t1+r10+400*4] + mova [t2+r10+400*0], m4 + mova [t2+r10+400*2], m5 + mova [t2+r10+400*4], m6 + vpbroadcastd m4, [pd_25] + mova [t3+r10*2+400*8+ 8], xm0 + vextracti128 [t3+r10*2+400*8+40], m0, 1 + mova [t3+r10*2+400*8+24], xm8 + vextracti128 [t3+r10*2+400*8+56], m8, 1 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 + pmulld m3, m4 + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmaxud m2, m4 + psubd m2, m4 ; p5 + vpbroadcastd m4, [pd_0xf00800a4] + pmaxud m3, m5 + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r13+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r13+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + packssdw m4, m5 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + psubw m4, m11, m4 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m4 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu xm0, [t4+r10*1+400*0+2] + paddw xm2, xm0, [t4+r10*1+400*0+0] + paddw xm2, [t4+r10*1+400*0+4] + movu m1, [t3+r10*2+400*0+4] + paddd m3, m1, [t3+r10*2+400*0+0] + paddd m3, [t3+r10*2+400*0+8] + paddw xm0, xm2 + paddd m1, m3 + psllw xm2, 2 + pslld m3, 2 + paddw xm0, xm2 ; a5 565 + paddd m1, m3 ; b5 565 + mova [t4+r10*1+400* 6], xm0 + mova [t3+r10*2+400*12], m1 + mova xm0, [t4+r10*1+400*2+0] + paddw xm0, [t4+r10*1+400*2+4] + paddw xm2, xm0, [t4+r10*1+400*2+2] + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m3, m1, [t3+r10*2+400*4+4] + psllw xm2, 2 ; a3[-1] 444 + pslld m3, 2 ; b3[-1] 444 + psubw xm2, xm0 ; a3[-1] 343 + psubd m3, m1 ; b3[-1] 343 + mova [t4+r10*1+400* 8], xm2 + mova [t3+r10*2+400*16], m3 + mova xm0, [t4+r10*1+400*4+0] + paddw xm0, [t4+r10*1+400*4+4] + paddw xm2, xm0, [t4+r10*1+400*4+2] + mova m1, [t3+r10*2+400*8+0] + paddd m1, [t3+r10*2+400*8+8] + paddd m3, m1, [t3+r10*2+400*8+4] + psllw xm2, 2 ; a3[ 0] 444 + pslld m3, 2 ; b3[ 0] 444 + mova [t4+r10*1+400*10], xm2 + mova [t3+r10*2+400*20], m3 + psubw xm2, xm0 ; a3[ 0] 343 + psubd m3, m1 ; b3[ 0] 343 + mova [t4+r10*1+400*12], xm2 + mova [t3+r10*2+400*24], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu xm2, [t4+r10*1+2] + paddw xm0, xm2, [t4+r10*1+0] + paddw xm0, [t4+r10*1+4] + paddw xm2, xm0 + psllw xm0, 2 + paddw xm0, xm2 ; a5 + movu m1, [t3+r10*2+4] + paddd m4, m1, [t3+r10*2+0] + paddd m4, [t3+r10*2+8] + paddd m1, m4 + pslld m4, 2 + paddd m4, m1 ; b5 + paddw xm2, xm0, [t4+r10*1+400* 6] + mova [t4+r10*1+400* 6], xm0 + paddd m0, m4, [t3+r10*2+400*12] + mova [t3+r10*2+400*12], m4 + mova xm3, [t4+r10*1+400*2+0] + paddw xm3, [t4+r10*1+400*2+4] + paddw xm5, xm3, [t4+r10*1+400*2+2] + psllw xm5, 2 ; a3[ 1] 444 + psubw xm4, xm5, xm3 ; a3[ 1] 343 + paddw xm3, xm4, [t4+r10*1+400* 8] + paddw xm3, [t4+r10*1+400*10] + mova [t4+r10*1+400* 8], xm4 + mova [t4+r10*1+400*10], xm5 + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m5, m1, [t3+r10*2+400*4+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+400*16] + paddd m1, [t3+r10*2+400*20] + mova [t3+r10*2+400*16], m4 + mova [t3+r10*2+400*20], m5 + pmovzxwd m4, [dstq+r10] + pmovzxwd m2, xm2 ; a5 + pmovzxwd m3, xm3 ; a3 + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + pslld m4, 13 + psubd m0, m4 + psubd m1, m4 + paddd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) + paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) + psrld m0, 9 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + vpbroadcastd m1, [pd_4096] + paddd m4, m1 + paddd m0, m4 + psrad m0, 7 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 ; clip + psrlw xm0, 6 + mova [dstq+r10], xm0 + add r10, 16 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova xm3, [t4+r10*1+400*4+0] + paddw xm3, [t4+r10*1+400*4+4] + paddw xm5, xm3, [t4+r10*1+400*4+2] + psllw xm5, 2 ; a3[ 1] 444 + psubw xm4, xm5, xm3 ; a3[ 1] 343 + paddw xm3, xm4, [t4+r10*1+400*12] + paddw xm3, [t4+r10*1+400*10] + mova [t4+r10*1+400*10], xm5 + mova [t4+r10*1+400*12], xm4 + mova m1, [t3+r10*2+400*8+0] + paddd m1, [t3+r10*2+400*8+8] + paddd m5, m1, [t3+r10*2+400*8+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+400*24] + paddd m1, [t3+r10*2+400*20] + mova [t3+r10*2+400*20], m5 + mova [t3+r10*2+400*24], m4 + pmovzxwd m4, [dstq+r10] + pmovzxwd m0, [t4+r10*1+400* 6] + pmovzxwd m3, xm3 + pmaddwd m0, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + pslld m4, 12 + psubd m2, m4, [t3+r10*2+400*12] + paddd m4, m4 + psubd m1, m4 + psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) + paddd m1, m3 ; a3 * src + b3 + (1 << 8) - (src << 13) + psrld m0, 8 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + vpbroadcastd m1, [pd_4096] + paddd m4, m1 + paddd m0, m4 + psrad m0, 7 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 ; clip + psrlw xm0, 6 + mova [dstq+r10], xm0 + add r10, 16 + jl .n1_loop + add dstq, dst_strideq + ret + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/looprestoration16_sse.asm dav1d-0.9.1/src/x86/looprestoration16_sse.asm --- dav1d-0.7.1/src/x86/looprestoration16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/looprestoration16_sse.asm 2021-07-28 21:38:28.909852300 +0000 @@ -0,0 +1,3778 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 +sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +pb_m14_m13: times 8 db -14,-13 +pb_m10_m9: times 8 db -10, -9 +pb_m6_m5: times 8 db -6, -5 +pb_m2_m1: times 8 db -2, -1 +pb_2_3: times 8 db 2, 3 +pb_6_7: times 8 db 6, 7 +pw_25: times 8 dw 25 +pw_256: times 8 dw 256 +pw_1023: times 8 dw 1023 +pd_8: times 4 dd 8 +pd_4096: times 4 dd 4096 +pd_34816: times 4 dd 34816 +pd_m262128: times 4 dd -262128 +pd_0xffff: times 4 dd 0xffff +pd_0xf00800a4: times 4 dd 0xf00800a4 +pd_0xf00801c7: times 4 dd 0xf00801c7 + +wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 +wiener_round: dd 1049600, 1048832 + +cextern sgr_x_by_x + +SECTION .text + +%macro movif64 2 ; dst, src + %if ARCH_X86_64 + mov %1, %2 + %endif +%endmacro + +%macro movif32 2 ; dst, src + %if ARCH_X86_32 + mov %1, %2 + %endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_32 +DECLARE_REG_TMP 4, 6 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 14*16 + %else + %assign extra_stack 12*16 + %endif +cglobal wiener_filter7_16bpc, 5, 7, 8, -384*12-16-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+16*12+ 0] + %define lpf_stridem dword [esp+calloff+16*12+ 4] + %define wm dword [esp+calloff+16*12+ 8] + %define hd dword [esp+calloff+16*12+12] + %define edgeb byte [esp+calloff+16*12+16] + %define edged dword [esp+calloff+16*12+16] + %else + %define hd dword r6m + %define edgeb byte r8m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t5m dword [esp+calloff+4*6] + %define t6m dword [esp+calloff+4*7] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define t5 t5m + %define t6 t6m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define r10 r5 + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+24] + mov lpf_stridem, lpf_strideq + mov wm, wd + mov r4, [rstk+stack_offset+28] + mov hd, r4 + mov r4, [rstk+stack_offset+36] + mov edged, r4 ; edge + %endif +%else +DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers +cglobal wiener_filter7_16bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, fltmp + mov edged, r8m + mov hd, r6m + mov t3d, r9m ; pixel_max + movq m13, [fltq] + movq m15, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+32] + mov t1, [rstk+stack_offset+40] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + %else + LEA t0, wiener_shifts + mov fltq, r7m + movq m1, [fltq] + movq m3, [fltq+16] + mov t1, r9m ; pixel_max + %endif + mov PICmem, t0 +%endif + mova m6, [base+wiener_shufA] + mova m7, [base+wiener_shufB] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + pshufd m12, m13, q0000 ; x0 x1 + pshufd m13, m13, q1111 ; x2 x3 + pshufd m14, m15, q0000 ; y0 y1 + pshufd m15, m15, q1111 ; y2 y3 + mova m8, [wiener_shufC] + mova m9, [wiener_shufD] + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + shr t3d, 11 + %define base t4-wiener_shifts + movd m10, [base+wiener_round+t3*4] + movq m11, [base+wiener_shifts+t3*8] + pshufd m10, m10, q0000 + pshufd m0, m11, q0000 + pshufd m11, m11, q1111 + pmullw m12, m0 ; upshift filter coefs to make the + pmullw m13, m0 ; horizontal downshift constant + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp+0] + %define lpf_stridem [rsp+8] + %define base + %define wiener_lshuf7_mem [wiener_lshuf7] + %define pd_m262128_mem [pd_m262128] +%else + add wd, wd + mova m4, [base+wiener_shufC] + mova m5, [base+wiener_shufD] + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + pshufd m2, m3, q0000 + pshufd m3, m3, q1111 + mova m8, m4 + mova m9, m5 + mova m14, m2 + mova m15, m3 + shr t1, 11 + add lpfq, wq + mova m3, [base+pd_m262128] + movd m4, [base+wiener_round+t1*4] + movq m5, [base+wiener_shifts+t1*8] + lea t1, [esp+extra_stack+wq+16] + add dstq, wq + neg wq + pshufd m4, m4, q0000 + pshufd m2, m5, q0000 + pshufd m5, m5, q1111 + mov wm, wq + pmullw m0, m2 + pmullw m1, m2 + mova m2, [base+wiener_lshuf7] + %define pd_m262128_mem [esp+calloff+16*10] + mova pd_m262128_mem, m3 + mova m10, m4 + mova m11, m5 + mova m12, m0 + mova m13, m1 + %define wiener_lshuf7_mem [esp+calloff+16*11] + mova wiener_lshuf7_mem, m2 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + add lpfq, lpf_stridem +%endif + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + movif64 lpf_stridem, lpf_strideq + add r10, lpf_strideq + mov lpfm, r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, lpfm + call .hv_bottom + add lpfq, lpf_stridem + call .hv_bottom +.v1: + call .v + RET +.no_top: + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v + movif32 wq, wm +.v2: + call .v + movif32 wq, wm + jmp .v1 +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movif32 t0, PICmem + pxor m0, m0 + movd m1, wd + mova m2, [base+pb_0to15] + pshufb m1, m0 + mova m0, [base+pb_6_7] + psubb m0, m1 + pminub m0, m2 + pshufb m3, m0 + mova m0, [base+pb_m2_m1] + psubb m0, m1 + pminub m0, m2 + pshufb m4, m0 + mova m0, [base+pb_m10_m9] + psubb m0, m1 + pminub m0, m2 + pshufb m5, m0 + movif32 t0, t0m + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: + movif64 wq, r5 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, wiener_lshuf7_mem ; before the start of the buffer + jmp .h_main +.h_top: + movif64 wq, r5 + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-8] +.h_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop + movif32 wq, wm + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, wiener_lshuf7_mem + jmp .hv_main +.hv_bottom: + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-8] +.hv_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t1, t4m + movif32 t0, t2m + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, pd_m262128_mem + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 +%if ARCH_X86_64 + mova m2, [t4+wq] + paddw m2, [t2+wq] + mova m5, [t3+wq] +%else + mova m2, [t1+wq] + paddw m2, [t0+wq] + mov t1, t3m + mov t0, t5m + mova m5, [t1+wq] + mov t1, t1m +%endif + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 +%if ARCH_X86_64 + mova m4, [t5+wq] + paddw m4, [t1+wq] + psraw m0, 1 + paddw m3, m0, [t6+wq] +%else + mova m4, [t0+wq] + paddw m4, [t1+wq] + mov t0, t0m + mov t1, t6m + psraw m0, 1 + paddw m3, m0, [t1+wq] +%endif + mova [t0+wq], m0 + punpcklwd m0, m2, m5 + pmaddwd m0, m15 + punpckhwd m2, m5 + pmaddwd m2, m15 + punpcklwd m1, m3, m4 + pmaddwd m1, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m2, m10 + paddd m0, m1 + paddd m2, m3 + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .hv_loop +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 +%else + mov r5, t5m + mov t1, t4m + mov t6m, r5 + mov t5m, t1 + mov r5, t3m + mov t1, t2m + mov t4m, r5 + mov t3m, t1 + mov r5, t1m + mov t1, t0 + mov t2m, r5 + mov t0, t6m + mov wq, wm +%endif + add dstq, dst_strideq + ret +.v: + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 +.v_loop: +%if ARCH_X86_64 + mova m1, [t4+wq] + paddw m1, [t2+wq] + mova m2, [t3+wq] + mova m4, [t1+wq] + paddw m3, m4, [t6+wq] + paddw m4, [t5+wq] +%else + mov t0, t4m + mov t1, t2m + mova m1, [t0+wq] + paddw m1, [t1+wq] + mov t0, t3m + mov t1, t1m + mova m2, [t0+wq] + mova m4, [t1+wq] + mov t0, t6m + mov t1, t5m + paddw m3, m4, [t0+wq] + paddw m4, [t1+wq] +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m15 + punpckhwd m1, m2 + pmaddwd m1, m15 + punpcklwd m2, m3, m4 + pmaddwd m2, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m1, m10 + paddd m0, m2 + paddd m1, m3 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .v_loop +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t0, t5m + mov t1, t4m + mov r5, t3m + mov t6m, t0 + mov t5m, t1 + mov t4m, r5 + mov r5, t2m + mov t1, t1m + mov t0, t0m + mov t3m, r5 + mov t2m, t1 +%endif + add dstq, dst_strideq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign stack_size 12*16+384*8 + %else + %assign stack_size 11*16+384*8 + %endif +cglobal wiener_filter5_16bpc, 5, 7, 8, -stack_size, dst, dst_stride, left, \ + lpf, lpf_stride, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+4*6] + %define lpf_stridem dword [esp+calloff+4*7] + %define wm dword [esp+calloff+16*10+0] + %define hd dword [esp+calloff+16*10+4] + %define edgeb byte [esp+calloff+16*10+8] + %define edged dword [esp+calloff+16*10+8] + %else + %define hd dword r6m + %define edgeb byte r8m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+24] + mov lpf_stridem, lpf_strideq + mov wm, wd + mov r4, [rstk+stack_offset+28] + mov hd, r4 + mov r4, [rstk+stack_offset+36] + mov edged, r4 ; edge + %endif +%else +cglobal wiener_filter5_16bpc, 5, 14, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, fltmp + mov edged, r8m + mov hd, r6m + mov t3d, r9m ; pixel_max + movq m12, [fltq] + movq m14, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+32] + mov t1, [rstk+stack_offset+40] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + %else + LEA t0, wiener_shifts + mov fltq, r7m + movq m1, [fltq] + movq m3, [fltq+16] + mov t1, r9m ; pixel_max + %endif + mov PICmem, t0 +%endif + mova m5, [base+wiener_shufE] + mova m6, [base+wiener_shufB] + mova m7, [base+wiener_shufD] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + punpcklwd m11, m12, m12 + pshufd m11, m11, q1111 ; x1 + pshufd m12, m12, q1111 ; x2 x3 + punpcklwd m13, m14, m14 + pshufd m13, m13, q1111 ; y1 + pshufd m14, m14, q1111 ; y2 y3 + shr t3d, 11 + mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + %define base t4-wiener_shifts + movd m9, [base+wiener_round+t3*4] + movq m10, [base+wiener_shifts+t3*8] + pshufd m9, m9, q0000 + pshufd m0, m10, q0000 + pshufd m10, m10, q1111 + mova m15, [wiener_lshuf5] + pmullw m11, m0 + pmullw m12, m0 + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp+0] + %define lpf_stridem [rsp+8] + %define base +%else + add wd, wd + punpcklwd m0, m1, m1 + pshufd m0, m0, q1111 ; x1 + pshufd m1, m1, q1111 ; x2 x3 + punpcklwd m2, m3, m3 + pshufd m2, m2, q1111 ; y1 + pshufd m3, m3, q1111 ; y2 y3 + mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) + mova m13, m2 + mova m14, m3 + mova m8, m4 + shr t1, 11 + add lpfq, wq + movd m2, [base+wiener_round+t1*4] + movq m3, [base+wiener_shifts+t1*8] + %if STACK_ALIGNMENT < 16 + lea t1, [esp+16*11+wq+16] + %else + lea t1, [esp+16*10+wq+16] + %endif + add dstq, wq + neg wq + pshufd m2, m2, q0000 + pshufd m4, m3, q0000 + pshufd m3, m3, q1111 + mov wm, wq + pmullw m0, m4 + pmullw m1, m4 + mova m4, [base+wiener_lshuf5] + mova m9, m2 + mova m10, m3 + mova m11, m0 + mova m12, m1 + mova m15, m4 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + add lpfq, lpf_stridem +%endif + mov t4, t1 + add t1, 384*2 + call .h_top + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + movif64 lpf_stridem, lpf_strideq + add r10, lpf_strideq + mov lpfm, r10 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, lpfm + call .hv_bottom + add lpfq, lpf_stridem + call .hv_bottom +.end: + RET +.no_top: + movif32 lpf_strideq, lpf_stridem + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t0, t3m + mov r5, t2m + mov t1, t1m + mov t4m, t0 + mov t3m, r5 + mov t2m, t1 + mov wq, wm +%endif + add dstq, dst_strideq +.v1: + call .v + jmp .end +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movif32 t0, PICmem + pxor m1, m1 + movd m2, wd + mova m0, [base+pb_2_3] + pshufb m2, m1 + mova m1, [base+pb_m6_m5] + psubb m0, m2 + psubb m1, m2 + mova m2, [base+pb_0to15] + pminub m0, m2 + pminub m1, m2 + pshufb m3, m0 + pshufb m4, m1 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: + movif64 wq, r5 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, m15 ; before the start of the buffer + jmp .h_main +.h_top: + movif64 wq, r5 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-4] +.h_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + paddd m1, m3 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop + movif32 wq, wm + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, m15 + jmp .hv_main +.hv_bottom: + movif64 wq, r5 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-4] +.hv_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t1, t1m + movif32 t0, t3m + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 +%if ARCH_X86_64 + mova m2, [t3+wq] + paddw m2, [t1+wq] + paddd m1, m3 + mova m4, [t2+wq] +%else + mova m2, [t0+wq] + mov t0, t2m + paddw m2, [t1+wq] + mov t1, t4m + paddd m1, m3 + mova m4, [t0+wq] + mov t0, t0m +%endif + punpckhwd m3, m2, m4 + pmaddwd m3, m14 + punpcklwd m2, m4 +%if ARCH_X86_64 + mova m4, [t4+wq] +%else + mova m4, [t1+wq] +%endif + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + pmaddwd m2, m14 + psraw m0, 1 + mova [t0+wq], m0 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .hv_loop +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 +%else + mov r5, t3m + mov t1, t2m + mov t4m, r5 + mov t3m, t1 + mov r5, t1m + mov t1, t0 + mov t2m, r5 + mov t0, t4m + mov wq, wm +%endif + add dstq, dst_strideq + ret +.v: + movif64 wq, r5 + movif32 t1m, t1 +.v_loop: +%if ARCH_X86_64 + mova m0, [t1+wq] + paddw m2, m0, [t3+wq] + mova m1, [t2+wq] + mova m4, [t4+wq] +%else + mov t0, t3m + mova m0, [t1+wq] + mov t1, t2m + paddw m2, m0, [t0+wq] + mov t0, t4m + mova m1, [t1+wq] + mova m4, [t0+wq] +%endif + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 +%if ARCH_X86_64 + jl .v_loop +%else + jge .v_end + mov t1, t1m + jmp .v_loop +.v_end: +%endif + ret + +%macro GATHERDD 3 ; dst, src, tmp + movd %3d, %2 + %if ARCH_X86_64 + movd %1, [r13+%3] + pextrw %3d, %2, 2 + pinsrw %1, [r13+%3+2], 3 + pextrw %3d, %2, 4 + pinsrw %1, [r13+%3+2], 5 + pextrw %3d, %2, 6 + pinsrw %1, [r13+%3+2], 7 + %else + movd %1, [base+sgr_x_by_x-0xf03+%3] + pextrw %3, %2, 2 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 + pextrw %3, %2, 4 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 + pextrw %3, %2, 6 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 + %endif +%endmacro + +%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore + %if ARCH_X86_64 + %define tmp r14 + %else + %define tmp %4 + %endif + GATHERDD %1, %2, tmp + GATHERDD %2, %3, tmp + movif32 %4, %5 + psrld %1, 24 + psrld %2, 24 + packssdw %1, %2 +%endmacro + +%macro MAXSD 3-4 0 ; dst, src, restore_tmp + pcmpgtd %3, %1, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 + %if %4 == 1 + pxor %3, %3 + %endif +%endmacro + +%macro MULLD 3 ; dst, src, tmp + pmulhuw %3, %1, %2 + pmullw %1, %2 + pslld %3, 16 + paddd %1, %3 +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 0, 1, 2, 3, 4 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 5*16 + %else + %assign extra_stack 3*16 + %endif +cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, params, h + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*0+4*6] + %define dst_stridemp dword [esp+calloff+16*3+4*7] + %define leftm dword [esp+calloff+16*3+4*0] + %define lpfm dword [esp+calloff+16*3+4*1] + %define lpf_stridem dword [esp+calloff+16*3+4*2] + %define w0m dword [esp+calloff+16*3+4*3] + %define hd dword [esp+calloff+16*3+4*4] + %define edgeb byte [esp+calloff+16*3+4*5] + %define edged dword [esp+calloff+16*3+4*5] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r6m + %define edgeb byte r8m + %define edged dword r8m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t0m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define m8 [base+pd_8] + %define m9 [base+pw_25] + %define m10 [esp+calloff+16*2] + %define m11 [base+pd_0xf00800a4] + %define m12 [base+pw_256] + %define m13 [base+pd_34816] + %define m14 [base+pw_1023] + %define m15 [base+sgr_lshuf5] + %define r10 r5 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov dst_strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov lpf_strideq, [rstk+stack_offset+20] + mov wd, [rstk+stack_offset+24] + mov dstm, dstq + mov dst_stridemp, dst_strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+28] + mov r2, [rstk+stack_offset+36] + mov lpfm, lpfq + mov lpf_stridem, lpf_strideq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_5x5_16bpc, 5, 15, 16, -400*24-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, paramsmp + lea r13, [sgr_x_by_x-0xf03] + mov edged, r8m + add wd, wd + mov hd, r6m + movu m10, [paramsq] + mova m12, [pw_256] + add lpfq, wq + mova m8, [pd_8] + lea t1, [rsp+wq+20] + mova m9, [pw_25] + add dstq, wq + lea t3, [rsp+wq*2+400*12+16] + mova m11, [pd_0xf00800a4] + lea t4, [rsp+wq+400*20+16] + pshufhw m7, m10, q0000 + pshufb m10, m12 ; s0 + punpckhqdq m7, m7 ; w0 + neg wq + mova m13, [pd_34816] ; (1 << 11) + (1 << 15) + pxor m6, m6 + mova m14, [pw_1023] + psllw m7, 4 + mova m15, [sgr_lshuf5] + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp+0] + %define lpf_stridem [rsp+8] +%else + mov r1, [rstk+stack_offset+32] ; params + LEA r6, $$ + add wd, wd + movu m1, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*20+16] + mov t3m, t3 + pshufhw m7, m1, q0000 + mov t4m, t4 + pshufb m1, m12 ; s0 + punpckhqdq m7, m7 ; w0 + psllw m7, 4 + neg wq + mova m10, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov lpf_strideq, lpf_stridem + mov w0m, wd +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + movif32 t2m, t1 + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + add r10, lpf_strideq + mov lpfm, r10 ; below + movif32 t0m, t2 + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, dst_stridemp + movif32 t4, t4m + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp +%if ARCH_X86_64 + test hb, hb +%else + mov r5, hd + test r5, r5 +%endif + jz .odd_height + call .h + add lpfq, dst_stridemp + call .hv + movif32 dstq, dstm + call .n0 + call .n1 + sub hd, 2 + movif32 t0, t0m + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .h_top + add lpfq, lpf_stridem + call .hv_bottom +.end: + movif32 dstq, dstm + call .n0 + call .n1 +.end2: + RET +.height1: + movif32 t4, t4m + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + movif32 dstq, dstm + call .n0 + call .n1 +.odd_height_end: + call .v + movif32 dstq, dstm + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + movif64 lpf_stridem, lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h + lea t2, [t1+400*6] + movif32 t2m, t2 + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + movif32 t0m, t0 + jmp .main +.no_top_height1: + movif32 t3, t3m + movif32 t4, t4m + call .v + call .prep_n + jmp .odd_height_end +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movd m1, wd + mova m3, [base+pb_m14_m13] + mova m0, [base+pb_0to15] + pshufb m1, m6 + psubb m2, m12, m1 + psubb m3, m1 + movd m1, [lpfq-2] + pcmpgtb m2, m0 + pcmpgtb m3, m0 + pshufb m1, m12 + pand m4, m2 + pand m5, m3 + pandn m2, m1 + pandn m3, m1 + por m4, m2 + por m5, m3 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r5-4] +%else + %define leftq r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m15 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r5-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+wq+400*0] + paddd m1, [t1+wq+400*2] + paddd m2, [t1+wq+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+wq+400*0], m0 + mova [t1+wq+400*2], m1 + mova [t1+wq+400*4], m2 + add wq, 16 + jl .h_loop + ret +.top_fixup: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv_main +.hv_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m15 + jmp .hv_main +.hv_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv_loop_start +%endif +.hv_loop: + movif32 lpfq, hvsrcm +.hv_loop_start: + movu m4, [lpfq+wq- 2] +.hv_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -20 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t3, hd + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] +%if ARCH_X86_64 + test hd, hd +%else + test t3, t3 +%endif + jz .hv_last_row +.hv_main2: + paddw m1, [t2+wq+400*0] ; hv sum + paddd m4, [t2+wq+400*2] ; hv sumsq + paddd m5, [t2+wq+400*4] + mova [t0+wq+400*0], m0 + mova [t0+wq+400*2], m2 + mova [t0+wq+400*4], m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + MULLD m4, m9, m0 ; a * 25 + MULLD m5, m9, m0 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, t2, t2m + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m5 + MULLD m1, m4, m5 + psubw m5, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m5 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + movif32 t2m, t2 + movif32 t0m, t0 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+wq+400*0], m1 + paddw m1, m0 + mova [t1+wq+400*2], m4 + paddd m4, m2 + mova [t1+wq+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v_loop: + mova m0, [t1+wq+400*0] + mova m2, [t1+wq+400*2] + mova m3, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + MULLD m4, m9, m0 ; a * 25 + MULLD m5, m9, m0 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, t2, t2m + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m5 + MULLD m1, m4, m5 + psubw m5, m12, m2 ; a + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m5 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + movif64 wq, r5 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r5 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+wq*1+400*2+ 0] + paddd m4, m1, [t3+wq*2+400*4+ 0] + paddd m5, m2, [t3+wq*2+400*4+16] + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 8) + paddd m3, m5 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, dst_stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r5 + movif32 wd, w1m +.n1_loop: + mova m0, [dstq+wq] + mova m3, [t4+wq*1+400*2+ 0] + mova m4, [t3+wq*2+400*4+ 0] + mova m5, [t3+wq*2+400*4+16] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 7) + paddd m3, m5 + psrld m2, 8 + psrld m3, 8 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, dst_stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 4*16 + %else + %assign extra_stack 2*16 + %endif +cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, params, h + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*2+4*0] + %define dst_stridemp dword [esp+calloff+16*2+4*1] + %define leftm dword [esp+calloff+16*2+4*2] + %define lpfm dword [esp+calloff+16*2+4*3] + %define lpf_stridem dword [esp+calloff+16*2+4*4] + %define w0m dword [esp+calloff+16*2+4*5] + %define hd dword [esp+calloff+16*2+4*6] + %define edgeb byte [esp+calloff+16*2+4*7] + %define edged dword [esp+calloff+16*2+4*7] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r6m + %define edgeb byte r8m + %define edged dword r8m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %define m8 [base+pd_8] + %define m9 [esp+calloff+16*1] + %define m10 [base+pd_0xf00801c7] + %define m11 [base+pd_34816] + %define m12 [base+pw_256] + %define m13 [base+pw_1023] + %define m14 [base+sgr_lshuf3] + %define m15 m6 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov dst_strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov lpf_strideq, [rstk+stack_offset+20] + mov wd, [rstk+stack_offset+24] + mov dstm, dstq + mov dst_stridemp, dst_strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+28] + mov r2, [rstk+stack_offset+36] + mov lpfm, lpfq + mov lpf_stridem, lpf_strideq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_3x3_16bpc, 5, 15, 16, 400*42+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, paramsmp + lea r13, [sgr_x_by_x-0xf03] + mov edged, r8m + add wd, wd + mov hd, r6m + movq m9, [paramsq+4] + mova m12, [pw_256] + add lpfq, wq + lea t1, [rsp+wq+12] + mova m8, [pd_8] + add dstq, wq + lea t3, [rsp+wq*2+400*12+8] + mova m10, [pd_0xf00801c7] + lea t4, [rsp+wq+400*32+8] + mova m11, [pd_34816] + pshuflw m7, m9, q3333 + pshufb m9, m12 ; s1 + punpcklqdq m7, m7 ; w1 + neg wq + pxor m6, m6 + mova m13, [pw_1023] + psllw m7, 4 + mova m14, [sgr_lshuf3] + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+32] ; params + LEA r6, $$ + add wd, wd + movq m1, [r1+4] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*32+16] + mov t3m, t3 + pshuflw m7, m1, q3333 + mov t4m, t4 + pshufb m1, m12 ; s1 + punpcklqdq m7, m7 ; w1 + psllw m7, 4 + neg wq + mova m9, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov lpf_strideq, lpf_stridem + mov w0m, wd +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv0 +%if ARCH_X86_64 + test hb, hb +%else + mov r5, hd + test r5, r5 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + mov lpfq, hvsrcm + add lpfq, lpf_stridem +%endif + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movd m1, wd + mova m2, [base+pb_m2_m1] + mova m3, [base+pb_0to15] + movd m5, [lpfq-2] + pshufb m1, m6 + pshufb m5, m12 + psubb m2, m1 + pcmpgtb m2, m3 + pand m4, m2 + pandn m2, m5 + por m4, m2 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r5-4] +%else + %define leftq r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m14 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r5-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq+ 0] +.h_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m14 + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq+ 0] +.hv0_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -18 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq+4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m14 + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq+ 0] +.hv1_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -18 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m2 + mova [t2+wq+400*4], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + psrld m5, 4 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2 +4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*0+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m15 + MAXSD m5, m3, m15 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m15 ; p * s + MULLD m5, m9, m15 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m3, m4, 20 ; min(z, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m15 + MULLD m1, m4, m15 + psubw m5, m12, m2 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r5 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 4] + movu m1, [t3+wq*2+400*0+ 8] + movu m2, [t3+wq*2+400*0+24] + movu m3, [t4+wq*1+400*0+ 2] + movu m4, [t3+wq*2+400*0+ 4] + movu m5, [t3+wq*2+400*0+20] + paddw m0, [t4+wq*1+400*0+ 0] + paddd m1, [t3+wq*2+400*0+ 0] + paddd m2, [t3+wq*2+400*0+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[-1] 444 + pslld m4, 2 ; b[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a[-1] 343 + psubd m4, m1 ; b[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400*4], m3 + mova [t3+wq*2+400*8+ 0], m4 + mova [t3+wq*2+400*8+16], m5 + movu m0, [t4+wq*1+400*2+ 4] + movu m1, [t3+wq*2+400*4+ 8] + movu m2, [t3+wq*2+400*4+24] + movu m3, [t4+wq*1+400*2+ 2] + movu m4, [t3+wq*2+400*4+ 4] + movu m5, [t3+wq*2+400*4+20] + paddw m0, [t4+wq*1+400*2+ 0] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[ 0] 444 + pslld m4, 2 ; b[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400* 6], m3 + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + psubw m3, m0 ; a[ 0] 343 + psubd m4, m1 ; b[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r5 + movif32 wd, w1m +.n0_loop: + movu m3, [t4+wq*1+400*0+4] + movu m1, [t4+wq*1+400*0+2] + paddw m3, [t4+wq*1+400*0+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*4] + paddw m3, [t4+wq*1+400*6] + mova [t4+wq*1+400*4], m2 + mova [t4+wq*1+400*6], m1 + movu m4, [t3+wq*2+400*0+8] + movu m1, [t3+wq*2+400*0+4] + paddd m4, [t3+wq*2+400*0+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400* 8+ 0] + paddd m4, [t3+wq*2+400*12+ 0] + mova [t3+wq*2+400* 8+ 0], m2 + mova [t3+wq*2+400*12+ 0], m1 + movu m5, [t3+wq*2+400*0+24] + movu m1, [t3+wq*2+400*0+20] + paddd m5, [t3+wq*2+400*0+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400* 8+16] + paddd m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400* 8+16], m2 + mova [t3+wq*2+400*12+16], m1 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 8) + paddd m3, m5 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, dst_stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r5 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*2+4] + movu m1, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*6] + paddw m3, [t4+wq*1+400*8] + mova [t4+wq*1+400*6], m1 + mova [t4+wq*1+400*8], m2 + movu m4, [t3+wq*2+400*4+8] + movu m1, [t3+wq*2+400*4+4] + paddd m4, [t3+wq*2+400*4+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400*12+ 0] + paddd m4, [t3+wq*2+400*16+ 0] + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*16+ 0], m2 + movu m5, [t3+wq*2+400*4+24] + movu m1, [t3+wq*2+400*4+20] + paddd m5, [t3+wq*2+400*4+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400*12+16] + paddd m5, [t3+wq*2+400*16+16] + mova [t3+wq*2+400*12+16], m1 + mova [t3+wq*2+400*16+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + paddd m2, m4 ; a * src + b + (1 << 8) + paddd m3, m5 + psrld m2, 9 + psrld m3, 9 + packssdw m2, m3 + psllw m1, m0, 4 + psubw m2, m1 + pmulhrsw m2, m7 + paddw m0, m2 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, dst_stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 10*16 + %else + %assign extra_stack 8*16 + %endif +cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ + dst, dst_stride, left, lpf, lpf_stride, w, params, h + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*8+4*0] + %define dst_stridemp dword [esp+calloff+16*8+4*1] + %define leftm dword [esp+calloff+16*8+4*2] + %define lpfm dword [esp+calloff+16*8+4*3] + %define lpf_stridem dword [esp+calloff+16*8+4*4] + %define w0m dword [esp+calloff+16*8+4*5] + %define hd dword [esp+calloff+16*8+4*6] + %define edgeb byte [esp+calloff+16*8+4*7] + %define edged dword [esp+calloff+16*8+4*7] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r6m + %define edgeb byte r8m + %define edged dword r8m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %xdefine m8 m6 + %define m9 [base+pd_8] + %define m10 [base+pd_34816] + %define m11 [base+pd_0xf00801c7] + %define m12 [base+pw_256] + %define m13 [esp+calloff+16*4] + %define m14 [esp+calloff+16*5] + %define m15 [esp+calloff+16*6] + %define m6 [esp+calloff+16*7] + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov dst_strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov lpf_strideq, [rstk+stack_offset+20] + mov wd, [rstk+stack_offset+24] + mov dstm, dstq + mov dst_stridemp, dst_strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+28] + mov r2, [rstk+stack_offset+36] + mov lpfm, lpfq + mov lpf_stridem, lpf_strideq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_mix_16bpc, 5, 15, 16, -400*66-40, dst, dst_stride, left, \ + lpf, lpf_stride, w, edge, \ + params, h +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, paramsmp + lea r13, [sgr_x_by_x-0xf03] + mov edged, r8m + add wd, wd + mov hd, r6m + mova m15, [paramsq] + add lpfq, wq + mova m9, [pd_8] + lea t1, [rsp+wq+44] + mova m10, [pd_34816] + add dstq, wq + mova m12, [pw_256] + lea t3, [rsp+wq*2+400*24+40] + mova m11, [pd_0xf00801c7] + lea t4, [rsp+wq+400*52+40] + neg wq + pshuflw m13, m15, q0000 + pshuflw m14, m15, q2222 + pshufhw m15, m15, q1010 + punpcklqdq m13, m13 ; s0 + punpcklqdq m14, m14 ; s1 + punpckhqdq m15, m15 ; w0 w1 + pxor m6, m6 + psllw m15, 2 + DEFINE_ARGS dst, dst_stride, left, lpf, lpf_stride, _, edge, _, h, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+32] ; params + LEA r6, $$ + add wd, wd + mova m2, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+52] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*24+48] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*52+48] + mov t3m, t3 + mov t4m, t4 + neg wq + pshuflw m0, m2, q0000 + pshuflw m1, m2, q2222 + pshufhw m2, m2, q1010 + punpcklqdq m0, m0 ; s0 + punpcklqdq m1, m1 ; s1 + punpckhqdq m2, m2 ; w0 w1 + mov w1m, wd + pxor m3, m3 + psllw m2, 2 + mova m13, m0 + mova m14, m1 + sub wd, 4 + mova m15, m2 + mova m6, m3 + mov lpfq, lpfm + mov lpf_strideq, lpf_stridem + mov w0m, wd +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 +%if ARCH_X86_64 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup +%else + mov wq, w0m + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop +%endif + add t1, 400*12 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv0 +%if ARCH_X86_64 + test hd, hd +%else + mov r5, hd + test r5, r5 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, dst_stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom +%if ARCH_X86_64 + add lpfq, lpf_strideq +%else + mov lpfq, hvsrcm + add lpfq, lpf_stridem +%endif + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+wq+400* 0] + mova m1, [t1+wq+400* 2] + mova m2, [t1+wq+400* 4] + paddw m0, m0 + mova m3, [t1+wq+400* 6] + paddd m1, m1 + mova m4, [t1+wq+400* 8] + paddd m2, m2 + mova m5, [t1+wq+400*10] + mova [t2+wq+400* 0], m0 + mova [t2+wq+400* 2], m1 + mova [t2+wq+400* 4], m2 + mova [t2+wq+400* 6], m3 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.h: ; horizontal boxsum +%assign stack_offset stack_offset+4 +%assign calloff 4 +%if ARCH_X86_64 + lea wq, [r5-4] +%else + %define leftq r5 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r5-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.h_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m8, m1 ; sum5 + paddd m7, m2 ; sumsq5 + paddd m5, m3 + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq- 2] +.hv0_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -20 + jl .hv0_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv0_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + movif32 t3, t3m + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; h sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m7, m2 ; h sumsq5 + paddd m5, m3 + mova [t3+wq*2+400*8+ 8], m8 + mova [t3+wq*2+400*0+ 8], m7 + mova [t3+wq*2+400*0+24], m5 + paddw m8, [t1+wq+400* 0] + paddd m7, [t1+wq+400* 2] + paddd m5, [t1+wq+400* 4] + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + paddw m0, m1, [t1+wq+400* 6] + paddd m4, m2, [t1+wq+400* 8] + paddd m5, m3, [t1+wq+400*10] + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z3, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m7 + MULLD m1, m4, m7 + psubw m5, m12, m2 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq- 2] +.hv1_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -20 + jl .hv1_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv1_have_right: + palignr m7, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m7, m3 + punpcklwd m0, m7, m3 + pmaddwd m0, m0 + punpckhwd m7, m3 + pmaddwd m7, m7 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m6 + pmaddwd m1, m1 + punpckhwd m3, m6 + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + palignr m5, m4, 8 + punpckhwd m1, m4, m5 + paddw m8, m4, m5 + pmaddwd m1, m1 + punpcklwd m4, m5 + pmaddwd m4, m4 + paddd m7, m3 + paddw m5, m2, [t2+wq+400* 6] + mova [t2+wq+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 8], m0 + mova [t2+wq+400*10], m7 + paddd m4, m0 ; h sumsq5 + paddd m1, m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m0, m2, 3 + pslld m7, m3, 3 + paddd m2, m0 ; ((a3 + 8) >> 4) * 9 + paddd m3, m7 + psrlw m7, m5, 1 + pavgw m7, m6 ; (b3 + 2) >> 2 + punpcklwd m0, m7, m6 + pmaddwd m0, m0 + punpckhwd m7, m6 + pmaddwd m7, m7 +%if ARCH_X86_32 + mova [esp+20], m8 +%else + SWAP m8, m6 +%endif + MAXSD m2, m0, m8 + MAXSD m3, m7, m8 + pxor m8, m8 + psubd m2, m0 ; p3 + psubd m3, m7 + punpcklwd m0, m5, m8 ; b3 + punpckhwd m5, m8 + MULLD m2, m14, m8 ; p3 * s1 + MULLD m3, m14, m8 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m5, m11 + paddusw m2, m11 + paddusw m3, m11 + psrld m8, m2, 20 ; min(z3, 255) + movif32 t3, t3m + psrld m2, m3, 20 + GATHER_X_BY_X m7, m8, m2, r0, dstm + punpcklwd m2, m7, m7 + punpckhwd m8, m7, m7 + MULLD m0, m2, m3 + MULLD m5, m8, m3 + psubw m3, m12, m7 +%if ARCH_X86_32 + mova m8, [esp+20] +%endif + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m5, m10 + psrld m0, 12 + psrld m5, 12 + mova [t4+wq*1+400*4+4], m3 + mova [t3+wq*2+400*8+ 8], m0 + mova [t3+wq*2+400*8+24], m5 +%if ARCH_X86_64 + SWAP m6, m8 + pxor m6, m6 +%endif + paddw m5, m8, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m1, [t2+wq+400*4] + paddw m5, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m8 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m1 + mova m4, [base+pw_25] + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + MULLD m2, m4, m7 ; ((a5 + 8) >> 4) * 25 + MULLD m3, m4, m7 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m1, m5, 1 + pavgw m1, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m1, m7 + pmaddwd m4, m4 + punpckhwd m1, m7 + pmaddwd m1, m1 + punpcklwd m0, m5, m7 ; b5 + punpckhwd m5, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + mova m4, [base+pd_0xf00800a4] + MAXSD m3, m1, m7 + psubd m3, m1 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m5, m4 + paddusw m2, m4 + paddusw m3, m4 + psrld m1, m2, 20 ; min(z5, 255) + psrld m2, m3, 20 + GATHER_X_BY_X m4, m1, m2, r0, dstm + punpcklwd m2, m4, m4 + punpckhwd m3, m4, m4 + MULLD m0, m2, m7 + MULLD m5, m3, m7 + psubw m1, m12, m4 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m5, m10 + mova [t4+wq*1+400*0+ 4], m1 + psrld m0, 12 + psrld m5, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m5 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400* 6] + mova m4, [t1+wq+400* 8] + mova m5, [t1+wq+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z3, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m7 + MULLD m1, m4, m7 + psubw m5, m12, m2 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+4], m5 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + mova [t3+wq*2+400*8+ 8], m3 + mova [t3+wq*2+400*0+ 8], m4 + mova [t3+wq*2+400*0+24], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+wq+400*0], m3 + mova [t1+wq+400*2], m4 + mova [t1+wq+400*4], m5 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r5-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m4, [t1+wq+400* 6] + mova m5, [t1+wq+400* 8] + mova m7, [t1+wq+400*10] + paddw m1, m4, [t2+wq+400* 6] + paddd m2, m5, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 6], m4 + mova [t2+wq+400* 8], m5 + mova [t2+wq+400*10], m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m3, m4, 20 ; min(z3, 255) + psrld m4, m5, 20 + GATHER_X_BY_X m2, m3, m4, r0, dstm + punpcklwd m3, m2, m2 + punpckhwd m4, m2, m2 + MULLD m0, m3, m7 + MULLD m1, m4, m7 + psubw m5, m12, m2 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*4+4], m5 + psrld m0, 12 + psrld m8, m1, 12 + mova m4, [t3+wq*2+400*8+ 8] + mova m5, [t3+wq*2+400*0+ 8] + mova m7, [t3+wq*2+400*0+24] + paddw m1, m4, [t2+wq+400*0] + paddd m2, m5, [t2+wq+400*2] + paddd m3, m7, [t2+wq+400*4] + paddw m1, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m4 + mova [t2+wq+400*2], m5 + mova [t2+wq+400*4], m7 + mova m4, [base+pw_25] + mova [t3+wq*2+400*8+ 8], m0 + mova [t3+wq*2+400*8+24], m8 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + MULLD m2, m4, m7 ; ((a5 + 8) >> 4) * 25 + MULLD m3, m4, m7 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + mova m4, [base+pd_0xf00800a4] + MAXSD m3, m5, m7 + psubd m3, m5 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrld m5, m2, 20 ; min(z5, 255) + psrld m2, m3, 20 + GATHER_X_BY_X m4, m5, m2, r0, dstm + punpcklwd m2, m4, m4 + punpckhwd m3, m4, m4 + psubw m5, m12, m4 + MULLD m0, m2, m7 + MULLD m1, m3, m7 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*0+ 4], m5 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r5 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 2] + movu m1, [t3+wq*2+400*0+ 4] + movu m2, [t3+wq*2+400*0+20] + movu m7, [t4+wq*1+400*0+ 4] + movu m8, [t3+wq*2+400*0+ 8] + paddw m3, m0, [t4+wq*1+400*0+ 0] + paddd m4, m1, [t3+wq*2+400*0+ 0] + paddd m5, m2, [t3+wq*2+400*0+16] + paddw m3, m7 + paddd m4, m8 + movu m7, [t3+wq*2+400*0+24] + paddw m0, m3 + paddd m1, m4 + psllw m3, 2 + pslld m4, 2 + paddd m5, m7 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a5 565 + paddd m1, m4 ; b5 565 + paddd m2, m5 + mova [t4+wq*1+400* 6+ 0], m0 + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*12+16], m2 + movu m0, [t4+wq*1+400*2+ 4] + movu m3, [t4+wq*1+400*2+ 2] + paddw m0, [t4+wq*1+400*2+ 0] + movu m1, [t3+wq*2+400*4+ 8] + movu m4, [t3+wq*2+400*4+ 4] + paddd m1, [t3+wq*2+400*4+ 0] + movu m2, [t3+wq*2+400*4+24] + movu m5, [t3+wq*2+400*4+20] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[-1] 444 + pslld m4, 2 ; b3[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a3[-1] 343 + psubd m4, m1 ; b3[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8+ 0], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + movu m0, [t4+wq*1+400*4+ 4] + movu m3, [t4+wq*1+400*4+ 2] + paddw m0, [t4+wq*1+400*4+ 0] + movu m1, [t3+wq*2+400*8+ 8] + movu m4, [t3+wq*2+400*8+ 4] + paddd m1, [t3+wq*2+400*8+ 0] + movu m2, [t3+wq*2+400*8+24] + movu m5, [t3+wq*2+400*8+20] + paddd m2, [t3+wq*2+400*8+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[ 0] 444 + pslld m4, 2 ; b3[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400*10+ 0], m3 + mova [t3+wq*2+400*20+ 0], m4 + mova [t3+wq*2+400*20+16], m5 + psubw m3, m0 ; a3[ 0] 343 + psubd m4, m1 ; b3[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400*12+ 0], m3 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r5 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 4] + movu m2, [t4+wq*1+ 2] + paddw m0, [t4+wq*1+ 0] + paddw m0, m2 + paddw m2, m0 + psllw m0, 2 + paddw m0, m2 ; a5 + movu m4, [t3+wq*2+ 8] + movu m5, [t3+wq*2+24] + movu m1, [t3+wq*2+ 4] + movu m3, [t3+wq*2+20] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddd m4, m1 + paddd m5, m3 + paddd m1, m4 + paddd m3, m5 + pslld m4, 2 + pslld m5, 2 + paddd m4, m1 ; b5 + paddd m5, m3 + movu m2, [t4+wq*1+400* 6] + paddw m2, m0 + mova [t4+wq*1+400* 6], m0 + paddd m0, m4, [t3+wq*2+400*12+ 0] + paddd m1, m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + mova [rsp+16+ARCH_X86_32*4], m1 + movu m3, [t4+wq*1+400*2+4] + movu m5, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + movu m3, [t4+wq*1+400* 8] + paddw m3, [t4+wq*1+400*10] + paddw m3, m4 + mova [t4+wq*1+400* 8], m4 + mova [t4+wq*1+400*10], m5 + movu m1, [t3+wq*2+400*4+ 8] + movu m5, [t3+wq*2+400*4+ 4] + movu m7, [t3+wq*2+400*4+24] + movu m8, [t3+wq*2+400*4+20] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m7, [t3+wq*2+400*4+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 +%if ARCH_X86_32 + mova [esp+52], m8 + psubd m8, m7 +%else + psubd m6, m8, m7 + SWAP m8, m6 +%endif + paddd m1, m4, [t3+wq*2+400*16+ 0] + paddd m7, m8, [t3+wq*2+400*16+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m8 + mova [t3+wq*2+400*20+ 0], m5 +%if ARCH_X86_32 + mova m8, [esp+52] +%else + SWAP m8, m6 + pxor m6, m6 +%endif + mova [t3+wq*2+400*20+16], m8 + mova [rsp+32+ARCH_X86_32*4], m7 + movu m4, [dstq+wq] + punpcklwd m7, m2, m6 + punpckhwd m2, m6 + punpcklwd m8, m3, m6 + punpckhwd m3, m6 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + pmaddwd m7, m5 ; a5 * src + pmaddwd m8, m5 ; a3 * src + pmaddwd m2, m4 + pmaddwd m3, m4 + pslld m5, 13 + pslld m4, 13 + psubd m0, m5 + psubd m1, m5 + paddd m0, m7 ; a5 * src + b5 + (1 << 8) - (src << 13) + paddd m1, m8 ; a3 * src + b3 + (1 << 8) - (src << 13) + mova m7, [base+pd_0xffff] + psrld m0, 9 + pslld m1, 7 + pand m0, m7 + pandn m8, m7, m1 + por m0, m8 + psubd m1, m4, [rsp+16+ARCH_X86_32*4] + psubd m8, m4, [rsp+32+ARCH_X86_32*4] + psubd m2, m1 + psubd m3, m8 + mova m1, [base+pd_4096] + psrld m2, 9 + pslld m3, 7 + pand m2, m7 + pandn m7, m3 + por m2, m7 + pmaddwd m0, m15 + pmaddwd m2, m15 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + paddd m5, m1 + paddd m4, m1 + paddd m0, m5 + paddd m2, m4 + psrad m0, 8 + psrad m2, 8 + packssdw m0, m2 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, dst_stridemp + ret +%if ARCH_X86_64 + SWAP m6, m7 +%endif +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r5 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*4+4] + movu m5, [t4+wq*1+400*4+2] + paddw m3, [t4+wq*1+400*4+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + paddw m3, m4, [t4+wq*1+400*12] + paddw m3, [t4+wq*1+400*10] + mova [t4+wq*1+400*10], m5 + mova [t4+wq*1+400*12], m4 + movu m1, [t3+wq*2+400*8+ 8] + movu m5, [t3+wq*2+400*8+ 4] + movu m7, [t3+wq*2+400*8+24] + movu m8, [t3+wq*2+400*8+20] + paddd m1, [t3+wq*2+400*8+ 0] + paddd m7, [t3+wq*2+400*8+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 + psubd m0, m8, m7 + paddd m1, m4, [t3+wq*2+400*24+ 0] + paddd m7, m0, [t3+wq*2+400*24+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*20+ 0], m5 + mova [t3+wq*2+400*20+16], m8 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m0 + mova m5, [dstq+wq] + mova m8, [t4+wq*1+400* 6] + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m0, m8, m6 + punpckhwd m8, m6 + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + pmaddwd m0, m4 ; a5 * src + pmaddwd m2, m4 ; a3 * src + pmaddwd m8, m5 + pmaddwd m3, m5 + paddd m1, m2 ; a3 * src + b3 + (1 << 8) - (src << 13) + pslld m4, 12 + pslld m5, 12 + psubd m2, m4, [t3+wq*2+400*12+ 0] + psubd m0, m2 ; a5 * src + b5 + (1 << 8) - (src << 13) + psubd m2, m5, [t3+wq*2+400*12+16] + psubd m8, m2 + paddd m4, m4 + paddd m5, m5 + paddd m7, m3 + mova m2, [base+pd_0xffff] + psubd m1, m4 + psubd m7, m5 + psrld m0, 8 + psrld m8, 8 + pslld m1, 7 + pslld m7, 7 + pand m0, m2 + pand m8, m2 + pandn m3, m2, m1 + pandn m2, m7 + por m0, m3 + por m8, m2 + mova m1, [base+pd_4096] + pmaddwd m0, m15 + pmaddwd m8, m15 +%if ARCH_X86_64 + pxor m6, m6 + SWAP m7, m6 +%else + pxor m7, m7 +%endif + paddd m4, m1 + paddd m5, m1 + paddd m0, m4 + paddd m8, m5 + psrad m0, 8 + psrad m8, 8 + packssdw m0, m8 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, dst_stridemp + movif32 dstm, dstq + ret diff -Nru dav1d-0.7.1/src/x86/looprestoration.asm dav1d-0.9.1/src/x86/looprestoration.asm --- dav1d-0.7.1/src/x86/looprestoration.asm 2020-06-21 11:48:55.028126500 +0000 +++ dav1d-0.9.1/src/x86/looprestoration.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1157 +0,0 @@ -; Copyright © 2018, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - -%if ARCH_X86_64 - -SECTION_RODATA 32 -pb_right_ext_mask: times 32 db 0xff - times 32 db 0 -pb_14x0_1_2: times 14 db 0 - db 1, 2 -pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 - db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 -pb_15: times 16 db 15 -pw_16: times 2 dw 16 -pw_256: times 2 dw 256 -pw_2048: times 2 dw 2048 -pw_16380: times 2 dw 16380 -pw_0_128: dw 0, 128 -pw_5_6: dw 5, 6 -pd_6: dd 6 -pd_1024: dd 1024 -pd_0xf0080029: dd 0xf0080029 -pd_0xf00801c7: dd 0xf00801c7 - -cextern sgr_x_by_x - -SECTION .text - -INIT_YMM avx2 -cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge - mov edged, edgem - vpbroadcastb m15, [fhq+0] - movifnidn wd, wm - vpbroadcastb m14, [fhq+2] - mov hd, hm - vpbroadcastb m13, [fhq+4] - vpbroadcastw m12, [fhq+6] - vpbroadcastd m11, [pw_2048] - vpbroadcastd m10, [pw_16380] - lea r11, [pb_right_ext_mask] - - DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim - - ; if (edge & has_right) align_w_to_32 - ; else w -= 32, and use that as limit in x loop - test edgeb, 2 ; has_right - jnz .align - mov xlimq, -3 - jmp .loop -.align: - add wd, 31 - and wd, ~31 - xor xlimd, xlimd - - ; main y loop for vertical filter -.loop: - mov srcptrq, srcq - mov dstptrq, dstq - lea xq, [wq+xlimq] - - ; load left edge pixels - test edgeb, 1 ; have_left - jz .emu_left - test leftq, leftq ; left == NULL for the edge-extended bottom/top - jz .load_left_combined - movd xm0, [leftq] - add leftq, 4 - pinsrd xm0, [srcq], 1 - pslldq xm0, 9 - jmp .left_load_done -.load_left_combined: - movq xm0, [srcq-3] - pslldq xm0, 10 - jmp .left_load_done -.emu_left: - movd xm0, [srcq] - pshufb xm0, [pb_14x0_1_2] - - ; load right edge pixels -.left_load_done: - cmp xd, 32 - jg .main_load - test xd, xd - jg .load_and_splat - je .splat_right - - ; for very small images (w=[1-2]), edge-extend the original cache, - ; ugly, but only runs in very odd cases - add wd, wd - pshufb xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] - shr wd, 1 - - ; main x loop, mostly this starts in .main_load -.splat_right: - ; no need to load new pixels, just extend them from the (possibly previously - ; extended) previous load into m0 - pshufb xm1, xm0, [pb_15] - jmp .main_loop -.load_and_splat: - ; load new pixels and extend edge for right-most - movu m1, [srcptrq+3] - sub r11, xq - movu m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32] - add r11, xq - vpbroadcastb m3, [srcptrq+2+xq] - pand m1, m2 - pandn m3, m2, m3 - por m1, m3 - jmp .main_loop -.main_load: - ; load subsequent line - movu m1, [srcptrq+3] -.main_loop: - vinserti128 m0, xm1, 1 - - palignr m2, m1, m0, 10 - palignr m3, m1, m0, 11 - palignr m4, m1, m0, 12 - palignr m5, m1, m0, 13 - palignr m6, m1, m0, 14 - palignr m7, m1, m0, 15 - - punpcklbw m0, m2, m1 - punpckhbw m2, m1 - punpcklbw m8, m3, m7 - punpckhbw m3, m7 - punpcklbw m7, m4, m6 - punpckhbw m4, m6 - pxor m9, m9 - punpcklbw m6, m5, m9 - punpckhbw m5, m9 - - pmaddubsw m0, m15 - pmaddubsw m2, m15 - pmaddubsw m8, m14 - pmaddubsw m3, m14 - pmaddubsw m7, m13 - pmaddubsw m4, m13 - paddw m0, m8 - paddw m2, m3 - psllw m8, m6, 7 - psllw m3, m5, 7 - psubw m8, m10 - psubw m3, m10 - pmullw m6, m12 - pmullw m5, m12 - paddw m0, m7 - paddw m2, m4 - paddw m0, m6 - paddw m2, m5 - ; for a signed overflow to happen we need filter and pixels as follow: - ; filter => -5,-23,-17,90,-17,-23,-5 - ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0 - ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6] - ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84] - ; 32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A] - ; => signed 16-bit overflow occurs - paddsw m0, m8 ; paddsw clips this range to [-8000;+7FFF] - paddsw m2, m3 - psraw m0, 3 ; shift changes the range to [-1000;+FFF] - psraw m2, 3 - paddw m0, m11 ; adding back 800 (removed in m8) changes the - paddw m2, m11 ; range to [-800;+17FF] as defined in the spec - mova [dstptrq], xm0 ; (note that adding another 800 would give us - mova [dstptrq+16], xm2; the same range as in the C code => [0;1FFF]) - vextracti128 [dstptrq+32], m0, 1 - vextracti128 [dstptrq+48], m2, 1 - vextracti128 xm0, m1, 1 - add srcptrq, 32 - add dstptrq, 64 - sub xq, 32 - cmp xd, 32 - jg .main_load - test xd, xd - jg .load_and_splat - cmp xd, xlimd - jg .splat_right - - add srcq, strideq - add dstq, 384*2 - dec hd - jg .loop - RET - -cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge - movifnidn fvq, fvmp - mov edged, edgem - movifnidn hd, hm - vpbroadcastd m10, [fvq] - vpbroadcastd m11, [fvq+4] - vpbroadcastd m0, [pw_0_128] - vpbroadcastd m12, [pd_1024] - - DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr - rorx ylimd, edged, 2 - paddw m11, m0 - and ylimd, 2 ; have_bottom - sub ylimd, 3 - - ; main x loop for vertical filter, does one column of 16 pixels -.loop_x: - mova m3, [midq] ; middle line - - ; load top pixels - test edgeb, 4 ; have_top - jz .emu_top - mova m0, [midq-384*4] - mova m2, [midq-384*2] - mova m1, m0 - jmp .load_bottom_pixels -.emu_top: - mova m0, m3 - mova m1, m3 - mova m2, m3 - - ; load bottom pixels -.load_bottom_pixels: - mov yd, hd - mov mptrq, midq - mov dstptrq, dstq - add yd, ylimd - jg .load_threelines - - ; the remainder here is somewhat messy but only runs in very weird - ; circumstances at the bottom of the image in very small blocks (h=[1-3]), - ; so performance is not terribly important here... - je .load_twolines - cmp yd, -1 - je .load_oneline - ; h == 1 case - mova m5, m3 - mova m4, m3 - mova m6, m3 - jmp .loop -.load_oneline: - ; h == 2 case - mova m4, [midq+384*2] - mova m5, m4 - mova m6, m4 - jmp .loop -.load_twolines: - ; h == 3 case - mova m4, [midq+384*2] - mova m5, [midq+384*4] - mova m6, m5 - jmp .loop -.load_threelines: - ; h > 3 case - mova m4, [midq+384*2] - mova m5, [midq+384*4] - ; third line loaded in main loop below - - ; main y loop for vertical filter -.loop_load: - ; load one line into m6. if that pixel is no longer available, do - ; nothing, since m6 still has the data from the previous line in it. We - ; try to structure the loop so that the common case is evaluated fastest - mova m6, [mptrq+384*6] -.loop: - paddw m0, m6 - paddw m7, m1, m5 - paddw m8, m2, m4 - punpcklwd m9, m0, m7 - punpckhwd m0, m7 - punpcklwd m7, m8, m3 - punpckhwd m8, m3 - pmaddwd m9, m10 - pmaddwd m0, m10 - pmaddwd m7, m11 - pmaddwd m8, m11 - add mptrq, 384*2 - paddd m7, m9 - paddd m0, m8 - paddd m7, m12 - paddd m0, m12 - psrad m7, 11 - psrad m0, 11 - packssdw m7, m0 - vextracti128 xm0, m7, 1 - packuswb xm7, xm0 - mova [dstptrq], xm7 - ; shift pixels one position - mova m0, m1 - mova m1, m2 - mova m2, m3 - mova m3, m4 - mova m4, m5 - mova m5, m6 - add dstptrq, strideq - dec yd - jg .loop_load - ; for the bottom pixels, continue using m6 (as extended edge) - cmp yd, ylimd - jg .loop - add midq, 32 - add dstq, 16 - sub wd, 16 - jg .loop_x - RET - -INIT_YMM avx2 -cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov xlimd, edgem - movifnidn wd, wm - mov hd, hm - mov edged, xlimd - and xlimd, 2 ; have_right - jz .no_right - add wd, 2+15 - and wd, ~15 -.no_right: - lea r10, [pb_right_ext_mask+32] - xor xlimd, 2 ; 2*!have_right - pxor m1, m1 - add srcq, wq - lea sumq, [sumq+wq*2-2] - lea sumsqq, [sumsqq+wq*4-4] - neg wq -.loop_y: - mov xq, wq - - ; load left - test edgeb, 1 ; have_left - jz .no_left - test leftq, leftq - jz .load_left_from_main - vpbroadcastw xm0, [leftq+2] - add leftq, 4 - jmp .expand_x -.no_left: - vpbroadcastb xm0, [srcq+xq] - jmp .expand_x -.load_left_from_main: - vpbroadcastw xm0, [srcq+xq-2] -.expand_x: - punpckhbw xm0, xm1 - - ; when we reach this, xm0 contains left two px in highest words - cmp xd, -16 - jle .loop_x -.partial_load_and_extend: - vpbroadcastb m3, [srcq-1] - pmovzxbw m2, [srcq+xq] - movu m4, [r10+xq*2] - punpcklbw m3, m1 - pand m2, m4 - pandn m4, m3 - por m2, m4 - jmp .loop_x_noload -.right_extend: - psrldq xm2, xm0, 14 - vpbroadcastw m2, xm2 - jmp .loop_x_noload - -.loop_x: - pmovzxbw m2, [srcq+xq] -.loop_x_noload: - vinserti128 m0, xm2, 1 - palignr m3, m2, m0, 12 - palignr m4, m2, m0, 14 - - punpcklwd m5, m3, m2 - punpckhwd m6, m3, m2 - paddw m3, m4 - punpcklwd m0, m4, m1 - punpckhwd m4, m1 - pmaddwd m5, m5 - pmaddwd m6, m6 - pmaddwd m0, m0 - pmaddwd m4, m4 - paddw m3, m2 - paddd m5, m0 - vextracti128 xm0, m2, 1 - paddd m6, m4 - movu [sumq+xq*2], m3 - movu [sumsqq+xq*4+ 0], xm5 - movu [sumsqq+xq*4+16], xm6 - vextracti128 [sumsqq+xq*4+32], m5, 1 - vextracti128 [sumsqq+xq*4+48], m6, 1 - add xq, 16 - - ; if x <= -16 we can reload more pixels - ; else if x < 0 we reload and extend (this implies have_right=0) - ; else if x < xlimd we extend from previous load (this implies have_right=0) - ; else we are done - - cmp xd, -16 - jle .loop_x - test xd, xd - jl .partial_load_and_extend - cmp xd, xlimd - jl .right_extend - - add sumsqq, (384+16)*4 - add sumq, (384+16)*2 - add srcq, strideq - dec hd - jg .loop_y - RET - -INIT_YMM avx2 -cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim - movifnidn edged, edgem - mov xq, -2 - rorx ylimd, edged, 2 - and ylimd, 2 ; have_bottom - sub ylimd, 2 ; -2 if have_bottom=0, else 0 -.loop_x: - lea yd, [hq+ylimq+2] - lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] - lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test edgeb, 4 ; have_top - jnz .load_top - movu m0, [sumsq_ptrq+(384+16)*4*1] - movu m1, [sumsq_ptrq+(384+16)*4*1+32] - movu m6, [sum_ptrq+(384+16)*2*1] - mova m2, m0 - mova m3, m1 - mova m4, m0 - mova m5, m1 - mova m7, m6 - mova m8, m6 - jmp .loop_y_noload -.load_top: - movu m0, [sumsq_ptrq-(384+16)*4*1] ; l2sq [left] - movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l2sq [right] - movu m2, [sumsq_ptrq-(384+16)*4*0] ; l1sq [left] - movu m3, [sumsq_ptrq-(384+16)*4*0+32] ; l1sq [right] - movu m6, [sum_ptrq-(384+16)*2*1] ; l2 - movu m7, [sum_ptrq-(384+16)*2*0] ; l1 -.loop_y: - movu m4, [sumsq_ptrq+(384+16)*4*1] ; l0sq [left] - movu m5, [sumsq_ptrq+(384+16)*4*1+32] ; l0sq [right] - movu m8, [sum_ptrq+(384+16)*2*1] ; l0 -.loop_y_noload: - paddd m0, m2 - paddd m1, m3 - paddw m6, m7 - paddd m0, m4 - paddd m1, m5 - paddw m6, m8 - movu [sumsq_ptrq+ 0], m0 - movu [sumsq_ptrq+32], m1 - movu [sum_ptrq], m6 - - ; shift position down by one - mova m0, m2 - mova m1, m3 - mova m2, m4 - mova m3, m5 - mova m6, m7 - mova m7, m8 - add sumsq_ptrq, (384+16)*4 - add sum_ptrq, (384+16)*2 - dec yd - jg .loop_y - cmp yd, ylimd - jg .loop_y_noload - add xd, 16 - cmp xd, wd - jl .loop_x - RET - -INIT_YMM avx2 -cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 - add hd, 2 - lea r5, [sgr_x_by_x-0xf03] -%ifidn sd, sm - movd xm6, sd - vpbroadcastd m6, xm6 -%else - vpbroadcastd m6, sm -%endif - vpbroadcastd m8, [pd_0xf00801c7] - vpbroadcastd m9, [pw_256] - pcmpeqb m7, m7 - psrld m10, m9, 13 ; pd_2048 - DEFINE_ARGS a, b, w, h, x - -.loop_y: - mov xq, -2 -.loop_x: - pmovzxwd m0, [bq+xq*2] - pmovzxwd m1, [bq+xq*2+(384+16)*2] - movu m2, [aq+xq*4] - movu m3, [aq+xq*4+(384+16)*4] - pslld m4, m2, 3 - pslld m5, m3, 3 - paddd m2, m4 ; aa * 9 - paddd m3, m5 - pmaddwd m4, m0, m0 - pmaddwd m5, m1, m1 - pmaddwd m0, m8 - pmaddwd m1, m8 - psubd m2, m4 ; p = aa * 9 - bb * bb - psubd m3, m5 - pmulld m2, m6 - pmulld m3, m6 - paddusw m2, m8 - paddusw m3, m8 - psrld m2, 20 ; z - psrld m3, 20 - mova m5, m7 - vpgatherdd m4, [r5+m2], m5 ; xx - mova m5, m7 - vpgatherdd m2, [r5+m3], m5 - psrld m4, 24 - psrld m2, 24 - pmulld m0, m4 - pmulld m1, m2 - packssdw m4, m2 - psubw m4, m9, m4 - vpermq m4, m4, q3120 - paddd m0, m10 - paddd m1, m10 - psrld m0, 12 - psrld m1, 12 - movu [bq+xq*2], xm4 - vextracti128 [bq+xq*2+(384+16)*2], m4, 1 - movu [aq+xq*4], m0 - movu [aq+xq*4+(384+16)*4], m1 - add xd, 8 - cmp xd, wd - jl .loop_x - add aq, (384+16)*4*2 - add bq, (384+16)*2*2 - sub hd, 2 - jg .loop_y - RET - -INIT_YMM avx2 -cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ - tmp_ptr, src_ptr, a_ptr, b_ptr, x, y - movifnidn wd, wm - mov hd, hm - vpbroadcastd m15, [pw_16] - xor xd, xd -.loop_x: - lea tmp_ptrq, [tq+xq*2] - lea src_ptrq, [srcq+xq*1] - lea a_ptrq, [aq+xq*4+(384+16)*4] - lea b_ptrq, [bq+xq*2+(384+16)*2] - movu m0, [aq+xq*4-(384+16)*4-4] - movu m2, [aq+xq*4-(384+16)*4+4] - mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] - paddd m0, m2 ; a:tl+tr [first half] - movu m2, [aq+xq*4-(384+16)*4-4+32] - movu m4, [aq+xq*4-(384+16)*4+4+32] - mova m3, [aq+xq*4-(384+16)*4+32] ; a:top [second half] - paddd m2, m4 ; a:tl+tr [second half] - movu m4, [aq+xq*4-4] - movu m5, [aq+xq*4+4] - paddd m1, [aq+xq*4] ; a:top+ctr [first half] - paddd m4, m5 ; a:l+r [first half] - movu m5, [aq+xq*4+32-4] - movu m6, [aq+xq*4+32+4] - paddd m3, [aq+xq*4+32] ; a:top+ctr [second half] - paddd m5, m6 ; a:l+r [second half] - - movu m6, [bq+xq*2-(384+16)*2-2] - movu m8, [bq+xq*2-(384+16)*2+2] - mova m7, [bq+xq*2-(384+16)*2] ; b:top - paddw m6, m8 ; b:tl+tr - movu m8, [bq+xq*2-2] - movu m9, [bq+xq*2+2] - paddw m7, [bq+xq*2] ; b:top+ctr - paddw m8, m9 ; b:l+r - mov yd, hd -.loop_y: - movu m9, [b_ptrq-2] - movu m10, [b_ptrq+2] - paddw m7, [b_ptrq] ; b:top+ctr+bottom - paddw m9, m10 ; b:bl+br - paddw m10, m7, m8 ; b:top+ctr+bottom+l+r - paddw m6, m9 ; b:tl+tr+bl+br - psubw m7, [b_ptrq-(384+16)*2*2] ; b:ctr+bottom - paddw m10, m6 - psllw m10, 2 - psubw m10, m6 ; aa - pmovzxbw m12, [src_ptrq] - punpcklwd m6, m10, m15 - punpckhwd m10, m15 - punpcklwd m13, m12, m15 - punpckhwd m12, m15 - pmaddwd m6, m13 ; aa*src[x]+256 [first half] - pmaddwd m10, m12 ; aa*src[x]+256 [second half] - - movu m11, [a_ptrq-4] - movu m12, [a_ptrq+4] - paddd m1, [a_ptrq] ; a:top+ctr+bottom [first half] - paddd m11, m12 ; a:bl+br [first half] - movu m12, [a_ptrq+32-4] - movu m13, [a_ptrq+32+4] - paddd m3, [a_ptrq+32] ; a:top+ctr+bottom [second half] - paddd m12, m13 ; a:bl+br [second half] - paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] - paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] - paddd m0, m11 ; a:tl+tr+bl+br [first half] - paddd m2, m12 ; a:tl+tr+bl+br [second half] - paddd m13, m0 - paddd m14, m2 - pslld m13, 2 - pslld m14, 2 - psubd m13, m0 ; bb [first half] - psubd m14, m2 ; bb [second half] - vperm2i128 m0, m13, m14, 0x31 - vinserti128 m13, xm14, 1 - psubd m1, [a_ptrq-(384+16)*4*2] ; a:ctr+bottom [first half] - psubd m3, [a_ptrq-(384+16)*4*2+32] ; a:ctr+bottom [second half] - - paddd m6, m13 - paddd m10, m0 - psrad m6, 9 - psrad m10, 9 - packssdw m6, m10 - mova [tmp_ptrq], m6 - - ; shift to next row - mova m0, m4 - mova m2, m5 - mova m4, m11 - mova m5, m12 - mova m6, m8 - mova m8, m9 - - add a_ptrq, (384+16)*4 - add b_ptrq, (384+16)*2 - add tmp_ptrq, 384*2 - add src_ptrq, strideq - dec yd - jg .loop_y - add xd, 16 - cmp xd, wd - jl .loop_x - RET - -INIT_YMM avx2 -cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt -%ifidn wtd, wtm - shl wtd, 4 - movd xm5, wtd - vpbroadcastw m5, xm5 -%else - vpbroadcastw m5, wtm - mov hd, hm - psllw m5, 4 -%endif - DEFINE_ARGS dst, stride, t, w, h, idx -.loop_y: - xor idxd, idxd -.loop_x: - mova m0, [tq+idxq*2+ 0] - mova m1, [tq+idxq*2+32] - pmovzxbw m2, [dstq+idxq+ 0] - pmovzxbw m3, [dstq+idxq+16] - psllw m4, m2, 4 - psubw m0, m4 - psllw m4, m3, 4 - psubw m1, m4 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - paddw m0, m2 - paddw m1, m3 - packuswb m0, m1 - vpermq m0, m0, q3120 - mova [dstq+idxq], m0 - add idxd, 32 - cmp idxd, wd - jl .loop_x - add tq, 384*2 - add dstq, strideq - dec hd - jg .loop_y - RET - -INIT_YMM avx2 -cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov edged, edgem - movifnidn wd, wm - mov hd, hm - test edgeb, 2 ; have_right - jz .no_right - xor xlimd, xlimd - add wd, 2+15 - and wd, ~15 - jmp .right_done -.no_right: - mov xlimd, 3 - sub wd, 1 -.right_done: - lea r10, [pb_right_ext_mask+32] - pxor m1, m1 - lea srcq, [srcq+wq+1] - lea sumq, [sumq+wq*2-2] - lea sumsqq, [sumsqq+wq*4-4] - neg wq -.loop_y: - mov xq, wq - - ; load left - test edgeb, 1 ; have_left - jz .no_left - test leftq, leftq - jz .load_left_from_main - vpbroadcastd xm2, [leftq] - movd xm0, [srcq+xq-1] - add leftq, 4 - palignr xm0, xm2, 1 - jmp .expand_x -.no_left: - vpbroadcastb xm0, [srcq+xq-1] - jmp .expand_x -.load_left_from_main: - vpbroadcastd xm0, [srcq+xq-4] -.expand_x: - punpckhbw xm0, xm1 - - ; when we reach this, xm0 contains left two px in highest words - cmp xd, -16 - jle .loop_x - test xd, xd - jge .right_extend -.partial_load_and_extend: - vpbroadcastb m3, [srcq-1] - pmovzxbw m2, [srcq+xq] - movu m4, [r10+xq*2] - punpcklbw m3, m1 - pand m2, m4 - pandn m4, m3 - por m2, m4 - jmp .loop_x_noload -.right_extend: - psrldq xm2, xm0, 14 - vpbroadcastw m2, xm2 - jmp .loop_x_noload - -.loop_x: - pmovzxbw m2, [srcq+xq] -.loop_x_noload: - vinserti128 m0, xm2, 1 - palignr m3, m2, m0, 8 - palignr m4, m2, m0, 10 - palignr m5, m2, m0, 12 - palignr m6, m2, m0, 14 - - paddw m0, m3, m2 - punpcklwd m7, m3, m2 - punpckhwd m3, m2 - paddw m0, m4 - punpcklwd m8, m4, m5 - punpckhwd m4, m5 - paddw m0, m5 - punpcklwd m9, m6, m1 - punpckhwd m5, m6, m1 - paddw m0, m6 - pmaddwd m7, m7 - pmaddwd m3, m3 - pmaddwd m8, m8 - pmaddwd m4, m4 - pmaddwd m9, m9 - pmaddwd m5, m5 - paddd m7, m8 - paddd m3, m4 - paddd m7, m9 - paddd m3, m5 - movu [sumq+xq*2], m0 - movu [sumsqq+xq*4+ 0], xm7 - movu [sumsqq+xq*4+16], xm3 - vextracti128 [sumsqq+xq*4+32], m7, 1 - vextracti128 [sumsqq+xq*4+48], m3, 1 - - vextracti128 xm0, m2, 1 - add xq, 16 - - ; if x <= -16 we can reload more pixels - ; else if x < 0 we reload and extend (this implies have_right=0) - ; else if x < xlimd we extend from previous load (this implies have_right=0) - ; else we are done - - cmp xd, -16 - jle .loop_x - test xd, xd - jl .partial_load_and_extend - cmp xd, xlimd - jl .right_extend - - add srcq, strideq - add sumsqq, (384+16)*4 - add sumq, (384+16)*2 - dec hd - jg .loop_y - RET - -INIT_YMM avx2 -cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim - movifnidn edged, edgem - mov xq, -2 - rorx ylimd, edged, 2 - and ylimd, 2 ; have_bottom - sub ylimd, 3 ; -3 if have_bottom=0, else -1 -.loop_x: - lea yd, [hq+ylimq+2] - lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] - lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test edgeb, 4 ; have_top - jnz .load_top - movu m0, [sumsq_ptrq+(384+16)*4*1] - movu m1, [sumsq_ptrq+(384+16)*4*1+32] - movu m10, [sum_ptrq+(384+16)*2*1] - mova m2, m0 - mova m3, m1 - mova m4, m0 - mova m5, m1 - mova m6, m0 - mova m7, m1 - mova m11, m10 - mova m12, m10 - mova m13, m10 - jmp .loop_y_second_load -.load_top: - movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] - movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l3/4sq [right] - movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] - movu m5, [sumsq_ptrq-(384+16)*4*0+32] ; l2sq [right] - movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 - movu m12, [sum_ptrq-(384+16)*2*0] ; l2 - mova m2, m0 - mova m3, m1 - mova m11, m10 -.loop_y: - movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] - movu m7, [sumsq_ptrq+(384+16)*4*1+32] ; l1sq [right] - movu m13, [sum_ptrq+(384+16)*2*1] ; l1 -.loop_y_second_load: - test yd, yd - jle .emulate_second_load - movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] - movu m9, [sumsq_ptrq+(384+16)*4*2+32] ; l0sq [right] - movu m14, [sum_ptrq+(384+16)*2*2] ; l0 -.loop_y_noload: - paddd m0, m2 - paddd m1, m3 - paddw m10, m11 - paddd m0, m4 - paddd m1, m5 - paddw m10, m12 - paddd m0, m6 - paddd m1, m7 - paddw m10, m13 - paddd m0, m8 - paddd m1, m9 - paddw m10, m14 - movu [sumsq_ptrq+ 0], m0 - movu [sumsq_ptrq+32], m1 - movu [sum_ptrq], m10 - - ; shift position down by one - mova m0, m4 - mova m1, m5 - mova m2, m6 - mova m3, m7 - mova m4, m8 - mova m5, m9 - mova m10, m12 - mova m11, m13 - mova m12, m14 - add sumsq_ptrq, (384+16)*4*2 - add sum_ptrq, (384+16)*2*2 - sub yd, 2 - jge .loop_y - ; l1 = l0 - mova m6, m8 - mova m7, m9 - mova m13, m14 - cmp yd, ylimd - jg .loop_y_noload - add xd, 16 - cmp xd, wd - jl .loop_x - RET -.emulate_second_load: - mova m8, m6 - mova m9, m7 - mova m14, m13 - jmp .loop_y_noload - -INIT_YMM avx2 -cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 - add hd, 2 - lea r5, [sgr_x_by_x-0xf03] -%ifidn sd, sm - movd xm6, sd - vpbroadcastd m6, xm6 -%else - vpbroadcastd m6, sm -%endif - vpbroadcastd m8, [pd_0xf0080029] - vpbroadcastd m9, [pw_256] - pcmpeqb m7, m7 - psrld m10, m9, 15 ; pd_512 - DEFINE_ARGS a, b, w, h, x -.loop_y: - mov xq, -2 -.loop_x: - pmovzxwd m0, [bq+xq*2+ 0] - pmovzxwd m1, [bq+xq*2+16] - movu m2, [aq+xq*4+ 0] - movu m3, [aq+xq*4+32] - pslld m4, m2, 3 ; aa * 8 - pslld m5, m3, 3 - paddd m2, m4 ; aa * 9 - paddd m3, m5 - paddd m4, m4 ; aa * 16 - paddd m5, m5 - paddd m2, m4 ; aa * 25 - paddd m3, m5 - pmaddwd m4, m0, m0 - pmaddwd m5, m1, m1 - psubd m2, m4 ; p = aa * 25 - bb * bb - psubd m3, m5 - pmulld m2, m6 - pmulld m3, m6 - paddusw m2, m8 - paddusw m3, m8 - psrld m2, 20 ; z - psrld m3, 20 - mova m5, m7 - vpgatherdd m4, [r5+m2], m5 ; xx - mova m5, m7 - vpgatherdd m2, [r5+m3], m5 - psrld m4, 24 - psrld m2, 24 - packssdw m3, m4, m2 - pmullw m4, m8 - pmullw m2, m8 - psubw m3, m9, m3 - vpermq m3, m3, q3120 - pmaddwd m0, m4 - pmaddwd m1, m2 - paddd m0, m10 - paddd m1, m10 - psrld m0, 10 - psrld m1, 10 - movu [bq+xq*2], m3 - movu [aq+xq*4+ 0], m0 - movu [aq+xq*4+32], m1 - add xd, 16 - cmp xd, wd - jl .loop_x - add aq, (384+16)*4*2 - add bq, (384+16)*2*2 - sub hd, 2 - jg .loop_y - RET - -INIT_YMM avx2 -cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ - tmp_ptr, src_ptr, a_ptr, b_ptr, x, y - movifnidn wd, wm - mov hd, hm - vpbroadcastd m9, [pw_5_6] - vpbroadcastd m12, [pw_256] - psrlw m11, m12, 1 ; pw_128 - psrlw m10, m12, 8 ; pw_1 - xor xd, xd -.loop_x: - lea tmp_ptrq, [tq+xq*2] - lea src_ptrq, [srcq+xq*1] - lea a_ptrq, [aq+xq*4+(384+16)*4] - lea b_ptrq, [bq+xq*2+(384+16)*2] - movu m0, [aq+xq*4-(384+16)*4-4] - mova m1, [aq+xq*4-(384+16)*4] - movu m2, [aq+xq*4-(384+16)*4+4] - movu m3, [aq+xq*4-(384+16)*4-4+32] - mova m4, [aq+xq*4-(384+16)*4+32] - movu m5, [aq+xq*4-(384+16)*4+4+32] - paddd m0, m2 - paddd m3, m5 - paddd m0, m1 - paddd m3, m4 - pslld m2, m0, 2 - pslld m5, m3, 2 - paddd m2, m0 - paddd m5, m3 - paddd m0, m2, m1 ; prev_odd_b [first half] - paddd m1, m5, m4 ; prev_odd_b [second half] - movu m3, [bq+xq*2-(384+16)*2-2] - mova m4, [bq+xq*2-(384+16)*2] - movu m5, [bq+xq*2-(384+16)*2+2] - paddw m3, m5 - punpcklwd m5, m3, m4 - punpckhwd m3, m4 - pmaddwd m5, m9 - pmaddwd m3, m9 - packssdw m2, m5, m3 ; prev_odd_a - mov yd, hd -.loop_y: - movu m3, [a_ptrq-4] - mova m4, [a_ptrq] - movu m5, [a_ptrq+4] - movu m6, [a_ptrq+32-4] - mova m7, [a_ptrq+32] - movu m8, [a_ptrq+32+4] - paddd m3, m5 - paddd m6, m8 - paddd m3, m4 - paddd m6, m7 - pslld m5, m3, 2 - pslld m8, m6, 2 - paddd m5, m3 - paddd m8, m6 - paddd m3, m5, m4 ; cur_odd_b [first half] - paddd m4, m8, m7 ; cur_odd_b [second half] - movu m5, [b_ptrq-2] - mova m6, [b_ptrq] - movu m7, [b_ptrq+2] - paddw m5, m7 - punpcklwd m7, m5, m6 - punpckhwd m5, m6 - pmaddwd m7, m9 - pmaddwd m5, m9 - packssdw m5, m7, m5 ; cur_odd_a - - paddd m0, m3 ; cur_even_b [first half] - paddd m1, m4 ; cur_even_b [second half] - paddw m2, m5 ; cur_even_a - - pmovzxbw m6, [src_ptrq] - vperm2i128 m8, m0, m1, 0x31 - vinserti128 m0, xm1, 1 - punpcklwd m7, m6, m10 - punpckhwd m6, m10 - punpcklwd m1, m2, m12 - punpckhwd m2, m12 - pmaddwd m7, m1 - pmaddwd m6, m2 - paddd m7, m0 - paddd m6, m8 - psrad m7, 9 - psrad m6, 9 - - pmovzxbw m8, [src_ptrq+strideq] - punpcklwd m0, m8, m10 - punpckhwd m8, m10 - punpcklwd m1, m5, m11 - punpckhwd m2, m5, m11 - pmaddwd m0, m1 - pmaddwd m8, m2 - vinserti128 m2, m3, xm4, 1 - vperm2i128 m1, m3, m4, 0x31 - paddd m0, m2 - paddd m8, m1 - psrad m0, 8 - psrad m8, 8 - - packssdw m7, m6 - packssdw m0, m8 - mova [tmp_ptrq+384*2*0], m7 - mova [tmp_ptrq+384*2*1], m0 - - mova m0, m3 - mova m1, m4 - mova m2, m5 - add a_ptrq, (384+16)*4*2 - add b_ptrq, (384+16)*2*2 - add tmp_ptrq, 384*2*2 - lea src_ptrq, [src_ptrq+strideq*2] - sub yd, 2 - jg .loop_y - add xd, 16 - cmp xd, wd - jl .loop_x - RET - -INIT_YMM avx2 -cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt - movifnidn wd, wm - movifnidn hd, hm - vpbroadcastd m0, wtm - vpbroadcastd m10, [pd_1024] - DEFINE_ARGS dst, stride, t1, t2, w, h, idx -.loop_y: - xor idxd, idxd -.loop_x: - mova m1, [t1q+idxq*2+ 0] - mova m2, [t1q+idxq*2+32] - mova m3, [t2q+idxq*2+ 0] - mova m4, [t2q+idxq*2+32] - pmovzxbw m5, [dstq+idxq+ 0] - pmovzxbw m6, [dstq+idxq+16] - psllw m7, m5, 4 - psllw m8, m6, 4 - psubw m1, m7 - psubw m2, m8 - psubw m3, m7 - psubw m4, m8 - punpcklwd m9, m1, m3 - punpckhwd m1, m3 - punpcklwd m3, m2, m4 - punpckhwd m2, m4 - pmaddwd m9, m0 - pmaddwd m1, m0 - pmaddwd m3, m0 - pmaddwd m2, m0 - paddd m9, m10 - paddd m1, m10 - paddd m3, m10 - paddd m2, m10 - psrad m9, 11 - psrad m1, 11 - psrad m3, 11 - psrad m2, 11 - packssdw m1, m9, m1 - packssdw m2, m3, m2 - paddw m1, m5 - paddw m2, m6 - packuswb m1, m2 - vpermq m1, m1, q3120 - mova [dstq+idxq], m1 - add idxd, 32 - cmp idxd, wd - jl .loop_x - add dstq, strideq - add t1q, 384 * 2 - add t2q, 384 * 2 - dec hd - jg .loop_y - RET -%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/looprestoration_avx2.asm dav1d-0.9.1/src/x86/looprestoration_avx2.asm --- dav1d-0.7.1/src/x86/looprestoration_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/looprestoration_avx2.asm 2021-07-28 21:38:28.909852300 +0000 @@ -0,0 +1,2270 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 +wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +sgr_r_ext: times 16 db 1 + times 16 db 9 + +; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of +; cache but eliminates some shifts in the inner sgr loop which is overall a win +const sgr_x_by_x_avx2 + dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 + dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 + dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 + dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 + dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3 + dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 + dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 + + times 4 db -1 ; needed for 16-bit sgr +pb_m5: times 4 db -5 +pb_3: times 4 db 3 +pw_5_6: dw 5, 6 + +sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 + db 9, -1, 10, -1, 11, -1, 12, -1 + +pw_256: times 2 dw 256 +pw_2056: times 2 dw 2056 +pw_m16380: times 2 dw -16380 +pd_25: dd 25 +pd_34816: dd 34816 +pd_m4096: dd -4096 +pd_0xf00801c7: dd 0xf00801c7 +pd_0xf00800a4: dd 0xf00800a4 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers + +INIT_YMM avx2 +cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + mov fltq, fltmp + mov edged, r8m + mov wd, wm + mov hd, r6m + vbroadcasti128 m6, [wiener_shufA] + vpbroadcastb m11, [fltq+ 0] ; x0 x0 + vbroadcasti128 m7, [wiener_shufB] + vpbroadcastd m12, [fltq+ 2] + vbroadcasti128 m8, [wiener_shufC] + packsswb m12, m12 ; x1 x2 + vpbroadcastw m13, [fltq+ 6] ; x3 + vbroadcasti128 m9, [sgr_shuf+6] + add lpfq, wq + vpbroadcastd m10, [pw_m16380] + lea t1, [rsp+wq*2+16] + vpbroadcastd m14, [fltq+16] ; y0 y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + neg wq + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + mov [rsp+8*1], lpf_strideq + add r7, lpf_strideq + mov [rsp+8*0], r7 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp+8*0] + call .hv_bottom + add lpfq, [rsp+8*1] + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov [rsp+8*0], r7 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.extend_right: + movd xm2, r10d + vpbroadcastd m0, [pb_3] + vpbroadcastd m1, [pb_m5] + vpbroadcastb m2, xm2 + movu m3, [pb_0to31] + psubb m0, m2 + psubb m1, m2 + pminub m0, m3 + pminub m1, m3 + pshufb m4, m0 + pshufb m5, m1 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + palignr m4, m5, 12 + pshufb m4, [wiener_l_shuf] + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10-4] +.h_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m11 + pshufb m1, m5, m6 + pmaddubsw m1, m11 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + paddw m0, m2 + pshufb m2, m4, m8 + pmaddubsw m2, m12 + paddw m1, m3 + pshufb m3, m5, m8 + pmaddubsw m3, m12 + pshufb m4, m9 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m9 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m10 + paddw m5, m10 + paddw m0, m2 + vpbroadcastd m2, [pw_2056] + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m2 + paddw m1, m2 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+32], m1 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .hv_main +.hv_extend_left: + movu m4, [lpfq+r10-4] + pshufb m4, [wiener_l_shuf] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10-4] +.hv_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m11 + pshufb m1, m5, m6 + pmaddubsw m1, m11 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + paddw m0, m2 + pshufb m2, m4, m8 + pmaddubsw m2, m12 + paddw m1, m3 + pshufb m3, m5, m8 + pmaddubsw m3, m12 + pshufb m4, m9 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m9 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m10 + paddw m5, m10 + paddw m0, m2 + paddw m1, m3 + mova m2, [t4+r10*2] + paddw m2, [t2+r10*2] + mova m3, [t3+r10*2] + paddsw m0, m4 + vpbroadcastd m4, [pw_2056] + paddsw m1, m5 + mova m5, [t5+r10*2] + paddw m5, [t1+r10*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m4 + paddw m1, m4 + paddw m4, m0, [t6+r10*2] + mova [t0+r10*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m5 + pmaddwd m3, m14 + punpckhwd m4, m5 + pmaddwd m4, m14 + paddd m0, m3 + paddd m4, m2 + mova m2, [t4+r10*2+32] + paddw m2, [t2+r10*2+32] + mova m3, [t3+r10*2+32] + mova m5, [t5+r10*2+32] + paddw m5, [t1+r10*2+32] + psrad m0, 11 + psrad m4, 11 + packssdw m0, m4 + paddw m4, m1, [t6+r10*2+32] + mova [t0+r10*2+32], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m5 + pmaddwd m3, m14 + punpckhwd m4, m5 + pmaddwd m4, m14 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, dst_strideq + ret +.v: + mov r10, wq +.v_loop: + mova m2, [t4+r10*2+ 0] + paddw m2, [t2+r10*2+ 0] + mova m4, [t3+r10*2+ 0] + mova m6, [t1+r10*2+ 0] + paddw m8, m6, [t6+r10*2+ 0] + paddw m6, [t5+r10*2+ 0] + mova m3, [t4+r10*2+32] + paddw m3, [t2+r10*2+32] + mova m5, [t3+r10*2+32] + mova m7, [t1+r10*2+32] + paddw m9, m7, [t6+r10*2+32] + paddw m7, [t5+r10*2+32] + punpcklwd m0, m2, m4 + pmaddwd m0, m15 + punpckhwd m2, m4 + pmaddwd m2, m15 + punpcklwd m4, m8, m6 + pmaddwd m4, m14 + punpckhwd m6, m8, m6 + pmaddwd m6, m14 + punpcklwd m1, m3, m5 + pmaddwd m1, m15 + punpckhwd m3, m5 + pmaddwd m3, m15 + punpcklwd m5, m9, m7 + pmaddwd m5, m14 + punpckhwd m7, m9, m7 + pmaddwd m7, m14 + paddd m0, m4 + paddd m2, m6 + paddd m1, m5 + paddd m3, m7 + REPX {psrad x, 11}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, dst_strideq + ret + +cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + mov fltq, fltmp + mov edged, r8m + mov wd, wm + mov hd, r6m + vbroadcasti128 m6, [wiener_shufB] + vpbroadcastd m12, [fltq+ 2] + vbroadcasti128 m7, [wiener_shufC] + packsswb m12, m12 ; x1 x2 + vpbroadcastw m13, [fltq+ 6] ; x3 + vbroadcasti128 m8, [sgr_shuf+6] + add lpfq, wq + vpbroadcastd m9, [pw_m16380] + vpbroadcastd m10, [pw_2056] + lea t1, [rsp+wq*2+16] + mova m11, [wiener_l_shuf] + vpbroadcastd m14, [fltq+16] ; __ y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + neg wq + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + mov [rsp+8*1], lpf_strideq + add r7, lpf_strideq + mov [rsp+8*0], r7 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp+8*0] + call .hv_bottom + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + RET +.no_top: + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov [rsp+8*0], r7 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, dst_strideq +.v1: + call .v + jmp .end +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + palignr m4, m5, 12 + pshufb m4, m11 + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10-4] +.h_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -33 + jl .h_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right +.h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 + pshufb m1, m5, m6 + pmaddubsw m1, m12 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + pshufb m4, m8 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m8 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m9 + paddw m5, m9 + paddw m0, m2 + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m10 + paddw m1, m10 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+32], m1 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .hv_main +.hv_extend_left: + movu m4, [lpfq+r10-4] + pshufb m4, m11 + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10-4] +.hv_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -33 + jl .hv_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right +.hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 + pshufb m1, m5, m6 + pmaddubsw m1, m12 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + pshufb m4, m8 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m8 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m9 + paddw m5, m9 + paddw m0, m2 + paddw m1, m3 + mova m2, [t3+r10*2] + paddw m2, [t1+r10*2] + mova m3, [t2+r10*2] + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m10 + paddw m1, m10 + paddw m4, m0, [t4+r10*2] + mova [t0+r10*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m4 + pmaddwd m3, m14 + punpckhwd m4, m4 + pmaddwd m4, m14 + paddd m0, m3 + paddd m4, m2 + mova m2, [t3+r10*2+32] + paddw m2, [t1+r10*2+32] + mova m3, [t2+r10*2+32] + psrad m0, 11 + psrad m4, 11 + packssdw m0, m4 + paddw m4, m1, [t4+r10*2+32] + mova [t0+r10*2+32], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m4 + pmaddwd m3, m14 + punpckhwd m4, m4 + pmaddwd m4, m14 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + add dstq, dst_strideq + ret +.v: + mov r10, wq + psrld m13, m14, 16 ; y1 __ +.v_loop: + mova m6, [t1+r10*2+ 0] + paddw m2, m6, [t3+r10*2+ 0] + mova m4, [t2+r10*2+ 0] + mova m7, [t1+r10*2+32] + paddw m3, m7, [t3+r10*2+32] + mova m5, [t2+r10*2+32] + paddw m6, [t4+r10*2+ 0] + paddw m7, [t4+r10*2+32] + punpcklwd m0, m2, m4 + pmaddwd m0, m15 + punpckhwd m2, m4 + pmaddwd m2, m15 + punpcklwd m1, m3, m5 + pmaddwd m1, m15 + punpckhwd m3, m5 + pmaddwd m3, m15 + punpcklwd m5, m7, m6 + pmaddwd m4, m5, m14 + punpckhwd m7, m6 + pmaddwd m6, m7, m14 + pmaddwd m5, m13 + pmaddwd m7, m13 + paddd m0, m4 + paddd m2, m6 + paddd m1, m5 + paddd m3, m7 + REPX {psrad x, 11}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + ret + +cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] + mov paramsq, paramsmp + mov wd, wm + mov edged, r8m + mov hd, r6m + vbroadcasti128 m8, [base+sgr_shuf+0] + add lpfq, wq + vbroadcasti128 m9, [base+sgr_shuf+8] + lea t1, [rsp+wq*2+20] + vbroadcasti128 m10, [base+sgr_shuf+2] + add dstq, wq + vbroadcasti128 m11, [base+sgr_shuf+6] + lea t3, [rsp+wq*4+16+400*12] + vpbroadcastd m12, [paramsq+0] ; s0 + neg wq + vpbroadcastd m13, [base+pd_0xf00800a4] + pxor m6, m6 + vpbroadcastw m7, [paramsq+8] ; w0 + vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) + psllw m7, 4 + vpbroadcastd m15, [base+pd_m4096] + lea r10, [lpfq+lpf_strideq*4] + mov [rsp+8*1], lpf_strideq + add r10, lpf_strideq + mov [rsp+8*0], r10 ; below + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + add r10, lpf_strideq + mov [rsp+8*0], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, dst_strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + test hd, hd + jz .odd_height + call .h + add lpfq, dst_strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp+8*0] + call .h_top + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov [rsp+8*0], r10 + call .h + lea t2, [t1+400*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + movd xm2, r10d + mova m0, [sgr_r_ext] + vpbroadcastb m2, xm2 + psubb m0, m2 + pminub m0, [pb_0to31] + pshufb m5, m0 + ret +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m3, m5, m8 + pmullw m4, m3, m3 + pshufb m2, m5, m9 + paddw m0, m3, m2 + shufps m3, m2, q2121 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + punpcklwd m3, m4, m6 + paddd m1, m3 + punpckhwd m4, m6 + paddd m2, m4 + pshufb m4, m5, m10 + paddw m0, m4 + pshufb m5, m11 + paddw m0, m5 ; sum + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + punpckhwd m4, m5 + pmaddwd m4, m4 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10*2+400*0] + paddd m1, [t1+r10*2+400*2] + paddd m2, [t1+r10*2+400*4] +.h_loop_end: + paddd m1, m3 ; sumsq + paddd m2, m4 + mova [t1+r10*2+400*0], m0 + mova [t1+r10*2+400*2], m1 + mova [t1+r10*2+400*4], m2 + add r10, 16 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-2] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10*2+400*0] + mova m1, [t1+r10*2+400*2] + mova m2, [t1+r10*2+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10*2+400*0], m0 + mova [t2+r10*2+400*2], m1 + mova [t2+r10*2+400*4], m2 + add r10, 16 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv_main +.hv_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu xm5, [lpfq+r10-2] +.hv_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m1, m5, m8 + pmullw m4, m1, m1 + pshufb m3, m5, m9 + paddw m0, m1, m3 + shufps m1, m3, q2121 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + punpcklwd m1, m4, m6 + paddd m2, m1 + punpckhwd m4, m6 + paddd m3, m4 + pshufb m1, m5, m10 + paddw m0, m1 + pshufb m5, m11 + paddw m0, m5 ; h sum + punpcklwd m4, m5, m1 + pmaddwd m4, m4 + punpckhwd m5, m1 + pmaddwd m5, m5 + paddw m1, m0, [t1+r10*2+400*0] + paddd m2, m4 ; h sumsq + paddd m3, m5 + paddd m4, m2, [t1+r10*2+400*2] + paddd m5, m3, [t1+r10*2+400*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10*2+400*0] ; hv sum + paddd m4, [t2+r10*2+400*2] ; hv sumsq + paddd m5, [t2+r10*2+400*4] + mova [t0+r10*2+400*0], m0 + mova [t0+r10*2+400*2], m2 + mova [t0+r10*2+400*4], m3 + vpbroadcastd m2, [pd_25] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m4, m2 ; a * 25 + pmulld m5, m2 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m12 ; p * s + pmulld m5, m12 + pmaddwd m0, m13 ; b * 164 + pmaddwd m1, m13 + paddusw m4, m13 + paddusw m5, m13 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + pand m0, m15 + pand m1, m15 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires + vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. + mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but + vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. + add r10, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10*2+400*0], m1 + paddw m1, m0 + mova [t1+r10*2+400*2], m4 + paddd m4, m2 + mova [t1+r10*2+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m0, [t1+r10*2+400*0] + mova m2, [t1+r10*2+400*2] + mova m3, [t1+r10*2+400*4] + paddw m1, m0, [t2+r10*2+400*0] + paddd m4, m2, [t2+r10*2+400*2] + paddd m5, m3, [t2+r10*2+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + vpbroadcastd m2, [pd_25] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m4, m2 ; a * 25 + pmulld m5, m2 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m12 ; p * s + pmulld m5, m12 + pmaddwd m0, m13 ; b * 164 + pmaddwd m1, m13 + paddusw m4, m13 + paddusw m5, m13 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + pand m0, m15 + pand m1, m15 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+36] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+32] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+40] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 ; ab 565 + paddd m3, m1 + ; a = 4096 - (ab & 4095) = -(ab | ~4095), so by + ; using OR instead of AND for the masking we get + ; the subtraction for free (with a negated result) + por m0, m15, m2 ; -a + psrld m2, 12 ; b + por m1, m15, m3 + psrld m3, 12 + mova [t3+r10*4+400*4+ 0], m0 + mova [t3+r10*4+400*8+ 0], m2 + mova [t3+r10*4+400*4+32], m1 + mova [t3+r10*4+400*8+32], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+36] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+32] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+40] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 + paddd m3, m1 + por m0, m15, m2 + psrld m2, 12 + por m1, m15, m3 + psrld m3, 12 + paddd m4, m0, [t3+r10*4+400*4+ 0] ; -a + paddd m5, m1, [t3+r10*4+400*4+32] + mova [t3+r10*4+400*4+ 0], m0 + mova [t3+r10*4+400*4+32], m1 + paddd m0, m2, [t3+r10*4+400*8+ 0] ; b + paddd m1, m3, [t3+r10*4+400*8+32] + mova [t3+r10*4+400*8+ 0], m2 + mova [t3+r10*4+400*8+32], m3 + pmovzxbd m2, [dstq+r10+0] + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2 ; -a * src + pmaddwd m5, m3 + packssdw m2, m3 + psubd m0, m4 ; a * src + b + (1 << 8) + psubd m1, m5 + psrld m0, 9 + psrld m1, 9 + packssdw m0, m1 + psllw m1, m2, 4 + psubw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + pmovzxbd m2, [dstq+r10+0] + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; -a * src + pmaddwd m5, m3, [t3+r10*4+400*4+32] + mova m0, [t3+r10*4+400*8+ 0] ; b + mova m1, [t3+r10*4+400*8+32] + packssdw m2, m3 + psubd m0, m4 ; a * src + b + (1 << 7) + psubd m1, m5 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + psllw m1, m2, 4 + psubw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n1_loop + add dstq, dst_strideq + ret + +cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%define base r14-sgr_x_by_x_avx2-256*4 + mov paramsq, paramsmp + mov edged, r8m + mov wd, wm + mov hd, r6m + lea r14, [sgr_x_by_x_avx2+256*4] + vbroadcasti128 m8, [base+sgr_shuf+2] + add lpfq, wq + vbroadcasti128 m9, [base+sgr_shuf+4] + lea t1, [rsp+wq*2+20] + vbroadcasti128 m10, [base+sgr_shuf+6] + add dstq, wq + vpbroadcastd m11, [paramsq+ 4] ; s1 + lea t3, [rsp+wq*4+16+400*12] + vpbroadcastd m12, [base+pd_0xf00801c7] + neg wq + vpbroadcastw m7, [paramsq+10] ; w1 + pxor m6, m6 + vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) + psllw m7, 4 + vpbroadcastd m14, [base+pd_m4096] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea t4, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + add t4, lpf_strideq + mov [rsp+8*0], t4 ; below + mov t0, t2 + call .hv +.main: + mov t5, t3 + add t3, 400*4 + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv + call .prep_n + dec hd + jz .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv + call .n + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp+8*0] + call .hv_bottom + call .n + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + call .n + RET +.height1: + call .v + call .prep_n + mov t2, t1 + call .v + jmp .end +.extend_bottom: + call .v + call .n + mov t2, t1 + call .v + jmp .end +.no_top: + lea t4, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea t4, [t4+lpf_strideq*2] + mov [rsp+8*0], t4 + call .h + lea t0, [t1+400*6] + mov t2, t1 + call .v + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -17 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.h_have_right: + pshufb m0, m5, m8 + pmullw m2, m0, m0 + pshufb m4, m5, m9 + paddw m0, m4 + pshufb m5, m10 + paddw m0, m5 ; sum + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + punpckhwd m4, m5 + pmaddwd m4, m4 + punpcklwd m1, m2, m6 + punpckhwd m2, m6 + mova [t1+r10*2+400*0], m0 + paddd m1, m3 ; sumsq + paddd m2, m4 + mova [t1+r10*2+400*2], m1 + mova [t1+r10*2+400*4], m2 + add r10, 16 + jl .h_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv_main +.hv_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu xm5, [lpfq+r10-2] +.hv_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -17 + jl .hv_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.hv_have_right: + pshufb m0, m5, m8 + pmullw m3, m0, m0 + pshufb m1, m5, m9 + paddw m0, m1 + pshufb m5, m10 + paddw m0, m5 ; h sum + punpcklwd m4, m5, m1 + pmaddwd m4, m4 + punpckhwd m5, m1 + pmaddwd m5, m5 + paddw m1, m0, [t2+r10*2+400*0] + paddw m1, [t1+r10*2+400*0] ; hv sum + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + paddd m4, m2 ; h sumsq + paddd m5, m3 + paddd m2, m4, [t2+r10*2+400*2] + paddd m3, m5, [t2+r10*2+400*4] + paddd m2, [t1+r10*2+400*2] ; hv sumsq + paddd m3, [t1+r10*2+400*4] + mova [t0+r10*2+400*0], m0 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + mova [t0+r10*2+400*2], m4 + pslld m4, m2, 3 + mova [t0+r10*2+400*4], m5 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + pmaddwd m2, m0, m0 ; b * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m11 ; p * s + pmulld m5, m11 + pmaddwd m0, m12 ; b * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r14+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r14+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m13 + pand m0, m14 + pand m1, m14 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m1, [t1+r10*2+400*0] + paddw m1, m1 + paddw m1, [t2+r10*2+400*0] ; hv sum + mova m2, [t1+r10*2+400*2] + mova m3, [t1+r10*2+400*4] + paddd m2, m2 + paddd m3, m3 + paddd m2, [t2+r10*2+400*2] ; hv sumsq + paddd m3, [t2+r10*2+400*4] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + pmaddwd m2, m0, m0 ; b * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m11 ; p * s + pmulld m5, m11 + pmaddwd m0, m12 ; b * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r14+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r14+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m13 + pand m0, m14 + pand m1, m14 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq + mov t4, t3 + add t3, 400*4 +.prep_n_loop: + mova m2, [t5+r10*4+0] + mova m3, [t4+r10*4+0] + paddd m2, [t5+r10*4+8] + paddd m3, [t4+r10*4+8] + paddd m0, m2, [t5+r10*4+4] + paddd m1, m3, [t4+r10*4+4] + pslld m0, 2 + paddd m1, m1 ; ab[ 0] 222 + psubd m0, m2 ; ab[-1] 343 + mova [t3+r10*4+400*4], m1 + paddd m1, m1 + mova [t5+r10*4], m0 + psubd m1, m3 ; ab[ 0] 343 + mova [t4+r10*4], m1 + add r10, 8 + jl .prep_n_loop + ret +; a+b are packed together in a single dword, but we can't do the +; full neighbor calculations before splitting them since we don't +; have sufficient precision. The solution is to do the calculations +; in two equal halves and split a and b before doing the final sum. +ALIGN function_align +.n: ; neighbor + output + mov r10, wq +.n_loop: + mova m4, [t3+r10*4+ 0] + paddd m4, [t3+r10*4+ 8] + paddd m5, m4, [t3+r10*4+ 4] + paddd m5, m5 ; ab[+1] 222 + mova m2, [t3+r10*4+400*4+ 0] + paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 + mova m3, [t3+r10*4+400*4+32] + paddd m1, m3, [t5+r10*4+32] + mova [t3+r10*4+400*4+ 0], m5 + paddd m5, m5 + psubd m5, m4 ; ab[+1] 343 + mova [t5+r10*4+ 0], m5 + paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 + mova m4, [t3+r10*4+32] + paddd m4, [t3+r10*4+40] + paddd m5, m4, [t3+r10*4+36] + paddd m5, m5 + mova [t3+r10*4+400*4+32], m5 + paddd m5, m5 + psubd m5, m4 + mova [t5+r10*4+32], m5 + por m4, m14, m0 + psrld m0, 12 + paddd m3, m5 + por m5, m14, m2 + psrld m2, 12 + paddd m4, m5 ; -a + por m5, m14, m1 + psrld m1, 12 + paddd m0, m2 ; b + (1 << 8) + por m2, m14, m3 + psrld m3, 12 + paddd m5, m2 + pmovzxbd m2, [dstq+r10+0] + paddd m1, m3 + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2 ; -a * src + pmaddwd m5, m3 + packssdw m2, m3 + psubd m0, m4 ; a * src + b + (1 << 8) + psubd m1, m5 + psrld m0, 9 + psrld m1, 9 + packssdw m0, m1 + psllw m1, m2, 4 + psubw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n_loop + mov r10, t5 + mov t5, t4 + mov t4, r10 + add dstq, dst_strideq + ret + +cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] + mov paramsq, paramsmp + mov wd, wm + mov edged, r8m + mov hd, r6m + vbroadcasti128 m9, [base+sgr_shuf+0] + add lpfq, wq + vbroadcasti128 m10, [base+sgr_shuf+8] + lea t1, [rsp+wq*2+12] + vbroadcasti128 m11, [base+sgr_shuf+2] + add dstq, wq + vbroadcasti128 m12, [base+sgr_shuf+6] + lea t3, [rsp+wq*4+400*24+8] + vpbroadcastd m15, [paramsq+8] ; w0 w1 + neg wq + vpbroadcastd m13, [paramsq+0] ; s0 + pxor m7, m7 + vpbroadcastd m14, [paramsq+4] ; s1 + psllw m15, 2 ; to reuse existing pd_m4096 register for rounding + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup + add t1, 400*12 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, dst_strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, lpf_strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+400*12] + lea r10, [wq-2] +.top_fixup_loop: + mova m0, [t1+r10*2+400* 0] + mova m1, [t1+r10*2+400* 2] + mova m2, [t1+r10*2+400* 4] + paddw m0, m0 + mova m3, [t1+r10*2+400* 6] + paddd m1, m1 + mova m4, [t1+r10*2+400* 8] + paddd m2, m2 + mova m5, [t1+r10*2+400*10] + mova [t2+r10*2+400* 0], m0 + mova [t2+r10*2+400* 2], m1 + mova [t2+r10*2+400* 4], m2 + mova [t2+r10*2+400* 6], m3 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + add r10, 16 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsums + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -18 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.h_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 + paddw m8, m6, m4 + shufps m0, m6, m4, q2121 + pmullw m3, m0, m0 + pshufb m2, m5, m11 + paddw m0, m2 + pshufb m5, m12 + paddw m0, m5 ; sum3 + punpcklwd m1, m2, m5 + pmaddwd m1, m1 + punpckhwd m2, m5 + pmaddwd m2, m2 + punpcklwd m5, m6, m4 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + punpcklwd m4, m3, m7 + paddd m1, m4 ; sumsq3 + punpckhwd m3, m7 + paddd m2, m3 + mova [t1+r10*2+400* 6], m0 + mova [t1+r10*2+400* 8], m1 + mova [t1+r10*2+400*10], m2 + paddw m8, m0 ; sum5 + paddd m5, m1 ; sumsq5 + paddd m6, m2 + mova [t1+r10*2+400* 0], m8 + mova [t1+r10*2+400* 2], m5 + mova [t1+r10*2+400* 4], m6 + add r10, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv0_main +.hv0_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu xm5, [lpfq+r10-2] +.hv0_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -18 + jl .hv0_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.hv0_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 + paddw m8, m6, m4 + shufps m1, m6, m4, q2121 + pmullw m0, m1, m1 + pshufb m3, m5, m11 + paddw m1, m3 + pshufb m5, m12 + paddw m1, m5 ; sum3 + punpcklwd m2, m3, m5 + pmaddwd m2, m2 + punpckhwd m3, m5 + pmaddwd m3, m3 + punpcklwd m5, m6, m4 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + punpcklwd m4, m0, m7 + paddd m2, m4 ; sumsq3 + punpckhwd m0, m7 + paddd m3, m0 + paddw m8, m1 ; sum5 + paddd m5, m2 ; sumsq5 + paddd m6, m3 + mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row + mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd + mova [t3+r10*4+400*0+40], m6 + paddw m8, [t1+r10*2+400* 0] + paddd m5, [t1+r10*2+400* 2] + paddd m6, [t1+r10*2+400* 4] + mova [t1+r10*2+400* 0], m8 + mova [t1+r10*2+400* 2], m5 + mova [t1+r10*2+400* 4], m6 + paddw m0, m1, [t1+r10*2+400* 6] + paddd m4, m2, [t1+r10*2+400* 8] + paddd m5, m3, [t1+r10*2+400*10] + mova [t1+r10*2+400* 6], m1 + mova [t1+r10*2+400* 8], m2 + mova [t1+r10*2+400*10], m3 + paddw m1, m0, [t2+r10*2+400* 6] + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m0 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + vpbroadcastd m4, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m5, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m4 + pand m0, m5 + pand m1, m5 + por m0, m2 ; a3 | (b3 << 12) + por m1, m3 + mova [t3+r10*4+400*4+ 8], xm0 + vextracti128 [t3+r10*4+400*4+40], m0, 1 + mova [t3+r10*4+400*4+24], xm1 + vextracti128 [t3+r10*4+400*4+56], m1, 1 + add r10, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv1_main +.hv1_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu xm5, [lpfq+r10-2] +.hv1_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -18 + jl .hv1_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.hv1_have_right: + pshufb m6, m5, m9 + pshufb m3, m5, m10 + paddw m8, m6, m3 + shufps m2, m6, m3, q2121 + pmullw m1, m2, m2 + pshufb m0, m5, m11 + paddw m2, m0 + pshufb m5, m12 + paddw m2, m5 ; sum3 + punpcklwd m4, m5, m0 + pmaddwd m4, m4 + punpckhwd m5, m0 + pmaddwd m5, m5 + punpcklwd m0, m6, m3 + pmaddwd m0, m0 + punpckhwd m6, m3 + pmaddwd m6, m6 + punpcklwd m3, m1, m7 + paddd m4, m3 ; sumsq3 + punpckhwd m1, m7 + paddd m5, m1 + paddw m1, m2, [t2+r10*2+400* 6] + mova [t2+r10*2+400* 6], m2 + paddw m8, m2 ; sum5 + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + paddd m4, m0 ; sumsq5 + paddd m5, m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m6, m2, 3 + pslld m7, m3, 3 + paddd m6, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m7, m3 + pmaddwd m3, m1, m1 + psubd m6, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m7, m3 + pmulld m6, m14 ; p3 * s1 + pmulld m7, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m6, m2 + paddusw m7, m2 + psrad m3, m6, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m6 + psrad m6, m7, 20 + vpgatherdd m3, [r12+m6*4], m7 + vpbroadcastd m6, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m7, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m6 + pand m0, m7 + pand m7, m1 + por m0, m2 ; a3 | (b3 << 12) + por m7, m3 + paddw m1, m8, [t2+r10*2+400*0] + paddd m2, m4, [t2+r10*2+400*2] + paddd m3, m5, [t2+r10*2+400*4] + paddw m1, [t1+r10*2+400*0] + paddd m2, [t1+r10*2+400*2] + paddd m3, [t1+r10*2+400*4] + mova [t2+r10*2+400*0], m8 + mova [t2+r10*2+400*2], m4 + mova [t2+r10*2+400*4], m5 + mova [t3+r10*4+400*8+ 8], xm0 + vextracti128 [t3+r10*4+400*8+40], m0, 1 + mova [t3+r10*4+400*8+24], xm7 + vextracti128 [t3+r10*4+400*8+56], m7, 1 + vpbroadcastd m4, [base+pd_25] + pxor m7, m7 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmulld m2, m4 ; a5 * 25 + pmulld m3, m4 + pmaddwd m4, m0, m0 ; b5 * b5 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p5 + vpbroadcastd m4, [base+pd_0xf00800a4] + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r12+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r12+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m6 + vpbroadcastd m6, [base+pd_m4096] + pand m0, m6 + pand m1, m6 + por m0, m4 ; a5 | (b5 << 12) + por m1, m5 + mova [t3+r10*4+400*0+ 8], xm0 + vextracti128 [t3+r10*4+400*0+40], m0, 1 + mova [t3+r10*4+400*0+24], xm1 + vextracti128 [t3+r10*4+400*0+56], m1, 1 + add r10, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-2] + vpbroadcastd m6, [base+pd_34816] + vpbroadcastd m8, [base+pd_m4096] +.v0_loop: + mova m0, [t1+r10*2+400* 6] + mova m4, [t1+r10*2+400* 8] + mova m5, [t1+r10*2+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10*2+400* 6] + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m0 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m6 + pand m0, m8 + pand m1, m8 + por m0, m2 ; a3 | (b3 << 12) + por m1, m3 + mova m2, [t1+r10*2+400*0] + mova m3, [t1+r10*2+400*2] + mova m4, [t1+r10*2+400*4] + mova [t3+r10*4+400*8+ 8], m2 + mova [t3+r10*4+400*0+ 8], m3 + mova [t3+r10*4+400*0+40], m4 + paddw m2, m2 ; cc5 + paddd m3, m3 + paddd m4, m4 + mova [t1+r10*2+400*0], m2 + mova [t1+r10*2+400*2], m3 + mova [t1+r10*2+400*4], m4 + mova [t3+r10*4+400*4+ 8], xm0 + vextracti128 [t3+r10*4+400*4+40], m0, 1 + mova [t3+r10*4+400*4+24], xm1 + vextracti128 [t3+r10*4+400*4+56], m1, 1 + add r10, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-2] +.v1_loop: + mova m4, [t1+r10*2+400* 6] + mova m5, [t1+r10*2+400* 8] + mova m6, [t1+r10*2+400*10] + paddw m1, m4, [t2+r10*2+400* 6] + paddd m2, m5, [t2+r10*2+400* 8] + paddd m3, m6, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m4 + mova [t2+r10*2+400* 8], m5 + mova [t2+r10*2+400*10], m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + vpbroadcastd m4, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m8, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m4 + pand m0, m8 + pand m8, m1 + por m0, m2 ; a3 | (b3 << 12) + por m8, m3 + mova m4, [t3+r10*4+400*8+ 8] + mova m5, [t3+r10*4+400*0+ 8] + mova m6, [t3+r10*4+400*0+40] + paddw m1, m4, [t2+r10*2+400*0] + paddd m2, m5, [t2+r10*2+400*2] + paddd m3, m6, [t2+r10*2+400*4] + paddw m1, [t1+r10*2+400*0] + paddd m2, [t1+r10*2+400*2] + paddd m3, [t1+r10*2+400*4] + mova [t2+r10*2+400*0], m4 + mova [t2+r10*2+400*2], m5 + mova [t2+r10*2+400*4], m6 + vpbroadcastd m4, [base+pd_25] + mova [t3+r10*4+400*8+ 8], xm0 + vextracti128 [t3+r10*4+400*8+40], m0, 1 + mova [t3+r10*4+400*8+24], xm8 + vextracti128 [t3+r10*4+400*8+56], m8, 1 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmulld m2, m4 ; a5 * 25 + pmulld m3, m4 + pmaddwd m4, m0, m0 ; b5 * b5 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p5 + vpbroadcastd m4, [base+pd_0xf00800a4] + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r12+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r12+m2*4], m3 + pmulld m0, m4 + vpbroadcastd m6, [base+pd_34816] + pmulld m1, m5 + paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m6 + vpbroadcastd m6, [base+pd_m4096] + pand m0, m6 + pand m1, m6 + por m0, m4 ; a5 | (b5 << 12) + por m1, m5 + mova [t3+r10*4+400*0+ 8], xm0 + vextracti128 [t3+r10*4+400*0+40], m0, 1 + mova [t3+r10*4+400*0+24], xm1 + vextracti128 [t3+r10*4+400*0+56], m1, 1 + add r10, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+400*0+4] + paddd m1, m0, [t3+r10*4+400*0+0] + mova m4, [t3+r10*4+400*4+0] + paddd m1, [t3+r10*4+400*0+8] + mova m5, [t3+r10*4+400*8+0] + paddd m4, [t3+r10*4+400*4+8] + paddd m5, [t3+r10*4+400*8+8] + paddd m2, m4, [t3+r10*4+400*4+4] + paddd m3, m5, [t3+r10*4+400*8+4] + paddd m0, m1 + pslld m1, 2 + pslld m2, 2 + paddd m1, m0 ; ab5 565 + paddd m3, m3 ; ab3[ 0] 222 + psubd m2, m4 ; ab3[-1] 343 + mova [t3+r10*4+400*20], m3 + por m0, m6, m1 ; a5 565 + mova [t3+r10*4+400*24], m2 + psrld m1, 12 ; b5 565 + mova [t3+r10*4+400*12], m0 + paddd m3, m3 + mova [t3+r10*4+400*16], m1 + psubd m3, m5 ; ab3[ 0] 343 + mova [t3+r10*4+400*28], m3 + add r10, 8 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t3+r10*4+4] + paddd m4, m0, [t3+r10*4+0] + paddd m4, [t3+r10*4+8] + paddd m0, m4 + pslld m4, 2 + paddd m4, m0 + por m0, m6, m4 + psrld m4, 12 + paddd m2, m0, [t3+r10*4+400*12] ; -a5 + mova [t3+r10*4+400*12], m0 + paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) + mova [t3+r10*4+400*16], m4 + mova m3, [t3+r10*4+400*4+0] + paddd m3, [t3+r10*4+400*4+8] + paddd m5, m3, [t3+r10*4+400*4+4] + paddd m5, m5 ; ab3[ 1] 222 + mova m4, [t3+r10*4+400*20] + paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+400*20], m5 + paddd m5, m5 + psubd m5, m3 ; ab3[ 1] 343 + mova [t3+r10*4+400*24], m5 + paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 + por m3, m6, m1 + psrld m1, 12 + por m5, m6, m4 + psrld m4, 12 + paddd m3, m5 ; -a3 + paddd m1, m4 ; b3 + (1 << 8) + pmovzxbd m4, [dstq+r10] + pmaddwd m2, m4 ; -a5 * src + pmaddwd m3, m4 ; -a3 * src + pslld m4, 13 + psubd m0, m4 + psubd m1, m4 + psubd m0, m2 ; a5 * src + b5 + (1 << 8) + psubd m1, m3 ; a3 * src + b3 + (1 << 8) + psrld m0, 9 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + psubd m4, m6 + paddd m0, m4 + psrad m0, 13 + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + packuswb xm0, xm0 + movq [dstq+r10], xm0 + add r10, 8 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t3+r10*4+400*8+0] + paddd m3, [t3+r10*4+400*8+8] + paddd m5, m3, [t3+r10*4+400*8+4] + paddd m5, m5 ; ab3[ 1] 222 + mova m4, [t3+r10*4+400*20] + paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+400*20], m5 + paddd m5, m5 + psubd m5, m3 ; ab3[ 1] 343 + mova [t3+r10*4+400*28], m5 + paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 + por m3, m6, m1 + psrld m1, 12 + por m5, m6, m4 + psrld m4, 12 + paddd m3, m5 ; -a3 + paddd m1, m4 ; b3 + (1 << 8) + pmovzxbd m4, [dstq+r10] + pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src + mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) + pmaddwd m3, m4 ; -a3 * src + pslld m4, 12 + psubd m0, m4 + paddd m4, m4 + psubd m1, m4 + psubd m0, m2 ; a5 * src + b5 + (1 << 7) + psubd m1, m3 ; a3 * src + b3 + (1 << 8) + psrld m0, 8 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + psubd m4, m6 + paddd m0, m4 + psrad m0, 13 + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + packuswb xm0, xm0 + movq [dstq+r10], xm0 + add r10, 8 + jl .n1_loop + add dstq, dst_strideq + ret +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/looprestoration_init_tmpl.c dav1d-0.9.1/src/x86/looprestoration_init_tmpl.c --- dav1d-0.7.1/src/x86/looprestoration_init_tmpl.c 2020-06-21 11:48:55.028126500 +0000 +++ dav1d-0.9.1/src/x86/looprestoration_init_tmpl.c 2021-07-28 21:38:28.909852300 +0000 @@ -29,205 +29,198 @@ #include "src/looprestoration.h" #include "common/intops.h" -#include "src/tables.h" -// Future potential optimizations: -// - special chroma versions which don't filter [0]/[6]; -// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top -// to bottom) instead of scanline-ordered should be faster since then the -// if (have_left) and similar conditions run only once instead of per line; -// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible -// to run 32 (like filter_h_avx2), and then all vpermqs can go; -// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2, -// since then the have_left condition can be inlined; -// - consider having the wrapper (wiener_filter_${ext}) also in hand-written -// assembly, so the setup overhead is minimized. - -#define WIENER_FILTER(ext) \ -\ -void dav1d_wiener_filter_h_##ext(int16_t *dst, const pixel (*left)[4], \ - const pixel *src, ptrdiff_t stride, \ - const int16_t fh[7], const intptr_t w, \ - int h, enum LrEdgeFlags edges); \ -void dav1d_wiener_filter_v_##ext(pixel *dst, ptrdiff_t stride, \ - const int16_t *mid, int w, int h, \ - const int16_t fv[7], enum LrEdgeFlags edges); \ -\ -static void wiener_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ - const pixel (*const left)[4], \ - const pixel *lpf, const ptrdiff_t lpf_stride, \ - const int w, const int h, const int16_t fh[7], \ - const int16_t fv[7], const enum LrEdgeFlags edges) \ -{ \ - ALIGN_STK_32(int16_t, mid, 68 * 384,); \ -\ - /* horizontal filter */ \ - dav1d_wiener_filter_h_##ext(&mid[2 * 384], left, dst, dst_stride, \ - fh, w, h, edges); \ - if (edges & LR_HAVE_TOP) \ - dav1d_wiener_filter_h_##ext(mid, NULL, lpf, lpf_stride, \ - fh, w, 2, edges); \ - if (edges & LR_HAVE_BOTTOM) \ - dav1d_wiener_filter_h_##ext(&mid[(2 + h) * 384], NULL, \ - lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride, \ - fh, w, 2, edges); \ -\ - dav1d_wiener_filter_v_##ext(dst, dst_stride, &mid[2*384], w, h, fv, edges); \ -} - -#define SGR_FILTER(ext) \ -\ -void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \ - const pixel (*left)[4], \ - const pixel *src, const ptrdiff_t stride, \ - const int w, const int h, \ - const enum LrEdgeFlags edges); \ -void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \ - const int w, const int h, \ - const enum LrEdgeFlags edges); \ -void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \ - const int w, const int h, const int strength); \ -void dav1d_sgr_finish_filter1_##ext(coef *tmp, \ - const pixel *src, const ptrdiff_t stride, \ - const int32_t *a, const int16_t *b, \ - const int w, const int h); \ +#define decl_wiener_filter_fns(ext) \ +decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \ +decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext)) + +#define decl_sgr_filter_fns(ext) \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext)) + +/* FIXME: Replace with a port of the AVX2 code */ +#define SGR_FILTER_OLD(ext) \ +void BF(dav1d_sgr_box3_h, ext)(int32_t *sumsq, int16_t *sum, \ + const pixel (*left)[4], \ + const pixel *src, const ptrdiff_t stride, \ + const int w, const int h, \ + const enum LrEdgeFlags edges); \ +void BF(dav1d_sgr_box3_v, ext)(int32_t *sumsq, int16_t *sum, \ + const int w, const int h, \ + const enum LrEdgeFlags edges); \ +void BF(dav1d_sgr_calc_ab1, ext)(int32_t *a, int16_t *b, \ + const int w, const int h, const unsigned s); \ +void BF(dav1d_sgr_finish_filter1, ext)(int16_t *tmp, \ + const pixel *src, const ptrdiff_t stride, \ + const int32_t *a, const int16_t *b, \ + const int w, const int h); \ \ /* filter with a 3x3 box (radius=1) */ \ -static void dav1d_sgr_filter1_##ext(coef *tmp, \ - const pixel *src, const ptrdiff_t stride, \ - const pixel (*left)[4], \ - const pixel *lpf, const ptrdiff_t lpf_stride, \ - const int w, const int h, const int strength, \ - const enum LrEdgeFlags edges) \ +static void BF(dav1d_sgr_filter1, ext)(int16_t *tmp, \ + const pixel *src, const ptrdiff_t stride, \ + const pixel (*left)[4], \ + const pixel *lpf, const ptrdiff_t lpf_stride, \ + const int w, const int h, const int strength, \ + const enum LrEdgeFlags edges) \ { \ ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ \ - dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ + BF(dav1d_sgr_box3_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ if (edges & LR_HAVE_TOP) \ - dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ - NULL, lpf, lpf_stride, w, 2, edges); \ + BF(dav1d_sgr_box3_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ + NULL, lpf, lpf_stride, w, 2, edges); \ \ if (edges & LR_HAVE_BOTTOM) \ - dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ - NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ - lpf_stride, w, 2, edges); \ -\ - dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \ - dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \ - dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \ + BF(dav1d_sgr_box3_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ + NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ + lpf_stride, w, 2, edges); \ +\ + BF(dav1d_sgr_box3_v, ext)(sumsq, sum, w, h, edges); \ + BF(dav1d_sgr_calc_ab1, ext)(a, b, w, h, strength); \ + BF(dav1d_sgr_finish_filter1, ext)(tmp, src, stride, a, b, w, h); \ } \ \ -void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \ - const pixel (*left)[4], \ - const pixel *src, const ptrdiff_t stride, \ - const int w, const int h, \ - const enum LrEdgeFlags edges); \ -void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \ - const int w, const int h, \ - const enum LrEdgeFlags edges); \ -void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \ - const int w, const int h, const int strength); \ -void dav1d_sgr_finish_filter2_##ext(coef *tmp, \ - const pixel *src, const ptrdiff_t stride, \ - const int32_t *a, const int16_t *b, \ - const int w, const int h); \ +void BF(dav1d_sgr_box5_h, ext)(int32_t *sumsq, int16_t *sum, \ + const pixel (*left)[4], \ + const pixel *src, const ptrdiff_t stride, \ + const int w, const int h, \ + const enum LrEdgeFlags edges); \ +void BF(dav1d_sgr_box5_v, ext)(int32_t *sumsq, int16_t *sum, \ + const int w, const int h, \ + const enum LrEdgeFlags edges); \ +void BF(dav1d_sgr_calc_ab2, ext)(int32_t *a, int16_t *b, \ + const int w, const int h, const int strength); \ +void BF(dav1d_sgr_finish_filter2, ext)(int16_t *tmp, \ + const pixel *src, const ptrdiff_t stride, \ + const int32_t *a, const int16_t *b, \ + const int w, const int h); \ \ /* filter with a 5x5 box (radius=2) */ \ -static void dav1d_sgr_filter2_##ext(coef *tmp, \ - const pixel *src, const ptrdiff_t stride, \ - const pixel (*left)[4], \ - const pixel *lpf, const ptrdiff_t lpf_stride, \ - const int w, const int h, const int strength, \ - const enum LrEdgeFlags edges) \ +static void BF(dav1d_sgr_filter2, ext)(int16_t *tmp, \ + const pixel *src, const ptrdiff_t stride, \ + const pixel (*left)[4], \ + const pixel *lpf, const ptrdiff_t lpf_stride, \ + const int w, const int h, const int strength, \ + const enum LrEdgeFlags edges) \ { \ ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ \ - dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ + BF(dav1d_sgr_box5_h, ext)(sumsq, sum, left, src, stride, w, h, edges); \ if (edges & LR_HAVE_TOP) \ - dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ - NULL, lpf, lpf_stride, w, 2, edges); \ + BF(dav1d_sgr_box5_h, ext)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ + NULL, lpf, lpf_stride, w, 2, edges); \ \ if (edges & LR_HAVE_BOTTOM) \ - dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ - NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ - lpf_stride, w, 2, edges); \ -\ - dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \ - dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \ - dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \ + BF(dav1d_sgr_box5_h, ext)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ + NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ + lpf_stride, w, 2, edges); \ +\ + BF(dav1d_sgr_box5_v, ext)(sumsq, sum, w, h, edges); \ + BF(dav1d_sgr_calc_ab2, ext)(a, b, w, h, strength); \ + BF(dav1d_sgr_finish_filter2, ext)(tmp, src, stride, a, b, w, h); \ } \ \ -void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \ - const coef *t1, const int w, const int h, \ - const int wt); \ -void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \ - const coef *t1, const coef *t2, \ - const int w, const int h, \ - const uint32_t wt); \ +void BF(dav1d_sgr_weighted1, ext)(pixel *dst, const ptrdiff_t stride, \ + const int16_t *t1, const int w, const int h, \ + const int wt); \ +void BF(dav1d_sgr_weighted2, ext)(pixel *dst, const ptrdiff_t stride, \ + const int16_t *t1, const int16_t *t2, \ + const int w, const int h, \ + const uint32_t wt); \ \ -static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ - const pixel (*const left)[4], \ - const pixel *lpf, const ptrdiff_t lpf_stride, \ - const int w, const int h, const int sgr_idx, \ - const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \ +static void BF(sgr_filter_5x5, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ + const pixel (*const left)[4], \ + const pixel *lpf, const ptrdiff_t lpf_stride, \ + const int w, const int h, \ + const LooprestorationParams *const params, \ + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \ { \ - if (!dav1d_sgr_params[sgr_idx][0]) { \ - ALIGN_STK_32(coef, tmp, 64 * 384,); \ - dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ - w, h, dav1d_sgr_params[sgr_idx][3], edges); \ - dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \ - } else if (!dav1d_sgr_params[sgr_idx][1]) { \ - ALIGN_STK_32(coef, tmp, 64 * 384,); \ - dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ - w, h, dav1d_sgr_params[sgr_idx][2], edges); \ - dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \ - } else { \ - ALIGN_STK_32(coef, tmp1, 64 * 384,); \ - ALIGN_STK_32(coef, tmp2, 64 * 384,); \ - dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ - w, h, dav1d_sgr_params[sgr_idx][2], edges); \ - dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ - w, h, dav1d_sgr_params[sgr_idx][3], edges); \ - const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \ - dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \ - } \ + ALIGN_STK_32(int16_t, tmp, 64 * 384,); \ + BF(dav1d_sgr_filter2, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ + w, h, params->sgr.s0, edges); \ + BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w0); \ +} \ +static void BF(sgr_filter_3x3, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ + const pixel (*const left)[4], \ + const pixel *lpf, const ptrdiff_t lpf_stride, \ + const int w, const int h, \ + const LooprestorationParams *const params, \ + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \ +{ \ + ALIGN_STK_32(int16_t, tmp, 64 * 384,); \ + BF(dav1d_sgr_filter1, ext)(tmp, dst, dst_stride, left, lpf, lpf_stride, \ + w, h, params->sgr.s1, edges); \ + BF(dav1d_sgr_weighted1, ext)(dst, dst_stride, tmp, w, h, params->sgr.w1); \ +} \ +static void BF(sgr_filter_mix, ext)(pixel *const dst, const ptrdiff_t dst_stride, \ + const pixel (*const left)[4], \ + const pixel *lpf, const ptrdiff_t lpf_stride, \ + const int w, const int h, \ + const LooprestorationParams *const params, \ + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) \ +{ \ + ALIGN_STK_32(int16_t, tmp1, 64 * 384,); \ + ALIGN_STK_32(int16_t, tmp2, 64 * 384,); \ + BF(dav1d_sgr_filter2, ext)(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ + w, h, params->sgr.s0, edges); \ + BF(dav1d_sgr_filter1, ext)(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ + w, h, params->sgr.s1, edges); \ + const uint32_t wt = (params->sgr.w1 << 16) | (uint16_t) params->sgr.w0; \ + BF(dav1d_sgr_weighted2, ext)(dst, dst_stride, tmp1, tmp2, w, h, wt); \ } -#define DEF_LR_FILTERS(ext) \ -WIENER_FILTER(ext) \ -SGR_FILTER(ext) +decl_wiener_filter_fns(sse2); +decl_wiener_filter_fns(ssse3); +decl_wiener_filter_fns(avx2); +decl_sgr_filter_fns(ssse3); +decl_sgr_filter_fns(avx2); #if BITDEPTH == 8 -WIENER_FILTER(sse2) -DEF_LR_FILTERS(ssse3) -# if ARCH_X86_64 -DEF_LR_FILTERS(avx2) -# endif +SGR_FILTER_OLD(ssse3) #endif -COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { +COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c, + const int bpc) +{ const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; #if BITDEPTH == 8 - c->wiener = wiener_filter_sse2; + c->wiener[0] = BF(dav1d_wiener_filter7, sse2); + c->wiener[1] = BF(dav1d_wiener_filter5, sse2); #endif if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); + c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); #if BITDEPTH == 8 - c->wiener = wiener_filter_ssse3; - c->selfguided = sgr_filter_ssse3; + c->sgr[0] = BF(sgr_filter_5x5, ssse3); + c->sgr[1] = BF(sgr_filter_3x3, ssse3); + c->sgr[2] = BF(sgr_filter_mix, ssse3); +#else + if (bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3); + c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3); + } #endif +#if ARCH_X86_64 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if BITDEPTH == 8 && ARCH_X86_64 - c->wiener = wiener_filter_avx2; - c->selfguided = sgr_filter_avx2; + + c->wiener[0] = BF(dav1d_wiener_filter7, avx2); + c->wiener[1] = BF(dav1d_wiener_filter5, avx2); + if (bpc <= 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); + } #endif } diff -Nru dav1d-0.7.1/src/x86/looprestoration_sse.asm dav1d-0.9.1/src/x86/looprestoration_sse.asm --- dav1d-0.7.1/src/x86/looprestoration_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/looprestoration_sse.asm 2021-07-28 21:38:28.909852300 +0000 @@ -0,0 +1,2448 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; Copyright © 2018, VideoLabs +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4 +wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 +wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 +wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 + +pb_right_ext_mask: times 24 db 0xff + times 8 db 0 +pb_0: times 16 db 0 +pb_3: times 16 db 3 +pb_15: times 16 db 15 +pb_0_1: times 8 db 0, 1 +pb_14_15: times 8 db 14, 15 +pw_1: times 8 dw 1 +pw_16: times 8 dw 16 +pw_128: times 8 dw 128 +pw_256: times 8 dw 256 +pw_2048: times 8 dw 2048 +pw_2056: times 8 dw 2056 +pw_m16380: times 8 dw -16380 +pw_5_6: times 4 dw 5, 6 +pd_1024: times 4 dd 1024 +%if ARCH_X86_32 +pd_512: times 4 dd 512 +pd_2048: times 4 dd 2048 +%endif +pd_0xF0080029: times 4 dd 0xF0080029 +pd_0xF00801C7: times 4 dd 0XF00801C7 + +cextern sgr_x_by_x + +SECTION .text + +%if ARCH_X86_32 + %define PIC_base_offset $$ + + %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg + %assign pic_reg_stk_off 4 + %xdefine PIC_reg %1 + %if %2 == 1 + mov [esp], %1 + %endif + LEA PIC_reg, PIC_base_offset + %if %3 == 1 + XCHG_PIC_REG + %endif + %endmacro + + %macro XCHG_PIC_REG 0 + mov [esp+pic_reg_stk_off], PIC_reg + %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 + mov PIC_reg, [esp+pic_reg_stk_off] + %endmacro + + %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) + +%else + %macro XCHG_PIC_REG 0 + %endmacro + + %define PIC_sym(sym) (sym) +%endif + +%macro WIENER 0 +%if ARCH_X86_64 +DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers +cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h, x + %define base 0 + mov fltq, fltmp + mov edged, r8m + mov wd, wm + mov hd, r6m + movq m14, [fltq] + add lpfq, wq + lea t1, [rsp+wq*2+16] + mova m15, [pw_2056] + add dstq, wq + movq m7, [fltq+16] + neg wq +%if cpuflag(ssse3) + pshufb m14, [wiener_init] + mova m8, [wiener_shufA] + pshufd m12, m14, q2222 ; x0 x0 + mova m9, [wiener_shufB] + pshufd m13, m14, q3333 ; x1 x2 + mova m10, [wiener_shufC] + punpcklqdq m14, m14 ; x3 + mova m11, [wiener_shufD] +%else + mova m10, [pw_m16380] + punpcklwd m14, m14 + pshufd m11, m14, q0000 ; x0 + pshufd m12, m14, q1111 ; x1 + pshufd m13, m14, q2222 ; x2 + pshufd m14, m14, q3333 ; x3 +%endif +%else +DECLARE_REG_TMP 4, 0, _, 5 +%if cpuflag(ssse3) + %define m10 [base+wiener_shufC] + %define m11 [base+wiener_shufD] + %define stk_off 96 +%else + %define m10 [base+pw_m16380] + %define m11 [stk+96] + %define stk_off 112 +%endif +cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride + %define base r6-pb_right_ext_mask-21 + %define stk esp + %define dstq leftq + %define edgeb byte edged + %define edged [stk+ 8] + %define dstmp [stk+12] + %define hd dword [stk+16] + %define wq [stk+20] + %define dst_strideq [stk+24] + %define leftmp [stk+28] + %define t2 [stk+32] + %define t4 [stk+36] + %define t5 [stk+40] + %define t6 [stk+44] + %define m8 [base+wiener_shufA] + %define m9 [base+wiener_shufB] + %define m12 [stk+48] + %define m13 [stk+64] + %define m14 [stk+80] + %define m15 [base+pw_2056] + mov r1, r7m ; flt + mov r0, r0m ; dst + mov r5, r5m ; w + mov lpfq, lpfm + mov r2, r8m ; edge + mov r4, r6m ; h + movq m3, [r1+ 0] + movq m7, [r1+16] + add r0, r5 + mov r1, r1m ; dst_stride + add lpfq, r5 + mov edged, r2 + mov r2, r2m ; left + mov dstmp, r0 + lea t1, [rsp+r5*2+stk_off] + mov hd, r4 + neg r5 + mov lpf_strideq, lpf_stridem + LEA r6, pb_right_ext_mask+21 + mov wq, r5 + mov dst_strideq, r1 + mov leftmp, r2 +%if cpuflag(ssse3) + pshufb m3, [base+wiener_init] + pshufd m1, m3, q2222 + pshufd m2, m3, q3333 + punpcklqdq m3, m3 +%else + punpcklwd m3, m3 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m11, m0 +%endif + mova m12, m1 + mova m13, m2 + mova m14, m3 +%endif + pshufd m6, m7, q0000 ; y0 y1 + pshufd m7, m7, q1111 ; y2 y3 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea t3, [lpfq+lpf_strideq*4] + mov lpfq, dstmp + mov [rsp+gprsize*1], lpf_strideq + add t3, lpf_strideq + mov [rsp+gprsize*0], t3 ; below + mov t4, t1 + add t1, 384*2 + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp+gprsize*0] + call .hv_bottom + add lpfq, [rsp+gprsize*1] + call .hv_bottom +.v1: + call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + RET +.no_top: + lea t3, [lpfq+lpf_strideq*4] + mov lpfq, dstmp + mov [rsp+gprsize*1], lpf_strideq + lea t3, [t3+lpf_strideq*2] + mov [rsp+gprsize*0], t3 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v +.v2: + call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + jmp .v1 +.extend_right: + movd m2, [lpfq-4] +%if ARCH_X86_64 + push r0 + lea r0, [pb_right_ext_mask+21] + movu m0, [r0+xq+0] + movu m1, [r0+xq+8] + pop r0 +%else + movu m0, [r6+xq+0] + movu m1, [r6+xq+8] +%endif +%if cpuflag(ssse3) + pshufb m2, [base+pb_3] +%else + punpcklbw m2, m2 + pshuflw m2, m2, q3333 + punpcklqdq m2, m2 +%endif + pand m4, m0 + pand m5, m1 + pandn m0, m2 + pandn m1, m2 + por m4, m0 + por m5, m1 + ret +.h: + %define stk esp+4 ; offset due to call + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .h_main +.h_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, [base+wiener_l_shuf] +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .h_main +.h_top: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+xq-4] +.h_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp xd, -18 + jl .h_have_right + call .extend_right +.h_have_right: +%macro %%h7 0 +%if cpuflag(ssse3) + pshufb m0, m4, m8 + pmaddubsw m0, m12 + pshufb m1, m5, m8 + pmaddubsw m1, m12 + pshufb m2, m4, m9 + pmaddubsw m2, m13 + pshufb m3, m5, m9 + pmaddubsw m3, m13 + paddw m0, m2 + pshufb m2, m4, m10 + pmaddubsw m2, m13 + paddw m1, m3 + pshufb m3, m5, m10 + pmaddubsw m3, m13 + pshufb m4, m11 + paddw m0, m2 + pmullw m2, m14, m4 + pshufb m5, m11 + paddw m1, m3 + pmullw m3, m14, m5 + psllw m4, 7 + psllw m5, 7 + paddw m0, m2 + mova m2, [base+pw_m16380] + paddw m1, m3 + paddw m4, m2 + paddw m5, m2 + paddsw m0, m4 + paddsw m1, m5 +%else + psrldq m0, m4, 1 + pslldq m1, m4, 1 + pxor m3, m3 + punpcklbw m0, m3 + punpckhbw m1, m3 + paddw m0, m1 + pmullw m0, m11 + psrldq m1, m4, 2 + pslldq m2, m4, 2 + punpcklbw m1, m3 + punpckhbw m2, m3 + paddw m1, m2 + pmullw m1, m12 + paddw m0, m1 + pshufd m2, m4, q0321 + punpcklbw m2, m3 + pmullw m1, m14, m2 + paddw m0, m1 + psrldq m1, m4, 3 + pslldq m4, 3 + punpcklbw m1, m3 + punpckhbw m4, m3 + paddw m1, m4 + pmullw m1, m13 + paddw m0, m1 + psllw m2, 7 + paddw m2, m10 + paddsw m0, m2 + psrldq m1, m5, 1 + pslldq m2, m5, 1 + punpcklbw m1, m3 + punpckhbw m2, m3 + paddw m1, m2 + pmullw m1, m11 + psrldq m2, m5, 2 + pslldq m4, m5, 2 + punpcklbw m2, m3 + punpckhbw m4, m3 + paddw m2, m4 + pmullw m2, m12 + paddw m1, m2 + pshufd m4, m5, q0321 + punpcklbw m4, m3 + pmullw m2, m14, m4 + paddw m1, m2 + psrldq m2, m5, 3 + pslldq m5, 3 + punpcklbw m2, m3 + punpckhbw m5, m3 + paddw m2, m5 + pmullw m2, m13 + paddw m1, m2 + psllw m4, 7 + paddw m4, m10 + paddsw m1, m4 +%endif +%endmacro + %%h7 + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 + mova [t1+xq*2+ 0], m0 + mova [t1+xq*2+16], m1 + add xq, 16 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .hv_main +.hv_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, [base+wiener_l_shuf] +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .hv_main +.hv_bottom: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+xq-4] +.hv_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp xd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + %%h7 +%if ARCH_X86_64 + mova m2, [t4+xq*2] + paddw m2, [t2+xq*2] +%else + mov r2, t4 + mova m2, [r2+xq*2] + mov r2, t2 + paddw m2, [r2+xq*2] + mov r2, t5 +%endif + mova m3, [t3+xq*2] +%if ARCH_X86_64 + mova m5, [t5+xq*2] +%else + mova m5, [r2+xq*2] + mov r2, t6 +%endif + paddw m5, [t1+xq*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 +%if ARCH_X86_64 + paddw m4, m0, [t6+xq*2] +%else + paddw m4, m0, [r2+xq*2] + mov r2, t4 +%endif + mova [t0+xq*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m5 + pmaddwd m3, m6 + punpckhwd m4, m5 + pmaddwd m4, m6 + paddd m0, m3 + mova m3, [t3+xq*2+16] + paddd m4, m2 +%if ARCH_X86_64 + mova m2, [t4+xq*2+16] + paddw m2, [t2+xq*2+16] + mova m5, [t5+xq*2+16] +%else + mova m2, [r2+xq*2+16] + mov r2, t2 + paddw m2, [r2+xq*2+16] + mov r2, t5 + mova m5, [r2+xq*2+16] + mov r2, t6 +%endif + paddw m5, [t1+xq*2+16] + psrad m0, 11 + psrad m4, 11 + packssdw m0, m4 +%if ARCH_X86_64 + paddw m4, m1, [t6+xq*2+16] +%else + paddw m4, m1, [r2+xq*2+16] + mov dstq, dstmp +%endif + mova [t0+xq*2+16], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m5 + pmaddwd m3, m6 + punpckhwd m4, m5 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .hv_loop + add dstq, dst_strideq +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 +%else + mov dstmp, dstq + mov r1, t5 + mov r2, t4 + mov t6, r1 + mov t5, r2 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, r1 +%endif + ret +%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code +.v: + mov xq, wq +.v_loop: +%if ARCH_X86_64 + mova m1, [t4+xq*2] + paddw m1, [t2+xq*2] +%else + mov r2, t4 + mova m1, [r2+xq*2] + mov r2, t2 + paddw m1, [r2+xq*2] + mov r2, t6 +%endif + mova m2, [t3+xq*2] + mova m4, [t1+xq*2] +%if ARCH_X86_64 + paddw m3, m4, [t6+xq*2] + paddw m4, [t5+xq*2] +%else + paddw m3, m4, [r2+xq*2] + mov r2, t5 + paddw m4, [r2+xq*2] + mov r2, t4 +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m3, m4 + pmaddwd m2, m6 + punpckhwd m3, m4 + pmaddwd m3, m6 + paddd m0, m2 + paddd m1, m3 +%if ARCH_X86_64 + mova m2, [t4+xq*2+16] + paddw m2, [t2+xq*2+16] +%else + mova m2, [r2+xq*2+16] + mov r2, t2 + paddw m2, [r2+xq*2+16] + mov r2, t6 +%endif + mova m3, [t3+xq*2+16] + mova m5, [t1+xq*2+16] +%if ARCH_X86_64 + paddw m4, m5, [t6+xq*2+16] + paddw m5, [t5+xq*2+16] +%else + paddw m4, m5, [r2+xq*2+16] + mov r2, t5 + paddw m5, [r2+xq*2+16] + movifnidn dstq, dstmp +%endif + psrad m0, 11 + psrad m1, 11 + packssdw m0, m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m5 + pmaddwd m3, m6 + punpckhwd m4, m5 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .v_loop + add dstq, dst_strideq +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 +%else + mov dstmp, dstq + mov r1, t5 + mov r2, t4 + mov t6, r1 + mov t5, r2 +%endif + mov t4, t3 + mov t3, t2 + mov t2, t1 + ret +%endif + +%if ARCH_X86_64 +cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h, x + mov fltq, fltmp + mov edged, r8m + mov wd, wm + mov hd, r6m + movq m14, [fltq] + add lpfq, wq + mova m8, [pw_m16380] + lea t1, [rsp+wq*2+16] + mova m15, [pw_2056] + add dstq, wq + movq m7, [fltq+16] + neg wq +%if cpuflag(ssse3) + pshufb m14, [wiener_init] + mova m9, [wiener_shufB] + pshufd m13, m14, q3333 ; x1 x2 + mova m10, [wiener_shufC] + punpcklqdq m14, m14 ; x3 + mova m11, [wiener_shufD] + mova m12, [wiener_l_shuf] +%else + punpcklwd m14, m14 + pshufd m11, m14, q1111 ; x1 + pshufd m13, m14, q2222 ; x2 + pshufd m14, m14, q3333 ; x3 +%endif +%else +%if cpuflag(ssse3) + %define stk_off 80 +%else + %define m11 [stk+80] + %define stk_off 96 +%endif +cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride + %define stk esp + %define leftmp [stk+28] + %define m8 [base+pw_m16380] + %define m12 [base+wiener_l_shuf] + %define m14 [stk+48] + mov r1, r7m ; flt + mov r0, r0m ; dst + mov r5, r5m ; w + mov lpfq, lpfm + mov r2, r8m ; edge + mov r4, r6m ; h + movq m2, [r1+ 0] + movq m7, [r1+16] + add r0, r5 + mov r1, r1m ; dst_stride + add lpfq, r5 + mov edged, r2 + mov r2, r2m ; left + mov dstmp, r0 + lea t1, [rsp+r5*2+stk_off] + mov hd, r4 + neg r5 + mov lpf_strideq, lpf_stridem + LEA r6, pb_right_ext_mask+21 + mov wq, r5 + mov dst_strideq, r1 + mov leftmp, r2 +%if cpuflag(ssse3) + pshufb m2, [base+wiener_init] + pshufd m1, m2, q3333 + punpcklqdq m2, m2 +%else + punpcklwd m2, m2 + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + pshufd m2, m2, q3333 + mova m11, m0 +%endif + mova m13, m1 + mova m14, m2 +%endif + pshufd m6, m7, q0000 ; __ y1 + pshufd m7, m7, q1111 ; y2 y3 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea xq, [lpfq+lpf_strideq*4] + mov lpfq, dstmp + mov t3, t1 + add t1, 384*2 + mov [rsp+gprsize*1], lpf_strideq + add xq, lpf_strideq + mov [rsp+gprsize*0], xq ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp+gprsize*0] + call .hv_bottom + add lpfq, [rsp+gprsize*1] + call .hv_bottom +.end: + RET +.no_top: + lea t3, [lpfq+lpf_strideq*4] + mov lpfq, dstmp + mov [rsp+gprsize*1], lpf_strideq + lea t3, [t3+lpf_strideq*2] + mov [rsp+gprsize*0], t3 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v + add dstq, dst_strideq + mov t4, t3 + mov t3, t2 + mov t2, t1 + movifnidn dstmp, dstq +.v1: + call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v + jmp .end +.h: + %define stk esp+4 + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .h_main +.h_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, m12 +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .h_main +.h_top: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+xq-4] +.h_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp xd, -17 + jl .h_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right +.h_have_right: +%macro %%h5 0 +%if cpuflag(ssse3) + pshufb m0, m4, m9 + pmaddubsw m0, m13 + pshufb m1, m5, m9 + pmaddubsw m1, m13 + pshufb m2, m4, m10 + pmaddubsw m2, m13 + pshufb m3, m5, m10 + pmaddubsw m3, m13 + pshufb m4, m11 + paddw m0, m2 + pmullw m2, m14, m4 + pshufb m5, m11 + paddw m1, m3 + pmullw m3, m14, m5 + psllw m4, 7 + psllw m5, 7 + paddw m4, m8 + paddw m5, m8 + paddw m0, m2 + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 +%else + psrldq m0, m4, 2 + pslldq m1, m4, 2 + pxor m3, m3 + punpcklbw m0, m3 + punpckhbw m1, m3 + paddw m0, m1 + pmullw m0, m11 + pshufd m2, m4, q0321 + punpcklbw m2, m3 + pmullw m1, m14, m2 + paddw m0, m1 + psrldq m1, m4, 3 + pslldq m4, 3 + punpcklbw m1, m3 + punpckhbw m4, m3 + paddw m1, m4 + pmullw m1, m13 + paddw m0, m1 + psllw m2, 7 + paddw m2, m8 + paddsw m0, m2 + psrldq m1, m5, 2 + pslldq m4, m5, 2 + punpcklbw m1, m3 + punpckhbw m4, m3 + paddw m1, m4 + pmullw m1, m11 + pshufd m4, m5, q0321 + punpcklbw m4, m3 + pmullw m2, m14, m4 + paddw m1, m2 + psrldq m2, m5, 3 + pslldq m5, 3 + punpcklbw m2, m3 + punpckhbw m5, m3 + paddw m2, m5 + pmullw m2, m13 + paddw m1, m2 + psllw m4, 7 + paddw m4, m8 + paddsw m1, m4 +%endif +%endmacro + %%h5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 + mova [t1+xq*2+ 0], m0 + mova [t1+xq*2+16], m1 + add xq, 16 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .hv_main +.hv_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, m12 +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .hv_main +.hv_bottom: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+xq-4] +.hv_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp xd, -17 + jl .hv_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right +.hv_have_right: + %%h5 + mova m2, [t3+xq*2] + paddw m2, [t1+xq*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 +%if ARCH_X86_64 + mova m3, [t2+xq*2] + paddw m4, m0, [t4+xq*2] +%else + mov r2, t2 + mova m3, [r2+xq*2] + mov r2, t4 + paddw m4, m0, [r2+xq*2] +%endif + mova [t0+xq*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m4 + pmaddwd m3, m6 + punpckhwd m4, m4 + pmaddwd m4, m6 + paddd m0, m3 + paddd m4, m2 + mova m2, [t3+xq*2+16] + paddw m2, [t1+xq*2+16] + psrad m0, 11 + psrad m4, 11 + packssdw m0, m4 +%if ARCH_X86_64 + mova m3, [t2+xq*2+16] + paddw m4, m1, [t4+xq*2+16] +%else + paddw m4, m1, [r2+xq*2+16] + mov r2, t2 + mova m3, [r2+xq*2+16] + mov dstq, dstmp +%endif + mova [t0+xq*2+16], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m4 + pmaddwd m3, m6 + punpckhwd m4, m4 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .hv_loop + add dstq, dst_strideq + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + movifnidn dstmp, dstq + ret +%if cpuflag(ssse3) +.v: + mov xq, wq +.v_loop: + mova m3, [t1+xq*2] + paddw m1, m3, [t3+xq*2] +%if ARCH_X86_64 + mova m2, [t2+xq*2] + paddw m3, [t4+xq*2] +%else + mov r2, t2 + mova m2, [r2+xq*2] + mov r2, t4 + paddw m3, [r2+xq*2] +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m3 + pmaddwd m2, m6 + punpckhwd m3, m3 + pmaddwd m3, m6 + paddd m0, m2 + paddd m1, m3 + mova m4, [t1+xq*2+16] + paddw m2, m4, [t3+xq*2+16] +%if ARCH_X86_64 + mova m3, [t2+xq*2+16] + paddw m4, [t4+xq*2+16] +%else + paddw m4, [r2+xq*2+16] + mov r2, t2 + mova m3, [r2+xq*2+16] + mov dstq, dstmp +%endif + psrad m0, 11 + psrad m1, 11 + packssdw m0, m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4 + pmaddwd m3, m6 + punpckhwd m4, m4 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .v_loop + ret +%endif +%endmacro + +INIT_XMM sse2 +WIENER + +INIT_XMM ssse3 +WIENER + +;;;;;;;;;;;;;;;;;;;;;;;;;; +;; self-guided ;; +;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro MULLD 2 + pmulhuw m5, %1, %2 + pmullw %1, %2 + pslld m5, 16 + paddd %1, m5 +%endmacro + +%macro GATHERDD 2 + mova m5, m7 + movd r6d, %2 + %if ARCH_X86_64 + movd %1, [r5+r6] + pextrw r6d, %2, 2 + pinsrw m5, [r5+r6+2], 3 + pextrw r6d, %2, 4 + pinsrw %1, [r5+r6+2], 5 + pextrw r6d, %2, 6 + pinsrw m5, [r5+r6+2], 7 + %else + movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6] + pextrw r6d, %2, 2 + pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3 + pextrw r6d, %2, 4 + pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5 + pextrw r6d, %2, 6 + pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7 + %endif + por %1, m5 +%endmacro + +%if ARCH_X86_64 +cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + mov xlimd, edgem + movifnidn xd, xm + mov hd, hm + mov edged, xlimd + and xlimd, 2 ; have_right + add xd, xlimd + xor xlimd, 2 ; 2*!have_right +%else +cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim + %define wq r0m + %define xlimd r1m + %define hd hmp + %define edgeb byte edgem + + mov r6, edgem + and r6, 2 ; have_right + add xd, r6 + xor r6, 2 ; 2*!have_right + mov xlimd, r6 + SETUP_PIC r6, 0 +%endif + + jnz .no_right + add xd, 7 + and xd, ~7 +.no_right: + pxor m1, m1 + lea srcq, [srcq+xq] + lea sumq, [sumq+xq*2-2] + lea sumsqq, [sumsqq+xq*4-4] + neg xq + mov wq, xq +%if ARCH_X86_64 + lea r10, [pb_right_ext_mask+24] +%endif +.loop_y: + mov xq, wq + + ; load left + test edgeb, 1 ; have_left + jz .no_left + test leftq, leftq + jz .load_left_from_main + movd m0, [leftq] + pslldq m0, 12 + add leftq, 4 + jmp .expand_x +.no_left: + movd m0, [srcq+xq] + pshufb m0, [PIC_sym(pb_0)] + jmp .expand_x +.load_left_from_main: + movd m0, [srcq+xq-2] + pslldq m0, 14 +.expand_x: + punpckhbw xm0, xm1 + + ; when we reach this, m0 contains left two px in highest words + cmp xd, -8 + jle .loop_x +.partial_load_and_extend: + movd m3, [srcq-4] + pshufb m3, [PIC_sym(pb_3)] + movq m2, [srcq+xq] + punpcklbw m2, m1 + punpcklbw m3, m1 +%if ARCH_X86_64 + movu m4, [r10+xq*2] +%else + movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24] +%endif + pand m2, m4 + pandn m4, m3 + por m2, m4 + jmp .loop_x_noload +.right_extend: + pshufb m2, m0, [PIC_sym(pb_14_15)] + jmp .loop_x_noload + +.loop_x: + movq m2, [srcq+xq] + punpcklbw m2, m1 +.loop_x_noload: + palignr m3, m2, m0, 12 + palignr m4, m2, m0, 14 + + punpcklwd m5, m3, m2 + punpckhwd m6, m3, m2 + paddw m3, m4 + punpcklwd m7, m4, m1 + punpckhwd m4, m1 + pmaddwd m5, m5 + pmaddwd m6, m6 + pmaddwd m7, m7 + pmaddwd m4, m4 + paddd m5, m7 + paddd m6, m4 + paddw m3, m2 + movu [sumq+xq*2], m3 + movu [sumsqq+xq*4+ 0], m5 + movu [sumsqq+xq*4+16], m6 + + mova m0, m2 + add xq, 8 + + ; if x <= -8 we can reload more pixels + ; else if x < 0 we reload and extend (this implies have_right=0) + ; else if x < xlimd we extend from previous load (this implies have_right=0) + ; else we are done + + cmp xd, -8 + jle .loop_x + test xd, xd + jl .partial_load_and_extend + cmp xd, xlimd + jl .right_extend + + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + add srcq, strideq + dec hd + jg .loop_y + RET + +%if ARCH_X86_64 +cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim + movifnidn edged, edgem +%else +cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y + %define sumsq_baseq dword [esp+0] + %define sum_baseq dword [esp+4] + %define ylimd dword [esp+8] + %define m8 [esp+12] + mov edged, r4m + mov hd, r3m +%endif + mov xq, -2 +%if ARCH_X86_64 + mov ylimd, edged + and ylimd, 8 ; have_bottom + shr ylimd, 2 + sub ylimd, 2 ; -2 if have_bottom=0, else 0 + mov sumsq_baseq, sumsqq + mov sum_baseq, sumq +.loop_x: + mov sumsqq, sumsq_baseq + mov sumq, sum_baseq + lea yd, [hq+ylimq+2] +%else + mov yd, edged + and yd, 8 ; have_bottom + shr yd, 2 + sub yd, 2 ; -2 if have_bottom=0, else 0 + mov sumsq_baseq, sumsqq + mov sum_baseq, sumq + mov ylimd, yd +.loop_x: + mov sumsqd, sumsq_baseq + mov sumd, sum_baseq + lea yd, [hq+2] + add yd, ylimd +%endif + lea sumsqq, [sumsqq+xq*4+4-(384+16)*4] + lea sumq, [sumq+xq*2+2-(384+16)*2] + test edgeb, 4 ; have_top + jnz .load_top + movu m0, [sumsqq+(384+16)*4*1] + movu m1, [sumsqq+(384+16)*4*1+16] + mova m2, m0 + mova m3, m1 + mova m4, m0 + mova m5, m1 + movu m6, [sumq+(384+16)*2*1] + mova m7, m6 + mova m8, m6 + jmp .loop_y_noload +.load_top: + movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left] + movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right] + movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left] + movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right] + movu m6, [sumq-(384+16)*2*1] ; l2 + movu m7, [sumq-(384+16)*2*0] ; l1 +.loop_y: +%if ARCH_X86_64 + movu m8, [sumq+(384+16)*2*1] ; l0 +%else + movu m4, [sumq+(384+16)*2*1] ; l0 + mova m8, m4 +%endif + movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left] + movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right] +.loop_y_noload: + paddd m0, m2 + paddd m1, m3 + paddw m6, m7 + paddd m0, m4 + paddd m1, m5 + paddw m6, m8 + movu [sumsqq+ 0], m0 + movu [sumsqq+16], m1 + movu [sumq], m6 + + ; shift position down by one + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + mova m6, m7 + mova m7, m8 + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + dec yd + jg .loop_y + cmp yd, ylimd + jg .loop_y_noload + add xd, 8 + cmp xd, wd + jl .loop_x + RET + +cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s + movifnidn sd, sm + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 +%if ARCH_X86_64 + LEA r5, sgr_x_by_x-0xF03 +%else + SETUP_PIC r5, 0 +%endif + movd m6, sd + pshuflw m6, m6, q0000 + punpcklqdq m6, m6 + pxor m7, m7 + DEFINE_ARGS a, b, w, h, x +%if ARCH_X86_64 + mova m8, [pd_0xF00801C7] + mova m9, [pw_256] + psrld m10, m9, 13 ; pd_2048 + mova m11, [pb_unpcklwdw] +%else + %define m8 [PIC_sym(pd_0xF00801C7)] + %define m9 [PIC_sym(pw_256)] + %define m10 [PIC_sym(pd_2048)] + %define m11 [PIC_sym(pb_unpcklwdw)] +%endif +.loop_y: + mov xq, -2 +.loop_x: + movq m0, [bq+xq*2] + movq m1, [bq+xq*2+(384+16)*2] + punpcklwd m0, m7 + punpcklwd m1, m7 + movu m2, [aq+xq*4] + movu m3, [aq+xq*4+(384+16)*4] + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m2, m4 ; aa * 9 + paddd m3, m5 + pmaddwd m4, m0, m0 + pmaddwd m5, m1, m1 + pmaddwd m0, m8 + pmaddwd m1, m8 + psubd m2, m4 ; p = aa * 9 - bb * bb + psubd m3, m5 + MULLD m2, m6 + MULLD m3, m6 + paddusw m2, m8 + paddusw m3, m8 + psrld m2, 20 ; z + psrld m3, 20 + GATHERDD m4, m2 ; xx + GATHERDD m2, m3 + psrld m4, 24 + psrld m2, 24 + packssdw m3, m4, m2 + pshufb m4, m11 + MULLD m0, m4 + pshufb m2, m11 + MULLD m1, m2 + psubw m5, m9, m3 + paddd m0, m10 + paddd m1, m10 + psrld m0, 12 + psrld m1, 12 + movq [bq+xq*2], m5 + psrldq m5, 8 + movq [bq+xq*2+(384+16)*2], m5 + movu [aq+xq*4], m0 + movu [aq+xq*4+(384+16)*4], m1 + add xd, 4 + cmp xd, wd + jl .loop_x + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + sub hd, 2 + jg .loop_y + RET + +%if ARCH_X86_64 +cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \ + tmp_base, src_base, a_base, b_base, x, y + movifnidn wd, wm + mov hd, hm + mova m15, [pw_16] + mov tmp_baseq, tq + mov src_baseq, srcq + mov a_baseq, aq + mov b_baseq, bq + xor xd, xd +%else +cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y + %define tmp_baseq [esp+8] + %define src_baseq [esp+12] + %define a_baseq [esp+16] + %define b_baseq [esp+20] + %define wd [esp+24] + %define hd [esp+28] + mov tmp_baseq, tq + mov src_baseq, srcq + mov a_baseq, aq + mov b_baseq, bq + mov wd, xd + mov hd, yd + xor xd, xd + SETUP_PIC yd, 1, 1 + jmp .loop_start +%endif + +.loop_x: + mov tq, tmp_baseq + mov srcq, src_baseq + mov aq, a_baseq + mov bq, b_baseq +%if ARCH_X86_32 +.loop_start: + movu m0, [bq+xq*2-(384+16)*2-2] + movu m2, [bq+xq*2-(384+16)*2+2] + mova m1, [bq+xq*2-(384+16)*2] ; b:top + paddw m0, m2 ; b:tl+tr + movu m2, [bq+xq*2-2] + movu m3, [bq+xq*2+2] + paddw m1, [bq+xq*2] ; b:top+ctr + paddw m2, m3 ; b:l+r + mova [esp+0x80], m0 + mova [esp+0x70], m1 + mova [esp+0x60], m2 +%endif + movu m0, [aq+xq*4-(384+16)*4-4] + movu m2, [aq+xq*4-(384+16)*4+4] + mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] + paddd m0, m2 ; a:tl+tr [first half] + movu m2, [aq+xq*4-(384+16)*4-4+16] + movu m4, [aq+xq*4-(384+16)*4+4+16] + mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half] + paddd m2, m4 ; a:tl+tr [second half] + movu m4, [aq+xq*4-4] + movu m5, [aq+xq*4+4] + paddd m1, [aq+xq*4] ; a:top+ctr [first half] + paddd m4, m5 ; a:l+r [first half] + movu m5, [aq+xq*4+16-4] + movu m6, [aq+xq*4+16+4] + paddd m3, [aq+xq*4+16] ; a:top+ctr [second half] + paddd m5, m6 ; a:l+r [second half] +%if ARCH_X86_64 + movu m6, [bq+xq*2-(384+16)*2-2] + movu m8, [bq+xq*2-(384+16)*2+2] + mova m7, [bq+xq*2-(384+16)*2] ; b:top + paddw m6, m8 ; b:tl+tr + movu m8, [bq+xq*2-2] + movu m9, [bq+xq*2+2] + paddw m7, [bq+xq*2] ; b:top+ctr + paddw m8, m9 ; b:l+r +%endif + + lea tq, [tq+xq*2] + lea srcq, [srcq+xq*1] + lea aq, [aq+xq*4+(384+16)*4] + lea bq, [bq+xq*2+(384+16)*2] + mov yd, hd +.loop_y: +%if ARCH_X86_64 + movu m9, [bq-2] + movu m10, [bq+2] + paddw m7, [bq] ; b:top+ctr+bottom + paddw m9, m10 ; b:bl+br + paddw m10, m7, m8 ; b:top+ctr+bottom+l+r + paddw m6, m9 ; b:tl+tr+bl+br + psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom + paddw m10, m6 + psllw m10, 2 + psubw m10, m6 ; aa + pxor m14, m14 + movq m12, [srcq] + punpcklbw m12, m14 + punpcklwd m6, m10, m15 + punpckhwd m10, m15 + punpcklwd m13, m12, m15 + punpckhwd m12, m15 + pmaddwd m6, m13 ; aa*src[x]+256 [first half] + pmaddwd m10, m12 ; aa*src[x]+256 [second half] +%else + paddd m1, [aq] ; a:top+ctr+bottom [first half] + paddd m3, [aq+16] ; a:top+ctr+bottom [second half] + mova [esp+0x50], m1 + mova [esp+0x40], m3 + mova [esp+0x30], m4 + movu m6, [aq-4] + movu m7, [aq+4] + paddd m1, m4 ; a:top+ctr+bottom+l+r [first half] + paddd m3, m5 ; a:top+ctr+bottom+l+r [second half] + paddd m6, m7 ; a:bl+br [first half] + movu m7, [aq+16-4] + movu m4, [aq+16+4] + paddd m7, m4 ; a:bl+br [second half] + paddd m0, m6 ; a:tl+tr+bl+br [first half] + paddd m2, m7 ; a:tl+tr+bl+br [second half] + paddd m1, m0 + paddd m3, m2 + pslld m1, 2 + pslld m3, 2 + psubd m1, m0 ; bb [first half] + psubd m3, m2 ; bb [second half] +%endif + +%if ARCH_X86_64 + movu m11, [aq-4] + movu m12, [aq+4] + paddd m1, [aq] ; a:top+ctr+bottom [first half] + paddd m11, m12 ; a:bl+br [first half] + movu m12, [aq+16-4] + movu m13, [aq+16+4] + paddd m3, [aq+16] ; a:top+ctr+bottom [second half] + paddd m12, m13 ; a:bl+br [second half] + paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] + paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] + paddd m0, m11 ; a:tl+tr+bl+br [first half] + paddd m2, m12 ; a:tl+tr+bl+br [second half] + paddd m13, m0 + paddd m14, m2 + pslld m13, 2 + pslld m14, 2 + psubd m13, m0 ; bb [first half] + psubd m14, m2 ; bb [second half] + psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] + psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] +%else + mova m4, [esp+0x80] + mova [esp+0x80], m5 + mova m5, [esp+0x70] + mova [esp+0x70], m6 + mova m6, [esp+0x60] + mova [esp+0x60], m7 + mova [esp+0x20], m1 + movu m7, [bq-2] + movu m1, [bq+2] + paddw m5, [bq] ; b:top+ctr+bottom + paddw m7, m1 + paddw m1, m5, m6 ; b:top+ctr+bottom+l+r + paddw m4, m7 ; b:tl+tr+bl+br + psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom + paddw m1, m4 + psllw m1, 2 + psubw m1, m4 ; aa + movq m0, [srcq] + XCHG_PIC_REG + punpcklbw m0, [PIC_sym(pb_0)] + punpcklwd m4, m1, [PIC_sym(pw_16)] + punpckhwd m1, [PIC_sym(pw_16)] + punpcklwd m2, m0, [PIC_sym(pw_16)] + punpckhwd m0, [PIC_sym(pw_16)] + XCHG_PIC_REG + pmaddwd m4, m2 ; aa*src[x]+256 [first half] + pmaddwd m1, m0 ; aa*src[x]+256 [second half] +%endif + +%if ARCH_X86_64 + paddd m6, m13 + paddd m10, m14 + psrad m6, 9 + psrad m10, 9 + packssdw m6, m10 + mova [tq], m6 +%else + paddd m4, [esp+0x20] + paddd m1, m3 + psrad m4, 9 + psrad m1, 9 + packssdw m4, m1 + mova [tq], m4 +%endif + + ; shift to next row +%if ARCH_X86_64 + mova m0, m4 + mova m2, m5 + mova m4, m11 + mova m5, m12 + mova m6, m8 + mova m8, m9 +%else + mova m1, [esp+0x50] + mova m3, [esp+0x40] + mova m0, [esp+0x30] + mova m2, [esp+0x80] + mova m4, [esp+0x70] + mova [esp+0x70], m5 + mova m5, [esp+0x60] + mova [esp+0x80], m6 + mova [esp+0x60], m7 + psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] + psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] +%endif + + add srcq, strideq + add aq, (384+16)*4 + add bq, (384+16)*2 + add tq, 384*2 + dec yd + jg .loop_y + add xd, 8 + cmp xd, wd + jl .loop_x + RET + +cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt + movifnidn hd, hm +%if ARCH_X86_32 + SETUP_PIC r6, 0 +%endif + movd m0, wtm + pshufb m0, [PIC_sym(pb_0_1)] + psllw m0, 4 + pxor m7, m7 + DEFINE_ARGS dst, stride, t, w, h, idx +.loop_y: + xor idxd, idxd +.loop_x: + mova m1, [tq+idxq*2+ 0] + mova m4, [tq+idxq*2+16] + mova m5, [dstq+idxq] + punpcklbw m2, m5, m7 + punpckhbw m5, m7 + psllw m3, m2, 4 + psllw m6, m5, 4 + psubw m1, m3 + psubw m4, m6 + pmulhrsw m1, m0 + pmulhrsw m4, m0 + paddw m1, m2 + paddw m4, m5 + packuswb m1, m4 + mova [dstq+idxq], m1 + add idxd, 16 + cmp idxd, wd + jl .loop_x + add dstq, strideq + add tq, 384 * 2 + dec hd + jg .loop_y + RET + +%if ARCH_X86_64 +cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov edged, edgem + movifnidn wd, wm + mov hd, hm + mova m10, [pb_0] + mova m11, [pb_0_1] +%else +cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge + %define edgeb byte edgem + %define wd xd + %define wq wd + %define wm r5m + %define strideq r4m + SUB esp, 8 + SETUP_PIC sumsqd, 1, 1 + + %define m10 [PIC_sym(pb_0)] + %define m11 [PIC_sym(pb_0_1)] +%endif + + test edgeb, 2 ; have_right + jz .no_right + xor xlimd, xlimd + add wd, 2 + add wd, 15 + and wd, ~15 + jmp .right_done +.no_right: + mov xlimd, 3 + dec wd +.right_done: + pxor m1, m1 + lea srcq, [srcq+wq+1] + lea sumq, [sumq+wq*2-2] + lea sumsqq, [sumsqq+wq*4-4] + neg wq +%if ARCH_X86_64 + lea r10, [pb_right_ext_mask+24] +%else + mov wm, xd + %define wq wm +%endif + +.loop_y: + mov xq, wq + ; load left + test edgeb, 1 ; have_left + jz .no_left + test leftq, leftq + jz .load_left_from_main + movd m0, [leftq] + movd m2, [srcq+xq-1] + pslldq m2, 4 + por m0, m2 + pslldq m0, 11 + add leftq, 4 + jmp .expand_x +.no_left: + movd m0, [srcq+xq-1] + XCHG_PIC_REG + pshufb m0, m10 + XCHG_PIC_REG + jmp .expand_x +.load_left_from_main: + movd m0, [srcq+xq-4] + pslldq m0, 12 +.expand_x: + punpckhbw m0, m1 + + ; when we reach this, m0 contains left two px in highest words + cmp xd, -8 + jle .loop_x + test xd, xd + jge .right_extend +.partial_load_and_extend: + XCHG_PIC_REG + movd m3, [srcq-1] + movq m2, [srcq+xq] + pshufb m3, m10 + punpcklbw m3, m1 + punpcklbw m2, m1 +%if ARCH_X86_64 + movu m4, [r10+xq*2] +%else + movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24] + XCHG_PIC_REG +%endif + pand m2, m4 + pandn m4, m3 + por m2, m4 + jmp .loop_x_noload +.right_extend: + psrldq m2, m0, 14 + XCHG_PIC_REG + pshufb m2, m11 + XCHG_PIC_REG + jmp .loop_x_noload + +.loop_x: + movq m2, [srcq+xq] + punpcklbw m2, m1 +.loop_x_noload: + palignr m3, m2, m0, 8 + palignr m4, m2, m0, 10 + palignr m5, m2, m0, 12 + palignr m6, m2, m0, 14 + +%if ARCH_X86_64 + paddw m0, m3, m2 + punpcklwd m7, m3, m2 + punpckhwd m3, m2 + paddw m0, m4 + punpcklwd m8, m4, m5 + punpckhwd m4, m5 + paddw m0, m5 + punpcklwd m9, m6, m1 + punpckhwd m5, m6, m1 + paddw m0, m6 + pmaddwd m7, m7 + pmaddwd m3, m3 + pmaddwd m8, m8 + pmaddwd m4, m4 + pmaddwd m9, m9 + pmaddwd m5, m5 + paddd m7, m8 + paddd m3, m4 + paddd m7, m9 + paddd m3, m5 + movu [sumq+xq*2], m0 + movu [sumsqq+xq*4+ 0], m7 + movu [sumsqq+xq*4+16], m3 +%else + paddw m0, m3, m2 + paddw m0, m4 + paddw m0, m5 + paddw m0, m6 + movu [sumq+xq*2], m0 + punpcklwd m7, m3, m2 + punpckhwd m3, m2 + punpcklwd m0, m4, m5 + punpckhwd m4, m5 + punpckhwd m5, m6, m1 + pmaddwd m7, m7 + pmaddwd m3, m3 + pmaddwd m0, m0 + pmaddwd m4, m4 + pmaddwd m5, m5 + paddd m7, m0 + paddd m3, m4 + paddd m3, m5 + punpcklwd m0, m6, m1 + pmaddwd m0, m0 + paddd m7, m0 + movu [sumsqq+xq*4+ 0], m7 + movu [sumsqq+xq*4+16], m3 +%endif + + mova m0, m2 + add xq, 8 + + ; if x <= -8 we can reload more pixels + ; else if x < 0 we reload and extend (this implies have_right=0) + ; else if x < xlimd we extend from previous load (this implies have_right=0) + ; else we are done + + cmp xd, -8 + jle .loop_x + test xd, xd + jl .partial_load_and_extend + cmp xd, xlimd + jl .right_extend + + add srcq, strideq + add sumsqq, (384+16)*4 + add sumq, (384+16)*2 + dec hd + jg .loop_y +%if ARCH_X86_32 + ADD esp, 8 +%endif + RET + +%if ARCH_X86_64 +cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem + mov ylimd, edged +%else +cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr + %define wm [esp+0] + %define hm [esp+4] + %define edgem [esp+8] + mov wm, xd + mov hm, yd + mov edgem, ylimd +%endif + + and ylimd, 8 ; have_bottom + shr ylimd, 2 + sub ylimd, 3 ; -3 if have_bottom=0, else -1 + mov xq, -2 +%if ARCH_X86_64 +.loop_x: + lea yd, [hd+ylimd+2] + lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] + lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2] + test edgeb, 4 ; have_top + jnz .load_top + movu m0, [sumsq_ptrq+(384+16)*4*1] + movu m1, [sumsq_ptrq+(384+16)*4*1+16] + mova m2, m0 + mova m3, m1 + mova m4, m0 + mova m5, m1 + mova m6, m0 + mova m7, m1 + movu m10, [sum_ptrq+(384+16)*2*1] + mova m11, m10 + mova m12, m10 + mova m13, m10 + jmp .loop_y_second_load +.load_top: + movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] + movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] + movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] + movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] + mova m2, m0 + mova m3, m1 + movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 + movu m12, [sum_ptrq-(384+16)*2*0] ; l2 + mova m11, m10 +.loop_y: + movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] + movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] + movu m13, [sum_ptrq+(384+16)*2*1] ; l1 +.loop_y_second_load: + test yd, yd + jle .emulate_second_load + movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] + movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] + movu m14, [sum_ptrq+(384+16)*2*2] ; l0 +.loop_y_noload: + paddd m0, m2 + paddd m1, m3 + paddw m10, m11 + paddd m0, m4 + paddd m1, m5 + paddw m10, m12 + paddd m0, m6 + paddd m1, m7 + paddw m10, m13 + paddd m0, m8 + paddd m1, m9 + paddw m10, m14 + movu [sumsq_ptrq+ 0], m0 + movu [sumsq_ptrq+16], m1 + movu [sum_ptrq], m10 + + ; shift position down by one + mova m0, m4 + mova m1, m5 + mova m2, m6 + mova m3, m7 + mova m4, m8 + mova m5, m9 + mova m10, m12 + mova m11, m13 + mova m12, m14 + add sumsq_ptrq, (384+16)*4*2 + add sum_ptrq, (384+16)*2*2 + sub yd, 2 + jge .loop_y + ; l1 = l0 + mova m6, m8 + mova m7, m9 + mova m13, m14 + cmp yd, ylimd + jg .loop_y_noload + add xd, 8 + cmp xd, wd + jl .loop_x + RET +.emulate_second_load: + mova m8, m6 + mova m9, m7 + mova m14, m13 + jmp .loop_y_noload +%else +.sumsq_loop_x: + lea yd, [ylimd+2] + add yd, hm + lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] + test byte edgem, 4 ; have_top + jnz .sumsq_load_top + movu m0, [sumsq_ptrq+(384+16)*4*1] + movu m1, [sumsq_ptrq+(384+16)*4*1+16] + mova m4, m0 + mova m5, m1 + mova m6, m0 + mova m7, m1 + mova [esp+0x1c], m0 + mova [esp+0x0c], m1 + jmp .sumsq_loop_y_second_load +.sumsq_load_top: + movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] + movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] + movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] + movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] + mova [esp+0x1c], m0 + mova [esp+0x0c], m1 +.sumsq_loop_y: + movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] + movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] +.sumsq_loop_y_second_load: + test yd, yd + jle .sumsq_emulate_second_load + movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] + movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] +.sumsq_loop_y_noload: + paddd m0, [esp+0x1c] + paddd m1, [esp+0x0c] + paddd m0, m4 + paddd m1, m5 + paddd m0, m6 + paddd m1, m7 + paddd m0, m2 + paddd m1, m3 + movu [sumsq_ptrq+ 0], m0 + movu [sumsq_ptrq+16], m1 + + ; shift position down by one + mova m0, m4 + mova m1, m5 + mova m4, m2 + mova m5, m3 + mova [esp+0x1c], m6 + mova [esp+0x0c], m7 + add sumsq_ptrq, (384+16)*4*2 + sub yd, 2 + jge .sumsq_loop_y + ; l1 = l0 + mova m6, m2 + mova m7, m3 + cmp yd, ylimd + jg .sumsq_loop_y_noload + add xd, 8 + cmp xd, wm + jl .sumsq_loop_x + + mov xd, -2 +.sum_loop_x: + lea yd, [ylimd+2] + add yd, hm + lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] + test byte edgem, 4 ; have_top + jnz .sum_load_top + movu m0, [sum_ptrq+(384+16)*2*1] + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp .sum_loop_y_second_load +.sum_load_top: + movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4 + movu m2, [sum_ptrq-(384+16)*2*0] ; l2 + mova m1, m0 +.sum_loop_y: + movu m3, [sum_ptrq+(384+16)*2*1] ; l1 +.sum_loop_y_second_load: + test yd, yd + jle .sum_emulate_second_load + movu m4, [sum_ptrq+(384+16)*2*2] ; l0 +.sum_loop_y_noload: + paddw m0, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movu [sum_ptrq], m0 + + ; shift position down by one + mova m0, m2 + mova m1, m3 + mova m2, m4 + add sum_ptrq, (384+16)*2*2 + sub yd, 2 + jge .sum_loop_y + ; l1 = l0 + mova m3, m4 + cmp yd, ylimd + jg .sum_loop_y_noload + add xd, 8 + cmp xd, wm + jl .sum_loop_x + RET +.sumsq_emulate_second_load: + mova m2, m6 + mova m3, m7 + jmp .sumsq_loop_y_noload +.sum_emulate_second_load: + mova m4, m3 + jmp .sum_loop_y_noload +%endif + +cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s + movifnidn sd, sm + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 +%if ARCH_X86_64 + LEA r5, sgr_x_by_x-0xF03 +%else + SETUP_PIC r5, 0 +%endif + movd m6, sd + pshuflw m6, m6, q0000 + punpcklqdq m6, m6 + pxor m7, m7 + DEFINE_ARGS a, b, w, h, x +%if ARCH_X86_64 + mova m8, [pd_0xF0080029] + mova m9, [pw_256] + psrld m10, m9, 15 ; pd_512 +%else + %define m8 [PIC_sym(pd_0xF0080029)] + %define m9 [PIC_sym(pw_256)] + %define m10 [PIC_sym(pd_512)] +%endif +.loop_y: + mov xq, -2 +.loop_x: + movq m0, [bq+xq*2+0] + movq m1, [bq+xq*2+8] + punpcklwd m0, m7 + punpcklwd m1, m7 + movu m2, [aq+xq*4+ 0] + movu m3, [aq+xq*4+16] + pslld m4, m2, 3 ; aa * 8 + pslld m5, m3, 3 + paddd m2, m4 ; aa * 9 + paddd m3, m5 + paddd m4, m4 ; aa * 16 + paddd m5, m5 + paddd m2, m4 ; aa * 25 + paddd m3, m5 + pmaddwd m4, m0, m0 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p = aa * 25 - bb * bb + psubd m3, m5 + MULLD m2, m6 + MULLD m3, m6 + paddusw m2, m8 + paddusw m3, m8 + psrld m2, 20 ; z + psrld m3, 20 + GATHERDD m4, m2 ; xx + GATHERDD m2, m3 + psrld m4, 24 + psrld m2, 24 + packssdw m3, m4, m2 + pmullw m4, m8 + pmullw m2, m8 + psubw m5, m9, m3 + pmaddwd m0, m4 + pmaddwd m1, m2 + paddd m0, m10 + paddd m1, m10 + psrld m0, 10 + psrld m1, 10 + movu [bq+xq*2], m5 + movu [aq+xq*4+ 0], m0 + movu [aq+xq*4+16], m1 + add xd, 8 + cmp xd, wd + jl .loop_x + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + sub hd, 2 + jg .loop_y + RET + +%if ARCH_X86_64 +cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \ + tmp_base, src_base, a_base, b_base, x, y + movifnidn wd, wm + mov hd, hm + mov tmp_baseq, tq + mov src_baseq, srcq + mov a_baseq, aq + mov b_baseq, bq + mova m9, [pw_5_6] + mova m12, [pw_256] + psrlw m10, m12, 8 ; pw_1 + psrlw m11, m12, 1 ; pw_128 + pxor m13, m13 +%else +cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y + %define tmp_baseq r0m + %define src_baseq r1m + %define a_baseq r3m + %define b_baseq r4m + %define wd r5m + %define hd r6m + + SUB esp, 8 + SETUP_PIC yd + + %define m8 m5 + %define m9 [PIC_sym(pw_5_6)] + %define m10 [PIC_sym(pw_1)] + %define m11 [PIC_sym(pw_128)] + %define m12 [PIC_sym(pw_256)] + %define m13 m0 +%endif + xor xd, xd +.loop_x: + mov tq, tmp_baseq + mov srcq, src_baseq + mov aq, a_baseq + mov bq, b_baseq + movu m0, [aq+xq*4-(384+16)*4-4] + mova m1, [aq+xq*4-(384+16)*4] + movu m2, [aq+xq*4-(384+16)*4+4] + movu m3, [aq+xq*4-(384+16)*4-4+16] + mova m4, [aq+xq*4-(384+16)*4+16] + movu m5, [aq+xq*4-(384+16)*4+4+16] + paddd m0, m2 + paddd m3, m5 + paddd m0, m1 + paddd m3, m4 + pslld m2, m0, 2 + pslld m5, m3, 2 + paddd m2, m0 + paddd m5, m3 + paddd m0, m2, m1 ; prev_odd_b [first half] + paddd m1, m5, m4 ; prev_odd_b [second half] + movu m3, [bq+xq*2-(384+16)*2-2] + mova m4, [bq+xq*2-(384+16)*2] + movu m5, [bq+xq*2-(384+16)*2+2] + paddw m3, m5 + punpcklwd m5, m3, m4 + punpckhwd m3, m4 + pmaddwd m5, m9 + pmaddwd m3, m9 + mova m2, m5 + packssdw m2, m3 ; prev_odd_a + lea tq, [tq+xq*2] + lea srcq, [srcq+xq*1] + lea aq, [aq+xq*4+(384+16)*4] + lea bq, [bq+xq*2+(384+16)*2] +%if ARCH_X86_32 + mov [esp], PIC_reg +%endif + mov yd, hd + XCHG_PIC_REG +.loop_y: + movu m3, [aq-4] + mova m4, [aq] + movu m5, [aq+4] + paddd m3, m5 + paddd m3, m4 + pslld m5, m3, 2 + paddd m5, m3 + paddd m5, m4 ; cur_odd_b [first half] + movu m3, [aq+16-4] + mova m6, [aq+16] + movu m7, [aq+16+4] + paddd m3, m7 + paddd m3, m6 + pslld m7, m3, 2 + paddd m7, m3 + paddd m4, m7, m6 ; cur_odd_b [second half] + movu m3, [bq-2] + mova m6, [bq] + movu m7, [bq+2] + paddw m3, m7 + punpcklwd m7, m3, m6 + punpckhwd m3, m6 + pmaddwd m7, m9 + pmaddwd m3, m9 + packssdw m6, m7, m3 ; cur_odd_a + + paddd m0, m5 ; cur_even_b [first half] + paddd m1, m4 ; cur_even_b [second half] + paddw m2, m6 ; cur_even_a + + movq m3, [srcq] +%if ARCH_X86_64 + punpcklbw m3, m13 +%else + mova [td], m5 + pxor m7, m7 + punpcklbw m3, m7 +%endif + punpcklwd m7, m3, m10 + punpckhwd m3, m10 + punpcklwd m8, m2, m12 + punpckhwd m2, m12 + pmaddwd m7, m8 + pmaddwd m3, m2 + paddd m7, m0 + paddd m3, m1 + psrad m7, 9 + psrad m3, 9 + +%if ARCH_X86_32 + pxor m13, m13 +%endif + movq m8, [srcq+strideq] + punpcklbw m8, m13 + punpcklwd m0, m8, m10 + punpckhwd m8, m10 + punpcklwd m1, m6, m11 + punpckhwd m2, m6, m11 + pmaddwd m0, m1 + pmaddwd m8, m2 +%if ARCH_X86_64 + paddd m0, m5 +%else + paddd m0, [td] +%endif + paddd m8, m4 + psrad m0, 8 + psrad m8, 8 + + packssdw m7, m3 + packssdw m0, m8 +%if ARCH_X86_32 + mova m5, [td] +%endif + mova [tq+384*2*0], m7 + mova [tq+384*2*1], m0 + + mova m0, m5 + mova m1, m4 + mova m2, m6 + add aq, (384+16)*4*2 + add bq, (384+16)*2*2 + add tq, 384*2*2 + lea srcq, [srcq+strideq*2] +%if ARCH_X86_64 + sub yd, 2 +%else + sub dword [esp+4], 2 +%endif + jg .loop_y + add xd, 8 + cmp xd, wd + jl .loop_x +%if ARCH_X86_32 + ADD esp, 8 +%endif + RET + +%undef t2 +cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt + movifnidn wd, wm + movd m0, wtm +%if ARCH_X86_64 + movifnidn hd, hm + mova m10, [pd_1024] + pxor m11, m11 +%else + SETUP_PIC hd, 0 + %define m10 [PIC_sym(pd_1024)] + %define m11 m7 +%endif + pshufd m0, m0, 0 + DEFINE_ARGS dst, stride, t1, t2, w, h, idx +%if ARCH_X86_32 + %define hd hmp +%endif + +.loop_y: + xor idxd, idxd +.loop_x: + mova m1, [t1q+idxq*2+ 0] + mova m2, [t1q+idxq*2+16] + mova m3, [t2q+idxq*2+ 0] + mova m4, [t2q+idxq*2+16] + mova m6, [dstq+idxq] +%if ARCH_X86_32 + pxor m11, m11 +%endif + punpcklbw m5, m6, m11 + punpckhbw m6, m11 + psllw m7, m5, 4 + psubw m1, m7 + psubw m3, m7 + psllw m7, m6, 4 + psubw m2, m7 + psubw m4, m7 + punpcklwd m7, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + pmaddwd m7, m0 + pmaddwd m1, m0 + pmaddwd m3, m0 + pmaddwd m2, m0 + paddd m7, m10 + paddd m1, m10 + paddd m3, m10 + paddd m2, m10 + psrad m7, 11 + psrad m1, 11 + psrad m3, 11 + psrad m2, 11 + packssdw m7, m1 + packssdw m3, m2 + paddw m7, m5 + paddw m3, m6 + packuswb m7, m3 + mova [dstq+idxq], m7 + add idxd, 16 + cmp idxd, wd + jl .loop_x + add dstq, strideq + add t1q, 384 * 2 + add t2q, 384 * 2 + dec hd + jg .loop_y + RET diff -Nru dav1d-0.7.1/src/x86/looprestoration_ssse3.asm dav1d-0.9.1/src/x86/looprestoration_ssse3.asm --- dav1d-0.7.1/src/x86/looprestoration_ssse3.asm 2020-06-21 11:48:55.032126400 +0000 +++ dav1d-0.9.1/src/x86/looprestoration_ssse3.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1952 +0,0 @@ -; Copyright © 2018, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC -; Copyright © 2018, VideoLabs -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - -SECTION_RODATA 16 - -pb_right_ext_mask: times 16 db 0xff - times 16 db 0 -pb_14x0_1_2: times 14 db 0 - db 1, 2 -pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 - db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 -pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 -pb_0: times 16 db 0 -pb_2: times 16 db 2 -pb_3: times 16 db 3 -pb_4: times 16 db 4 -pb_15: times 16 db 15 -pb_0_1: times 8 db 0, 1 -pb_6_7: times 8 db 6, 7 -pb_14_15: times 8 db 14, 15 -pw_1: times 8 dw 1 -pw_16: times 8 dw 16 -pw_128: times 8 dw 128 -pw_255: times 8 dw 255 -pw_256: times 8 dw 256 -pw_2048: times 8 dw 2048 -pw_16380: times 8 dw 16380 -pw_5_6: times 4 dw 5, 6 -pw_0_128: times 4 dw 0, 128 -pd_1024: times 4 dd 1024 -%if ARCH_X86_32 -pd_256: times 4 dd 256 -pd_512: times 4 dd 512 -pd_2048: times 4 dd 2048 -%endif -pd_0xF0080029: times 4 dd 0xF0080029 -pd_0xF00801C7: times 4 dd 0XF00801C7 - -cextern sgr_x_by_x - -SECTION .text - -%if ARCH_X86_32 - %define PIC_base_offset $$ - - %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg - %assign pic_reg_stk_off 4 - %xdefine PIC_reg %1 - %if %2 == 1 - mov [esp], %1 - %endif - LEA PIC_reg, PIC_base_offset - %if %3 == 1 - XCHG_PIC_REG - %endif - %endmacro - - %macro XCHG_PIC_REG 0 - mov [esp+pic_reg_stk_off], PIC_reg - %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 - mov PIC_reg, [esp+pic_reg_stk_off] - %endmacro - - %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) - -%else - %macro XCHG_PIC_REG 0 - %endmacro - - %define PIC_sym(sym) (sym) -%endif - -%macro PALIGNR 4 ; dst, src1, src2, shift - %if cpuflag(ssse3) - palignr %1, %2, %3, %4 - %else - %assign %%i regnumof%+%1 + 1 - %define %%tmp m %+ %%i - psrldq %1, %3, %4 - pslldq %%tmp, %2, 16-%4 - por %1, %%tmp - %endif -%endmacro - -%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero - %if cpuflag(ssse3) - pmaddubsw %1, %2 - %else - %if %5 == 1 - pxor %3, %3 - %endif - punpckhbw %4, %1, %3 - punpcklbw %1, %3 - pmaddwd %4, %2 - pmaddwd %1, %2 - packssdw %1, %4 - %endif -%endmacro - -;;;;;;;;;;;;;;;;;;;;;; -;; wiener ;; -;;;;;;;;;;;;;;;;;;;;;; - -%macro WIENER_H 0 -%if ARCH_X86_64 -cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge - mov edged, edgem - movifnidn wd, wm - mov hd, hm -%else -cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge - mov r5, edgem - mov [esp+12], r5 - mov wd, wm - mov hd, hm - SETUP_PIC hd - %define m15 m0 - %define m14 m1 - %define m13 m2 - %define m12 m3 -%endif - - movq m15, [fhq] -%if cpuflag(ssse3) - pshufb m12, m15, [PIC_sym(pb_6_7)] - pshufb m13, m15, [PIC_sym(pb_4)] - pshufb m14, m15, [PIC_sym(pb_2)] - pshufb m15, m15, [PIC_sym(pb_0)] -%else - pshuflw m12, m15, q3333 - punpcklbw m15, m15 - pshufhw m13, m15, q0000 - pshuflw m14, m15, q2222 - pshuflw m15, m15, q0000 - punpcklqdq m12, m12 - punpckhqdq m13, m13 - punpcklqdq m14, m14 - punpcklqdq m15, m15 - psraw m13, 8 - psraw m14, 8 - psraw m15, 8 -%endif - -%if ARCH_X86_64 - mova m11, [pw_2048] - mova m10, [pw_16380] - lea r11, [pb_right_ext_mask] - - DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim -%else - %define m10 [PIC_sym(pw_16380)] - %define m11 [PIC_sym(pw_2048)] - %define m12 [esp+0x14] - %define m13 [esp+0x24] - %define m14 [esp+0x34] - %define m15 [esp+0x44] - mova m12, m3 - mova m13, m2 - mova m14, m1 - mova m15, m0 - - DEFINE_ARGS dst, left, src, stride, x, w, h, edge - %define srcptrq srcq - %define dstptrq dstq - %define hd dword [esp+ 0] - %define edgeb byte [esp+12] - %define xlimd dword [esp+16] -%endif - - ; if (edge & has_right) align_w_to_16 - ; else w -= 3, and use that as limit in x loop - test edgeb, 2 ; has_right - jnz .align - mov xlimd, -3 - jmp .loop -.align: - add wd, 15 - and wd, ~15 -%if ARCH_X86_64 - xor xlimd, xlimd -%else - mov xlimd, 0 -%endif - - ; main y loop for vertical filter -.loop: -%if ARCH_X86_64 - mov srcptrq, srcq - mov dstptrq, dstq - lea xd, [wq+xlimq] -%else - mov [esp+8], srcq - mov [esp+4], dstq - mov xd, xlimd - add xd, wd -%endif - - ; load left edge pixels - test edgeb, 1 ; have_left - jz .emu_left - test leftq, leftq ; left == NULL for the edge-extended bottom/top - jz .load_left_combined - movd m0, [leftq] - movd m1, [srcq] - punpckldq m0, m1 - pslldq m0, 9 - add leftq, 4 - jmp .left_load_done -.load_left_combined: - movq m0, [srcq-3] - pslldq m0, 10 - jmp .left_load_done -.emu_left: - movd m0, [srcq] -%if cpuflag(ssse3) - pshufb m0, [PIC_sym(pb_14x0_1_2)] -%else - pslldq m1, m0, 13 - punpcklbw m0, m0 - pshuflw m0, m0, q0000 - punpcklqdq m0, m0 - psrldq m0, 2 - por m0, m1 -%endif - - ; load right edge pixels -.left_load_done: - cmp xd, 16 - jg .main_load - test xd, xd - jg .load_and_splat - je .splat_right - - ; for very small images (w=[1-2]), edge-extend the original cache, - ; ugly, but only runs in very odd cases -%if cpuflag(ssse3) - add wd, wd - %if ARCH_X86_64 - pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] - %else - pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16] - %endif - shr wd, 1 -%else - shl wd, 4 - pcmpeqd m2, m2 - movd m3, wd - psrldq m2, 2 - punpckhbw m1, m0, m0 - pshufhw m1, m1, q1122 - psllq m1, m3 - pand m0, m2 - pandn m2, m1 - por m0, m2 - shr wd, 4 -%endif - - ; main x loop, mostly this starts in .main_load -.splat_right: - ; no need to load new pixels, just extend them from the (possibly previously - ; extended) previous load into m0 -%if cpuflag(ssse3) - pshufb m1, m0, [PIC_sym(pb_15)] -%else - punpckhbw m1, m0, m0 - pshufhw m1, m1, q3333 - punpckhqdq m1, m1 -%endif - jmp .main_loop -.load_and_splat: - ; load new pixels and extend edge for right-most - movu m1, [srcptrq+3] -%if ARCH_X86_64 - sub r11, xq - movu m2, [r11+16] - add r11, xq -%else - sub PIC_reg, xd - movu m2, [PIC_sym(pb_right_ext_mask)+16] - add PIC_reg, xd -%endif - movd m3, [srcptrq+2+xq] -%if cpuflag(ssse3) - pshufb m3, [PIC_sym(pb_0)] -%else - punpcklbw m3, m3 - pshuflw m3, m3, q0000 - punpcklqdq m3, m3 -%endif - pand m1, m2 - pxor m2, [PIC_sym(pb_right_ext_mask)] - pand m3, m2 - pxor m2, [PIC_sym(pb_right_ext_mask)] - por m1, m3 - jmp .main_loop -.main_load: - ; load subsequent line - movu m1, [srcptrq+3] -.main_loop: -%if ARCH_X86_64 - PALIGNR m2, m1, m0, 10 - PALIGNR m3, m1, m0, 11 - PALIGNR m4, m1, m0, 12 - PALIGNR m5, m1, m0, 13 - PALIGNR m6, m1, m0, 14 - PALIGNR m7, m1, m0, 15 - - punpcklbw m0, m2, m1 - punpckhbw m2, m1 - punpcklbw m8, m3, m7 - punpckhbw m3, m7 - punpcklbw m7, m4, m6 - punpckhbw m4, m6 - PMADDUBSW m0, m15, m6, m9, 1 - PMADDUBSW m2, m15, m6, m9, 0 - PMADDUBSW m8, m14, m6, m9, 0 - PMADDUBSW m3, m14, m6, m9, 0 - PMADDUBSW m7, m13, m6, m9, 0 - PMADDUBSW m4, m13, m6, m9, 0 - paddw m0, m8 - paddw m2, m3 - %if cpuflag(ssse3) - pxor m6, m6 - %endif - punpcklbw m3, m5, m6 - punpckhbw m5, m6 - psllw m8, m3, 7 - psllw m6, m5, 7 - psubw m8, m10 - psubw m6, m10 - pmullw m3, m12 - pmullw m5, m12 - paddw m0, m7 - paddw m2, m4 - paddw m0, m3 - paddw m2, m5 - paddsw m0, m8 ; see the avx2 for an explanation - paddsw m2, m6 ; of how the clipping works here - psraw m0, 3 - psraw m2, 3 - paddw m0, m11 - paddw m2, m11 - mova [dstptrq+ 0], m0 - mova [dstptrq+16], m2 -%else - PALIGNR m2, m1, m0, 10 - punpcklbw m3, m2, m1 - punpckhbw m2, m1 - PMADDUBSW m3, m15, m4, m5, 1 - PMADDUBSW m2, m15, m4, m5, 0 - PALIGNR m4, m1, m0, 11 - PALIGNR m5, m1, m0, 15 - punpcklbw m6, m4, m5 - punpckhbw m4, m5 - PMADDUBSW m6, m14, m5, m7, 1 - PMADDUBSW m4, m14, m5, m7, 0 - paddw m3, m6 - paddw m2, m4 - PALIGNR m4, m1, m0, 12 - PALIGNR m5, m1, m0, 14 - punpcklbw m6, m4, m5 - punpckhbw m4, m5 - PMADDUBSW m6, m13, m5, m7, 1 - PMADDUBSW m4, m13, m5, m7, 0 - paddw m3, m6 - paddw m2, m4 - PALIGNR m6, m1, m0, 13 - %if cpuflag(ssse3) - pxor m5, m5 - %endif - punpcklbw m4, m6, m5 - punpckhbw m6, m5 - psllw m5, m4, 7 - psllw m7, m6, 7 - psubw m5, m10 - psubw m7, m10 - pmullw m4, m12 - pmullw m6, m12 - paddw m3, m4 - paddw m2, m6 - paddsw m3, m5 - paddsw m2, m7 - psraw m3, 3 - psraw m2, 3 - paddw m3, m11 - paddw m2, m11 - mova [dstptrq+ 0], m3 - mova [dstptrq+16], m2 -%endif - - mova m0, m1 - add srcptrq, 16 - add dstptrq, 32 - sub xd, 16 - cmp xd, 16 - jg .main_load - test xd, xd - jg .load_and_splat - cmp xd, xlimd - jg .splat_right - -%if ARCH_X86_32 - mov srcq, [esp+8] - mov dstq, [esp+4] -%endif - add srcq, strideq - add dstq, 384*2 - dec hd - jg .loop - RET -%endmacro - -%macro WIENER_V 0 -%if ARCH_X86_64 -cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge - mov edged, edgem - movifnidn fvq, fvmp - movifnidn hd, hm - movq m15, [fvq] - pshufd m14, m15, q1111 - pshufd m15, m15, q0000 - paddw m14, [pw_0_128] - mova m12, [pd_1024] - - DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr - - mov ylimd, edged - and ylimd, 8 ; have_bottom - shr ylimd, 2 - sub ylimd, 3 -%else -cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge - %define ylimd [esp+12] - - mov r5d, edgem - and r5d, 8 - shr r5d, 2 - sub r5d, 3 - mov ylimd, r5d - mov fvq, fvmp - mov edged, edgem - - SETUP_PIC edged - - movq m0, [fvq] - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - paddw m1, [PIC_sym(pw_0_128)] - mova [esp+0x50], m0 - mova [esp+0x40], m1 - - DEFINE_ARGS dst, stride, mid, w, h, y, edge - %define mptrq midq - %define dstptrq dstq - %define edgeb byte [esp] -%endif - - ; main x loop for vertical filter, does one column of 16 pixels -.loop_x: - mova m3, [midq] ; middle line - - ; load top pixels - test edgeb, 4 ; have_top - jz .emu_top - mova m0, [midq-384*4] - mova m2, [midq-384*2] - mova m1, m0 - jmp .load_bottom_pixels -.emu_top: - mova m0, m3 - mova m1, m3 - mova m2, m3 - - ; load bottom pixels -.load_bottom_pixels: - mov yd, hd -%if ARCH_X86_64 - mov mptrq, midq - mov dstptrq, dstq - add yd, ylimd -%else - mov [esp+8], midq - mov [esp+4], dstq - add yd, ylimd -%endif - jg .load_threelines - - ; the remainder here is somewhat messy but only runs in very weird - ; circumstances at the bottom of the image in very small blocks (h=[1-3]), - ; so performance is not terribly important here... - je .load_twolines - cmp yd, -1 - je .load_oneline - ; h == 1 case - mova m5, m3 - mova m4, m3 - mova m6, m3 - jmp .loop -.load_oneline: - ; h == 2 case - mova m4, [midq+384*2] - mova m5, m4 - mova m6, m4 - jmp .loop -.load_twolines: - ; h == 3 case - mova m4, [midq+384*2] - mova m5, [midq+384*4] - mova m6, m5 - jmp .loop -.load_threelines: - ; h > 3 case - mova m4, [midq+384*2] - mova m5, [midq+384*4] - ; third line loaded in main loop below - - ; main y loop for vertical filter -.loop_load: - ; load one line into m6. if that pixel is no longer available, do - ; nothing, since m6 still has the data from the previous line in it. We - ; try to structure the loop so that the common case is evaluated fastest - mova m6, [mptrq+384*6] -.loop: -%if ARCH_X86_64 - paddw m7, m0, m6 - paddw m8, m1, m5 - paddw m9, m2, m4 - punpcklwd m10, m7, m8 - punpckhwd m7, m8 - punpcklwd m11, m9, m3 - punpckhwd m9, m3 - pmaddwd m10, m15 - pmaddwd m7, m15 - pmaddwd m11, m14 - pmaddwd m9, m14 - paddd m10, m12 - paddd m7, m12 - paddd m10, m11 - paddd m7, m9 - psrad m10, 11 - psrad m7, 11 - packssdw m10, m7 - packuswb m10, m10 - movq [dstptrq], m10 -%else - mova [esp+0x30], m1 - mova [esp+0x20], m2 - mova [esp+0x10], m3 - paddw m0, m6 - paddw m1, m5 - paddw m2, m4 - punpcklwd m7, m2, m3 - punpckhwd m2, m3 - punpcklwd m3, m0, m1 - punpckhwd m0, m1 - mova m1, [esp+0x50] - pmaddwd m3, m1 - pmaddwd m0, m1 - mova m1, [esp+0x40] - pmaddwd m7, m1 - pmaddwd m2, m1 - paddd m3, [PIC_sym(pd_1024)] - paddd m0, [PIC_sym(pd_1024)] - paddd m3, m7 - paddd m0, m2 - psrad m3, 11 - psrad m0, 11 - packssdw m3, m0 - packuswb m3, m3 - movq [dstq], m3 - mova m1, [esp+0x30] - mova m2, [esp+0x20] - mova m3, [esp+0x10] -%endif - ; shift pixels one position - mova m0, m1 - mova m1, m2 - mova m2, m3 - mova m3, m4 - mova m4, m5 - mova m5, m6 - add mptrq, 384*2 - add dstptrq, strideq - dec yd - jg .loop_load - ; for the bottom pixels, continue using m6 (as extended edge) - cmp yd, ylimd - jg .loop - -%if ARCH_X86_32 - mov midq, [esp+8] - mov dstq, [esp+4] -%endif - add midq, 16 - add dstq, 8 - sub wd, 8 - jg .loop_x - RET -%endmacro - -INIT_XMM sse2 -WIENER_H -WIENER_V - -INIT_XMM ssse3 -WIENER_H -WIENER_V - -;;;;;;;;;;;;;;;;;;;;;;;;;; -;; self-guided ;; -;;;;;;;;;;;;;;;;;;;;;;;;;; - -%macro MULLD 2 - pmulhuw m5, %1, %2 - pmullw %1, %2 - pslld m5, 16 - paddd %1, m5 -%endmacro - -%macro GATHERDD 2 - mova m5, m7 - movd r6d, %2 - %if ARCH_X86_64 - movd %1, [r5+r6] - pextrw r6d, %2, 2 - pinsrw m5, [r5+r6+2], 3 - pextrw r6d, %2, 4 - pinsrw %1, [r5+r6+2], 5 - pextrw r6d, %2, 6 - pinsrw m5, [r5+r6+2], 7 - %else - movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6] - pextrw r6d, %2, 2 - pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3 - pextrw r6d, %2, 4 - pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5 - pextrw r6d, %2, 6 - pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7 - %endif - por %1, m5 -%endmacro - -%if ARCH_X86_64 -cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - mov xlimd, edgem - movifnidn xd, xm - mov hd, hm - mov edged, xlimd - and xlimd, 2 ; have_right - add xd, xlimd - xor xlimd, 2 ; 2*!have_right -%else -cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim - %define wq r0m - %define xlimd r1m - %define hd hmp - %define edgeb byte edgem - - mov r6, edgem - and r6, 2 ; have_right - add xd, r6 - xor r6, 2 ; 2*!have_right - mov xlimd, r6 - SETUP_PIC r6, 0 -%endif - - jnz .no_right - add xd, 7 - and xd, ~7 -.no_right: - pxor m1, m1 - lea srcq, [srcq+xq] - lea sumq, [sumq+xq*2-2] - lea sumsqq, [sumsqq+xq*4-4] - neg xq - mov wq, xq -%if ARCH_X86_64 - lea r10, [pb_right_ext_mask+16] -%endif -.loop_y: - mov xq, wq - - ; load left - test edgeb, 1 ; have_left - jz .no_left - test leftq, leftq - jz .load_left_from_main - movd m0, [leftq] - pslldq m0, 12 - add leftq, 4 - jmp .expand_x -.no_left: - movd m0, [srcq+xq] - pshufb m0, [PIC_sym(pb_0)] - jmp .expand_x -.load_left_from_main: - movd m0, [srcq+xq-2] - pslldq m0, 14 -.expand_x: - punpckhbw xm0, xm1 - - ; when we reach this, m0 contains left two px in highest words - cmp xd, -8 - jle .loop_x -.partial_load_and_extend: - movd m3, [srcq-4] - pshufb m3, [PIC_sym(pb_3)] - movq m2, [srcq+xq] - punpcklbw m2, m1 - punpcklbw m3, m1 -%if ARCH_X86_64 - movu m4, [r10+xq*2] -%else - movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] -%endif - pand m2, m4 - pandn m4, m3 - por m2, m4 - jmp .loop_x_noload -.right_extend: - pshufb m2, m0, [PIC_sym(pb_14_15)] - jmp .loop_x_noload - -.loop_x: - movq m2, [srcq+xq] - punpcklbw m2, m1 -.loop_x_noload: - palignr m3, m2, m0, 12 - palignr m4, m2, m0, 14 - - punpcklwd m5, m3, m2 - punpckhwd m6, m3, m2 - paddw m3, m4 - punpcklwd m7, m4, m1 - punpckhwd m4, m1 - pmaddwd m5, m5 - pmaddwd m6, m6 - pmaddwd m7, m7 - pmaddwd m4, m4 - paddd m5, m7 - paddd m6, m4 - paddw m3, m2 - movu [sumq+xq*2], m3 - movu [sumsqq+xq*4+ 0], m5 - movu [sumsqq+xq*4+16], m6 - - mova m0, m2 - add xq, 8 - - ; if x <= -8 we can reload more pixels - ; else if x < 0 we reload and extend (this implies have_right=0) - ; else if x < xlimd we extend from previous load (this implies have_right=0) - ; else we are done - - cmp xd, -8 - jle .loop_x - test xd, xd - jl .partial_load_and_extend - cmp xd, xlimd - jl .right_extend - - add sumsqq, (384+16)*4 - add sumq, (384+16)*2 - add srcq, strideq - dec hd - jg .loop_y - RET - -%if ARCH_X86_64 -cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim - movifnidn edged, edgem -%else -cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y - %define sumsq_baseq dword [esp+0] - %define sum_baseq dword [esp+4] - %define ylimd dword [esp+8] - %define m8 [esp+12] - mov edged, r4m - mov hd, r3m -%endif - mov xq, -2 -%if ARCH_X86_64 - mov ylimd, edged - and ylimd, 8 ; have_bottom - shr ylimd, 2 - sub ylimd, 2 ; -2 if have_bottom=0, else 0 - mov sumsq_baseq, sumsqq - mov sum_baseq, sumq -.loop_x: - mov sumsqq, sumsq_baseq - mov sumq, sum_baseq - lea yd, [hq+ylimq+2] -%else - mov yd, edged - and yd, 8 ; have_bottom - shr yd, 2 - sub yd, 2 ; -2 if have_bottom=0, else 0 - mov sumsq_baseq, sumsqq - mov sum_baseq, sumq - mov ylimd, yd -.loop_x: - mov sumsqd, sumsq_baseq - mov sumd, sum_baseq - lea yd, [hq+2] - add yd, ylimd -%endif - lea sumsqq, [sumsqq+xq*4+4-(384+16)*4] - lea sumq, [sumq+xq*2+2-(384+16)*2] - test edgeb, 4 ; have_top - jnz .load_top - movu m0, [sumsqq+(384+16)*4*1] - movu m1, [sumsqq+(384+16)*4*1+16] - mova m2, m0 - mova m3, m1 - mova m4, m0 - mova m5, m1 - movu m6, [sumq+(384+16)*2*1] - mova m7, m6 - mova m8, m6 - jmp .loop_y_noload -.load_top: - movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left] - movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right] - movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left] - movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right] - movu m6, [sumq-(384+16)*2*1] ; l2 - movu m7, [sumq-(384+16)*2*0] ; l1 -.loop_y: -%if ARCH_X86_64 - movu m8, [sumq+(384+16)*2*1] ; l0 -%else - movu m4, [sumq+(384+16)*2*1] ; l0 - mova m8, m4 -%endif - movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left] - movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right] -.loop_y_noload: - paddd m0, m2 - paddd m1, m3 - paddw m6, m7 - paddd m0, m4 - paddd m1, m5 - paddw m6, m8 - movu [sumsqq+ 0], m0 - movu [sumsqq+16], m1 - movu [sumq], m6 - - ; shift position down by one - mova m0, m2 - mova m1, m3 - mova m2, m4 - mova m3, m5 - mova m6, m7 - mova m7, m8 - add sumsqq, (384+16)*4 - add sumq, (384+16)*2 - dec yd - jg .loop_y - cmp yd, ylimd - jg .loop_y_noload - add xd, 8 - cmp xd, wd - jl .loop_x - RET - -cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s - movifnidn sd, sm - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 - add hd, 2 -%if ARCH_X86_64 - LEA r5, sgr_x_by_x-0xF03 -%else - SETUP_PIC r5, 0 -%endif - movd m6, sd - pshuflw m6, m6, q0000 - punpcklqdq m6, m6 - pxor m7, m7 - DEFINE_ARGS a, b, w, h, x -%if ARCH_X86_64 - mova m8, [pd_0xF00801C7] - mova m9, [pw_256] - psrld m10, m9, 13 ; pd_2048 - mova m11, [pb_unpcklwdw] -%else - %define m8 [PIC_sym(pd_0xF00801C7)] - %define m9 [PIC_sym(pw_256)] - %define m10 [PIC_sym(pd_2048)] - %define m11 [PIC_sym(pb_unpcklwdw)] -%endif -.loop_y: - mov xq, -2 -.loop_x: - movq m0, [bq+xq*2] - movq m1, [bq+xq*2+(384+16)*2] - punpcklwd m0, m7 - punpcklwd m1, m7 - movu m2, [aq+xq*4] - movu m3, [aq+xq*4+(384+16)*4] - pslld m4, m2, 3 - pslld m5, m3, 3 - paddd m2, m4 ; aa * 9 - paddd m3, m5 - pmaddwd m4, m0, m0 - pmaddwd m5, m1, m1 - pmaddwd m0, m8 - pmaddwd m1, m8 - psubd m2, m4 ; p = aa * 9 - bb * bb - psubd m3, m5 - MULLD m2, m6 - MULLD m3, m6 - paddusw m2, m8 - paddusw m3, m8 - psrld m2, 20 ; z - psrld m3, 20 - GATHERDD m4, m2 ; xx - GATHERDD m2, m3 - psrld m4, 24 - psrld m2, 24 - packssdw m3, m4, m2 - pshufb m4, m11 - MULLD m0, m4 - pshufb m2, m11 - MULLD m1, m2 - psubw m5, m9, m3 - paddd m0, m10 - paddd m1, m10 - psrld m0, 12 - psrld m1, 12 - movq [bq+xq*2], m5 - psrldq m5, 8 - movq [bq+xq*2+(384+16)*2], m5 - movu [aq+xq*4], m0 - movu [aq+xq*4+(384+16)*4], m1 - add xd, 4 - cmp xd, wd - jl .loop_x - add aq, (384+16)*4*2 - add bq, (384+16)*2*2 - sub hd, 2 - jg .loop_y - RET - -%if ARCH_X86_64 -cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ - tmp_base, src_base, a_base, b_base, x, y - movifnidn wd, wm - mov hd, hm - mova m15, [pw_16] - mov tmp_baseq, tq - mov src_baseq, srcq - mov a_baseq, aq - mov b_baseq, bq - xor xd, xd -%else -cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y - %define tmp_baseq [esp+8] - %define src_baseq [esp+12] - %define a_baseq [esp+16] - %define b_baseq [esp+20] - %define wd [esp+24] - %define hd [esp+28] - mov tmp_baseq, tq - mov src_baseq, srcq - mov a_baseq, aq - mov b_baseq, bq - mov wd, xd - mov hd, yd - xor xd, xd - SETUP_PIC yd, 1, 1 - jmp .loop_start -%endif - -.loop_x: - mov tq, tmp_baseq - mov srcq, src_baseq - mov aq, a_baseq - mov bq, b_baseq -%if ARCH_X86_32 -.loop_start: - movu m0, [bq+xq*2-(384+16)*2-2] - movu m2, [bq+xq*2-(384+16)*2+2] - mova m1, [bq+xq*2-(384+16)*2] ; b:top - paddw m0, m2 ; b:tl+tr - movu m2, [bq+xq*2-2] - movu m3, [bq+xq*2+2] - paddw m1, [bq+xq*2] ; b:top+ctr - paddw m2, m3 ; b:l+r - mova [esp+0x80], m0 - mova [esp+0x70], m1 - mova [esp+0x60], m2 -%endif - movu m0, [aq+xq*4-(384+16)*4-4] - movu m2, [aq+xq*4-(384+16)*4+4] - mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] - paddd m0, m2 ; a:tl+tr [first half] - movu m2, [aq+xq*4-(384+16)*4-4+16] - movu m4, [aq+xq*4-(384+16)*4+4+16] - mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half] - paddd m2, m4 ; a:tl+tr [second half] - movu m4, [aq+xq*4-4] - movu m5, [aq+xq*4+4] - paddd m1, [aq+xq*4] ; a:top+ctr [first half] - paddd m4, m5 ; a:l+r [first half] - movu m5, [aq+xq*4+16-4] - movu m6, [aq+xq*4+16+4] - paddd m3, [aq+xq*4+16] ; a:top+ctr [second half] - paddd m5, m6 ; a:l+r [second half] -%if ARCH_X86_64 - movu m6, [bq+xq*2-(384+16)*2-2] - movu m8, [bq+xq*2-(384+16)*2+2] - mova m7, [bq+xq*2-(384+16)*2] ; b:top - paddw m6, m8 ; b:tl+tr - movu m8, [bq+xq*2-2] - movu m9, [bq+xq*2+2] - paddw m7, [bq+xq*2] ; b:top+ctr - paddw m8, m9 ; b:l+r -%endif - - lea tq, [tq+xq*2] - lea srcq, [srcq+xq*1] - lea aq, [aq+xq*4+(384+16)*4] - lea bq, [bq+xq*2+(384+16)*2] - mov yd, hd -.loop_y: -%if ARCH_X86_64 - movu m9, [bq-2] - movu m10, [bq+2] - paddw m7, [bq] ; b:top+ctr+bottom - paddw m9, m10 ; b:bl+br - paddw m10, m7, m8 ; b:top+ctr+bottom+l+r - paddw m6, m9 ; b:tl+tr+bl+br - psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom - paddw m10, m6 - psllw m10, 2 - psubw m10, m6 ; aa - pxor m14, m14 - movq m12, [srcq] - punpcklbw m12, m14 - punpcklwd m6, m10, m15 - punpckhwd m10, m15 - punpcklwd m13, m12, m15 - punpckhwd m12, m15 - pmaddwd m6, m13 ; aa*src[x]+256 [first half] - pmaddwd m10, m12 ; aa*src[x]+256 [second half] -%else - paddd m1, [aq] ; a:top+ctr+bottom [first half] - paddd m3, [aq+16] ; a:top+ctr+bottom [second half] - mova [esp+0x50], m1 - mova [esp+0x40], m3 - mova [esp+0x30], m4 - movu m6, [aq-4] - movu m7, [aq+4] - paddd m1, m4 ; a:top+ctr+bottom+l+r [first half] - paddd m3, m5 ; a:top+ctr+bottom+l+r [second half] - paddd m6, m7 ; a:bl+br [first half] - movu m7, [aq+16-4] - movu m4, [aq+16+4] - paddd m7, m4 ; a:bl+br [second half] - paddd m0, m6 ; a:tl+tr+bl+br [first half] - paddd m2, m7 ; a:tl+tr+bl+br [second half] - paddd m1, m0 - paddd m3, m2 - pslld m1, 2 - pslld m3, 2 - psubd m1, m0 ; bb [first half] - psubd m3, m2 ; bb [second half] -%endif - -%if ARCH_X86_64 - movu m11, [aq-4] - movu m12, [aq+4] - paddd m1, [aq] ; a:top+ctr+bottom [first half] - paddd m11, m12 ; a:bl+br [first half] - movu m12, [aq+16-4] - movu m13, [aq+16+4] - paddd m3, [aq+16] ; a:top+ctr+bottom [second half] - paddd m12, m13 ; a:bl+br [second half] - paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] - paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] - paddd m0, m11 ; a:tl+tr+bl+br [first half] - paddd m2, m12 ; a:tl+tr+bl+br [second half] - paddd m13, m0 - paddd m14, m2 - pslld m13, 2 - pslld m14, 2 - psubd m13, m0 ; bb [first half] - psubd m14, m2 ; bb [second half] - psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] - psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] -%else - mova m4, [esp+0x80] - mova [esp+0x80], m5 - mova m5, [esp+0x70] - mova [esp+0x70], m6 - mova m6, [esp+0x60] - mova [esp+0x60], m7 - mova [esp+0x20], m1 - movu m7, [bq-2] - movu m1, [bq+2] - paddw m5, [bq] ; b:top+ctr+bottom - paddw m7, m1 - paddw m1, m5, m6 ; b:top+ctr+bottom+l+r - paddw m4, m7 ; b:tl+tr+bl+br - psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom - paddw m1, m4 - psllw m1, 2 - psubw m1, m4 ; aa - movq m0, [srcq] - XCHG_PIC_REG - punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16] - punpcklwd m4, m1, [PIC_sym(pw_16)] - punpckhwd m1, [PIC_sym(pw_16)] - punpcklwd m2, m0, [PIC_sym(pw_16)] - punpckhwd m0, [PIC_sym(pw_16)] - XCHG_PIC_REG - pmaddwd m4, m2 ; aa*src[x]+256 [first half] - pmaddwd m1, m0 ; aa*src[x]+256 [second half] -%endif - -%if ARCH_X86_64 - paddd m6, m13 - paddd m10, m14 - psrad m6, 9 - psrad m10, 9 - packssdw m6, m10 - mova [tq], m6 -%else - paddd m4, [esp+0x20] - paddd m1, m3 - psrad m4, 9 - psrad m1, 9 - packssdw m4, m1 - mova [tq], m4 -%endif - - ; shift to next row -%if ARCH_X86_64 - mova m0, m4 - mova m2, m5 - mova m4, m11 - mova m5, m12 - mova m6, m8 - mova m8, m9 -%else - mova m1, [esp+0x50] - mova m3, [esp+0x40] - mova m0, [esp+0x30] - mova m2, [esp+0x80] - mova m4, [esp+0x70] - mova [esp+0x70], m5 - mova m5, [esp+0x60] - mova [esp+0x80], m6 - mova [esp+0x60], m7 - psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] - psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] -%endif - - add srcq, strideq - add aq, (384+16)*4 - add bq, (384+16)*2 - add tq, 384*2 - dec yd - jg .loop_y - add xd, 8 - cmp xd, wd - jl .loop_x - RET - -cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt - movifnidn hd, hm -%if ARCH_X86_32 - SETUP_PIC r6, 0 -%endif - movd m0, wtm - pshufb m0, [PIC_sym(pb_0_1)] - psllw m0, 4 - pxor m7, m7 - DEFINE_ARGS dst, stride, t, w, h, idx -.loop_y: - xor idxd, idxd -.loop_x: - mova m1, [tq+idxq*2+ 0] - mova m4, [tq+idxq*2+16] - mova m5, [dstq+idxq] - punpcklbw m2, m5, m7 - punpckhbw m5, m7 - psllw m3, m2, 4 - psllw m6, m5, 4 - psubw m1, m3 - psubw m4, m6 - pmulhrsw m1, m0 - pmulhrsw m4, m0 - paddw m1, m2 - paddw m4, m5 - packuswb m1, m4 - mova [dstq+idxq], m1 - add idxd, 16 - cmp idxd, wd - jl .loop_x - add dstq, strideq - add tq, 384 * 2 - dec hd - jg .loop_y - RET - -%if ARCH_X86_64 -cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov edged, edgem - movifnidn wd, wm - mov hd, hm - mova m10, [pb_0] - mova m11, [pb_0_1] -%else -cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge - %define edgeb byte edgem - %define wd xd - %define wq wd - %define wm r5m - %define strideq r4m - SUB esp, 8 - SETUP_PIC sumsqd, 1, 1 - - %define m10 [PIC_sym(pb_0)] - %define m11 [PIC_sym(pb_0_1)] -%endif - - test edgeb, 2 ; have_right - jz .no_right - xor xlimd, xlimd - add wd, 2 - add wd, 15 - and wd, ~15 - jmp .right_done -.no_right: - mov xlimd, 3 - dec wd -.right_done: - pxor m1, m1 - lea srcq, [srcq+wq+1] - lea sumq, [sumq+wq*2-2] - lea sumsqq, [sumsqq+wq*4-4] - neg wq -%if ARCH_X86_64 - lea r10, [pb_right_ext_mask+16] -%else - mov wm, xd - %define wq wm -%endif - -.loop_y: - mov xq, wq - ; load left - test edgeb, 1 ; have_left - jz .no_left - test leftq, leftq - jz .load_left_from_main - movd m0, [leftq] - movd m2, [srcq+xq-1] - pslldq m2, 4 - por m0, m2 - pslldq m0, 11 - add leftq, 4 - jmp .expand_x -.no_left: - movd m0, [srcq+xq-1] - XCHG_PIC_REG - pshufb m0, m10 - XCHG_PIC_REG - jmp .expand_x -.load_left_from_main: - movd m0, [srcq+xq-4] - pslldq m0, 12 -.expand_x: - punpckhbw m0, m1 - - ; when we reach this, m0 contains left two px in highest words - cmp xd, -8 - jle .loop_x - test xd, xd - jge .right_extend -.partial_load_and_extend: - XCHG_PIC_REG - movd m3, [srcq-1] - movq m2, [srcq+xq] - pshufb m3, m10 - punpcklbw m3, m1 - punpcklbw m2, m1 -%if ARCH_X86_64 - movu m4, [r10+xq*2] -%else - movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] - XCHG_PIC_REG -%endif - pand m2, m4 - pandn m4, m3 - por m2, m4 - jmp .loop_x_noload -.right_extend: - psrldq m2, m0, 14 - XCHG_PIC_REG - pshufb m2, m11 - XCHG_PIC_REG - jmp .loop_x_noload - -.loop_x: - movq m2, [srcq+xq] - punpcklbw m2, m1 -.loop_x_noload: - palignr m3, m2, m0, 8 - palignr m4, m2, m0, 10 - palignr m5, m2, m0, 12 - palignr m6, m2, m0, 14 - -%if ARCH_X86_64 - paddw m0, m3, m2 - punpcklwd m7, m3, m2 - punpckhwd m3, m2 - paddw m0, m4 - punpcklwd m8, m4, m5 - punpckhwd m4, m5 - paddw m0, m5 - punpcklwd m9, m6, m1 - punpckhwd m5, m6, m1 - paddw m0, m6 - pmaddwd m7, m7 - pmaddwd m3, m3 - pmaddwd m8, m8 - pmaddwd m4, m4 - pmaddwd m9, m9 - pmaddwd m5, m5 - paddd m7, m8 - paddd m3, m4 - paddd m7, m9 - paddd m3, m5 - movu [sumq+xq*2], m0 - movu [sumsqq+xq*4+ 0], m7 - movu [sumsqq+xq*4+16], m3 -%else - paddw m0, m3, m2 - paddw m0, m4 - paddw m0, m5 - paddw m0, m6 - movu [sumq+xq*2], m0 - punpcklwd m7, m3, m2 - punpckhwd m3, m2 - punpcklwd m0, m4, m5 - punpckhwd m4, m5 - punpckhwd m5, m6, m1 - pmaddwd m7, m7 - pmaddwd m3, m3 - pmaddwd m0, m0 - pmaddwd m4, m4 - pmaddwd m5, m5 - paddd m7, m0 - paddd m3, m4 - paddd m3, m5 - punpcklwd m0, m6, m1 - pmaddwd m0, m0 - paddd m7, m0 - movu [sumsqq+xq*4+ 0], m7 - movu [sumsqq+xq*4+16], m3 -%endif - - mova m0, m2 - add xq, 8 - - ; if x <= -8 we can reload more pixels - ; else if x < 0 we reload and extend (this implies have_right=0) - ; else if x < xlimd we extend from previous load (this implies have_right=0) - ; else we are done - - cmp xd, -8 - jle .loop_x - test xd, xd - jl .partial_load_and_extend - cmp xd, xlimd - jl .right_extend - - add srcq, strideq - add sumsqq, (384+16)*4 - add sumq, (384+16)*2 - dec hd - jg .loop_y -%if ARCH_X86_32 - ADD esp, 8 -%endif - RET - -%if ARCH_X86_64 -cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim - movifnidn edged, edgem - mov ylimd, edged -%else -cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr - %define wm [esp+0] - %define hm [esp+4] - %define edgem [esp+8] - mov wm, xd - mov hm, yd - mov edgem, ylimd -%endif - - and ylimd, 8 ; have_bottom - shr ylimd, 2 - sub ylimd, 3 ; -3 if have_bottom=0, else -1 - mov xq, -2 -%if ARCH_X86_64 -.loop_x: - lea yd, [hd+ylimd+2] - lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] - lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2] - test edgeb, 4 ; have_top - jnz .load_top - movu m0, [sumsq_ptrq+(384+16)*4*1] - movu m1, [sumsq_ptrq+(384+16)*4*1+16] - mova m2, m0 - mova m3, m1 - mova m4, m0 - mova m5, m1 - mova m6, m0 - mova m7, m1 - movu m10, [sum_ptrq+(384+16)*2*1] - mova m11, m10 - mova m12, m10 - mova m13, m10 - jmp .loop_y_second_load -.load_top: - movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] - movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] - movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] - movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] - mova m2, m0 - mova m3, m1 - movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 - movu m12, [sum_ptrq-(384+16)*2*0] ; l2 - mova m11, m10 -.loop_y: - movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] - movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] - movu m13, [sum_ptrq+(384+16)*2*1] ; l1 -.loop_y_second_load: - test yd, yd - jle .emulate_second_load - movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] - movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] - movu m14, [sum_ptrq+(384+16)*2*2] ; l0 -.loop_y_noload: - paddd m0, m2 - paddd m1, m3 - paddw m10, m11 - paddd m0, m4 - paddd m1, m5 - paddw m10, m12 - paddd m0, m6 - paddd m1, m7 - paddw m10, m13 - paddd m0, m8 - paddd m1, m9 - paddw m10, m14 - movu [sumsq_ptrq+ 0], m0 - movu [sumsq_ptrq+16], m1 - movu [sum_ptrq], m10 - - ; shift position down by one - mova m0, m4 - mova m1, m5 - mova m2, m6 - mova m3, m7 - mova m4, m8 - mova m5, m9 - mova m10, m12 - mova m11, m13 - mova m12, m14 - add sumsq_ptrq, (384+16)*4*2 - add sum_ptrq, (384+16)*2*2 - sub yd, 2 - jge .loop_y - ; l1 = l0 - mova m6, m8 - mova m7, m9 - mova m13, m14 - cmp yd, ylimd - jg .loop_y_noload - add xd, 8 - cmp xd, wd - jl .loop_x - RET -.emulate_second_load: - mova m8, m6 - mova m9, m7 - mova m14, m13 - jmp .loop_y_noload -%else -.sumsq_loop_x: - lea yd, [ylimd+2] - add yd, hm - lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] - test byte edgem, 4 ; have_top - jnz .sumsq_load_top - movu m0, [sumsq_ptrq+(384+16)*4*1] - movu m1, [sumsq_ptrq+(384+16)*4*1+16] - mova m4, m0 - mova m5, m1 - mova m6, m0 - mova m7, m1 - mova [esp+0x1c], m0 - mova [esp+0x0c], m1 - jmp .sumsq_loop_y_second_load -.sumsq_load_top: - movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] - movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] - movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] - movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] - mova [esp+0x1c], m0 - mova [esp+0x0c], m1 -.sumsq_loop_y: - movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] - movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] -.sumsq_loop_y_second_load: - test yd, yd - jle .sumsq_emulate_second_load - movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] - movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] -.sumsq_loop_y_noload: - paddd m0, [esp+0x1c] - paddd m1, [esp+0x0c] - paddd m0, m4 - paddd m1, m5 - paddd m0, m6 - paddd m1, m7 - paddd m0, m2 - paddd m1, m3 - movu [sumsq_ptrq+ 0], m0 - movu [sumsq_ptrq+16], m1 - - ; shift position down by one - mova m0, m4 - mova m1, m5 - mova m4, m2 - mova m5, m3 - mova [esp+0x1c], m6 - mova [esp+0x0c], m7 - add sumsq_ptrq, (384+16)*4*2 - sub yd, 2 - jge .sumsq_loop_y - ; l1 = l0 - mova m6, m2 - mova m7, m3 - cmp yd, ylimd - jg .sumsq_loop_y_noload - add xd, 8 - cmp xd, wm - jl .sumsq_loop_x - - mov xd, -2 -.sum_loop_x: - lea yd, [ylimd+2] - add yd, hm - lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test byte edgem, 4 ; have_top - jnz .sum_load_top - movu m0, [sum_ptrq+(384+16)*2*1] - mova m1, m0 - mova m2, m0 - mova m3, m0 - jmp .sum_loop_y_second_load -.sum_load_top: - movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4 - movu m2, [sum_ptrq-(384+16)*2*0] ; l2 - mova m1, m0 -.sum_loop_y: - movu m3, [sum_ptrq+(384+16)*2*1] ; l1 -.sum_loop_y_second_load: - test yd, yd - jle .sum_emulate_second_load - movu m4, [sum_ptrq+(384+16)*2*2] ; l0 -.sum_loop_y_noload: - paddw m0, m1 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - movu [sum_ptrq], m0 - - ; shift position down by one - mova m0, m2 - mova m1, m3 - mova m2, m4 - add sum_ptrq, (384+16)*2*2 - sub yd, 2 - jge .sum_loop_y - ; l1 = l0 - mova m3, m4 - cmp yd, ylimd - jg .sum_loop_y_noload - add xd, 8 - cmp xd, wm - jl .sum_loop_x - RET -.sumsq_emulate_second_load: - mova m2, m6 - mova m3, m7 - jmp .sumsq_loop_y_noload -.sum_emulate_second_load: - mova m4, m3 - jmp .sum_loop_y_noload -%endif - -cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s - movifnidn sd, sm - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 - add hd, 2 -%if ARCH_X86_64 - LEA r5, sgr_x_by_x-0xF03 -%else - SETUP_PIC r5, 0 -%endif - movd m6, sd - pshuflw m6, m6, q0000 - punpcklqdq m6, m6 - pxor m7, m7 - DEFINE_ARGS a, b, w, h, x -%if ARCH_X86_64 - mova m8, [pd_0xF0080029] - mova m9, [pw_256] - psrld m10, m9, 15 ; pd_512 -%else - %define m8 [PIC_sym(pd_0xF0080029)] - %define m9 [PIC_sym(pw_256)] - %define m10 [PIC_sym(pd_512)] -%endif -.loop_y: - mov xq, -2 -.loop_x: - movq m0, [bq+xq*2+0] - movq m1, [bq+xq*2+8] - punpcklwd m0, m7 - punpcklwd m1, m7 - movu m2, [aq+xq*4+ 0] - movu m3, [aq+xq*4+16] - pslld m4, m2, 3 ; aa * 8 - pslld m5, m3, 3 - paddd m2, m4 ; aa * 9 - paddd m3, m5 - paddd m4, m4 ; aa * 16 - paddd m5, m5 - paddd m2, m4 ; aa * 25 - paddd m3, m5 - pmaddwd m4, m0, m0 - pmaddwd m5, m1, m1 - psubd m2, m4 ; p = aa * 25 - bb * bb - psubd m3, m5 - MULLD m2, m6 - MULLD m3, m6 - paddusw m2, m8 - paddusw m3, m8 - psrld m2, 20 ; z - psrld m3, 20 - GATHERDD m4, m2 ; xx - GATHERDD m2, m3 - psrld m4, 24 - psrld m2, 24 - packssdw m3, m4, m2 - pmullw m4, m8 - pmullw m2, m8 - psubw m5, m9, m3 - pmaddwd m0, m4 - pmaddwd m1, m2 - paddd m0, m10 - paddd m1, m10 - psrld m0, 10 - psrld m1, 10 - movu [bq+xq*2], m5 - movu [aq+xq*4+ 0], m0 - movu [aq+xq*4+16], m1 - add xd, 8 - cmp xd, wd - jl .loop_x - add aq, (384+16)*4*2 - add bq, (384+16)*2*2 - sub hd, 2 - jg .loop_y - RET - -%if ARCH_X86_64 -cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ - tmp_base, src_base, a_base, b_base, x, y - movifnidn wd, wm - mov hd, hm - mov tmp_baseq, tq - mov src_baseq, srcq - mov a_baseq, aq - mov b_baseq, bq - mova m9, [pw_5_6] - mova m12, [pw_256] - psrlw m10, m12, 8 ; pw_1 - psrlw m11, m12, 1 ; pw_128 - pxor m13, m13 -%else -cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y - %define tmp_baseq r0m - %define src_baseq r1m - %define a_baseq r3m - %define b_baseq r4m - %define wd r5m - %define hd r6m - - SUB esp, 8 - SETUP_PIC yd - - %define m8 m5 - %define m9 [PIC_sym(pw_5_6)] - %define m10 [PIC_sym(pw_1)] - %define m11 [PIC_sym(pw_128)] - %define m12 [PIC_sym(pw_256)] - %define m13 m0 -%endif - xor xd, xd -.loop_x: - mov tq, tmp_baseq - mov srcq, src_baseq - mov aq, a_baseq - mov bq, b_baseq - movu m0, [aq+xq*4-(384+16)*4-4] - mova m1, [aq+xq*4-(384+16)*4] - movu m2, [aq+xq*4-(384+16)*4+4] - movu m3, [aq+xq*4-(384+16)*4-4+16] - mova m4, [aq+xq*4-(384+16)*4+16] - movu m5, [aq+xq*4-(384+16)*4+4+16] - paddd m0, m2 - paddd m3, m5 - paddd m0, m1 - paddd m3, m4 - pslld m2, m0, 2 - pslld m5, m3, 2 - paddd m2, m0 - paddd m5, m3 - paddd m0, m2, m1 ; prev_odd_b [first half] - paddd m1, m5, m4 ; prev_odd_b [second half] - movu m3, [bq+xq*2-(384+16)*2-2] - mova m4, [bq+xq*2-(384+16)*2] - movu m5, [bq+xq*2-(384+16)*2+2] - paddw m3, m5 - punpcklwd m5, m3, m4 - punpckhwd m3, m4 - pmaddwd m5, m9 - pmaddwd m3, m9 - mova m2, m5 - packssdw m2, m3 ; prev_odd_a - lea tq, [tq+xq*2] - lea srcq, [srcq+xq*1] - lea aq, [aq+xq*4+(384+16)*4] - lea bq, [bq+xq*2+(384+16)*2] -%if ARCH_X86_32 - mov [esp], PIC_reg -%endif - mov yd, hd - XCHG_PIC_REG -.loop_y: - movu m3, [aq-4] - mova m4, [aq] - movu m5, [aq+4] - paddd m3, m5 - paddd m3, m4 - pslld m5, m3, 2 - paddd m5, m3 - paddd m5, m4 ; cur_odd_b [first half] - movu m3, [aq+16-4] - mova m6, [aq+16] - movu m7, [aq+16+4] - paddd m3, m7 - paddd m3, m6 - pslld m7, m3, 2 - paddd m7, m3 - paddd m4, m7, m6 ; cur_odd_b [second half] - movu m3, [bq-2] - mova m6, [bq] - movu m7, [bq+2] - paddw m3, m7 - punpcklwd m7, m3, m6 - punpckhwd m3, m6 - pmaddwd m7, m9 - pmaddwd m3, m9 - packssdw m6, m7, m3 ; cur_odd_a - - paddd m0, m5 ; cur_even_b [first half] - paddd m1, m4 ; cur_even_b [second half] - paddw m2, m6 ; cur_even_a - - movq m3, [srcq] -%if ARCH_X86_64 - punpcklbw m3, m13 -%else - mova [td], m5 - pxor m7, m7 - punpcklbw m3, m7 -%endif - punpcklwd m7, m3, m10 - punpckhwd m3, m10 - punpcklwd m8, m2, m12 - punpckhwd m2, m12 - pmaddwd m7, m8 - pmaddwd m3, m2 - paddd m7, m0 - paddd m3, m1 - psrad m7, 9 - psrad m3, 9 - -%if ARCH_X86_32 - pxor m13, m13 -%endif - movq m8, [srcq+strideq] - punpcklbw m8, m13 - punpcklwd m0, m8, m10 - punpckhwd m8, m10 - punpcklwd m1, m6, m11 - punpckhwd m2, m6, m11 - pmaddwd m0, m1 - pmaddwd m8, m2 -%if ARCH_X86_64 - paddd m0, m5 -%else - paddd m0, [td] -%endif - paddd m8, m4 - psrad m0, 8 - psrad m8, 8 - - packssdw m7, m3 - packssdw m0, m8 -%if ARCH_X86_32 - mova m5, [td] -%endif - mova [tq+384*2*0], m7 - mova [tq+384*2*1], m0 - - mova m0, m5 - mova m1, m4 - mova m2, m6 - add aq, (384+16)*4*2 - add bq, (384+16)*2*2 - add tq, 384*2*2 - lea srcq, [srcq+strideq*2] -%if ARCH_X86_64 - sub yd, 2 -%else - sub dword [esp+4], 2 -%endif - jg .loop_y - add xd, 8 - cmp xd, wd - jl .loop_x -%if ARCH_X86_32 - ADD esp, 8 -%endif - RET - -cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt - movifnidn wd, wm - movd m0, wtm -%if ARCH_X86_64 - movifnidn hd, hm - mova m10, [pd_1024] - pxor m11, m11 -%else - SETUP_PIC hd, 0 - %define m10 [PIC_sym(pd_1024)] - %define m11 m7 -%endif - pshufd m0, m0, 0 - DEFINE_ARGS dst, stride, t1, t2, w, h, idx -%if ARCH_X86_32 - %define hd hmp -%endif - -.loop_y: - xor idxd, idxd -.loop_x: - mova m1, [t1q+idxq*2+ 0] - mova m2, [t1q+idxq*2+16] - mova m3, [t2q+idxq*2+ 0] - mova m4, [t2q+idxq*2+16] - mova m6, [dstq+idxq] -%if ARCH_X86_32 - pxor m11, m11 -%endif - punpcklbw m5, m6, m11 - punpckhbw m6, m11 - psllw m7, m5, 4 - psubw m1, m7 - psubw m3, m7 - psllw m7, m6, 4 - psubw m2, m7 - psubw m4, m7 - punpcklwd m7, m1, m3 - punpckhwd m1, m3 - punpcklwd m3, m2, m4 - punpckhwd m2, m4 - pmaddwd m7, m0 - pmaddwd m1, m0 - pmaddwd m3, m0 - pmaddwd m2, m0 - paddd m7, m10 - paddd m1, m10 - paddd m3, m10 - paddd m2, m10 - psrad m7, 11 - psrad m1, 11 - psrad m3, 11 - psrad m2, 11 - packssdw m7, m1 - packssdw m3, m2 - paddw m7, m5 - paddw m3, m6 - packuswb m7, m3 - mova [dstq+idxq], m7 - add idxd, 16 - cmp idxd, wd - jl .loop_x - add dstq, strideq - add t1q, 384 * 2 - add t2q, 384 * 2 - dec hd - jg .loop_y - RET diff -Nru dav1d-0.7.1/src/x86/mc16_avx2.asm dav1d-0.9.1/src/x86/mc16_avx2.asm --- dav1d-0.7.1/src/x86/mc16_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/mc16_avx2.asm 2021-07-28 21:38:28.909852300 +0000 @@ -0,0 +1,4006 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +; dav1d_obmc_masks[] * -512 +obmc_masks: dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 + dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0 + dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120 + dw -4096, -3072, -2048, -1536, 0, 0, 0, 0 + dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240 + dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608 + dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024 + +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 +subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + +put_bilin_h_rnd: dw 8, 8, 10, 10 +prep_mul: dw 16, 16, 4, 4 +put_8tap_h_rnd: dd 34, 40 +prep_8tap_1d_rnd: dd 8 - (8192 << 4) +prep_8tap_2d_rnd: dd 32 - (8192 << 5) +warp8x8t_rnd: dd 16384 - (8192 << 15) +warp8x8_shift: dd 5, 3 +warp8x8_rnd: dw 4096, 4096, 16384, 16384 +bidir_rnd: dw -16400, -16400, -16388, -16388 +bidir_mul: dw 2048, 2048, 8192, 8192 + +%define pw_16 prep_mul + +pw_2: times 2 dw 2 +pw_64: times 2 dw 64 +pw_2048: times 2 dw 2048 +pw_8192: times 2 dw 8192 +pw_27615: times 2 dw 27615 +pw_32766: times 2 dw 32766 +pw_m512: times 2 dw -512 +pd_32: dd 32 +pd_512: dd 512 +pd_32768: dd 32768 +pd_65538: dd 65538 + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128 + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) +%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) + +BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +cextern mc_warp_filter + +SECTION .text + +INIT_XMM avx2 +cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy + mov mxyd, r6m ; mx + lea r7, [put_avx2] +%if UNIX64 + DECLARE_REG_TMP 8 + %define org_w r8d + mov r8d, wd +%else + DECLARE_REG_TMP 7 + %define org_w wm +%endif + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [r7+wq*2+table_offset(put,)] + add wq, r7 + jmp wq +.put_w2: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +INIT_YMM avx2 +.put_w16: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+ssq*0+32*0] + movu m1, [srcq+ssq*0+32*1] + movu m2, [srcq+ssq*1+32*0] + movu m3, [srcq+ssq*1+32*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+32*0], m0 + mova [dstq+dsq*0+32*1], m1 + mova [dstq+dsq*1+32*0], m2 + mova [dstq+dsq*1+32*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] + movu m2, [srcq+32*2] + movu m3, [srcq+32*3] + add srcq, ssq + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] + movu m2, [srcq+32*2] + movu m3, [srcq+32*3] + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + movu m0, [srcq+32*4] + movu m1, [srcq+32*5] + movu m2, [srcq+32*6] + movu m3, [srcq+32*7] + add srcq, ssq + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + movd xm5, mxyd + mov mxyd, r7m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] + mov r6d, r8m ; bitdepth_max + add wq, r7 + shr r6d, 11 + vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4] + jmp wq +.h_w2: + movq xm1, [srcq+ssq*0] + movhps xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw xm0, xm4, xm1 + psrlq xm1, 16 + pmullw xm1, xm5 + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 4 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + movq xm1, [srcq+ssq*0+2] + movhps xm1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw xm0, xm4 + pmullw xm1, xm5 + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 4 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + movu xm1, [srcq+ssq*0+2] + vinserti128 m1, [srcq+ssq*1+2], 1 + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + pmullw m0, m4, [srcq+ssq*0] + pmullw m1, m5, [srcq+ssq*0+2] + paddw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+ssq*1] + pmullw m2, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m5, [srcq+32*0+2] + paddw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+32*1] + pmullw m2, m5, [srcq+32*1+2] + add srcq, ssq + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .h_w32 + RET +.h_w64: +.h_w128: + movifnidn t0d, org_w +.h_w64_loop0: + mov r6d, t0d +.h_w64_loop: + pmullw m0, m4, [srcq+r6*2-32*1] + pmullw m1, m5, [srcq+r6*2-32*1+2] + paddw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+r6*2-32*2] + pmullw m2, m5, [srcq+r6*2-32*2+2] + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+r6*2-32*1], m0 + mova [dstq+r6*2-32*2], m1 + sub r6d, 32 + jg .h_w64_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w64_loop0 + RET +.v: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] + shl mxyd, 11 + movd xm5, mxyd + add wq, r7 + vpbroadcastw m5, xm5 + jmp wq +.v_w2: + movd xm0, [srcq+ssq*0] +.v_w2_loop: + movd xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpckldq xm2, xm0, xm1 + movd xm0, [srcq+ssq*0] + punpckldq xm1, xm0 + psubw xm1, xm2 + pmulhrsw xm1, xm5 + paddw xm1, xm2 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xm0, [srcq+ssq*0] +.v_w4_loop: + movq xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq xm2, xm0, xm1 + movq xm0, [srcq+ssq*0] + punpcklqdq xm1, xm0 + psubw xm1, xm2 + pmulhrsw xm1, xm5 + paddw xm1, xm2 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu xm0, [srcq+ssq*0] +.v_w8_loop: + vbroadcasti128 m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd m2, m0, m1, 0xf0 + vbroadcasti128 m0, [srcq+ssq*0] + vpblendd m1, m0, 0xf0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w32: + movu m0, [srcq+ssq*0+32*0] + movu m1, [srcq+ssq*0+32*1] +.v_w32_loop: + movu m2, [srcq+ssq*1+32*0] + movu m3, [srcq+ssq*1+32*1] + lea srcq, [srcq+ssq*2] + psubw m4, m2, m0 + pmulhrsw m4, m5 + paddw m4, m0 + movu m0, [srcq+ssq*0+32*0] + mova [dstq+dsq*0+32*0], m4 + psubw m4, m3, m1 + pmulhrsw m4, m5 + paddw m4, m1 + movu m1, [srcq+ssq*0+32*1] + mova [dstq+dsq*0+32*1], m4 + psubw m4, m0, m2 + pmulhrsw m4, m5 + paddw m4, m2 + mova [dstq+dsq*1+32*0], m4 + psubw m4, m1, m3 + pmulhrsw m4, m5 + paddw m4, m3 + mova [dstq+dsq*1+32*1], m4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + RET +.v_w16: +.v_w64: +.v_w128: + movifnidn t0d, org_w + add t0d, t0d + mov r4, srcq + lea r6d, [hq+t0*8-256] + mov r7, dstq +.v_w16_loop0: + movu m0, [srcq+ssq*0] +.v_w16_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m5 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m5 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 + vpbroadcastd m3, [pw_2] + movd xm6, mxyd + vpbroadcastd m7, [pw_8192] + add wq, r7 + vpbroadcastw m6, xm6 + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + vpbroadcastd m7, [pw_2048] +.hv_12bpc: + jmp wq +.hv_w2: + vpbroadcastq xm1, [srcq+ssq*0] + pmullw xm0, xm4, xm1 + psrlq xm1, 16 + pmullw xm1, xm5 + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 2 +.hv_w2_loop: + movq xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm2, [srcq+ssq*0] + pmullw xm1, xm4, xm2 + psrlq xm2, 16 + pmullw xm2, xm5 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 ; 1 _ 2 _ + shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _ + mova xm0, xm1 + psubw xm1, xm2 + paddw xm1, xm1 + pmulhw xm1, xm6 + paddw xm1, xm2 + pmulhrsw xm1, xm7 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + pmullw xm0, xm4, [srcq+ssq*0-8] + pmullw xm1, xm5, [srcq+ssq*0-6] + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 2 +.hv_w4_loop: + movq xm1, [srcq+ssq*1] + movq xm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps xm1, [srcq+ssq*0] + movhps xm2, [srcq+ssq*0+2] + pmullw xm1, xm4 + pmullw xm2, xm5 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 ; 1 2 + shufpd xm2, xm0, xm1, 0x01 ; 0 1 + mova xm0, xm1 + psubw xm1, xm2 + paddw xm1, xm1 + pmulhw xm1, xm6 + paddw xm1, xm2 + pmulhrsw xm1, xm7 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + pmullw xm0, xm4, [srcq+ssq*0] + pmullw xm1, xm5, [srcq+ssq*0+2] + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 2 + vinserti128 m0, xm0, 1 +.hv_w8_loop: + movu xm1, [srcq+ssq*1] + movu xm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + vinserti128 m1, [srcq+ssq*0], 1 + vinserti128 m2, [srcq+ssq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +.hv_w32: +.hv_w64: +.hv_w128: +%if UNIX64 + lea r6d, [r8*2-32] +%else + mov r6d, wm + lea r6d, [r6*2-32] +%endif + mov r4, srcq + lea r6d, [hq+r6*8] + mov r7, dstq +.hv_w16_loop0: + pmullw m0, m4, [srcq+ssq*0] + pmullw m1, m5, [srcq+ssq*0+2] + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w16_loop: + pmullw m1, m4, [srcq+ssq*1] + pmullw m2, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 + psubw m2, m1, m0 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m0 + pmulhrsw m2, m7 + mova [dstq+dsq*0], m2 + pmullw m0, m4, [srcq+ssq*0] + pmullw m2, m5, [srcq+ssq*0+2] + paddw m0, m3 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m1 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m1 + pmulhrsw m2, m7 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w16_loop0 + RET + +cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep_avx2] +%if UNIX64 + DECLARE_REG_TMP 7 + %define org_w r7d +%else + DECLARE_REG_TMP 6 + %define org_w r5m +%endif + mov org_w, wd + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [r6-prep_avx2+pw_8192] + add wq, r6 + shr r5d, 11 + vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + vpbroadcastq m1, [srcq+strideq*2] + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m0, m1, 0x30 + vpblendd m0, m2, 0xc0 + pmullw m0, m4 + psubw m0, m5 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*2] + vinserti128 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m4, [srcq+strideq*2] + pmullw m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmullw m0, m4, [srcq+strideq*0+32*0] + pmullw m1, m4, [srcq+strideq*0+32*1] + pmullw m2, m4, [srcq+strideq*1+32*0] + pmullw m3, m4, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .prep_w32 + RET +.prep_w64: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m4, [srcq+32*1] + pmullw m2, m4, [srcq+32*2] + pmullw m3, m4, [srcq+32*3] + add srcq, strideq + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .prep_w64 + RET +.prep_w128: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m4, [srcq+32*1] + pmullw m2, m4, [srcq+32*2] + pmullw m3, m4, [srcq+32*3] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + pmullw m0, m4, [srcq+32*4] + pmullw m1, m4, [srcq+32*5] + pmullw m2, m4, [srcq+32*6] + pmullw m3, m4, [srcq+32*7] + add tmpq, 32*8 + add srcq, strideq + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .prep_w128 + RET +.h: + movd xm5, mxyd + mov mxyd, r6m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + vpbroadcastd m3, [pw_32766] + psubw m4, m5 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m4, 2 + psllw m5, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti128 m2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq m0, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + pmullw m0, m4 + vpblendd m1, m2, 0xcc + pmullw m1, m5 + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*0+2] + vinserti128 m1, [srcq+strideq*1+2], 1 + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m5, [srcq+strideq*0+2] + psubw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m1, m3 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: +.h_w64: +.h_w128: + movifnidn t0d, org_w +.h_w32_loop0: + mov r3d, t0d +.h_w32_loop: + pmullw m0, m4, [srcq+r3*2-32*1] + pmullw m1, m5, [srcq+r3*2-32*1+2] + psubw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+r3*2-32*2] + pmullw m2, m5, [srcq+r3*2-32*2+2] + psubw m1, m3 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+r3*2-32*1], m0 + mova [tmpq+r3*2-32*2], m1 + sub r3d, 32 + jg .h_w32_loop + add srcq, strideq + lea tmpq, [tmpq+t0*2] + dec hd + jg .h_w32_loop0 + RET +.v: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + movd xm5, mxyd + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + vpbroadcastd m3, [pw_32766] + add wq, r6 + lea stride3q, [strideq*3] + psubw m4, m5 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m4, 2 + psllw m5, 2 +.v_12bpc: + jmp wq +.v_w4: + movq xm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq xm1, [srcq+strideq*1] + vpblendd m2, m0, 0x03 ; 0 2 2 2 + vpbroadcastq m0, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0xf0 ; 1 1 3 3 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m1, m2, 0x33 ; 0 1 2 3 + vpblendd m0, m2, 0x0c ; 4 2 4 4 + punpckhqdq m2, m1, m0 ; 1 2 3 4 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movu xm0, [srcq+strideq*0] +.v_w8_loop: + vbroadcasti128 m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m1, m0, m2, 0xf0 ; 0 1 + vbroadcasti128 m0, [srcq+strideq*0] + vpblendd m2, m0, 0xf0 ; 1 2 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5, m2 + psubw m0, m3 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m4 + mova [tmpq+32*0], m1 + pmullw m1, m5, m0 + psubw m2, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + PUSH r7 +%endif + movifnidn r7d, org_w + add r7d, r7d + mov r3, srcq + lea r6d, [hq+r7*8-256] + mov r5, tmpq +.v_w32_loop0: + movu m0, [srcq+strideq*0] +.v_w32_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5, m2 + psubw m0, m3 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m4 + mova [tmpq+r7*0], m1 + pmullw m1, m5, m0 + psubw m2, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+r7*1], m1 + lea tmpq, [tmpq+r7*2] + sub hd, 2 + jg .v_w32_loop + add r3, 32 + add r5, 32 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .v_w32_loop0 +%if WIN64 + POP r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + movd xm6, mxyd + add wq, r6 + lea stride3q, [strideq*3] + vpbroadcastw m6, xm6 + jmp wq +.hv_w4: + movu xm1, [srcq+strideq*0] +%if WIN64 + movaps [rsp+24], xmm7 +%endif + pmullw xm0, xm4, xm1 + psrldq xm1, 2 + pmullw xm1, xm5 + psubw xm0, xm3 + paddw xm0, xm1 + psraw xm0, 2 + vpbroadcastq m0, xm0 +.hv_w4_loop: + movu xm1, [srcq+strideq*1] + vinserti128 m1, [srcq+stride3q ], 1 + movu xm2, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vinserti128 m2, [srcq+strideq*0], 1 + punpcklqdq m7, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + pmullw m7, m4 + vpblendd m1, m2, 0xcc + pmullw m1, m5 + psubw m7, m3 + paddw m1, m7 + psraw m1, 2 ; 1 2 3 4 + vpblendd m0, m1, 0x3f + vpermq m2, m0, q2103 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop +%if WIN64 + movaps xmm7, [rsp+24] +%endif + RET +.hv_w8: + pmullw xm0, xm4, [srcq+strideq*0] + pmullw xm1, xm5, [srcq+strideq*0+2] + psubw xm0, xm3 + paddw xm0, xm1 + psraw xm0, 2 + vinserti128 m0, xm0, 1 +.hv_w8_loop: + movu xm1, [srcq+strideq*1] + movu xm2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + vinserti128 m1, [srcq+strideq*0], 1 + vinserti128 m2, [srcq+strideq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +.hv_w32: +.hv_w64: +.hv_w128: +%if WIN64 + PUSH r7 +%endif + movifnidn r7d, org_w + add r7d, r7d + mov r3, srcq + lea r6d, [hq+r7*8-256] + mov r5, tmpq +.hv_w16_loop0: + pmullw m0, m4, [srcq] + pmullw m1, m5, [srcq+2] + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 +.hv_w16_loop: + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+r7*0], m2 + pmullw m0, m4, [srcq+strideq*0] + pmullw m2, m5, [srcq+strideq*0+2] + psubw m0, m3 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+r7*1], m2 + lea tmpq, [tmpq+r7*2] + sub hd, 2 + jg .hv_w16_loop + add r3, 32 + add r5, 32 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .hv_w16_loop0 +%if WIN64 + POP r7 +%endif + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v +cglobal %1_8tap_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) +%endif +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +MC_8TAP_FN put, sharp, SHARP, SHARP +MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN put, smooth, SMOOTH, SMOOTH +MC_8TAP_FN put, sharp_regular, SHARP, REGULAR +MC_8TAP_FN put, regular_sharp, REGULAR, SHARP +MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN put, regular, REGULAR, REGULAR + +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%define base r8-put_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movifnidn wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 +%if WIN64 + pop r8 +%endif + jmp wq +.h_w2: + movzx mxd, mxb + sub srcq, 2 + mova xm2, [subpel_h_shuf2] + vpbroadcastd xm3, [base+subpel_filters+mxq*8+2] + pmovsxbw xm3, xm3 +.h_w2_loop: + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm2 + pshufb xm1, xm2 + pmaddwd xm0, xm3 + pmaddwd xm1, xm3 + phaddd xm0, xm1 + paddd xm0, xm4 + psrad xm0, 6 + packusdw xm0, xm0 + pminsw xm0, xm5 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xm3, [base+subpel_filters+mxq*8] + WIN64_SPILL_XMM 8 + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + pshufd xm3, xm3, q2211 + vpbroadcastq m2, xm3 + vpermq m3, m3, q1111 +.h_w4_loop: + movu xm1, [srcq+ssq*0] + vinserti128 m1, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 + pshufb m1, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m2 + pmaddwd m1, m3 + paddd m0, m4 + paddd m0, m1 + psrad m0, 6 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 + pminsw xm0, xm5 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + mov r7d, r8m + vpbroadcastw m5, r8m + shr r7d, 11 + vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] + cmp wd, 4 + je .h_w4 + jl .h_w2 + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 13 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 8 + jg .h_w16 +.h_w8: +%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m4 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m4 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 6 + psrad m%2, 6 + packusdw m%1, m%2 + pminsw m%1, m5 +%endmacro + movu xm0, [srcq+ssq*0+ 0] + vinserti128 m0, [srcq+ssq*1+ 0], 1 + movu xm2, [srcq+ssq*0+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + shufpd m1, m0, m2, 0x05 + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6*2-32] + movu m1, [srcq+r6*2-24] + movu m2, [srcq+r6*2-16] + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+r6*2-32], m0 + sub r6d, 16 + jg .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m0, [base+subpel_filters+myq*8] + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 15 + vpbroadcastd m6, [pd_32] + vpbroadcastw m7, r8m + lea r6, [ssq*3] + sub srcq, r6 + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd xm2, [srcq+ssq*0] + pinsrd xm2, [srcq+ssq*1], 1 + pinsrd xm2, [srcq+ssq*2], 2 + pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*4] + movd xm3, [srcq+ssq*0] + vpbroadcastd xm1, [srcq+ssq*1] + vpbroadcastd xm0, [srcq+ssq*2] + add srcq, r6 + vpblendd xm3, xm1, 0x02 ; 4 5 + vpblendd xm1, xm0, 0x02 ; 5 6 + palignr xm4, xm3, xm2, 4 ; 1 2 3 4 + punpcklwd xm3, xm1 ; 45 56 + punpcklwd xm1, xm2, xm4 ; 01 12 + punpckhwd xm2, xm4 ; 23 34 +.v_w2_loop: + vpbroadcastd xm4, [srcq+ssq*0] + pmaddwd xm5, xm8, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm9 ; a1 b1 + paddd xm5, xm6 + paddd xm5, xm2 + mova xm2, xm3 + pmaddwd xm3, xm10 ; a2 b2 + paddd xm5, xm3 + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm4, xm0, 0x02 ; 7 8 + punpcklwd xm3, xm4 ; 67 78 + pmaddwd xm4, xm11, xm3 ; a3 b3 + paddd xm5, xm4 + psrad xm5, 6 + packusdw xm5, xm5 + pminsw xm5, xm7 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xm1, [srcq+ssq*0] + vpbroadcastq m0, [srcq+ssq*1] + vpbroadcastq m2, [srcq+ssq*2] + vpbroadcastq m4, [srcq+r6 ] + lea srcq, [srcq+ssq*4] + vpbroadcastq m3, [srcq+ssq*0] + vpbroadcastq m5, [srcq+ssq*1] + vpblendd m1, m0, 0x30 + vpblendd m0, m2, 0x30 + punpcklwd m1, m0 ; 01 12 + vpbroadcastq m0, [srcq+ssq*2] + add srcq, r6 + vpblendd m2, m4, 0x30 + vpblendd m4, m3, 0x30 + punpcklwd m2, m4 ; 23 34 + vpblendd m3, m5, 0x30 + vpblendd m5, m0, 0x30 + punpcklwd m3, m5 ; 45 56 +.v_w4_loop: + vpbroadcastq m4, [srcq+ssq*0] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m6 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd m4, m0, 0x30 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 6 + vextracti128 xm4, m5, 1 + packusdw xm5, xm4 + pminsw xm5, xm7 + movq [dstq+dsq*0], xm5 + movhps [dstq+dsq*1], xm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + shl wd, 5 + mov r7, srcq + mov r8, dstq + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m4, [srcq+ssq*0] + vbroadcasti128 m5, [srcq+ssq*1] + vbroadcasti128 m0, [srcq+r6 ] + vbroadcasti128 m6, [srcq+ssq*2] + lea srcq, [srcq+ssq*4] + vbroadcasti128 m1, [srcq+ssq*0] + vbroadcasti128 m2, [srcq+ssq*1] + vbroadcasti128 m3, [srcq+ssq*2] + add srcq, r6 + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklwd m3, m6, m0 ; 23 + punpckhwd m6, m0 ; 56 +.v_w8_loop: + vbroadcasti128 m14, [srcq+ssq*0] + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + vbroadcasti128 m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + paddd m13, m6 + shufpd m6, m0, m14, 0x0d + shufpd m0, m14, m5, 0x0c + punpcklwd m5, m6, m0 ; 67 + punpckhwd m6, m0 ; 78 + pmaddwd m14, m11, m5 ; a3 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 5 + psrad m13, 5 + packusdw m12, m13 + pxor m13, m13 + pavgw m12, m13 + pminsw m12, m7 + vpermq m12, m12, q3120 + mova [dstq+dsq*0], xm12 + vextracti128 [dstq+dsq*1], m12, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + add r7, 16 + add r8, 16 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .v_w8_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vpbroadcastw m15, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m1, [base+subpel_filters+myq*8] + vpbroadcastd m6, [pd_512] + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_10bit + psraw m7, 2 + psllw m1, 2 +.hv_10bit: + pshufd m11, m1, q0000 + pshufd m12, m1, q1111 + pshufd m13, m1, q2222 + pshufd m14, m1, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m9, [subpel_h_shuf2] + vbroadcasti128 m1, [srcq+r6 ] ; 3 3 + movu xm3, [srcq+ssq*2] + movu xm0, [srcq+ssq*0] + movu xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*4] + vinserti128 m3, [srcq+ssq*0], 1 ; 2 4 + vinserti128 m0, [srcq+ssq*1], 1 ; 0 5 + vinserti128 m2, [srcq+ssq*2], 1 ; 1 6 + add srcq, r6 + pshufb m1, m9 + pshufb m3, m9 + pshufb m0, m9 + pshufb m2, m9 + pmaddwd m1, m7 + pmaddwd m3, m7 + pmaddwd m0, m7 + pmaddwd m2, m7 + phaddd m1, m3 + phaddd m0, m2 + paddd m1, m6 + paddd m0, m6 + psrad m1, 10 + psrad m0, 10 + packssdw m1, m0 ; 3 2 0 1 + vextracti128 xm0, m1, 1 ; 3 4 5 6 + pshufd xm2, xm1, q1301 ; 2 3 1 2 + pshufd xm3, xm0, q2121 ; 4 5 4 5 + punpckhwd xm1, xm2 ; 01 12 + punpcklwd xm2, xm0 ; 23 34 + punpckhwd xm3, xm0 ; 45 56 +.hv_w2_loop: + movu xm4, [srcq+ssq*0] + movu xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm4, xm9 + pshufb xm5, xm9 + pmaddwd xm4, xm7 + pmaddwd xm5, xm7 + phaddd xm4, xm5 + pmaddwd xm5, xm11, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm12 ; a1 b1 + paddd xm5, xm2 + mova xm2, xm3 + pmaddwd xm3, xm13 ; a2 b2 + paddd xm5, xm3 + paddd xm4, xm6 + psrad xm4, 10 + packssdw xm4, xm4 + palignr xm3, xm4, xm0, 12 + mova xm0, xm4 + punpcklwd xm3, xm0 ; 67 78 + pmaddwd xm4, xm14, xm3 ; a3 b3 + paddd xm5, xm6 + paddd xm5, xm4 + psrad xm5, 10 + packusdw xm5, xm5 + pminsw xm5, xm15 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + vbroadcasti128 m9, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + pshufd m8, m7, q1111 + pshufd m7, m7, q0000 + movu xm1, [srcq+ssq*0] + vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 + vbroadcasti128 m0, [srcq+r6 ] + vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 + lea srcq, [srcq+ssq*4] + vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 + movu xm3, [srcq+ssq*1] + vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 + add srcq, r6 + pshufb m4, m1, m9 + pshufb m1, m10 + pmaddwd m4, m7 + pmaddwd m1, m8 + pshufb m5, m2, m9 + pshufb m2, m10 + pmaddwd m5, m7 + pmaddwd m2, m8 + paddd m4, m6 + paddd m1, m4 + pshufb m4, m0, m9 + pshufb m0, m10 + pmaddwd m4, m7 + pmaddwd m0, m8 + paddd m5, m6 + paddd m2, m5 + pshufb m5, m3, m9 + pshufb m3, m10 + pmaddwd m5, m7 + pmaddwd m3, m8 + paddd m4, m6 + paddd m4, m0 + paddd m5, m6 + paddd m5, m3 + vperm2i128 m0, m1, m2, 0x21 + psrld m1, 10 + psrld m2, 10 + vperm2i128 m3, m4, m5, 0x21 + pslld m4, 6 + pslld m5, 6 + pblendw m2, m4, 0xaa ; 23 34 + pslld m0, 6 + pblendw m1, m0, 0xaa ; 01 12 + psrld m3, 10 + pblendw m3, m5, 0xaa ; 45 56 + psrad m0, m5, 16 +.hv_w4_loop: + movu xm4, [srcq+ssq*0] + vinserti128 m4, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m6 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + pshufb m3, m4, m9 + pshufb m4, m10 + pmaddwd m3, m7 + pmaddwd m4, m8 + paddd m3, m6 + paddd m4, m3 + psrad m4, 10 + packssdw m0, m4 ; _ 7 6 8 + vpermq m3, m0, q1122 ; _ 6 _ 7 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m4, m5 + psrad m4, 10 + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, xm15 + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + pmovsxbw xm1, [base+subpel_filters+myq*8] + shl wd, 5 + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + pxor m0, m0 + punpcklbw m0, m2 + mov r7, srcq + mov r8, dstq + lea wd, [hq+wq-256] + test dword r8m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 + psllw xm1, 2 +.hv_w8_10bit: + pshufd m11, m0, q0000 + pshufd m12, m0, q1111 + pshufd m13, m0, q2222 + pshufd m14, m0, q3333 +%if WIN64 + %define v_mul (rsp+stack_offset+40) ; r4m +%else + %define v_mul (rsp-24) ; red zone +%endif + mova [v_mul], xm1 +.hv_w8_loop0: +%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 + pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 + pmaddwd m3, m12, m2 + pmaddwd m%1, m11 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m3, m10 + paddd m%1, m3 + pmaddwd m3, m14, m%2 + paddd m%1, m3 + pmaddwd m3, m13, m2 + pshufb m%3, m9 ; a b b c c d d e + pmaddwd m2, m11 + paddd m%1, m3 + pmaddwd m3, m12, m%2 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m14 + pmaddwd m%2, m13 + paddd m2, m10 + paddd m2, m3 + paddd m%3, m2 + paddd m%2, m%3 + psrad m%1, 10 + psrad m%2, 10 + packssdw m%1, m%2 +%endmacro + movu xm4, [srcq+r6 *1+ 0] + vbroadcasti128 m8, [subpel_h_shufA] + movu xm6, [srcq+r6 *1+ 8] + vbroadcasti128 m9, [subpel_h_shufB] + movu xm0, [srcq+r6 *1+16] + vpbroadcastd m10, [pd_512] + movu xm5, [srcq+ssq*0+ 0] + vinserti128 m5, [srcq+ssq*4+ 0], 1 + movu xm1, [srcq+ssq*0+16] + vinserti128 m1, [srcq+ssq*4+16], 1 + shufpd m7, m5, m1, 0x05 + INIT_XMM avx2 + PUT_8TAP_HV_H 4, 6, 0 ; 3 + INIT_YMM avx2 + PUT_8TAP_HV_H 5, 7, 1 ; 0 4 + movu xm0, [srcq+ssq*2+ 0] + vinserti128 m0, [srcq+r6 *2+ 0], 1 + movu xm1, [srcq+ssq*2+16] + vinserti128 m1, [srcq+r6 *2+16], 1 + shufpd m7, m0, m1, 0x05 + PUT_8TAP_HV_H 0, 7, 1 ; 2 6 + movu xm6, [srcq+ssq*1+ 0] + movu xm1, [srcq+ssq*1+16] + lea srcq, [srcq+ssq*4] + vinserti128 m6, [srcq+ssq*1+ 0], 1 + vinserti128 m1, [srcq+ssq*1+16], 1 + add srcq, r6 + shufpd m7, m6, m1, 0x05 + PUT_8TAP_HV_H 6, 7, 1 ; 1 5 + vpermq m4, m4, q1100 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + vpermq m7, m0, q3120 + punpcklwd m3, m7, m4 ; 23 + punpckhwd m4, m5 ; 34 + punpcklwd m1, m5, m6 ; 01 + punpckhwd m5, m6 ; 45 + punpcklwd m2, m6, m7 ; 12 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vpbroadcastd m9, [v_mul+4*0] + vpbroadcastd m7, [v_mul+4*1] + vpbroadcastd m10, [v_mul+4*2] + pmaddwd m8, m9, m1 ; a0 + pmaddwd m9, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m7 ; a1 + pmaddwd m4, m7 ; b1 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m8, m5 + paddd m9, m6 + movu xm5, [srcq+ssq*0] + vinserti128 m5, [srcq+ssq*1], 1 + vbroadcasti128 m7, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + movu xm6, [srcq+ssq*0+16] + vinserti128 m6, [srcq+ssq*1+16], 1 + vextracti128 [dstq], m0, 1 + pshufb m0, m5, m7 ; 01 + pshufb m5, m10 ; 23 + pmaddwd m0, m11 + pmaddwd m5, m12 + paddd m0, m5 + pshufb m5, m6, m7 ; 89 + pshufb m6, m10 ; ab + pmaddwd m5, m13 + pmaddwd m6, m14 + paddd m6, m5 + movu xm5, [srcq+ssq*0+8] + vinserti128 m5, [srcq+ssq*1+8], 1 + lea srcq, [srcq+ssq*2] + pshufb m7, m5, m7 + pshufb m5, m10 + pmaddwd m10, m13, m7 + pmaddwd m7, m11 + paddd m0, m10 + vpbroadcastd m10, [pd_512] + paddd m6, m7 + pmaddwd m7, m14, m5 + pmaddwd m5, m12 + paddd m0, m7 + paddd m5, m6 + vbroadcasti128 m6, [dstq] + paddd m8, m10 + paddd m9, m10 + paddd m0, m10 + paddd m5, m10 + vpbroadcastd m10, [v_mul+4*3] + psrad m0, 10 + psrad m5, 10 + packssdw m0, m5 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m10, m5 ; a3 + pmaddwd m10, m6 ; b3 + paddd m7, m8 + paddd m9, m10 + psrad m7, 10 + psrad m9, 10 + packusdw m7, m9 + pminsw m7, m15 + vpermq m7, m7, q3120 + mova [dstq+dsq*0], xm7 + vextracti128 [dstq+dsq*1], m7, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add r7, 16 + add r8, 16 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .hv_w8_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +MC_8TAP_FN prep, sharp, SHARP, SHARP +MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH +MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR +MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP +MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN prep, regular, REGULAR, REGULAR + +cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my +%define base r7-prep_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep_avx2] + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov r6d, r7m ; bitdepth_max + movzx wd, word [r7+wq*2+table_offset(prep,)] + vpbroadcastd m5, [r7-prep_avx2+pw_8192] + shr r6d, 11 + add wq, r7 + vpbroadcastd m4, [base+prep_mul+r6*4] + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xm0, [base+subpel_filters+mxq*8] + vbroadcasti128 m3, [subpel_h_shufA] + vbroadcasti128 m4, [subpel_h_shufB] + WIN64_SPILL_XMM 8 + pshufd xm0, xm0, q2211 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw xm0, 2 +.h_w4_12bpc: + vpbroadcastq m6, xm0 + vpermq m7, m0, q1111 +.h_w4_loop: + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti128 m2, [srcq+r6 ], 1 + lea srcq, [srcq+strideq*4] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) + lea r6, [strideq*3] + cmp wd, 4 + je .h_w4 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .h_12bpc + psllw m0, 2 +.h_12bpc: + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 8 + jg .h_w16 +.h_w8: +%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m5 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m5 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 4 + psrad m%2, 4 + packssdw m%1, m%2 +%endmacro + movu xm0, [srcq+strideq*0+ 0] + vinserti128 m0, [srcq+strideq*1+ 0], 1 + movu xm2, [srcq+strideq*0+16] + vinserti128 m2, [srcq+strideq*1+16], 1 + lea srcq, [srcq+strideq*2] + shufpd m1, m0, m2, 0x05 + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + add wd, wd +.h_w16_loop0: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6-32] + movu m1, [srcq+r6-24] + movu m2, [srcq+r6-16] + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq+r6-32], m0 + sub r6d, 32 + jg .h_w16_loop + add srcq, strideq + add tmpq, wq + dec hd + jg .h_w16_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m0, [base+subpel_filters+myq*8] + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 15 + vpbroadcastd m7, [prep_8tap_1d_rnd] + lea r6, [strideq*3] + sub srcq, r6 + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m0, 2 +.v_12bpc: + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 4 + jg .v_w8 +.v_w4: + movq xm1, [srcq+strideq*0] + vpbroadcastq m0, [srcq+strideq*1] + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq m4, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + vpblendd m1, m0, 0x30 + vpblendd m0, m2, 0x30 + punpcklwd m1, m0 ; 01 12 + vpbroadcastq m0, [srcq+strideq*2] + add srcq, r6 + vpblendd m2, m4, 0x30 + vpblendd m4, m3, 0x30 + punpcklwd m2, m4 ; 23 34 + vpblendd m3, m5, 0x30 + vpblendd m5, m0, 0x30 + punpcklwd m3, m5 ; 45 56 +.v_w4_loop: + vpbroadcastq m4, [srcq+strideq*0] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m7 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m4, m0, 0x30 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 4 + vextracti128 xm4, m5, 1 + packssdw xm5, xm4 + mova [tmpq], xm5 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: +%if WIN64 + push r8 +%endif + mov r8d, wd + shl wd, 5 + mov r5, srcq + mov r7, tmpq + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m4, [srcq+strideq*0] + vbroadcasti128 m5, [srcq+strideq*1] + vbroadcasti128 m0, [srcq+r6 ] + vbroadcasti128 m6, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vbroadcasti128 m1, [srcq+strideq*0] + vbroadcasti128 m2, [srcq+strideq*1] + vbroadcasti128 m3, [srcq+strideq*2] + add srcq, r6 + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklwd m3, m6, m0 ; 23 + punpckhwd m6, m0 ; 56 +.v_w8_loop: + vbroadcasti128 m14, [srcq+strideq*0] + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m7 + paddd m13, m7 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + vbroadcasti128 m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + paddd m13, m6 + shufpd m6, m0, m14, 0x0d + shufpd m0, m14, m5, 0x0c + punpcklwd m5, m6, m0 ; 67 + punpckhwd m6, m0 ; 78 + pmaddwd m14, m11, m5 ; a3 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 4 + psrad m13, 4 + packssdw m12, m13 + vpermq m12, m12, q3120 + mova [tmpq+r8*0], xm12 + vextracti128 [tmpq+r8*2], m12, 1 + lea tmpq, [tmpq+r8*4] + sub hd, 2 + jg .v_w8_loop + add r5, 16 + add r7, 16 + movzx hd, wb + mov srcq, r5 + mov tmpq, r7 + sub wd, 1<<8 + jg .v_w8_loop0 +%if WIN64 + pop r8 +%endif + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vpbroadcastd m15, [prep_8tap_2d_rnd] + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m1, [base+subpel_filters+myq*8] + lea r6, [strideq*3] + sub srcq, 2 + sub srcq, r6 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m1, m1 + psraw m7, 4 + psraw m1, 8 + test dword r7m, 0x800 + jz .hv_w4_10bit + psraw m7, 2 +.hv_w4_10bit: + pshufd m11, m1, q0000 + pshufd m12, m1, q1111 + pshufd m13, m1, q2222 + pshufd m14, m1, q3333 +.hv_w4: + vbroadcasti128 m9, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + pshufd m8, m7, q1111 + pshufd m7, m7, q0000 + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*1], 1 ; 0 1 + vbroadcasti128 m0, [srcq+r6 ] + vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3 + lea srcq, [srcq+strideq*4] + vinserti128 m0, [srcq+strideq*0], 1 ; 3 4 + movu xm3, [srcq+strideq*1] + vinserti128 m3, [srcq+strideq*2], 1 ; 5 6 + add srcq, r6 + pshufb m4, m1, m9 + pshufb m1, m10 + pmaddwd m4, m7 + pmaddwd m1, m8 + pshufb m5, m2, m9 + pshufb m2, m10 + pmaddwd m5, m7 + pmaddwd m2, m8 + paddd m4, m15 + paddd m1, m4 + pshufb m4, m0, m9 + pshufb m0, m10 + pmaddwd m4, m7 + pmaddwd m0, m8 + paddd m5, m15 + paddd m2, m5 + pshufb m5, m3, m9 + pshufb m3, m10 + pmaddwd m5, m7 + pmaddwd m3, m8 + paddd m4, m15 + paddd m4, m0 + paddd m5, m15 + paddd m5, m3 + vperm2i128 m0, m1, m2, 0x21 + psrld m1, 6 + psrld m2, 6 + vperm2i128 m3, m4, m5, 0x21 + pslld m4, 10 + pslld m5, 10 + pblendw m2, m4, 0xaa ; 23 34 + pslld m0, 10 + pblendw m1, m0, 0xaa ; 01 12 + psrld m3, 6 + pblendw m3, m5, 0xaa ; 45 56 + psrad m0, m5, 16 +.hv_w4_loop: + movu xm4, [srcq+strideq*0] + vinserti128 m4, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m15 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + pshufb m3, m4, m9 + pshufb m4, m10 + pmaddwd m3, m7 + pmaddwd m4, m8 + paddd m3, m15 + paddd m4, m3 + psrad m4, 6 + packssdw m0, m4 ; _ 7 6 8 + vpermq m3, m0, q1122 ; _ 6 _ 7 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m4, m5 + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + pmovsxbw xm1, [base+subpel_filters+myq*8] +%if WIN64 + PUSH r8 +%endif + mov r8d, wd + shl wd, 5 + lea r6, [strideq*3] + sub srcq, 6 + sub srcq, r6 + mov r5, srcq + mov r7, tmpq + lea wd, [hq+wq-256] + pxor m0, m0 + punpcklbw m0, m2 + mova [v_mul], xm1 + psraw m0, 4 + test dword r7m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 +.hv_w8_10bit: + pshufd m11, m0, q0000 + pshufd m12, m0, q1111 + pshufd m13, m0, q2222 + pshufd m14, m0, q3333 +.hv_w8_loop0: +%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 + pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 + pmaddwd m3, m12, m2 + pmaddwd m%1, m11 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m3, m15 + paddd m%1, m3 + pmaddwd m3, m14, m%2 + paddd m%1, m3 + pmaddwd m3, m13, m2 + pshufb m%3, m9 ; a b b c c d d e + pmaddwd m2, m11 + paddd m%1, m3 + pmaddwd m3, m12, m%2 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m14 + pmaddwd m%2, m13 + paddd m2, m15 + paddd m2, m3 + paddd m2, m%3 + paddd m2, m%2 + psrad m%1, 6 + psrad m2, 6 + packssdw m%1, m2 +%endmacro + movu xm4, [srcq+r6 + 0] + vbroadcasti128 m8, [subpel_h_shufA] + movu xm6, [srcq+r6 + 8] + vbroadcasti128 m9, [subpel_h_shufB] + movu xm0, [srcq+r6 +16] + movu xm5, [srcq+strideq*0+ 0] + vinserti128 m5, [srcq+strideq*4+ 0], 1 + movu xm1, [srcq+strideq*0+16] + vinserti128 m1, [srcq+strideq*4+16], 1 + shufpd m7, m5, m1, 0x05 + INIT_XMM avx2 + PREP_8TAP_HV_H 4, 6, 0 ; 3 + INIT_YMM avx2 + PREP_8TAP_HV_H 5, 7, 1 ; 0 4 + movu xm0, [srcq+strideq*2+ 0] + vinserti128 m0, [srcq+r6 *2+ 0], 1 + movu xm1, [srcq+strideq*2+16] + vinserti128 m1, [srcq+r6 *2+16], 1 + shufpd m7, m0, m1, 0x05 + PREP_8TAP_HV_H 0, 7, 1 ; 2 6 + movu xm6, [srcq+strideq*1+ 0] + movu xm1, [srcq+strideq*1+16] + lea srcq, [srcq+strideq*4] + vinserti128 m6, [srcq+strideq*1+ 0], 1 + vinserti128 m1, [srcq+strideq*1+16], 1 + add srcq, r6 + shufpd m7, m6, m1, 0x05 + PREP_8TAP_HV_H 6, 7, 1 ; 1 5 + vpermq m4, m4, q1100 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + vpermq m7, m0, q3120 + punpcklwd m3, m7, m4 ; 23 + punpckhwd m4, m5 ; 34 + punpcklwd m1, m5, m6 ; 01 + punpckhwd m5, m6 ; 45 + punpcklwd m2, m6, m7 ; 12 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vpbroadcastd m9, [v_mul+4*0] + vpbroadcastd m7, [v_mul+4*1] + vpbroadcastd m10, [v_mul+4*2] + pmaddwd m8, m9, m1 ; a0 + pmaddwd m9, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m7 ; a1 + pmaddwd m4, m7 ; b1 + paddd m8, m15 + paddd m9, m15 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m8, m5 + paddd m9, m6 + movu xm5, [srcq+strideq*0] + vinserti128 m5, [srcq+strideq*1], 1 + vbroadcasti128 m7, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + movu xm6, [srcq+strideq*0+16] + vinserti128 m6, [srcq+strideq*1+16], 1 + vextracti128 [tmpq], m0, 1 + pshufb m0, m5, m7 ; 01 + pshufb m5, m10 ; 23 + pmaddwd m0, m11 + pmaddwd m5, m12 + paddd m0, m15 + paddd m0, m5 + pshufb m5, m6, m7 ; 89 + pshufb m6, m10 ; ab + pmaddwd m5, m13 + pmaddwd m6, m14 + paddd m5, m15 + paddd m6, m5 + movu xm5, [srcq+strideq*0+8] + vinserti128 m5, [srcq+strideq*1+8], 1 + lea srcq, [srcq+strideq*2] + pshufb m7, m5, m7 + pshufb m5, m10 + pmaddwd m10, m13, m7 + pmaddwd m7, m11 + paddd m0, m10 + paddd m6, m7 + pmaddwd m7, m14, m5 + pmaddwd m5, m12 + paddd m0, m7 + paddd m5, m6 + vbroadcasti128 m6, [tmpq] + vpbroadcastd m10, [v_mul+4*3] + psrad m0, 6 + psrad m5, 6 + packssdw m0, m5 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m10, m5 ; a3 + pmaddwd m10, m6 ; b3 + paddd m7, m8 + paddd m9, m10 + psrad m7, 6 + psrad m9, 6 + packssdw m7, m9 + vpermq m7, m7, q3120 + mova [tmpq+r8*0], xm7 + vextracti128 [tmpq+r8*2], m7, 1 + lea tmpq, [tmpq+r8*4] + sub hd, 2 + jg .hv_w8_loop + add r5, 16 + add r7, 16 + movzx hd, wb + mov srcq, r5 + mov tmpq, r7 + sub wd, 1<<8 + jg .hv_w8_loop0 +%if WIN64 + POP r8 +%endif + RET + +%macro WARP_V 5 ; dst, 01, 23, 45, 67 + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm8, [filterq+myq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 ; a e + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+deltaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; b f + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm9, [filterq+myq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 ; c g + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+gammaq] ; my += gamma + punpcklwd m8, m0 + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; d h + punpcklwd m0, m9, m0 + punpckldq m9, m8, m0 + punpckhdq m0, m8, m0 + punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 + punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 + pmaddwd m%2, m8 + pmaddwd m9, m%3 + punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 + punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 + pmaddwd m8, m%4 + pmaddwd m0, m%5 + paddd m9, m%2 + mova m%2, m%3 + paddd m0, m8 + mova m%3, m%4 + mova m%4, m%5 + paddd m%1, m0, m9 +%endmacro + +cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts + mov r6d, r7m + lea r9, [$$] + shr r6d, 11 + vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4] + vpbroadcastd m14, [warp8x8t_rnd] + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main + jmp .start +.loop: + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2 + lea tmpq, [tmpq+tsq*4] +.start: + paddd m7, m14 + paddd m0, m14 + psrad m7, 15 + psrad m0, 15 + packssdw m7, m0 + vpermq m7, m7, q3120 + mova [tmpq+tsq*0], xm7 + vextracti128 [tmpq+tsq*2], m7, 1 + dec r4d + jg .loop +.end: + RET + +cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \ + alpha, beta, filter, tmp1, delta, \ + my, gamma + mov r6d, r7m + lea filterq, [$$] + shr r6d, 11 + vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4] + vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4] + vpbroadcastw m15, r7m ; pixel_max + call .main + jmp .start +.loop: + call .main2 + lea dstq, [dstq+dsq*2] +.start: + psrad m7, 16 + psrad m0, 16 + packusdw m7, m0 + pmulhrsw m7, m14 + pminsw m7, m15 + vpermq m7, m7, q3120 + mova [dstq+dsq*0], xm7 + vextracti128 [dstq+dsq*1], m7, 1 + dec r4d + jg .loop +.end: + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov abcdq, r5m + mov mxd, r6m +%endif + movsx alphad, word [abcdq+2*0] + movsx betad, word [abcdq+2*1] + vpbroadcastd m12, [pd_32768] + pxor m11, m11 + add filterq, mc_warp_filter-$$ + lea tmp1q, [ssq*3] + add mxd, 512+(64<<10) + lea tmp2d, [alphaq*3] + sub srcq, tmp1q ; src -= src_stride*3 + sub betad, tmp2d ; beta -= alpha*3 + mov myd, r7m + call .h + psrld m1, m0, 16 + call .h + pblendw m1, m0, 0xaa ; 01 + psrld m2, m0, 16 + call .h + pblendw m2, m0, 0xaa ; 12 + psrld m3, m0, 16 + call .h + pblendw m3, m0, 0xaa ; 23 + psrld m4, m0, 16 + call .h + pblendw m4, m0, 0xaa ; 34 + psrld m5, m0, 16 + call .h + pblendw m5, m0, 0xaa ; 45 + psrld m6, m0, 16 + call .h + pblendw m6, m0, 0xaa ; 56 + movsx deltad, word [abcdq+2*2] + movsx gammad, word [abcdq+2*3] + add myd, 512+(64<<10) + mov r4d, 4 + lea tmp1d, [deltaq*3] + sub gammad, tmp1d ; gamma -= delta*3 +.main2: + call .h + psrld m7, m6, 16 + pblendw m7, m0, 0xaa ; 67 + WARP_V 7, 1, 3, 5, 7 + call .h + psrld m10, m5, 16 + pblendw m10, m0, 0xaa ; 78 + WARP_V 0, 2, 4, 6, 10 + ret +ALIGN function_align +.h: + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + movu xm10, [srcq-6] + vinserti128 m10, [srcq+2], 1 + shr mxd, 10 ; 0 + shr tmp1d, 10 ; 4 + movq xm0, [filterq+mxq *8] + vinserti128 m0, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+alphaq*1] + movu xm8, [srcq-4] + vinserti128 m8, [srcq+4], 1 + shr tmp2d, 10 ; 1 + shr tmp1d, 10 ; 5 + movq xm9, [filterq+tmp2q*8] + vinserti128 m9, [filterq+tmp1q*8], 1 + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + shr mxd, 10 ; 2 + shr tmp1d, 10 ; 6 + punpcklbw m0, m11, m0 + pmaddwd m0, m10 + movu xm10, [srcq-2] + vinserti128 m10, [srcq+6], 1 + punpcklbw m9, m11, m9 + pmaddwd m9, m8 + movq xm8, [filterq+mxq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + phaddd m0, m9 ; 0 1 4 5 + movu xm9, [srcq+0] + vinserti128 m9, [srcq+8], 1 + shr tmp2d, 10 ; 3 + shr tmp1d, 10 ; 7 + punpcklbw m8, m11, m8 + pmaddwd m8, m10 + movq xm10, [filterq+tmp2q*8] + vinserti128 m10, [filterq+tmp1q*8], 1 + punpcklbw m10, m11, m10 + pmaddwd m9, m10 + add srcq, ssq + phaddd m8, m9 ; 2 3 6 7 + phaddd m0, m8 ; 0 1 2 3 4 5 6 7 + vpsllvd m0, m13 + paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword + ret + +%macro BIDIR_FN 0 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + movq [dstq ], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + cmp hd, 8 + je .ret + lea dstq, [dstq+strideq*4] + movq [dstq ], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq ], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.ret: + RET +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + cmp hd, 4 + jne .w8_loop_start + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 +.w8_loop_start: + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + dec hd + jg .w128_loop + RET +%endmacro + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg_avx2_table + lea r6, [avg_avx2_table] + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + vpbroadcastd m4, [base+bidir_rnd+t0*4] + vpbroadcastd m5, [base+bidir_mul+t0*4] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+32*0] + paddsw m0, [tmp2q+32*0] + mova m1, [tmp1q+32*1] + paddsw m1, [tmp2q+32*1] + mova m2, [tmp1q+32*2] + paddsw m2, [tmp2q+32*2] + mova m3, [tmp1q+32*3] + paddsw m3, [tmp2q+32*3] + add tmp1q, 32*4 + add tmp2q, 32*4 + pmaxsw m0, m4 + pmaxsw m1, m4 + pmaxsw m2, m4 + pmaxsw m3, m4 + psubsw m0, m4 + psubsw m1, m4 + psubsw m2, m4 + psubsw m3, m4 + pmulhw m0, m5 + pmulhw m1, m5 + pmulhw m2, m5 + pmulhw m3, m5 + ret + +cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3 + lea r6, [w_avg_avx2_table] + tzcnt wd, wm + mov t0d, r6m ; weight + vpbroadcastw m8, r7m ; pixel_max + vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538] + movsxd wq, [r6+wq*4] + paddw m7, m8 + add wq, r6 + lea r6d, [t0-16] + shl t0d, 16 + sub t0d, r6d ; 16-weight, weight + pslld m7, 7 + rorx r6d, t0d, 30 ; << 2 + test dword r7m, 0x800 + cmovz r6d, t0d + movifnidn hd, hm + movd xm6, r6d + vpbroadcastd m6, xm6 + BIDIR_FN +ALIGN function_align +.main: + mova m4, [tmp1q+32*0] + mova m0, [tmp2q+32*0] + punpckhwd m5, m0, m4 + punpcklwd m0, m4 + mova m4, [tmp1q+32*1] + mova m1, [tmp2q+32*1] + pmaddwd m5, m6 + pmaddwd m0, m6 + paddd m5, m7 + paddd m0, m7 + psrad m5, 8 + psrad m0, 8 + packusdw m0, m5 + punpckhwd m5, m1, m4 + punpcklwd m1, m4 + mova m4, [tmp1q+32*2] + mova m2, [tmp2q+32*2] + pmaddwd m5, m6 + pmaddwd m1, m6 + paddd m5, m7 + paddd m1, m7 + psrad m5, 8 + psrad m1, 8 + packusdw m1, m5 + punpckhwd m5, m2, m4 + punpcklwd m2, m4 + mova m4, [tmp1q+32*3] + mova m3, [tmp2q+32*3] + add tmp1q, 32*4 + add tmp2q, 32*4 + pmaddwd m5, m6 + pmaddwd m2, m6 + paddd m5, m7 + paddd m2, m7 + psrad m5, 8 + psrad m2, 8 + packusdw m2, m5 + punpckhwd m5, m3, m4 + punpcklwd m3, m4 + pmaddwd m5, m6 + pmaddwd m3, m6 + paddd m5, m7 + paddd m3, m7 + psrad m5, 8 + psrad m3, 8 + packusdw m3, m5 + pminsw m0, m8 + pminsw m1, m8 + pminsw m2, m8 + pminsw m3, m8 + ret + +cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask_avx2_table + lea r7, [mask_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_64] + vpbroadcastd m9, [base+bidir_rnd+r6*4] + vpbroadcastd m10, [base+bidir_mul+r6*4] + mov maskq, maskmp + add wq, r7 + BIDIR_FN +ALIGN function_align +.main: +%macro MASK 1 + pmovzxbw m5, [maskq+16*%1] + mova m%1, [tmp1q+32*%1] + mova m6, [tmp2q+32*%1] + punpckhwd m4, m%1, m6 + punpcklwd m%1, m6 + psubw m7, m8, m5 + punpckhwd m6, m5, m7 ; m, 64-m + punpcklwd m5, m7 + pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m%1, m5 + psrad m4, 5 + psrad m%1, 5 + packssdw m%1, m4 + pmaxsw m%1, m9 + psubsw m%1, m9 + pmulhw m%1, m10 +%endmacro + MASK 0 + MASK 1 + MASK 2 + MASK 3 + add maskq, 16*4 + add tmp1q, 32*4 + add tmp2q, 32*4 + ret + +cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx2_table + lea r7, [w_mask_420_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd xm0, r7m ; sign + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+bidir_rnd+r6*4] + vpbroadcastd m13, [base+bidir_mul+r6*4] + movd xm14, [base+pw_2] + mov maskq, maskmp + psubw xm14, xm0 + vpbroadcastw m14, xm14 + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + phaddd m4, m5 + paddw m4, m14 + psrlw m4, 2 + packuswb m4, m4 + vextracti128 xm5, m4, 1 + punpcklwd xm4, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + mova [maskq], xm4 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w8: + vperm2i128 m6, m4, m5, 0x21 + vpblendd m4, m5, 0xf0 + paddw m4, m14 + paddw m4, m6 + psrlw m4, 2 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + mova [maskq], xm4 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w16: + punpcklqdq m6, m4, m5 + punpckhqdq m4, m5 + paddw m6, m14 + paddw m4, m6 + psrlw m4, 2 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + pshufd xm4, xm4, q3120 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + mova [maskq], xm4 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w32: + paddw m4, m14 + paddw m4, m5 + psrlw m15, m4, 2 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + call .main + mova m6, [deint_shuf] + paddw m4, m14 + paddw m4, m5 + psrlw m4, 2 + packuswb m15, m4 + vpermd m4, m6, m15 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m2 + mova [dstq+stride3q +32*1], m3 + mova [maskq], m4 + sub hd, 4 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w64: + paddw m4, m14 + paddw m15, m14, m5 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [maskq], m4 ; no available registers + call .main + paddw m4, [maskq] + mova m6, [deint_shuf] + paddw m5, m15 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 ; 0 2 4 6 1 3 5 7 + vpermd m4, m6, m4 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + mova [maskq], m4 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 64 +.w128: + paddw m4, m14 + paddw m5, m14 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [maskq+32*0], m4 + mova [dstq+strideq], m5 + call .main + paddw m4, m14 + paddw m15, m14, m5 + mova [dstq+strideq*0+32*4], m0 + mova [dstq+strideq*0+32*5], m1 + mova [dstq+strideq*0+32*6], m2 + mova [dstq+strideq*0+32*7], m3 + mova [maskq+32*1], m4 + call .main + paddw m4, [maskq+32*0] + paddw m5, [dstq+strideq] + mova m6, [deint_shuf] + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m6, m4 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + mova [maskq+32*0], m4 + call .main + paddw m4, [maskq+32*1] + mova m6, [deint_shuf] + paddw m5, m15 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m6, m4 + mova [dstq+strideq*1+32*4], m0 + mova [dstq+strideq*1+32*5], m1 + mova [dstq+strideq*1+32*6], m2 + mova [dstq+strideq*1+32*7], m3 + mova [maskq+32*1], m4 + sub hd, 2 + jg .w128_loop + RET +ALIGN function_align +.main: +%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul + mova m%1, [tmp1q+32*%1] + mova m%2, [tmp2q+32*%1] + punpcklwd m8, m%2, m%1 + punpckhwd m9, m%2, m%1 + psubsw m%1, m%2 + pabsw m%1, m%1 + psubusw m7, m10, m%1 + psrlw m7, 10 ; 64-m + psubw m%2, m%3, m7 ; m + punpcklwd m%1, m7, m%2 + punpckhwd m7, m%2 + pmaddwd m%1, m8 + pmaddwd m7, m9 + psrad m%1, 5 + psrad m7, 5 + packssdw m%1, m7 + pmaxsw m%1, m%4 + psubsw m%1, m%4 + pmulhw m%1, m%5 +%endmacro + W_MASK 0, 4 + W_MASK 1, 5 + phaddw m4, m5 + W_MASK 2, 5 + W_MASK 3, 6 + phaddw m5, m6 + add tmp1q, 32*4 + add tmp2q, 32*4 + ret + +cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx2_table + lea r7, [w_mask_422_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + vpbroadcastb m14, r7m ; sign + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+bidir_rnd+r6*4] + vpbroadcastd m13, [base+bidir_mul+r6*4] + mova m15, [base+deint_shuf] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 4 + W_MASK 1, 5 + phaddw m4, m5 + W_MASK 2, 5 + W_MASK 3, 6 + phaddw m5, m6 + add tmp1q, 32*4 + add tmp2q, 32*4 + packuswb m4, m5 + pxor m5, m5 + psubb m4, m14 + pavgb m4, m5 + vpermd m4, m15, m4 + mova [maskq], m4 + add maskq, 32 + ret + +cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx2_table + lea r7, [w_mask_444_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] + vpbroadcastd m4, [base+pw_64] + vpbroadcastd m5, [base+bidir_rnd+r6*4] + vpbroadcastd m6, [base+bidir_mul+r6*4] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + call .main + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + call .main + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + call .main + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + call .main + mova [dstq+32*6], m0 + mova [dstq+32*7], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2, 4, 5, 6 + W_MASK 1, 3, 4, 5, 6 + packuswb m2, m3 + vpermq m2, m2, q3120 + add tmp1q, 32*2 + add tmp2q, 32*2 + mova [maskq], m2 + add maskq, 32 + ret + +; (a * (64 - m) + b * m + 32) >> 6 +; = (((b - a) * m + 32) >> 6) + a +; = (((b - a) * (m << 9) + 16384) >> 15) + a +; except m << 9 overflows int16_t when m == 64 (which is possible), +; but if we negate m it works out (-64 << 9 == -32768). +; = (((a - b) * (m * -512) + 16384) >> 15) + a +cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_avx2_table + lea r6, [blend_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + vpbroadcastd m6, [base+pw_m512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + pmovzxbw m3, [maskq] + movq xm0, [dstq+dsq*0] + movhps xm0, [dstq+dsq*1] + vpbroadcastq m1, [dstq+dsq*2] + vpbroadcastq m2, [dstq+r6 ] + vpblendd m0, m1, 0x30 + vpblendd m0, m2, 0xc0 + psubw m1, m0, [tmpq] + add maskq, 16 + add tmpq, 32 + pmullw m3, m6 + pmulhrsw m1, m3 + paddw m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + movq [dstq+dsq*2], xm1 + movhps [dstq+r6 ], xm1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + RET +.w8: + pmovzxbw m4, [maskq+16*0] + pmovzxbw m5, [maskq+16*1] + mova xm0, [dstq+dsq*0] + vinserti128 m0, [dstq+dsq*1], 1 + mova xm1, [dstq+dsq*2] + vinserti128 m1, [dstq+r6 ], 1 + psubw m2, m0, [tmpq+32*0] + psubw m3, m1, [tmpq+32*1] + add maskq, 16*2 + add tmpq, 32*2 + pmullw m4, m6 + pmullw m5, m6 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + mova [dstq+dsq*2], xm1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + RET +.w16: + pmovzxbw m4, [maskq+16*0] + pmovzxbw m5, [maskq+16*1] + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 32*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 32*1] + add maskq, 16*2 + add tmpq, 32*2 + pmullw m4, m6 + pmullw m5, m6 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16 + RET +.w32: + pmovzxbw m4, [maskq+16*0] + pmovzxbw m5, [maskq+16*1] + mova m0, [dstq+32*0] + psubw m2, m0, [tmpq+32*0] + mova m1, [dstq+32*1] + psubw m3, m1, [tmpq+32*1] + add maskq, 16*2 + add tmpq, 32*2 + pmullw m4, m6 + pmullw m5, m6 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .w32 + RET + +INIT_XMM avx2 +cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h +%define base r5-blend_v_avx2_table + lea r5, [blend_v_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + vpbroadcastd m2, [base+obmc_masks+2*2] +.w2_loop: + movd m0, [dstq+dsq*0] + pinsrd m0, [dstq+dsq*1], 1 + movq m1, [tmpq] + add tmpq, 4*2 + psubw m1, m0, m1 + pmulhrsw m1, m2 + paddw m0, m1 + movd [dstq+dsq*0], m0 + pextrd [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + vpbroadcastq m2, [base+obmc_masks+4*2] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + psubw m1, m0, [tmpq] + add tmpq, 8*2 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +INIT_YMM avx2 +.w8: + vbroadcasti128 m2, [base+obmc_masks+8*2] +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti128 m0, [dstq+dsq*1], 1 + psubw m1, m0, [tmpq] + add tmpq, 16*2 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + mova m4, [base+obmc_masks+16*2] +.w16_loop: + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 32*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 32*1] + add tmpq, 32*2 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: +%if WIN64 + movaps [rsp+ 8], xmm6 + movaps [rsp+24], xmm7 +%endif + mova m6, [base+obmc_masks+32*2] + vbroadcasti128 m7, [base+obmc_masks+32*3] +.w32_loop: + mova m0, [dstq+dsq*0+32*0] + psubw m3, m0, [tmpq +32*0] + mova xm2, [dstq+dsq*0+32*1] + mova xm5, [tmpq +32*1] + mova m1, [dstq+dsq*1+32*0] + psubw m4, m1, [tmpq +32*2] + vinserti128 m2, [dstq+dsq*1+32*1], 1 + vinserti128 m5, [tmpq +32*3], 1 + add tmpq, 32*4 + psubw m5, m2, m5 + pmulhrsw m3, m6 + pmulhrsw m4, m6 + pmulhrsw m5, m7 + paddw m0, m3 + paddw m1, m4 + paddw m2, m5 + mova [dstq+dsq*0+32*0], m0 + mova [dstq+dsq*1+32*0], m1 + mova [dstq+dsq*0+32*1], xm2 + vextracti128 [dstq+dsq*1+32*1], m2, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32_loop +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + RET + +%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp + mova m0, [dstq+32*(%1+0)] + psubw m2, m0, [tmpq+32*(%2+0)] + mova m1, [dstq+32*(%1+1)] + psubw m3, m1, [tmpq+32*(%2+1)] +%if %3 + add tmpq, 32*%3 +%endif + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+32*(%1+0)], m0 + mova [dstq+32*(%1+1)], m1 +%endmacro + +INIT_XMM avx2 +cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_h_avx2_table + lea r5, [blend_h_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + pinsrd m0, [dstq+dsq*1], 1 + movd m2, [maskq+hq*2] + movq m1, [tmpq] + add tmpq, 4*2 + punpcklwd m2, m2 + psubw m1, m0, m1 + pmulhrsw m1, m2 + paddw m0, m1 + movd [dstq+dsq*0], m0 + pextrd [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova m3, [blend_shuf] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + movd m2, [maskq+hq*2] + psubw m1, m0, [tmpq] + add tmpq, 8*2 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +INIT_YMM avx2 +.w8: + vbroadcasti128 m3, [blend_shuf] + shufpd m3, m3, 0x0c +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti128 m0, [dstq+dsq*1], 1 + vpbroadcastd m2, [maskq+hq*2] + psubw m1, m0, [tmpq] + add tmpq, 16*2 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + RET +.w16: + vpbroadcastw m4, [maskq+hq*2] + vpbroadcastw m5, [maskq+hq*2+2] + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 32*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 32*1] + add tmpq, 32*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16 + RET +.w32: + vpbroadcastw m4, [maskq+hq*2] + BLEND_H_ROW 0, 0, 2 + add dstq, dsq + inc hq + jl .w32 + RET +.w64: + vpbroadcastw m4, [maskq+hq*2] + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 4 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + vpbroadcastw m4, [maskq+hq*2] + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 8 + BLEND_H_ROW 4, -4 + BLEND_H_ROW 6, -2 + add dstq, dsq + inc hq + jl .w128 + RET + +cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ + bottomext, rightext + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor r12d, r12d + lea r10, [ihq-1] + cmp yq, ihq + cmovs r10, yq + test yq, yq + cmovs r10, r12 + imul r10, sstrideq + add srcq, r10 + + ; ref += iclip(x, 0, iw - 1) + lea r10, [iwq-1] + cmp xq, iwq + cmovs r10, xq + test xq, xq + cmovs r10, r12 + lea srcq, [srcq+r10*2] + + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) + lea bottomextq, [yq+bhq] + sub bottomextq, ihq + lea r3, [bhq-1] + cmovs bottomextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, r12 + cmp bottomextq, bhq + cmovns bottomextq, r3 + cmp topextq, bhq + cmovg topextq, r3 + + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + lea rightextq, [xq+bwq] + sub rightextq, iwq + lea r2, [bwq-1] + cmovs rightextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, r12 + cmp rightextq, bwq + cmovns rightextq, r2 + cmp leftextq, bwq + cmovns leftextq, r2 + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ + dst, dstride, src, sstride, bottomext, rightext + + ; center_h = bh - top_ext - bottom_ext + lea r3, [bottomextq+topextq] + sub centerhq, r3 + + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq + imul r2, dstrideq + add dstq, r2 + mov r9m, dstq + + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq + lea r3, [rightextq+leftextq] + sub centerwq, r3 + +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix +.v_loop_%3: +%if %1 + ; left extension + xor r3, r3 + vpbroadcastw m0, [srcq] +.left_loop_%3: + mova [dstq+r3*2], m0 + add r3, 16 + cmp r3, leftextq + jl .left_loop_%3 + + ; body + lea r12, [dstq+leftextq*2] +%endif + xor r3, r3 +.body_loop_%3: + movu m0, [srcq+r3*2] +%if %1 + movu [r12+r3*2], m0 +%else + movu [dstq+r3*2], m0 +%endif + add r3, 16 + cmp r3, centerwq + jl .body_loop_%3 + +%if %2 + ; right extension +%if %1 + lea r12, [r12+centerwq*2] +%else + lea r12, [dstq+centerwq*2] +%endif + xor r3, r3 + vpbroadcastw m0, [srcq+centerwq*2-2] +.right_loop_%3: + movu [r12+r3*2], m0 + add r3, 16 + cmp r3, rightextq + jl .right_loop_%3 + +%endif + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 +%endmacro + + test leftextq, leftextq + jnz .need_left_ext + test rightextq, rightextq + jnz .need_right_ext + v_loop 0, 0, 0 + jmp .body_done + +.need_left_ext: + test rightextq, rightextq + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: + ; bottom edge extension + test bottomextq, bottomextq + jz .top + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 +.bottom_x_loop: + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, bottomextq +.bottom_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .bottom_y_loop + add r1, 16 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end + mov srcq, r9m + mov dstq, dstm + xor r1, r1 +.top_x_loop: + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, topextq +.top_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .top_y_loop + add r1, 16 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/mc16_sse.asm dav1d-0.9.1/src/x86/mc16_sse.asm --- dav1d-0.7.1/src/x86/mc16_sse.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/mc16_sse.asm 2021-07-28 21:38:28.913852200 +0000 @@ -0,0 +1,4544 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +; dav1d_obmc_masks[] << 9 +obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 + dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 + dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 + dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 + dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 + dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 + dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 + +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + +pw_2: times 8 dw 2 +pw_16: times 4 dw 16 +prep_mul: times 4 dw 16 + times 8 dw 4 +pw_64: times 8 dw 64 +pw_256: times 8 dw 256 +pw_2048: times 4 dw 2048 +bidir_mul: times 4 dw 2048 +pw_8192: times 8 dw 8192 +pw_27615: times 8 dw 27615 +pw_32766: times 8 dw 32766 +pw_m512: times 8 dw -512 +pd_512: times 4 dd 512 +pd_65538: times 2 dd 65538 + +put_bilin_h_rnd: times 4 dw 8 + times 4 dw 10 +bidir_rnd: times 4 dw -16400 + times 4 dw -16388 +put_8tap_h_rnd: dd 34, 34, 40, 40 +prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) +prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) + +warp8x8_shift: dd 11, 13 +warp8x8_rnd1: dd 1024, 1024, 4096, 4096 +warp8x8_rnd2: times 4 dw 4096 + times 4 dw 16384 +warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) +%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) + +BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +cextern mc_warp_filter + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +INIT_XMM ssse3 +cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy +%define base t0-put_ssse3 + mov mxyd, r6m ; mx + LEA t0, put_ssse3 + movifnidn wd, wm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + add wq, t0 + movifnidn hd, hm + jmp wq +.put_w2: + mov r4d, [srcq+ssq*0] + mov r6d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r4d + mov [dstq+dsq*1], r6d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq [dstq+dsq*0], m0 + movq [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0+16*0] + movu m1, [srcq+ssq*0+16*1] + movu m2, [srcq+ssq*1+16*0] + movu m3, [srcq+ssq*1+16*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+16*0], m0 + mova [dstq+dsq*0+16*1], m1 + mova [dstq+dsq*1+16*0], m2 + mova [dstq+dsq*1+16*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, ssq + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + add dstq, dsq + dec hd + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + add srcq, 16*8 + add dstq, 16*8 +.put_w128_loop: + movu m0, [srcq-16*8] + movu m1, [srcq-16*7] + movu m2, [srcq-16*6] + movu m3, [srcq-16*5] + mova [dstq-16*8], m0 + mova [dstq-16*7], m1 + mova [dstq-16*6], m2 + mova [dstq-16*5], m3 + movu m0, [srcq-16*4] + movu m1, [srcq-16*3] + movu m2, [srcq-16*2] + movu m3, [srcq-16*1] + mova [dstq-16*4], m0 + mova [dstq-16*3], m1 + mova [dstq-16*2], m2 + mova [dstq-16*1], m3 + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w128_loop + RET +.h: + movd m5, mxyd + mov mxyd, r7m ; my + mova m4, [base+pw_16] + pshufb m5, [base+pw_256] + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + mov r6d, r8m ; bitdepth_max + shr r6d, 11 + movddup m3, [base+put_bilin_h_rnd+r6*8] + movifnidn hd, hm + sub wd, 8 + jg .h_w16 + je .h_w8 + jp .h_w4 +.h_w2: + movq m1, [srcq+ssq*0] + movhps m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw m0, m4, m1 + psrlq m1, 16 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movd [dstq+dsq*0], m0 + punpckhqdq m0, m0 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq m0, [srcq+ssq*0] + movhps m0, [srcq+ssq*1] + movq m1, [srcq+ssq*0+2] + movhps m1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+r6*2+16*0], m0 + mova [dstq+r6*2+16*1], m1 + add r6, 16 + jl .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16_loop0 + RET +.v: + shl mxyd, 11 + movd m5, mxyd + pshufb m5, [base+pw_256] + movifnidn hd, hm + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd m0, [srcq+ssq*0] +.v_w2_loop: + movd m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movd m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq m0, [srcq+ssq*0] +.v_w4_loop: + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movq m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + mov r7, srcq + lea r6d, [wq+hq-256] + mov r4, dstq +%else + mov r6, srcq +%endif +.v_w8_loop0: + movu m0, [srcq+ssq*0] +.v_w8_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m5 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m5 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop +%if ARCH_X86_64 + add r7, 16 + add r4, 16 + movzx hd, r6b + mov srcq, r7 + mov dstq, r4 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .v_w8_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 8 + shl mxyd, 11 + mova m3, [base+pw_2] + movd m6, mxyd + mova m7, [base+pw_8192] + pshufb m6, [base+pw_256] + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + mova m7, [base+pw_2048] +.hv_12bpc: + movifnidn hd, hm + cmp wd, 4 + jg .hv_w8 + je .hv_w4 +.hv_w2: + movddup m0, [srcq+ssq*0] + pshufhw m1, m0, q0321 + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w2_loop: + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps m2, [srcq+ssq*0] + pmullw m1, m4, m2 + psrlq m2, 16 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 _ 2 _ + shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + movddup m0, [srcq+ssq*0] + movddup m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w4_loop: + movq m1, [srcq+ssq*1] + movq m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps m1, [srcq+ssq*0] + movhps m2, [srcq+ssq*0+2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + lea r6d, [wq+hq-256] + mov r4, srcq + mov r7, dstq +%else + mov r6, srcq +%endif +.hv_w8_loop0: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w8_loop: + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 + psubw m2, m1, m0 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m0 + pmulhrsw m2, m7 + mova [dstq+dsq*0], m2 + movu m0, [srcq+ssq*0] + movu m2, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m2, m5 + paddw m0, m3 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m1 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m1 + pmulhrsw m2, m7 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop +%if ARCH_X86_64 + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .hv_w8_loop0 +%if WIN64 + pop r7 +%endif + RET + +cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 +%define base r6-prep_ssse3 + movifnidn mxyd, r5m ; mx + LEA r6, prep_ssse3 + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + tzcnt wd, wd + movzx wd, word [base+prep_ssse3_table+wq*2] + mov r5d, r7m ; bitdepth_max + mova m5, [base+pw_8192] + add wq, r6 + shr r5d, 11 + movddup m4, [base+prep_mul+r5*8] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*2] + movhps m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + movu m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] + movu m2, [srcq+strideq*1+16*0] + movu m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 2 + jg .prep_w16 + RET +.prep_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + dec hd + jg .prep_w32 + RET +.prep_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + add tmpq, 16*8 + dec hd + jg .prep_w64 + RET +.prep_w128: + movu m0, [srcq+16* 0] + movu m1, [srcq+16* 1] + movu m2, [srcq+16* 2] + movu m3, [srcq+16* 3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16* 4] + movu m1, [srcq+16* 5] + movu m2, [srcq+16* 6] + movu m3, [srcq+16* 7] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + movu m0, [srcq+16* 8] + movu m1, [srcq+16* 9] + movu m2, [srcq+16*10] + movu m3, [srcq+16*11] + add tmpq, 16*16 + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*8], m0 + mova [tmpq-16*7], m1 + mova [tmpq-16*6], m2 + mova [tmpq-16*5], m3 + movu m0, [srcq+16*12] + movu m1, [srcq+16*13] + movu m2, [srcq+16*14] + movu m3, [srcq+16*15] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*4], m0 + mova [tmpq-16*3], m1 + mova [tmpq-16*2], m2 + mova [tmpq-16*1], m3 + dec hd + jg .prep_w128 + RET +.h: + movd m4, mxyd + mov mxyd, r6m ; my + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m3, 2 + psllw m4, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + sub wd, 8 + je .h_w8 + jg .h_w16 +.h_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*0+2] + movhps m1, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + add r6, 16 + jl .h_w16_loop + add srcq, strideq + dec hd + jg .h_w16_loop0 + RET +.v: + movd m4, mxyd + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 + psllw m4, 2 +.v_12bpc: + cmp wd, 8 + je .v_w8 + jg .v_w16 +.v_w4: + movq m0, [srcq+strideq*0] +.v_w4_loop: + movq m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklqdq m1, m0, m2 ; 0 1 + movq m0, [srcq+strideq*0] + punpcklqdq m2, m0 ; 1 2 + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu m0, [srcq+strideq*0] +.v_w8_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+16*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.v_w16_loop0: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+wq*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+wq*2], m1 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .v_w16_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 7 + shl mxyd, 11 + movd m6, mxyd + pshufb m6, [base+pw_256] + cmp wd, 8 + je .hv_w8 + jg .hv_w16 +.hv_w4: + movddup m0, [srcq+strideq*0] + movddup m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w4_loop: + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + movhps m1, [srcq+strideq*0] + movhps m2, [srcq+strideq*0+2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w8_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+16*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.hv_w16_loop0: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w16_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+wq*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+wq*2], m2 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .hv_w16_loop0 +%if WIN64 + pop r7 +%endif + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v +cglobal %1_8tap_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) +%endif +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 1, 2, 6 +%elif WIN64 +DECLARE_REG_TMP 4, 5, 8 +%else +DECLARE_REG_TMP 7, 8, 8 +%endif + +MC_8TAP_FN put, sharp, SHARP, SHARP +MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN put, smooth, SMOOTH, SMOOTH +MC_8TAP_FN put, sharp_regular, SHARP, REGULAR +MC_8TAP_FN put, regular_sharp, REGULAR, SHARP +MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN put, regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r1b +%define myd r1 +%define myq r1 +%define m8 [esp+16*0] +%define m9 [esp+16*1] +%define m10 [esp+16*2] +%define m11 [esp+16*3] +%define m12 [esp+16*4] +%define m13 [esp+16*5] +%define m14 [esp+16*6] +%define m15 [esp+16*7] +%else +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%endif +%define base t2-put_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, put_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + movifnidn ssq, ssmp + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + movifnidn dstq, dstmp + movifnidn dsq, dsmp + add wq, t2 +%if WIN64 + pop r8 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + mov myd, r8m + movd m5, r8m + shr myd, 11 + movddup m4, [base+put_8tap_h_rnd+myq*8] + movifnidn dsq, dsmp + pshufb m5, [base+pw_256] + cmp wd, 4 + jg .h_w8 + movzx mxd, mxb + lea srcq, [srcq-2] + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + je .h_w4 +.h_w2: + mova m2, [base+spel_h_shuf2] + pshufd m3, m3, q2121 +.h_w2_loop: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m2 + pshufb m1, m2 + pmaddwd m0, m3 + pmaddwd m1, m3 + phaddd m0, m1 + paddd m0, m4 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movd [dstq+dsq*0], m0 + pshuflw m0, m0, q3232 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + WIN64_SPILL_XMM 8 + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] + pshufd m2, m3, q1111 + pshufd m3, m3, q2222 +.h_w4_loop: + movu m1, [srcq] + add srcq, ssq + pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 + pshufb m1, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m2 + pmaddwd m1, m3 + paddd m0, m4 + paddd m0, m1 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movq [dstq], m0 + add dstq, dsq + dec hd + jg .h_w4_loop + RET +.h_w8: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 +%endif + shr mxd, 16 + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] +%if UNIX64 + mov wd, wd +%endif + lea srcq, [srcq+wq*2] + punpcklbw m3, m3 + lea dstq, [dstq+wq*2] + psraw m3, 8 + neg wq +%if ARCH_X86_32 + ALLOC_STACK -16*4 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6*2- 6] + movu m1, [srcq+r6*2+ 2] + pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 + pshufb m0, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m8 ; abcd0 + pmaddwd m0, m9 ; abcd1 + pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 + pshufb m1, m7 ; 6 7 7 8 8 9 9 a + paddd m2, m4 + paddd m0, m2 + pmaddwd m2, m10, m3 ; abcd2 + pmaddwd m3, m8 ; efgh0 + paddd m0, m2 + pmaddwd m2, m11, m1 ; abcd3 + pmaddwd m1, m9 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6*2+10] + paddd m3, m4 + paddd m1, m3 + pshufb m3, m2, m6 ; 8 9 9 a a b b c + pshufb m2, m7 ; a b b c c d d e + pmaddwd m3, m10 ; efgh2 + pmaddwd m2, m11 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + mova [dstq+r6*2], m0 + add r6, 8 + jl .h_w8_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if WIN64 + WIN64_SPILL_XMM 15 +%endif + movd m7, r8m + movifnidn dstq, dstmp + movifnidn dsq, dsmp + punpcklbw m3, m3 + pshufb m7, [base+pw_256] + psraw m3, 8 ; sign-extend +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + cmp wd, 2 + jne .v_w4 +.v_w2: + movd m1, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + movd m2, [srcq+ssq*2] + add srcq, r6 + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m6, [srcq+ssq*2] + add srcq, r6 + movd m0, [srcq+ssq*0] + punpckldq m1, m4 ; 0 1 + punpckldq m4, m2 ; 1 2 + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m6 ; 4 5 + punpckldq m6, m0 ; 5 6 + punpcklwd m1, m4 ; 01 12 + punpcklwd m2, m5 ; 23 34 + punpcklwd m3, m6 ; 45 56 + pxor m6, m6 +.v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + punpckldq m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpckldq m4, m0 ; 7 8 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m6 + pavgw m5, m6 + pminsw m5, m7 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: +%if ARCH_X86_32 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcmp, srcq +%endif + lea wd, [wq+hq-(1<<16)] +%else + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] +%endif +.v_w4_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, r6 + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, r6 + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_w4_loop_start +.v_w4_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_w4_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m3 + psrad m1, 5 + psrad m2, 5 + packssdw m1, m2 + pxor m2, m2 + pmaxsw m1, m2 + pavgw m1, m2 + pminsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*29] + mov dstq, [esp+4*30] + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + sub wd, 1<<16 +%else +.v_w4_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 5 + psrad m13, 5 + packssdw m12, m13 + pxor m13, m13 + pmaxsw m12, m13 + pavgw m12, m13 + pminsw m12, m7 + movq [dstq+dsq*0], m12 + movhps [dstq+dsq*1], m12 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .v_w4_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if ARCH_X86_32 + movd m4, r8m + mova m6, [base+pd_512] + pshufb m4, [base+pw_256] +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + movd m15, r8m + pshufb m15, [base+pw_256] +%endif + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + je .hv_w4 + movq m0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov dstq, dstmp + mov dsq, dsmp + mova m5, [base+spel_h_shuf2] + ALLOC_STACK -16*8 +%else + mova m6, [base+pd_512] + mova m9, [base+spel_h_shuf2] +%endif + pshuflw m0, m0, q2121 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_w2_10bpc + psraw m7, 2 + psllw m3, 2 +.hv_w2_10bpc: + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 +%if ARCH_X86_32 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m9, m5 + mova m11, m0 + mova m12, m1 + mova m13, m2 + mova m14, m3 + mova m15, m4 +%else + pshufd m11, m3, q0000 + pshufd m12, m3, q1111 + pshufd m13, m3, q2222 + pshufd m14, m3, q3333 +%endif + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + movu m1, [srcq+ssq*2] + add srcq, r6 + movu m4, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m2, m3, m1, m4 +%else + REPX {pshufb x, m9}, m2, m3, m1, m4 +%endif + REPX {pmaddwd x, m7}, m2, m3, m1, m4 + phaddd m2, m3 ; 0 1 + phaddd m1, m4 ; 2 3 + movu m3, [srcq+ssq*1] + movu m4, [srcq+ssq*2] + add srcq, r6 + movu m0, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m3, m4, m0 +%else + REPX {pshufb x, m9}, m3, m4, m0 +%endif + REPX {pmaddwd x, m7}, m3, m4, m0 + phaddd m3, m4 ; 4 5 + phaddd m0, m0 ; 6 6 + REPX {paddd x, m6}, m2, m1, m3, m0 + REPX {psrad x, 10}, m2, m1, m3, m0 + packssdw m2, m1 ; 0 1 2 3 + packssdw m3, m0 ; 4 5 6 _ + palignr m4, m3, m2, 4 ; 1 2 3 4 + pshufd m5, m3, q0321 ; 5 6 _ _ + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + punpcklwd m3, m5 ; 45 56 +.hv_w2_loop: + movu m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu m5, [srcq+ssq*0] + pshufb m4, m9 + pshufb m5, m9 + pmaddwd m4, m7 + pmaddwd m5, m7 + phaddd m4, m5 + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + paddd m4, m6 + psrad m4, 10 ; 7 8 + packssdw m0, m4 + pshufd m3, m0, q2103 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m5, m6 + paddd m5, m4 + psrad m5, 10 + packssdw m5, m5 + pxor m4, m4 + pminsw m5, m15 + pmaxsw m5, m4 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w8: + shr mxd, 16 +.hv_w4: + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + mov dstq, dstmp + mov dsq, dsmp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + ALLOC_STACK -16*15 + mova m8, m0 + mova m9, m1 + mova m14, m6 +%else + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m3, 8 + test dword r8m, 0x800 + jz .hv_w4_10bpc + psraw m0, 2 + psllw m3, 2 +.hv_w4_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 +%if ARCH_X86_32 + %define tmp esp+16*8 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcmp, srcq +%endif + mova [tmp+16*5], m4 + lea wd, [wq+hq-(1<<16)] + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-104 ; red zone +%endif + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 + mova [tmp+16*5], m15 +%endif + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] + pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 + pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 + pmaddwd m%3, m10 + pmaddwd m%1, m11 + paddd m%3, %5 + paddd m%1, m%3 + pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + pmaddwd m%3, m12 + pmaddwd m%2, m13 + paddd m%1, m%3 + paddd m%1, m%2 + psrad m%1, %4 +%endmacro +.hv_w4_loop0: +%if ARCH_X86_64 + mova m14, [pd_512] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + movu m6, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 4, 1, 0, 10 + PUT_8TAP_HV_H 5, 2, 0, 10 + PUT_8TAP_HV_H 6, 3, 0, 10 + movu m7, [srcq+ssq*0+0] + movu m2, [srcq+ssq*0+8] + movu m1, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + PUT_8TAP_HV_H 7, 2, 0, 10 + PUT_8TAP_HV_H 1, 3, 0, 10 + movu m2, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 2, 3, 0, 10 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 10 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_w4_loop_start +.hv_w4_loop: + mova m1, [tmp+16*6] + mova m2, m15 +.hv_w4_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*6], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 9 + psrad m2, 9 + packssdw m1, m2 + pxor m7, m7 + pmaxsw m1, m7 + pavgw m7, m1 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*61] + mov dstq, [esp+4*62] + add srcq, 8 + add dstq, 8 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + movzx hd, ww + sub wd, 1<<16 +%else +.hv_w4_loop: + mova m15, [tmp+16*1] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 9 + psrad m15, 9 + packssdw m14, m15 + pxor m7, m7 + pmaxsw m14, m7 + pavgw m7, m14 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .hv_w4_loop0 + RET +%undef tmp + +%if ARCH_X86_32 +DECLARE_REG_TMP 2, 1, 6, 4 +%elif WIN64 +DECLARE_REG_TMP 6, 4, 7, 4 +%else +DECLARE_REG_TMP 6, 7, 7, 8 +%endif + +MC_8TAP_FN prep, sharp, SHARP, SHARP +MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH +MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR +MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP +MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN prep, regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r2b +%define myd r2 +%define myq r2 +%else +cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my +%endif +%define base t2-prep_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, prep_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + test mxd, 0xf00 + jnz .h + movifnidn hd, hm + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov myd, r7m ; bitdepth_max + movzx wd, word [base+prep_ssse3_table+wq*2] + mova m5, [base+pw_8192] + shr myd, 11 + add wq, t2 + movddup m4, [base+prep_mul+myq*8] + movifnidn ssq, ssmp + movifnidn tmpq, tmpmp + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + movifnidn ssq, r2mp + movifnidn hd, r4m + movddup m5, [base+prep_8tap_1d_rnd] + cmp wd, 4 + jne .h_w8 + movzx mxd, mxb + movq m0, [base+subpel_filters+mxq*8] + mova m3, [base+spel_h_shufA] + mova m4, [base+spel_h_shufB] + movifnidn tmpq, tmpmp + sub srcq, 2 + WIN64_SPILL_XMM 8 + punpcklbw m0, m0 + psraw m0, 8 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw m0, 2 +.h_w4_12bpc: + pshufd m6, m0, q1111 + pshufd m7, m0, q2222 +.h_w4_loop: + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + WIN64_SPILL_XMM 11 + shr mxd, 16 + movq m2, [base+subpel_filters+mxq*8] + mova m4, [base+spel_h_shufA] + mova m6, [base+spel_h_shufB] + movifnidn tmpq, r0mp + add wd, wd + punpcklbw m2, m2 + add srcq, wq + psraw m2, 8 + add tmpq, wq + neg wq + test dword r7m, 0x800 + jnz .h_w8_12bpc + psllw m2, 2 +.h_w8_12bpc: + pshufd m7, m2, q0000 +%if ARCH_X86_32 + ALLOC_STACK -16*3 + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + pshufd m2, m2, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 +%else + pshufd m8, m2, q1111 + pshufd m9, m2, q2222 + pshufd m10, m2, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6- 6] + movu m1, [srcq+r6+ 2] + pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 + pshufb m0, m6 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m7 ; abcd0 + pmaddwd m0, m8 ; abcd1 + pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 + pshufb m1, m6 ; 6 7 7 8 8 9 9 a + paddd m2, m5 + paddd m0, m2 + pmaddwd m2, m9, m3 ; abcd2 + pmaddwd m3, m7 ; efgh0 + paddd m0, m2 + pmaddwd m2, m10, m1 ; abcd3 + pmaddwd m1, m8 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6+10] + paddd m3, m5 + paddd m1, m3 + pshufb m3, m2, m4 ; a b b c c d d e + pshufb m2, m6 ; 8 9 9 a a b b c + pmaddwd m3, m9 ; efgh2 + pmaddwd m2, m10 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq+r6], m0 + add r6, 16 + jl .h_w8_loop + add srcq, ssq + sub tmpq, wq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + WIN64_SPILL_XMM 15 + movddup m7, [base+prep_8tap_1d_rnd] + movifnidn ssq, r2mp + movifnidn tmpq, r0mp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 +.v_12bpc: +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_64 + mov r7, tmpq +%elif STACK_ALIGNMENT < 16 + mov [esp+4*29], tmpq +%endif + lea wd, [wq+hq-(1<<8)] +.v_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m5, [srcq+ssq*0] + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_loop_start +.v_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m7 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m7 + paddd m2, m3 + psrad m1, 4 + psrad m2, 4 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*29] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*29], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.v_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m7 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m7 + paddd m13, m14 + psrad m12, 4 + psrad m13, 4 + packssdw m12, m13 + movq [tmpq+r6*0], m12 + movhps [tmpq+r6*2], m12 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .v_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + movzx t3d, mxb + shr mxd, 16 + cmp wd, 4 + cmove mxd, t3d + movifnidn hd, r4m + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov ssq, r2mp + mov tmpq, r0mp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + mova m4, [base+prep_8tap_2d_rnd] + ALLOC_STACK -16*14 + mova m8, m0 + mova m9, m1 + mova m14, m4 +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m0, 4 + psraw m3, 8 + test dword r7m, 0x800 + jz .hv_10bpc + psraw m0, 2 +.hv_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_32 + %define tmp esp+16*8 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], tmpq +%endif + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-88 ; red zone +%endif + mov r7, tmpq + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + lea wd, [wq+hq-(1<<8)] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +.hv_loop0: +%if ARCH_X86_64 + mova m14, [prep_8tap_2d_rnd] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m6, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 4, 1, 0, 6 + PUT_8TAP_HV_H 5, 2, 0, 6 + PUT_8TAP_HV_H 6, 3, 0, 6 + movu m7, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m1, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 7, 2, 0, 6 + PUT_8TAP_HV_H 1, 3, 0, 6 + movu m2, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 2, 3, 0, 6 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 6 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_loop_start +.hv_loop: + mova m1, [tmp+16*5] + mova m2, m15 +.hv_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*5], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m14 + paddd m2, m14 + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 6 + psrad m2, 6 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*61] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*61], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.hv_loop: + mova m15, [tmp+16*1] + mova m7, [prep_8tap_2d_rnd] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + paddd m14, m7 + paddd m15, m7 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 6 + psrad m15, 6 + packssdw m14, m15 + movq [tmpq+r6*0], m14 + movhps [tmpq+r6*2], m14 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .hv_loop0 + RET +%undef tmp + +%if ARCH_X86_64 +; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that +; by allocating 16 bytes more stack space so that stack offsets match up. +%if WIN64 && STACK_ALIGNMENT == 16 +%assign stksz 16*14 +%else +%assign stksz 16*13 +%endif +cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +%assign stack_size_padded_8x8t stack_size_padded +%else +cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%define m8 [esp+16*13] +%define m9 [esp+16*14] +%define cntd dword [esp+4*63] +%define dstq tmpq +%define dsq 0 +%if STACK_ALIGNMENT < 16 +%define dstm [esp+4*65] +%define dsm [esp+4*66] +%else +%define dstm r0m +%define dsm r1m +%endif +%endif +%define base filterq-$$ + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8t_rnd] +%else + movddup m1, [base+warp8x8t_rnd] + mov r1, r1m + add r1, r1 + mova m8, m1 + mov r1m, r1 ; ds *= 2 +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*4] +%else + add dstq, dsm + mov dstm, dstq +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*0], m1 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*2], m1 + dec cntd + jg .loop + RET + +%if ARCH_X86_64 +cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +ASSERT stack_size_padded == stack_size_padded_8x8t +%else +cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%endif + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8_rnd2+t0*8] + movd m9, r7m ; pixel_max + pshufb m9, [base+pw_256] +%else + movddup m1, [base+warp8x8_rnd2+t0*8] + movd m2, r7m ; pixel_max + pshufb m2, [base+pw_256] + mova m8, m1 + mova m9, m2 +%endif + call .main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*2] +%else + add dstq, dsm + mov dstm, dstq +%endif + call .main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*0], m1 + call .main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*1], m1 + dec cntd + jg .loop + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov deltaq, r5m + mov mxd, r6m +%endif + movd m0, [base+warp8x8_shift+t0*4] + movddup m7, [base+warp8x8_rnd1+t0*8] + add filterq, mc_warp_filter-$$ +%if ARCH_X86_64 + movsx alphad, word [deltaq+2*0] + movsx betad, word [deltaq+2*1] + movsx gammad, word [deltaq+2*2] + movsx deltad, word [deltaq+2*3] + lea tmpq, [ssq*3] + add mxd, 512+(64<<10) + sub srcq, tmpq ; src -= ss*3 + imul tmpd, alphad, -7 + mov myd, r7m + add betad, tmpd ; beta -= alpha*7 + imul tmpd, gammad, -7 + add myd, 512+(64<<10) + mov cntd, 4 + add deltad, tmpd ; delta -= gamma*7 +%else +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset - gprsize +%endif + mov r3d, r5m ; abcd +%if STACK_ALIGNMENT < 16 + mov r0, r1m ; dst + mov r1, r2m ; ds + mov [esp+gprsize+4*65], r0 + mov [esp+gprsize+4*66], r1 +%endif + movsx alphad, word [r3+2*0] + movsx r2d, word [r3+2*1] + movsx gammad, word [r3+2*2] + movsx r3d, word [r3+2*3] + imul r5d, alphad, -7 + add r2d, r5d ; beta -= alpha*7 + imul r5d, gammad, -7 + mov [esp+gprsize+4*60], r2d + add r3d, r5d ; delta -= gamma*7 + mov [esp+gprsize+4*61], r3d + mov r3d, r4m ; ss + mov srcq, r3m + mov mxd, r6m + mov myd, r7m + mov dword [esp+gprsize+4*63], 4 ; cnt + mov [esp+gprsize+4*62], r3 + lea r3, [r3*3] + add mxd, 512+(64<<10) + add myd, 512+(64<<10) + sub srcq, r3 ; src -= ss*3 +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset + gprsize +%endif +%endif + mova [rsp+gprsize], m0 + pxor m6, m6 + call .h + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 01 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 1], m1 + mova [rsp+gprsize+16* 4], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 12 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 7], m1 + mova [rsp+gprsize+16*10], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 23 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 2], m1 + mova [rsp+gprsize+16* 5], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 34 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 8], m1 + mova [rsp+gprsize+16*11], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 45 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 3], m1 + mova [rsp+gprsize+16* 6], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 56 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 9], m1 + mova [rsp+gprsize+16*12], m5 + mova m5, m0 +.main2: + call .h +%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m4, [filterq+myq*8] ; a + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m2, [filterq+tmpq*8] ; b + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m3, [filterq+myq*8] ; c + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m1, [filterq+tmpq*8] ; d + lea tmpd, [myq+gammaq] + shr myd, 10 + punpcklwd m4, m2 + punpcklwd m3, m1 + punpckldq m2, m4, m3 + punpckhdq m4, m3 + punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 + pmaddwd m1, [rsp+gprsize+16*%1] + punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 + mova m2, [rsp+gprsize+16*%2] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%1], m2 + paddd m1, m3 + punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 + mova m2, [rsp+gprsize+16*%3] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%2], m2 + paddd m1, m3 + punpcklwd m3, m5, m0 ; 67 + punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 + pmaddwd m2, m3 + mova [rsp+gprsize+16*%3], m3 + paddd m1, m2 + movq m4, [filterq+myq*8] ; e + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] ; f + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m2, [filterq+myq*8] ; g +%if ARCH_X86_64 + lea myd, [tmpq+deltaq] ; my += delta +%else + mov myd, [esp+gprsize+4*61] + add myd, tmpd +%endif + shr tmpd, 10 + punpcklwd m4, m3 + movq m3, [filterq+tmpq*8] ; h + punpcklwd m2, m3 + punpckldq m3, m4, m2 + punpckhdq m4, m2 + punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 + pmaddwd m2, [rsp+gprsize+16*%4] + punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 + mova m3, [rsp+gprsize+16*%5] + pmaddwd m6, m3 + mova [rsp+gprsize+16*%4], m3 + pxor m3, m3 + paddd m2, m6 + punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 + mova m6, [rsp+gprsize+16*%6] + pmaddwd m3, m6 + mova [rsp+gprsize+16*%5], m6 + punpckhwd m5, m0 + pxor m6, m6 + paddd m2, m3 + punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 + pmaddwd m3, m5 + mova [rsp+gprsize+16*%6], m5 + mova m5, m0 + paddd m2, m3 +%endmacro + WARP_V 1, 2, 3, 4, 5, 6 + ret +.main3: + call .h + WARP_V 7, 8, 9, 10, 11, 12 + ret +ALIGN function_align +.h: + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + punpcklbw m0, m6, m3 + movu m3, [srcq-6] + pmaddwd m0, m3 ; 0 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m2, m6, m3 + movu m3, [srcq-4] + pmaddwd m2, m3 ; 1 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m0, m2 ; 0 1 + punpcklbw m2, m6, m3 + movu m3, [srcq-2] + pmaddwd m2, m3 ; 2 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m1, m6, m3 + movu m3, [srcq+0] + pmaddwd m1, m3 ; 3 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m2, m1 ; 2 3 + punpcklbw m1, m6, m3 + movu m3, [srcq+2] + pmaddwd m1, m3 ; 4 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + phaddd m0, m2 ; 0 1 2 3 + punpcklbw m2, m6, m3 + movu m3, [srcq+4] + pmaddwd m2, m3 ; 5 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m1, m2 ; 4 5 + punpcklbw m2, m6, m3 + movu m3, [srcq+6] + pmaddwd m2, m3 ; 6 +%if ARCH_X86_64 + lea mxd, [tmpq+betaq] ; mx += beta +%else + mov mxd, [esp+gprsize*2+4*60] + add mxd, tmpd +%endif + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m4, m6, m3 + movu m3, [srcq+8] +%if ARCH_X86_64 + add srcq, ssq +%else + add srcq, [esp+gprsize*2+4*62] +%endif + pmaddwd m3, m4 ; 7 + phaddd m2, m3 ; 6 7 + phaddd m1, m2 ; 4 5 6 7 + paddd m0, m7 + paddd m1, m7 + psrad m0, [rsp+gprsize*2] + psrad m1, [rsp+gprsize*2] + packssdw m0, m1 + ret + +%macro BIDIR_FN 0 + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.ret: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jne .w8_loop + RET +.w16_loop: + call .main + add dstq, strideq +.w16: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + dec hd + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h +%define base r6-avg_ssse3_table + LEA r6, avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + movddup m2, [base+bidir_rnd+t0*8] + movddup m3, [base+bidir_mul+t0*8] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+16*0] + paddsw m0, [tmp2q+16*0] + mova m1, [tmp1q+16*1] + paddsw m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaxsw m0, m2 + pmaxsw m1, m2 + psubsw m0, m2 + psubsw m1, m2 + pmulhw m0, m3 + pmulhw m1, m3 + ret + +cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h +%define base r6-w_avg_ssse3_table + LEA r6, w_avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; weight + movd m6, r7m ; pixel_max + movddup m5, [base+pd_65538] + movsxd wq, [r6+wq*4] + pshufb m6, [base+pw_256] + add wq, r6 + lea r6d, [t0-16] + shl t0d, 16 + sub t0d, r6d ; 16-weight, weight + paddw m5, m6 + mov r6d, t0d + shl t0d, 2 + test dword r7m, 0x800 + cmovnz r6d, t0d + movifnidn hd, hm + movd m4, r6d + pslld m5, 7 + pxor m7, m7 + pshufd m4, m4, q0000 + BIDIR_FN +ALIGN function_align +.main: + mova m2, [tmp1q+16*0] + mova m0, [tmp2q+16*0] + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + mova m2, [tmp1q+16*1] + mova m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaddwd m3, m4 + pmaddwd m0, m4 + paddd m3, m5 + paddd m0, m5 + psrad m3, 8 + psrad m0, 8 + packssdw m0, m3 + punpckhwd m3, m1, m2 + punpcklwd m1, m2 + pmaddwd m3, m4 + pmaddwd m1, m4 + paddd m3, m5 + paddd m1, m5 + psrad m3, 8 + psrad m1, 8 + packssdw m1, m3 + pminsw m0, m6 + pminsw m1, m6 + pmaxsw m0, m7 + pmaxsw m1, m7 + ret + +%if ARCH_X86_64 +cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask +%else +cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask +%define hd dword r5m +%define m8 [base+pw_64] +%endif +%define base r6-mask_ssse3_table + LEA r6, mask_ssse3_table + tzcnt wd, wm + mov t0d, r7m ; pixel_max + shr t0d, 11 + movsxd wq, [r6+wq*4] + movddup m6, [base+bidir_rnd+t0*8] + movddup m7, [base+bidir_mul+t0*8] +%if ARCH_X86_64 + mova m8, [base+pw_64] + movifnidn hd, hm +%endif + add wq, r6 + mov maskq, r6mp + BIDIR_FN +ALIGN function_align +.main: + movq m3, [maskq+8*0] + mova m0, [tmp1q+16*0] + mova m4, [tmp2q+16*0] + pxor m5, m5 + punpcklbw m3, m5 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + psubw m1, m8, m3 + punpckhwd m4, m3, m1 ; m, 64-m + punpcklwd m3, m1 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m0, m3 + movq m3, [maskq+8*1] + mova m1, [tmp1q+16*1] + mova m4, [tmp2q+16*1] + add maskq, 8*2 + add tmp1q, 16*2 + add tmp2q, 16*2 + psrad m2, 5 + psrad m0, 5 + packssdw m0, m2 + punpcklbw m3, m5 + punpckhwd m2, m1, m4 + punpcklwd m1, m4 + psubw m5, m8, m3 + punpckhwd m4, m3, m5 ; m, 64-m + punpcklwd m3, m5 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m1, m3 + psrad m2, 5 + psrad m1, 5 + packssdw m1, m2 + pmaxsw m0, m6 + pmaxsw m1, m6 + psubsw m0, m6 + psubsw m1, m6 + pmulhw m0, m7 + pmulhw m1, m7 + ret + +cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_420_ssse3_table + LEA t0, w_mask_420_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m0, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 + %define m8 [rsp+gprsize+16*0] + %define m9 [rsp+gprsize+16*1] + %define m10 [rsp+gprsize+16*2] + %define m11 [rsp+gprsize+16*3] +%endif + movd m7, [base+pw_2] + psubw m7, m0 + pshufb m7, [base+pw_256] + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w4: + movq [dstq+strideq*0], m0 + phaddw m2, m3 + movhps [dstq+strideq*1], m0 + phaddd m2, m2 + lea dstq, [dstq+strideq*2] + paddw m2, m7 + movq [dstq+strideq*0], m1 + psrlw m2, 2 + movhps [dstq+strideq*1], m1 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w8: + mova [dstq+strideq*0], m0 + paddw m2, m3 + phaddw m2, m2 + mova [dstq+strideq*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 2 + jg .w8_loop + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 8 +.w16: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movq [maskq], m2 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16 +.w32: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*0+16*2], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*3], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + paddw m2, [dstq+strideq*1+16*3] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*2 +.w64: + mova [dstq+strideq*1+16*1], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*2], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*2], m0 + mova [dstq+strideq*1+16*4], m3 + mova [dstq+strideq*0+16*3], m1 + call .main + mova [dstq+strideq*1+16*5], m2 + mova [dstq+strideq*0+16*4], m0 + mova [dstq+strideq*1+16*6], m3 + mova [dstq+strideq*0+16*5], m1 + call .main + mova [dstq+strideq*0+16*6], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*7], m2 + mova [dstq+strideq*0+16*7], m1 + call .main + paddw m2, [dstq+strideq*1+16*1] + paddw m3, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*3] + paddw m3, [dstq+strideq*1+16*4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16*5] + paddw m3, [dstq+strideq*1+16*6] + mova [dstq+strideq*1+16*4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*6], m2 + mova [dstq+strideq*1+16*5], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*6] + paddw m2, [dstq+strideq*1+16*7] + mova [dstq+strideq*1+16*6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*4 +.w128: + mova [dstq+strideq*1+16* 1], m2 + mova [dstq+strideq*0+16* 0], m0 + mova [dstq+strideq*1+16* 2], m3 + mova [dstq+strideq*0+16* 1], m1 + call .main + mova [dstq+strideq*1+16* 3], m2 + mova [dstq+strideq*0+16* 2], m0 + mova [dstq+strideq*1+16* 4], m3 + mova [dstq+strideq*0+16* 3], m1 + call .main + mova [dstq+strideq*1+16* 5], m2 + mova [dstq+strideq*0+16* 4], m0 + mova [dstq+strideq*1+16* 6], m3 + mova [dstq+strideq*0+16* 5], m1 + call .main + mova [dstq+strideq*1+16* 7], m2 + mova [dstq+strideq*0+16* 6], m0 + mova [dstq+strideq*1+16* 8], m3 + mova [dstq+strideq*0+16* 7], m1 + call .main + mova [dstq+strideq*1+16* 9], m2 + mova [dstq+strideq*0+16* 8], m0 + mova [dstq+strideq*1+16*10], m3 + mova [dstq+strideq*0+16* 9], m1 + call .main + mova [dstq+strideq*1+16*11], m2 + mova [dstq+strideq*0+16*10], m0 + mova [dstq+strideq*1+16*12], m3 + mova [dstq+strideq*0+16*11], m1 + call .main + mova [dstq+strideq*1+16*13], m2 + mova [dstq+strideq*0+16*12], m0 + mova [dstq+strideq*1+16*14], m3 + mova [dstq+strideq*0+16*13], m1 + call .main + mova [dstq+strideq*0+16*14], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*15], m2 + mova [dstq+strideq*0+16*15], m1 + call .main + paddw m2, [dstq+strideq*1+16* 1] + paddw m3, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 2], m2 + mova [dstq+strideq*1+16* 1], m1 + call .main + paddw m2, [dstq+strideq*1+16* 3] + paddw m3, [dstq+strideq*1+16* 4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16* 5] + paddw m3, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 6], m2 + mova [dstq+strideq*1+16* 5], m1 + call .main + paddw m2, [dstq+strideq*1+16* 7] + paddw m3, [dstq+strideq*1+16* 8] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + call .main + paddw m2, [dstq+strideq*1+16* 9] + paddw m3, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16* 8], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*10], m2 + mova [dstq+strideq*1+16* 9], m1 + call .main + paddw m2, [dstq+strideq*1+16*11] + paddw m3, [dstq+strideq*1+16*12] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16*10], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*11], m1 + packuswb m3, m2 + mova [maskq+16*2], m3 + call .main + paddw m2, [dstq+strideq*1+16*13] + paddw m3, [dstq+strideq*1+16*14] + mova [dstq+strideq*1+16*12], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*14], m2 + mova [dstq+strideq*1+16*13], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*14] + paddw m2, [dstq+strideq*1+16*15] + mova [dstq+strideq*1+16*14], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*15], m1 + packuswb m3, m2 + mova [maskq+16*3], m3 + sub hd, 2 + jg .w128_loop + RET +ALIGN function_align +.main: +%macro W_MASK 2 ; dst/tmp_offset, mask + mova m%1, [tmp1q+16*%1] + mova m%2, [tmp2q+16*%1] + punpcklwd m4, m%2, m%1 + punpckhwd m5, m%2, m%1 + psubsw m%1, m%2 + pabsw m%1, m%1 + psubusw m6, m8, m%1 + psrlw m6, 10 ; 64-m + psubw m%2, m9, m6 ; m + punpcklwd m%1, m6, m%2 + punpckhwd m6, m%2 + pmaddwd m%1, m4 + pmaddwd m6, m5 + psrad m%1, 5 + psrad m6, 5 + packssdw m%1, m6 + pmaxsw m%1, m10 + psubsw m%1, m10 + pmulhw m%1, m11 +%endmacro + W_MASK 0, 2 + W_MASK 1, 3 + add tmp1q, 16*2 + add tmp2q, 16*2 + ret + +cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_422_ssse3_table + LEA t0, w_mask_422_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m7, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 +%endif + pxor m0, m0 + add wq, t0 + pshufb m7, m0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + phaddw m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + packuswb m2, m2 + pxor m3, m3 + psubb m2, m7 + pavgb m2, m3 + movq [maskq], m2 + add maskq, 8 + ret + +cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_444_ssse3_table + LEA t0, w_mask_444_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m7, [base+bidir_mul+r6*8] + ALLOC_STACK -16*3 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + %define m11 m7 +%endif + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + packuswb m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + mova [maskq], m2 + add maskq, 16 + ret + +; (a * (64 - m) + b * m + 32) >> 6 +; = (((b - a) * m + 32) >> 6) + a +; = (((b - a) * (m << 9) + 16384) >> 15) + a +; except m << 9 overflows int16_t when m == 64 (which is possible), +; but if we negate m it works out (-64 << 9 == -32768). +; = (((a - b) * (m * -512) + 16384) >> 15) + a +cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 +%define base r6-blend_ssse3_table + LEA r6, blend_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + mova m7, [base+pw_m512] + add wq, r6 + lea stride3q, [strideq*3] + pxor m6, m6 + jmp wq +.w4: + mova m5, [maskq] + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + movq m1, [dstq+strideq*2] + movhps m1, [dstq+stride3q ] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + mova m5, [maskq] + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8 + RET +.w16: + mova m5, [maskq] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16 + RET +.w32: + mova m5, [maskq+16*0] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova m5, [maskq+16*1] + mova m0, [dstq+16*2] + mova m1, [dstq+16*3] + psubw m2, m0, [tmpq+16*2] + psubw m3, m1, [tmpq+16*3] + add maskq, 32 + add tmpq, 64 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + add dstq, strideq + dec hd + jg .w32 + RET + +cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h +%define base r5-blend_v_ssse3_table + LEA r5, blend_v_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + movd m4, [base+obmc_masks+2*2] +.w2_loop: + movd m0, [dstq+strideq*0] + movd m2, [tmpq+4*0] + movd m1, [dstq+strideq*1] + movd m3, [tmpq+4*1] + add tmpq, 4*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + movddup m2, [base+obmc_masks+4*2] +.w4_loop: + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + mova m4, [base+obmc_masks+8*2] +.w8_loop: + mova m0, [dstq+strideq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+strideq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + mova m4, [base+obmc_masks+16*2] + movq m5, [base+obmc_masks+16*3] +.w16_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16_loop + RET +.w32: +%if WIN64 + movaps [rsp+8], m6 +%endif + mova m4, [base+obmc_masks+16*4] + mova m5, [base+obmc_masks+16*5] + mova m6, [base+obmc_masks+16*6] +.w32_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + mova m2, [dstq+16*2] + paddw m1, m3 + mova m3, [tmpq+16*2] + add tmpq, 16*4 + psubw m3, m2 + pmulhrsw m3, m6 + paddw m2, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + add dstq, strideq + dec hd + jg .w32_loop +%if WIN64 + movaps m6, [rsp+8] +%endif + RET + +%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp + mova m0, [dstq+16*(%1+0)] + mova m2, [tmpq+16*(%2+0)] + mova m1, [dstq+16*(%1+1)] + mova m3, [tmpq+16*(%2+1)] +%if %3 + add tmpq, 16*%3 +%endif + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*(%1+0)], m0 + mova [dstq+16*(%1+1)], m1 +%endmacro + +cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask +%define base r6-blend_h_ssse3_table + LEA r6, blend_h_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m4, [base+blend_shuf] + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + add wq, r6 + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + movd m2, [dstq+dsq*1] + movd m3, [maskq+hq*2] + movq m1, [tmpq] + add tmpq, 4*2 + punpckldq m0, m2 + punpcklwd m3, m3 + psubw m1, m0 + pmulhrsw m1, m3 + paddw m0, m1 + movd [dstq+dsq*0], m0 + psrlq m0, 32 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova m3, [base+blend_shuf] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + movd m2, [maskq+hq*2] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + movddup m5, [base+blend_shuf+8] +%if WIN64 + movaps [rsp+ 8], m6 + movaps [rsp+24], m7 +%endif +.w8_loop: + movd m7, [maskq+hq*2] + mova m0, [dstq+dsq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+dsq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + pshufb m6, m7, m4 + psubw m2, m0 + pshufb m7, m5 + psubw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m3, m7 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop +%if WIN64 + movaps m6, [rsp+ 8] + movaps m7, [rsp+24] +%endif + RET +.w16: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0, 2 + add dstq, dsq + inc hq + jl .w16 + RET +.w32: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 4 + add dstq, dsq + inc hq + jl .w32 + RET +.w64: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 8 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 16 + BLEND_H_ROW 8, -8 + BLEND_H_ROW 10, -6 + BLEND_H_ROW 12, -4 + BLEND_H_ROW 14, -2 + add dstq, dsq + inc hq + jl .w128 + RET + +; emu_edge args: +; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, +; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, +; const pixel *ref, const ptrdiff_t ref_stride +; +; bw, bh total filled size +; iw, ih, copied block -> fill bottom, right +; x, y, offset in bw/bh -> fill top, left +cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ + y, dst, dstride, src, sstride, \ + bottomext, rightext, blk + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + +%if ARCH_X86_64 + %define reg_zero r12q + %define reg_tmp r10 + %define reg_src srcq + %define reg_bottomext bottomextq + %define reg_rightext rightextq + %define reg_blkm r9m +%else + %define reg_zero r6 + %define reg_tmp r0 + %define reg_src r1 + %define reg_bottomext r0 + %define reg_rightext r1 + %define reg_blkm r2m +%endif + ; + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor reg_zero, reg_zero + lea reg_tmp, [ihq-1] + cmp yq, ihq + cmovs reg_tmp, yq + test yq, yq + cmovs reg_tmp, reg_zero +%if ARCH_X86_64 + imul reg_tmp, sstrideq + add srcq, reg_tmp +%else + imul reg_tmp, sstridem + mov reg_src, srcm + add reg_src, reg_tmp +%endif + ; + ; ref += iclip(x, 0, iw - 1) + lea reg_tmp, [iwq-1] + cmp xq, iwq + cmovs reg_tmp, xq + test xq, xq + cmovs reg_tmp, reg_zero + lea reg_src, [reg_src+reg_tmp*2] +%if ARCH_X86_32 + mov srcm, reg_src +%endif + ; + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) +%if ARCH_X86_32 + mov r1, r1m ; restore bh +%endif + lea reg_bottomext, [yq+bhq] + sub reg_bottomext, ihq + lea r3, [bhq-1] + cmovs reg_bottomext, reg_zero + ; + + DEFINE_ARGS bw, bh, iw, ih, x, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, reg_zero + cmp reg_bottomext, bhq + cmovns reg_bottomext, r3 + cmp topextq, bhq + cmovg topextq, r3 + %if ARCH_X86_32 + mov r4m, reg_bottomext + ; + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + mov r0, r0m ; restore bw + %endif + lea reg_rightext, [xq+bwq] + sub reg_rightext, iwq + lea r2, [bwq-1] + cmovs reg_rightext, reg_zero + + DEFINE_ARGS bw, bh, iw, ih, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, reg_zero + cmp reg_rightext, bwq + cmovns reg_rightext, r2 + %if ARCH_X86_32 + mov r3m, r1 + %endif + cmp leftextq, bwq + cmovns leftextq, r2 + +%undef reg_zero +%undef reg_tmp +%undef reg_src +%undef reg_bottomext +%undef reg_rightext + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; center_h = bh - top_ext - bottom_ext +%if ARCH_X86_64 + lea r3, [bottomextq+topextq] + sub centerhq, r3 +%else + mov r1, centerhm ; restore r1 + sub centerhq, topextq + sub centerhq, r4m + mov r1m, centerhq +%endif + ; + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq +%if ARCH_X86_64 + imul r2, dstrideq +%else + mov r6, r6m ; restore dstq + imul r2, dstridem +%endif + add dstq, r2 + mov reg_blkm, dstq ; save pointer for ext + ; + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq +%if ARCH_X86_64 + lea r3, [rightextq+leftextq] + sub centerwq, r3 +%else + sub centerwq, r3m + sub centerwq, leftextq +%endif + +; vloop Macro +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix + %if ARCH_X86_64 + %define reg_tmp r12 + %else + %define reg_tmp r0 + %endif +.v_loop_%3: + %if ARCH_X86_32 + mov r0, r0m + mov r1, r1m + %endif +%if %1 + ; left extension + %if ARCH_X86_64 + movd m0, [srcq] + %else + mov r3, srcm + movd m0, [r3] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.left_loop_%3: + mova [dstq+r3*2], m0 + add r3, mmsize/2 + cmp r3, leftextq + jl .left_loop_%3 + ; body + lea reg_tmp, [dstq+leftextq*2] +%endif + xor r3, r3 +.body_loop_%3: + %if ARCH_X86_64 + movu m0, [srcq+r3*2] + %else + mov r1, srcm + movu m0, [r1+r3*2] + %endif +%if %1 + movu [reg_tmp+r3*2], m0 +%else + movu [dstq+r3*2], m0 +%endif + add r3, mmsize/2 + cmp r3, centerwq + jl .body_loop_%3 +%if %2 + ; right extension +%if %1 + lea reg_tmp, [reg_tmp+centerwq*2] +%else + lea reg_tmp, [dstq+centerwq*2] +%endif + %if ARCH_X86_64 + movd m0, [srcq+centerwq*2-2] + %else + mov r3, srcm + movd m0, [r3+centerwq*2-2] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.right_loop_%3: + movu [reg_tmp+r3*2], m0 + add r3, mmsize/2 + %if ARCH_X86_64 + cmp r3, rightextq + %else + cmp r3, r3m + %endif + jl .right_loop_%3 +%endif + %if ARCH_X86_64 + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 + %else + add dstq, dstridem + mov r0, sstridem + add srcm, r0 + sub dword centerhm, 1 + jg .v_loop_%3 + mov r0, r0m ; restore r0 + %endif +%endmacro ; vloop MACRO + + test leftextq, leftextq + jnz .need_left_ext + %if ARCH_X86_64 + test rightextq, rightextq + jnz .need_right_ext + %else + cmp leftextq, r3m ; leftextq == 0 + jne .need_right_ext + %endif + v_loop 0, 0, 0 + jmp .body_done + + ;left right extensions +.need_left_ext: + %if ARCH_X86_64 + test rightextq, rightextq + %else + mov r3, r3m + test r3, r3 + %endif + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: +; r0 ; bw +; r1 ;; x loop +; r4 ;; y loop +; r5 ; topextq +; r6 ;dstq +; r7 ;dstrideq +; r8 ; srcq +%if ARCH_X86_64 + %define reg_dstride dstrideq +%else + %define reg_dstride r2 +%endif + ; + ; bottom edge extension + %if ARCH_X86_64 + test bottomextq, bottomextq + jz .top + %else + xor r1, r1 + cmp r1, r4m + je .top + %endif + ; + %if ARCH_X86_64 + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 + %else + mov r3, dstq + mov reg_dstride, dstridem + sub r3, reg_dstride + mov srcm, r3 + %endif + ; +.bottom_x_loop: + %if ARCH_X86_64 + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, bottomextq + %else + mov r3, srcm + mova m0, [r3+r1*2] + lea r3, [dstq+r1*2] + mov r4, r4m + %endif + ; +.bottom_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .bottom_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end +%if ARCH_X86_64 + mov srcq, reg_blkm +%else + mov r3, reg_blkm + mov reg_dstride, dstridem +%endif + mov dstq, dstm + xor r1, r1 + ; +.top_x_loop: +%if ARCH_X86_64 + mova m0, [srcq+r1*2] +%else + mov r3, reg_blkm + mova m0, [r3+r1*2] +%endif + lea r3, [dstq+r1*2] + mov r4, topextq + ; +.top_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .top_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +%undef reg_dstride +%undef reg_blkm +%undef reg_tmp diff -Nru dav1d-0.7.1/src/x86/mc.asm dav1d-0.9.1/src/x86/mc.asm --- dav1d-0.7.1/src/x86/mc.asm 2020-06-21 11:48:55.032126400 +0000 +++ dav1d-0.9.1/src/x86/mc.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,8066 +0,0 @@ -; Copyright © 2018, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC -; All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions are met: -; -; 1. Redistributions of source code must retain the above copyright notice, this -; list of conditions and the following disclaimer. -; -; 2. Redistributions in binary form must reproduce the above copyright notice, -; this list of conditions and the following disclaimer in the documentation -; and/or other materials provided with the distribution. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -%include "ext/x86/x86inc.asm" - -%if ARCH_X86_64 - -SECTION_RODATA 64 - -; dav1d_obmc_masks[] with 64-x interleaved -obmc_masks: db 0, 0, 0, 0 - ; 2 - db 45, 19, 64, 0 - ; 4 - db 39, 25, 50, 14, 59, 5, 64, 0 - ; 8 - db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 - ; 16 - db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 - db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 - ; 32 - db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 - db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 - db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 - db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 - -bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 - db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 - db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 - db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 -wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31 - db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63 - db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 - db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 -wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47 - db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63 - db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 - db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 -wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 - db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127 - db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 - db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 -wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62 - db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 - db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126 - db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 -wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 - db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 - db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 - db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 -bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 - db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 - db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39 - db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47 -bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 - db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 - db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23 - db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31 -bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 - db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 - db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87 - db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39 -bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 - db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 - db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23 - db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31 -bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7 - db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15 - db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 - db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 -bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7 -spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 - db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 -spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 - db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42 - db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50 -spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 - db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 - db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54 -spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 - db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 -spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 - db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26 - db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34 -spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 - db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 - db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 -spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 - db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 -spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 - db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 - db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 - db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 - -warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 - db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 -warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 - db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 -subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 - db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 -subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 -subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 -subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 -subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 -subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 -subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 -bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 -bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 -deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 -blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 -wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -pb_8x0_8x8: times 8 db 0 - times 8 db 8 -bdct_lb_dw: times 4 db 0 - times 4 db 4 - times 4 db 8 - times 4 db 12 - -ALIGN 32 -rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 -resize_shuf: times 5 db 0 - db 1, 2, 3, 4, 5, 6 - times 5+8 db 7 - -ALIGN 8 -wm_420_perm64: dq 0xfedcba9876543210 -wm_420_sign: dd 0x01020102, 0x01010101 -wm_422_sign: dd 0x80808080, 0x7f7f7f7f -wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040 - -ALIGN 4 -pb_0123: db 0, 1, 2, 3 -pb_4567: db 4, 5, 6, 7 -pw_m128 times 2 dw -128 -pw_m256: times 2 dw -256 -pw_32: times 2 dw 32 -pw_34: times 2 dw 34 -pw_258: times 2 dw 258 -pw_512: times 2 dw 512 -pw_1024: times 2 dw 1024 -pw_2048: times 2 dw 2048 -pw_6903: times 2 dw 6903 -pw_8192: times 2 dw 8192 -pd_2: dd 2 -pd_32: dd 32 -pd_63: dd 63 -pd_512: dd 512 -pd_32768: dd 32768 -pd_0x3ff: dd 0x3ff -pd_0x4000: dd 0x4000 -pq_0x40000000: dq 0x40000000 - -%define pb_m64 (wm_sign_avx512+4) -%define pb_64 (wm_sign_avx512+8) -%define pb_127 (wm_422_sign +4) - -cextern mc_subpel_filters -cextern mc_warp_filter -%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) - -%macro BASE_JMP_TABLE 3-* - %xdefine %1_%2_table (%%table - %3) - %xdefine %%base %1_%2 - %%table: - %rep %0 - 2 - dw %%base %+ _w%3 - %%base - %rotate 1 - %endrep -%endmacro - -%macro HV_JMP_TABLE 5-* - %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) - %xdefine %%base %1_%3 - %assign %%types %4 - %if %%types & 1 - %xdefine %1_%2_h_%3_table (%%h - %5) - %%h: - %rep %0 - 4 - dw %%prefix %+ .h_w%5 - %%base - %rotate 1 - %endrep - %rotate 4 - %endif - %if %%types & 2 - %xdefine %1_%2_v_%3_table (%%v - %5) - %%v: - %rep %0 - 4 - dw %%prefix %+ .v_w%5 - %%base - %rotate 1 - %endrep - %rotate 4 - %endif - %if %%types & 4 - %xdefine %1_%2_hv_%3_table (%%hv - %5) - %%hv: - %rep %0 - 4 - dw %%prefix %+ .hv_w%5 - %%base - %rotate 1 - %endrep - %endif -%endmacro - -%macro BIDIR_JMP_TABLE 1-* - %xdefine %1_table (%%table - 2*%2) - %xdefine %%base %1_table - %xdefine %%prefix mangle(private_prefix %+ _%1) - %%table: - %rep %0 - 1 - dd %%prefix %+ .w%2 - %%base - %rotate 1 - %endrep -%endmacro - -%macro SCALED_JMP_TABLE 1-* - %xdefine %1_table (%%table - %2) - %xdefine %%base mangle(private_prefix %+ _%1) -%%table: - %rep %0 - 1 - dw %%base %+ .w%2 - %%base - %rotate 1 - %endrep - %rotate 1 -%%dy_1024: - %xdefine %1_dy1_table (%%dy_1024 - %2) - %rep %0 - 1 - dw %%base %+ .dy1_w%2 - %%base - %rotate 1 - %endrep - %rotate 1 -%%dy_2048: - %xdefine %1_dy2_table (%%dy_2048 - %2) - %rep %0 - 1 - dw %%base %+ .dy2_w%2 - %%base - %rotate 1 - %endrep -%endmacro - -%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put) -%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep) -%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep) - -%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX - -BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 -BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 -SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128 -SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 -BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 -BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 - -%if HAVE_AVX512ICL -BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128 -%endif ; HAVE_AVX512ICL - -SECTION .text - -INIT_XMM avx2 -DECLARE_REG_TMP 4, 6, 7 -cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy - movifnidn mxyd, r6m ; mx - lea t2, [put_avx2] - tzcnt wd, wm - movifnidn hd, hm - test mxyd, mxyd - jnz .h - mov mxyd, r7m ; my - test mxyd, mxyd - jnz .v -.put: - movzx wd, word [t2+wq*2+table_offset(put,)] - add wq, t2 - jmp wq -.put_w2: - movzx t0d, word [srcq+ssq*0] - movzx t1d, word [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - mov [dstq+dsq*0], t0w - mov [dstq+dsq*1], t1w - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .put_w2 - RET -.put_w4: - mov t0d, [srcq+ssq*0] - mov t1d, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - mov [dstq+dsq*0], t0d - mov [dstq+dsq*1], t1d - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .put_w4 - RET -.put_w8: - mov t0, [srcq+ssq*0] - mov t1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - mov [dstq+dsq*0], t0 - mov [dstq+dsq*1], t1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .put_w8 - RET -.put_w16: - movu m0, [srcq+ssq*0] - movu m1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - mova [dstq+dsq*0], m0 - mova [dstq+dsq*1], m1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .put_w16 - RET -INIT_YMM avx2 -.put_w32: - movu m0, [srcq+ssq*0] - movu m1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - mova [dstq+dsq*0], m0 - mova [dstq+dsq*1], m1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .put_w32 - RET -.put_w64: - movu m0, [srcq+ssq*0+32*0] - movu m1, [srcq+ssq*0+32*1] - movu m2, [srcq+ssq*1+32*0] - movu m3, [srcq+ssq*1+32*1] - lea srcq, [srcq+ssq*2] - mova [dstq+dsq*0+32*0], m0 - mova [dstq+dsq*0+32*1], m1 - mova [dstq+dsq*1+32*0], m2 - mova [dstq+dsq*1+32*1], m3 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .put_w64 - RET -.put_w128: - movu m0, [srcq+32*0] - movu m1, [srcq+32*1] - movu m2, [srcq+32*2] - movu m3, [srcq+32*3] - add srcq, ssq - mova [dstq+32*0], m0 - mova [dstq+32*1], m1 - mova [dstq+32*2], m2 - mova [dstq+32*3], m3 - add dstq, dsq - dec hd - jg .put_w128 - RET -.h: - ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 - ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 - imul mxyd, 0xff01 - vbroadcasti128 m4, [bilin_h_shuf8] - add mxyd, 16 << 8 - movd xm5, mxyd - mov mxyd, r7m ; my - vpbroadcastw m5, xm5 - test mxyd, mxyd - jnz .hv - movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)] - vpbroadcastd m3, [pw_2048] - add wq, t2 - jmp wq -.h_w2: - movd xm0, [srcq+ssq*0] - pinsrd xm0, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - pshufb xm0, xm4 - pmaddubsw xm0, xm5 - pmulhrsw xm0, xm3 - packuswb xm0, xm0 - pextrw [dstq+dsq*0], xm0, 0 - pextrw [dstq+dsq*1], xm0, 2 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w2 - RET -.h_w4: - mova xm4, [bilin_h_shuf4] -.h_w4_loop: - movq xm0, [srcq+ssq*0] - movhps xm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xm0, xm4 - pmaddubsw xm0, xm5 - pmulhrsw xm0, xm3 - packuswb xm0, xm0 - movd [dstq+dsq*0], xm0 - pextrd [dstq+dsq*1], xm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w4_loop - RET -.h_w8: - movu xm0, [srcq+ssq*0] - movu xm1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xm0, xm4 - pshufb xm1, xm4 - pmaddubsw xm0, xm5 - pmaddubsw xm1, xm5 - pmulhrsw xm0, xm3 - pmulhrsw xm1, xm3 - packuswb xm0, xm1 - movq [dstq+dsq*0], xm0 - movhps [dstq+dsq*1], xm0 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w8 - RET -.h_w16: - movu xm0, [srcq+ssq*0+8*0] - vinserti128 m0, m0, [srcq+ssq*1+8*0], 1 - movu xm1, [srcq+ssq*0+8*1] - vinserti128 m1, m1, [srcq+ssq*1+8*1], 1 - lea srcq, [srcq+ssq*2] - pshufb m0, m4 - pshufb m1, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packuswb m0, m1 - mova [dstq+dsq*0], xm0 - vextracti128 [dstq+dsq*1], m0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w16 - RET -.h_w32: - movu m0, [srcq+8*0] - movu m1, [srcq+8*1] - add srcq, ssq - pshufb m0, m4 - pshufb m1, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packuswb m0, m1 - mova [dstq], m0 - add dstq, dsq - dec hd - jg .h_w32 - RET -.h_w64: - movu m0, [srcq+8*0] - movu m1, [srcq+8*1] - pshufb m0, m4 - pshufb m1, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packuswb m0, m1 - movu m1, [srcq+8*4] - movu m2, [srcq+8*5] - add srcq, ssq - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmulhrsw m1, m3 - pmulhrsw m2, m3 - packuswb m1, m2 - mova [dstq+32*0], m0 - mova [dstq+32*1], m1 - add dstq, dsq - dec hd - jg .h_w64 - RET -.h_w128: - mov t1, -32*3 -.h_w128_loop: - movu m0, [srcq+t1+32*3+8*0] - movu m1, [srcq+t1+32*3+8*1] - pshufb m0, m4 - pshufb m1, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmulhrsw m0, m3 - pmulhrsw m1, m3 - packuswb m0, m1 - mova [dstq+t1+32*3], m0 - add t1, 32 - jle .h_w128_loop - add srcq, ssq - add dstq, dsq - dec hd - jg .h_w128 - RET -.v: - movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)] - imul mxyd, 0xff01 - vpbroadcastd m5, [pw_2048] - add mxyd, 16 << 8 - add wq, t2 - movd xm4, mxyd - vpbroadcastw m4, xm4 - jmp wq -.v_w2: - movd xm0, [srcq+ssq*0] -.v_w2_loop: - pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 - lea srcq, [srcq+ssq*2] - pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 - pshuflw xm1, xm1, q2301 ; 1 0 - punpcklbw xm1, xm0, xm1 - pmaddubsw xm1, xm4 - pmulhrsw xm1, xm5 - packuswb xm1, xm1 - pextrw [dstq+dsq*0], xm1, 1 - pextrw [dstq+dsq*1], xm1, 0 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .v_w2_loop - RET -.v_w4: - movd xm0, [srcq+ssq*0] -.v_w4_loop: - vpbroadcastd xm1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vpblendd xm2, xm1, xm0, 0x01 ; 0 1 - vpbroadcastd xm0, [srcq+ssq*0] - vpblendd xm1, xm1, xm0, 0x02 ; 1 2 - punpcklbw xm1, xm2 - pmaddubsw xm1, xm4 - pmulhrsw xm1, xm5 - packuswb xm1, xm1 - movd [dstq+dsq*0], xm1 - pextrd [dstq+dsq*1], xm1, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .v_w4_loop - RET -.v_w8: - movq xm0, [srcq+ssq*0] -.v_w8_loop: - movq xm3, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - punpcklbw xm1, xm3, xm0 - movq xm0, [srcq+ssq*0] - punpcklbw xm2, xm0, xm3 - pmaddubsw xm1, xm4 - pmaddubsw xm2, xm4 - pmulhrsw xm1, xm5 - pmulhrsw xm2, xm5 - packuswb xm1, xm2 - movq [dstq+dsq*0], xm1 - movhps [dstq+dsq*1], xm1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .v_w8_loop - RET -.v_w16: - movu xm0, [srcq+ssq*0] -.v_w16_loop: - vbroadcasti128 m2, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vpblendd m3, m2, m0, 0x0f ; 0 1 - vbroadcasti128 m0, [srcq+ssq*0] - vpblendd m2, m2, m0, 0xf0 ; 1 2 - punpcklbw m1, m2, m3 - punpckhbw m2, m3 - pmaddubsw m1, m4 - pmaddubsw m2, m4 - pmulhrsw m1, m5 - pmulhrsw m2, m5 - packuswb m1, m2 - mova [dstq+dsq*0], xm1 - vextracti128 [dstq+dsq*1], m1, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .v_w16_loop - RET -.v_w32: -%macro PUT_BILIN_V_W32 0 - movu m0, [srcq+ssq*0] -%%loop: - movu m3, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - punpcklbw m1, m3, m0 - punpckhbw m2, m3, m0 - movu m0, [srcq+ssq*0] - pmaddubsw m1, m4 - pmaddubsw m2, m4 - pmulhrsw m1, m5 - pmulhrsw m2, m5 - packuswb m1, m2 - mova [dstq+dsq*0], m1 - punpcklbw m1, m0, m3 - punpckhbw m2, m0, m3 - pmaddubsw m1, m4 - pmaddubsw m2, m4 - pmulhrsw m1, m5 - pmulhrsw m2, m5 - packuswb m1, m2 - mova [dstq+dsq*1], m1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg %%loop -%endmacro - PUT_BILIN_V_W32 - RET -.v_w64: - movu m0, [srcq+32*0] - movu m1, [srcq+32*1] -.v_w64_loop: - add srcq, ssq - movu m3, [srcq+32*0] - punpcklbw m2, m3, m0 - punpckhbw m0, m3, m0 - pmaddubsw m2, m4 - pmaddubsw m0, m4 - pmulhrsw m2, m5 - pmulhrsw m0, m5 - packuswb m2, m0 - mova m0, m3 - movu m3, [srcq+32*1] - mova [dstq+32*0], m2 - punpcklbw m2, m3, m1 - punpckhbw m1, m3, m1 - pmaddubsw m2, m4 - pmaddubsw m1, m4 - pmulhrsw m2, m5 - pmulhrsw m1, m5 - packuswb m2, m1 - mova m1, m3 - mova [dstq+32*1], m2 - add dstq, dsq - dec hd - jg .v_w64_loop - RET -.v_w128: - mov t0, dstq - mov t1, srcq - lea t2d, [hq+(3<<8)] -.v_w128_loop: - PUT_BILIN_V_W32 - movzx hd, t2b - add t0, 32 - add t1, 32 - mov dstq, t0 - mov srcq, t1 - sub t2d, 1<<8 - jg .v_w128_loop - RET -.hv: - ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 - ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 - movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)] - WIN64_SPILL_XMM 8 - shl mxyd, 11 ; can't shift by 12 due to signed overflow - vpbroadcastd m7, [pw_2048] - movd xm6, mxyd - add wq, t2 - vpbroadcastw m6, xm6 - jmp wq -.hv_w2: - vpbroadcastd xm0, [srcq+ssq*0] - pshufb xm0, xm4 - pmaddubsw xm0, xm5 -.hv_w2_loop: - movd xm1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pinsrd xm1, [srcq+ssq*0], 1 - pshufb xm1, xm4 - pmaddubsw xm1, xm5 ; 1 _ 2 _ - shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ - mova xm0, xm1 - psubw xm1, xm2 - paddw xm1, xm1 - pmulhw xm1, xm6 - paddw xm1, xm2 - pmulhrsw xm1, xm7 - packuswb xm1, xm1 - pextrw [dstq+dsq*0], xm1, 0 - pextrw [dstq+dsq*1], xm1, 2 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .hv_w2_loop - RET -.hv_w4: - mova xm4, [bilin_h_shuf4] - movddup xm0, [srcq+ssq*0] - pshufb xm0, xm4 - pmaddubsw xm0, xm5 -.hv_w4_loop: - movq xm1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - movhps xm1, [srcq+ssq*0] - pshufb xm1, xm4 - pmaddubsw xm1, xm5 ; 1 2 - shufps xm2, xm0, xm1, q1032 ; 0 1 - mova xm0, xm1 - psubw xm1, xm2 - paddw xm1, xm1 - pmulhw xm1, xm6 - paddw xm1, xm2 - pmulhrsw xm1, xm7 - packuswb xm1, xm1 - movd [dstq+dsq*0], xm1 - pextrd [dstq+dsq*1], xm1, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .hv_w4_loop - RET -.hv_w8: - vbroadcasti128 m0, [srcq+ssq*0] - pshufb m0, m4 - pmaddubsw m0, m5 -.hv_w8_loop: - movu xm1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vinserti128 m1, m1, [srcq+ssq*0], 1 - pshufb m1, m4 - pmaddubsw m1, m5 ; 1 2 - vperm2i128 m2, m0, m1, 0x21 ; 0 1 - mova m0, m1 - psubw m1, m2 - paddw m1, m1 - pmulhw m1, m6 - paddw m1, m2 - pmulhrsw m1, m7 - vextracti128 xm2, m1, 1 - packuswb xm1, xm2 - movq [dstq+dsq*0], xm1 - movhps [dstq+dsq*1], xm1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .hv_w8_loop - RET -.hv_w16: - movu m0, [srcq+ssq*0+8*0] - vinserti128 m0, m0, [srcq+ssq*0+8*1], 1 - pshufb m0, m4 - pmaddubsw m0, m5 -.hv_w16_loop: - movu xm2, [srcq+ssq*1+8*0] - vinserti128 m2, m2, [srcq+ssq*1+8*1], 1 - lea srcq, [srcq+ssq*2] - movu xm3, [srcq+ssq*0+8*0] - vinserti128 m3, m3, [srcq+ssq*0+8*1], 1 - pshufb m2, m4 - pshufb m3, m4 - pmaddubsw m2, m5 - psubw m1, m2, m0 - paddw m1, m1 - pmulhw m1, m6 - paddw m1, m0 - pmaddubsw m0, m3, m5 - psubw m3, m0, m2 - paddw m3, m3 - pmulhw m3, m6 - paddw m3, m2 - pmulhrsw m1, m7 - pmulhrsw m3, m7 - packuswb m1, m3 - vpermq m1, m1, q3120 - mova [dstq+dsq*0], xm1 - vextracti128 [dstq+dsq*1], m1, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .hv_w16_loop - RET -.hv_w32: - xor t2d, t2d -.hv_w32gt: - mov t0, dstq - mov t1, srcq -%if WIN64 - movaps r4m, xmm8 -%endif -.hv_w32_loop0: - movu m0, [srcq+8*0] - vinserti128 m0, m0, [srcq+8*2], 1 - movu m1, [srcq+8*1] - vinserti128 m1, m1, [srcq+8*3], 1 - pshufb m0, m4 - pshufb m1, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 -.hv_w32_loop: - add srcq, ssq - movu xm2, [srcq+8*1] - vinserti128 m2, m2, [srcq+8*3], 1 - pshufb m2, m4 - pmaddubsw m2, m5 - psubw m3, m2, m1 - paddw m3, m3 - pmulhw m3, m6 - paddw m3, m1 - mova m1, m2 - pmulhrsw m8, m3, m7 - movu xm2, [srcq+8*0] - vinserti128 m2, m2, [srcq+8*2], 1 - pshufb m2, m4 - pmaddubsw m2, m5 - psubw m3, m2, m0 - paddw m3, m3 - pmulhw m3, m6 - paddw m3, m0 - mova m0, m2 - pmulhrsw m3, m7 - packuswb m3, m8 - mova [dstq], m3 - add dstq, dsq - dec hd - jg .hv_w32_loop - movzx hd, t2b - add t0, 32 - add t1, 32 - mov dstq, t0 - mov srcq, t1 - sub t2d, 1<<8 - jg .hv_w32_loop0 -%if WIN64 - movaps xmm8, r4m -%endif - RET -.hv_w64: - lea t2d, [hq+(1<<8)] - jmp .hv_w32gt -.hv_w128: - lea t2d, [hq+(3<<8)] - jmp .hv_w32gt - -%macro PREP_BILIN 0 -DECLARE_REG_TMP 3, 5, 6 -cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 - movifnidn mxyd, r5m ; mx - lea t2, [prep%+SUFFIX] - tzcnt wd, wm - movifnidn hd, hm - test mxyd, mxyd - jnz .h - mov mxyd, r6m ; my - test mxyd, mxyd - jnz .v -.prep: - movzx wd, word [t2+wq*2+table_offset(prep,)] - add wq, t2 - lea stride3q, [strideq*3] - jmp wq -.prep_w4: - movd xm0, [srcq+strideq*0] - pinsrd xm0, [srcq+strideq*1], 1 - pinsrd xm0, [srcq+strideq*2], 2 - pinsrd xm0, [srcq+stride3q ], 3 - lea srcq, [srcq+strideq*4] - pmovzxbw ym0, xm0 - psllw ym0, 4 - mova [tmpq], ym0 - add tmpq, 32 - sub hd, 4 - jg .prep_w4 - RET -.prep_w8: - movq xm0, [srcq+strideq*0] -%if cpuflag(avx512) - movq xm1, [srcq+strideq*1] - vinserti128 ym0, [srcq+strideq*2], 1 - vinserti128 ym1, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - punpcklqdq ym0, ym1 - pmovzxbw m0, ym0 - psllw m0, 4 - mova [tmpq], m0 -%else - movhps xm0, [srcq+strideq*1] - movq xm1, [srcq+strideq*2] - movhps xm1, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - pmovzxbw m0, xm0 - pmovzxbw m1, xm1 - psllw m0, 4 - psllw m1, 4 - mova [tmpq+32*0], m0 - mova [tmpq+32*1], m1 -%endif - add tmpq, 32*2 - sub hd, 4 - jg .prep_w8 - RET -.prep_w16: -%if cpuflag(avx512) - movu xm0, [srcq+strideq*0] - vinserti128 ym0, [srcq+strideq*1], 1 - movu xm1, [srcq+strideq*2] - vinserti128 ym1, [srcq+stride3q ], 1 - pmovzxbw m0, ym0 - pmovzxbw m1, ym1 -%else - pmovzxbw m0, [srcq+strideq*0] - pmovzxbw m1, [srcq+strideq*1] - pmovzxbw m2, [srcq+strideq*2] - pmovzxbw m3, [srcq+stride3q ] -%endif - lea srcq, [srcq+strideq*4] - psllw m0, 4 - psllw m1, 4 -%if notcpuflag(avx512) - psllw m2, 4 - psllw m3, 4 -%endif - mova [tmpq+mmsize*0], m0 - mova [tmpq+mmsize*1], m1 -%if notcpuflag(avx512) - mova [tmpq+32*2], m2 - mova [tmpq+32*3], m3 -%endif - add tmpq, 32*4 - sub hd, 4 - jg .prep_w16 - RET -.prep_w32: -%if cpuflag(avx512) - pmovzxbw m0, [srcq+strideq*0] - pmovzxbw m1, [srcq+strideq*1] - pmovzxbw m2, [srcq+strideq*2] - pmovzxbw m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] -%else - pmovzxbw m0, [srcq+strideq*0+16*0] - pmovzxbw m1, [srcq+strideq*0+16*1] - pmovzxbw m2, [srcq+strideq*1+16*0] - pmovzxbw m3, [srcq+strideq*1+16*1] - lea srcq, [srcq+strideq*2] -%endif - psllw m0, 4 - psllw m1, 4 - psllw m2, 4 - psllw m3, 4 - mova [tmpq+mmsize*0], m0 - mova [tmpq+mmsize*1], m1 - mova [tmpq+mmsize*2], m2 - mova [tmpq+mmsize*3], m3 - add tmpq, mmsize*4 - sub hd, mmsize*4/(32*2) - jg .prep_w32 - RET -.prep_w64: -%if cpuflag(avx512) - pmovzxbw m0, [srcq+strideq*0+32*0] - pmovzxbw m1, [srcq+strideq*0+32*1] - pmovzxbw m2, [srcq+strideq*1+32*0] - pmovzxbw m3, [srcq+strideq*1+32*1] - lea srcq, [srcq+strideq*2] -%else - pmovzxbw m0, [srcq+16*0] - pmovzxbw m1, [srcq+16*1] - pmovzxbw m2, [srcq+16*2] - pmovzxbw m3, [srcq+16*3] - add srcq, strideq -%endif - psllw m0, 4 - psllw m1, 4 - psllw m2, 4 - psllw m3, 4 - mova [tmpq+mmsize*0], m0 - mova [tmpq+mmsize*1], m1 - mova [tmpq+mmsize*2], m2 - mova [tmpq+mmsize*3], m3 - add tmpq, mmsize*4 -%if cpuflag(avx512) - sub hd, 2 -%else - dec hd -%endif - jg .prep_w64 - RET -.prep_w128: - pmovzxbw m0, [srcq+(mmsize/2)*0] - pmovzxbw m1, [srcq+(mmsize/2)*1] - pmovzxbw m2, [srcq+(mmsize/2)*2] - pmovzxbw m3, [srcq+(mmsize/2)*3] - psllw m0, 4 - psllw m1, 4 - psllw m2, 4 - psllw m3, 4 - mova [tmpq+mmsize*0], m0 - mova [tmpq+mmsize*1], m1 - mova [tmpq+mmsize*2], m2 - mova [tmpq+mmsize*3], m3 -%if notcpuflag(avx512) - pmovzxbw m0, [srcq+16*4] - pmovzxbw m1, [srcq+16*5] - pmovzxbw m2, [srcq+16*6] - pmovzxbw m3, [srcq+16*7] -%endif - add tmpq, 32*8 - add srcq, strideq -%if notcpuflag(avx512) - psllw m0, 4 - psllw m1, 4 - psllw m2, 4 - psllw m3, 4 - mova [tmpq-32*4], m0 - mova [tmpq-32*3], m1 - mova [tmpq-32*2], m2 - mova [tmpq-32*1], m3 -%endif - dec hd - jg .prep_w128 - RET -.h: - ; 16 * src[x] + (mx * (src[x + 1] - src[x])) - ; = (16 - mx) * src[x] + mx * src[x + 1] - imul mxyd, 0xff01 - add mxyd, 16 << 8 -%if cpuflag(avx512) - vpbroadcastw m5, mxyd -%else - movd xm5, mxyd - vbroadcasti128 m4, [bilin_h_shuf8] - vpbroadcastw m5, xm5 -%endif - mov mxyd, r6m ; my - test mxyd, mxyd - jnz .hv - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] - add wq, t2 - lea stride3q, [strideq*3] - jmp wq -.h_w4: - vbroadcasti128 ym4, [bilin_h_shuf4] -.h_w4_loop: - movq xm0, [srcq+strideq*0] - movhps xm0, [srcq+strideq*1] - movq xm1, [srcq+strideq*2] - movhps xm1, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vinserti128 ym0, xm1, 1 - pshufb ym0, ym4 - pmaddubsw ym0, ym5 - mova [tmpq], ym0 - add tmpq, 32 - sub hd, 4 - jg .h_w4_loop - RET -.h_w8: -%if cpuflag(avx512) - vbroadcasti128 m4, [bilin_h_shuf8] -.h_w8_loop: - movu xm0, [srcq+strideq*0] - vinserti128 ym0, [srcq+strideq*1], 1 - vinserti128 m0, [srcq+strideq*2], 2 - vinserti128 m0, [srcq+stride3q ], 3 - lea srcq, [srcq+strideq*4] - pshufb m0, m4 - pmaddubsw m0, m5 - mova [tmpq+64*0], m0 -%else -.h_w8_loop: - movu xm0, [srcq+strideq*0] - vinserti128 m0, [srcq+strideq*1], 1 - movu xm1, [srcq+strideq*2] - vinserti128 m1, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - pshufb m0, m4 - pshufb m1, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - mova [tmpq+32*0], m0 - mova [tmpq+32*1], m1 -%endif - add tmpq, 32*2 - sub hd, 4 - jg .h_w8_loop - RET -.h_w16: -%if cpuflag(avx512icl) - mova m4, [bilin_h_perm16] -.h_w16_loop: - movu ym0, [srcq+strideq*0] - vinserti32x8 m0, [srcq+strideq*1], 1 - movu ym1, [srcq+strideq*2] - vinserti32x8 m1, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - vpermb m0, m4, m0 - vpermb m1, m4, m1 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - mova [tmpq+64*0], m0 - mova [tmpq+64*1], m1 -%else -.h_w16_loop: - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, [srcq+strideq*0+8*1], 1 - movu xm1, [srcq+strideq*1+8*0] - vinserti128 m1, [srcq+strideq*1+8*1], 1 - movu xm2, [srcq+strideq*2+8*0] - vinserti128 m2, [srcq+strideq*2+8*1], 1 - movu xm3, [srcq+stride3q +8*0] - vinserti128 m3, [srcq+stride3q +8*1], 1 - lea srcq, [srcq+strideq*4] - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 - mova [tmpq+32*0], m0 - mova [tmpq+32*1], m1 - mova [tmpq+32*2], m2 - mova [tmpq+32*3], m3 -%endif - add tmpq, 32*4 - sub hd, 4 - jg .h_w16_loop - RET -.h_w32: -%if cpuflag(avx512icl) - mova m4, [bilin_h_perm32] -.h_w32_loop: - vpermb m0, m4, [srcq+strideq*0] - vpermb m1, m4, [srcq+strideq*1] - vpermb m2, m4, [srcq+strideq*2] - vpermb m3, m4, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] -%else -.h_w32_loop: - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, [srcq+strideq*0+8*1], 1 - movu xm1, [srcq+strideq*0+8*2] - vinserti128 m1, [srcq+strideq*0+8*3], 1 - movu xm2, [srcq+strideq*1+8*0] - vinserti128 m2, [srcq+strideq*1+8*1], 1 - movu xm3, [srcq+strideq*1+8*2] - vinserti128 m3, [srcq+strideq*1+8*3], 1 - lea srcq, [srcq+strideq*2] - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 -%endif - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 - mova [tmpq+mmsize*0], m0 - mova [tmpq+mmsize*1], m1 - mova [tmpq+mmsize*2], m2 - mova [tmpq+mmsize*3], m3 - add tmpq, mmsize*4 - sub hd, mmsize*4/(32*2) - jg .h_w32_loop - RET -.h_w64: -%if cpuflag(avx512icl) - mova m4, [bilin_h_perm32] -.h_w64_loop: - vpermb m0, m4, [srcq+strideq*0+32*0] - vpermb m1, m4, [srcq+strideq*0+32*1] - vpermb m2, m4, [srcq+strideq*1+32*0] - vpermb m3, m4, [srcq+strideq*1+32*1] - lea srcq, [srcq+strideq*2] -%else -.h_w64_loop: - movu xm0, [srcq+8*0] - vinserti128 m0, [srcq+8*1], 1 - movu xm1, [srcq+8*2] - vinserti128 m1, [srcq+8*3], 1 - movu xm2, [srcq+8*4] - vinserti128 m2, [srcq+8*5], 1 - movu xm3, [srcq+8*6] - vinserti128 m3, [srcq+8*7], 1 - add srcq, strideq - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 -%endif - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 - mova [tmpq+mmsize*0], m0 - mova [tmpq+mmsize*1], m1 - mova [tmpq+mmsize*2], m2 - mova [tmpq+mmsize*3], m3 - add tmpq, mmsize*4 -%if cpuflag(avx512) - sub hd, 2 -%else - dec hd -%endif - jg .h_w64_loop - RET -.h_w128: -%if cpuflag(avx512icl) - mova m4, [bilin_h_perm32] -.h_w128_loop: - vpermb m0, m4, [srcq+32*0] - vpermb m1, m4, [srcq+32*1] - vpermb m2, m4, [srcq+32*2] - vpermb m3, m4, [srcq+32*3] -%else -.h_w128_loop: - movu xm0, [srcq+8*0] - vinserti128 m0, [srcq+8*1], 1 - movu xm1, [srcq+8*2] - vinserti128 m1, [srcq+8*3], 1 - movu xm2, [srcq+8*4] - vinserti128 m2, [srcq+8*5], 1 - movu xm3, [srcq+8*6] - vinserti128 m3, [srcq+8*7], 1 - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 -%endif - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 - mova [tmpq+mmsize*0], m0 - mova [tmpq+mmsize*1], m1 - mova [tmpq+mmsize*2], m2 - mova [tmpq+mmsize*3], m3 -%if notcpuflag(avx512) - movu xm0, [srcq+8* 8] - vinserti128 m0, [srcq+8* 9], 1 - movu xm1, [srcq+8*10] - vinserti128 m1, [srcq+8*11], 1 - movu xm2, [srcq+8*12] - vinserti128 m2, [srcq+8*13], 1 - movu xm3, [srcq+8*14] - vinserti128 m3, [srcq+8*15], 1 -%endif - add tmpq, 32*8 - add srcq, strideq -%if notcpuflag(avx512) - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 - mova [tmpq-32*4], m0 - mova [tmpq-32*3], m1 - mova [tmpq-32*2], m2 - mova [tmpq-32*1], m3 -%endif - dec hd - jg .h_w128_loop - RET -.v: - WIN64_SPILL_XMM 7 - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] - imul mxyd, 0xff01 - add mxyd, 16 << 8 - add wq, t2 - lea stride3q, [strideq*3] -%if cpuflag(avx512) - vpbroadcastw m6, mxyd -%else - movd xm6, mxyd - vpbroadcastw m6, xm6 -%endif - jmp wq -.v_w4: -%if cpuflag(avx512) - vpbroadcastd xm0, [srcq+strideq*0] - mov r3d, 0x29 - vbroadcasti128 ym3, [bilin_v_shuf4] - kmovb k1, r3d -.v_w4_loop: - vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____ - vpbroadcastd ym2, [srcq+strideq*2] - vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__ - lea srcq, [srcq+strideq*4] - vpbroadcastd ym0, [srcq+strideq*0] - punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_ - pshufb ym2, ym3 -%else - movd xm0, [srcq+strideq*0] -.v_w4_loop: - vpbroadcastd m1, [srcq+strideq*2] - vpbroadcastd xm2, [srcq+strideq*1] - vpbroadcastd m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd m1, m1, m0, 0x05 ; 0 2 2 2 - vpbroadcastd m0, [srcq+strideq*0] - vpblendd m3, m3, m2, 0x0f ; 1 1 3 3 - vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 - vpblendd m1, m1, m3, 0xaa ; 0 1 2 3 - vpblendd m2, m2, m3, 0x55 ; 1 2 3 4 - punpcklbw m2, m1 -%endif - pmaddubsw ym2, ym6 - mova [tmpq], ym2 - add tmpq, 32 - sub hd, 4 - jg .v_w4_loop - RET -.v_w8: -%if cpuflag(avx512icl) - mova m5, [bilin_v_perm8] - vbroadcasti128 ym0, [srcq+strideq*0] -%else - movq xm0, [srcq+strideq*0] -%endif -.v_w8_loop: -%if cpuflag(avx512icl) - vinserti128 ym1, ym0, [srcq+strideq*1], 1 - vpbroadcastq ym0, [srcq+strideq*2] - vinserti128 m1, [srcq+stride3q ], 2 - lea srcq, [srcq+strideq*4] - vinserti128 ym0, [srcq+strideq*0], 0 - vpermt2b m1, m5, m0 - pmaddubsw m1, m6 - mova [tmpq], m1 -%else - vpbroadcastq m1, [srcq+strideq*2] - vpbroadcastq m2, [srcq+strideq*1] - vpbroadcastq m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd m1, m1, m0, 0x03 ; 0 2 2 2 - vpbroadcastq m0, [srcq+strideq*0] - vpblendd m3, m3, m2, 0x33 ; 1 3 1 3 - vpblendd m2, m1, m3, 0x0f ; 1 3 2 2 - vpblendd m1, m1, m3, 0xf0 ; 0 2 1 3 - vpblendd m2, m2, m0, 0xc0 ; 1 3 2 4 - punpcklbw m3, m2, m1 - punpckhbw m2, m1 - pmaddubsw m3, m6 - pmaddubsw m2, m6 - mova [tmpq+32*0], m3 - mova [tmpq+32*1], m2 -%endif - add tmpq, 32*2 - sub hd, 4 - jg .v_w8_loop - RET -.v_w16: -%if cpuflag(avx512icl) - mova m5, [bilin_v_perm16] - movu xm0, [srcq+strideq*0] -.v_w16_loop: - movu xm2, [srcq+strideq*2] - vinserti128 ym1, ym0, [srcq+strideq*1], 1 - vpermt2b m1, m5, m2 - vinserti128 ym2, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - movu xm0, [srcq+strideq*0] - vpermt2b m2, m5, m0 - pmaddubsw m1, m6 - pmaddubsw m2, m6 - mova [tmpq+64*0], m1 - mova [tmpq+64*1], m2 -%else - vbroadcasti128 m0, [srcq+strideq*0] -.v_w16_loop: - vbroadcasti128 m1, [srcq+strideq*2] - vbroadcasti128 m2, [srcq+strideq*1] - vbroadcasti128 m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - shufpd m4, m0, m1, 0x0c ; 0 2 ; 0l2l 0h2h - vbroadcasti128 m0, [srcq+strideq*0] - shufpd m2, m2, m3, 0x0c ; 1 3 ; 1l3l 1h3h - shufpd m1, m1, m0, 0x0c ; 2 4 ; 2l4l 2h4h - punpcklbw m3, m2, m4 - punpcklbw m5, m1, m2 - punpckhbw m1, m2 - punpckhbw m2, m4 - pmaddubsw m3, m6 - pmaddubsw m5, m6 - pmaddubsw m2, m6 - pmaddubsw m1, m6 - mova [tmpq+32*0], m3 - mova [tmpq+32*1], m5 - mova [tmpq+32*2], m2 - mova [tmpq+32*3], m1 -%endif - add tmpq, 32*4 - sub hd, 4 - jg .v_w16_loop - RET -.v_w32: -%if cpuflag(avx512icl) - mova m5, [bilin_v_perm32] - movu ym0, [srcq+strideq*0] -.v_w32_loop: - movu ym2, [srcq+strideq*1] - movu ym3, [srcq+strideq*2] - movu ym4, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpermt2b m0, m5, m2 - vpermt2b m2, m5, m3 - vpermt2b m3, m5, m4 - pmaddubsw m1, m0, m6 - movu ym0, [srcq+strideq*0] - vpermt2b m4, m5, m0 - pmaddubsw m2, m6 - pmaddubsw m3, m6 - pmaddubsw m4, m6 - mova [tmpq+64*0], m1 - mova [tmpq+64*1], m2 - mova [tmpq+64*2], m3 - mova [tmpq+64*3], m4 - add tmpq, 64*4 -%else - vpermq ym0, [srcq+strideq*0], q3120 -.v_w32_loop: - vpermq ym1, [srcq+strideq*1], q3120 - vpermq ym2, [srcq+strideq*2], q3120 - vpermq ym3, [srcq+stride3q ], q3120 - lea srcq, [srcq+strideq*4] - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 - vpermq ym0, [srcq+strideq*0], q3120 - pmaddubsw m4, m6 - pmaddubsw m5, m6 - mova [tmpq+32*0], ym4 - mova [tmpq+32*1], ym5 - punpcklbw m4, m2, m1 - punpckhbw m5, m2, m1 - pmaddubsw m4, m6 - pmaddubsw m5, m6 - mova [tmpq+32*2], ym4 - mova [tmpq+32*3], ym5 - add tmpq, 32*8 - punpcklbw m4, m3, m2 - punpckhbw m5, m3, m2 - punpcklbw m1, m0, m3 - punpckhbw m2, m0, m3 - pmaddubsw m4, m6 - pmaddubsw m5, m6 - pmaddubsw m1, m6 - pmaddubsw m2, m6 - mova [tmpq-32*4], m4 - mova [tmpq-32*3], m5 - mova [tmpq-32*2], m1 - mova [tmpq-32*1], m2 -%endif - sub hd, 4 - jg .v_w32_loop - RET -.v_w64: -%if cpuflag(avx512) - mova m5, [bilin_v_perm64] - vpermq m0, m5, [srcq+strideq*0] -.v_w64_loop: - vpermq m1, m5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - punpcklbw m4, m1, m0 - punpckhbw m2, m1, m0 - vpermq m0, m5, [srcq+strideq*0] - punpcklbw m3, m0, m1 - punpckhbw m1, m0, m1 - pmaddubsw m4, m6 - pmaddubsw m2, m6 - pmaddubsw m3, m6 - pmaddubsw m1, m6 - mova [tmpq+64*0], m4 - mova [tmpq+64*1], m2 - mova [tmpq+64*2], m3 - mova [tmpq+64*3], m1 - add tmpq, 64*4 -%else - vpermq m0, [srcq+strideq*0+32*0], q3120 - vpermq m1, [srcq+strideq*0+32*1], q3120 -.v_w64_loop: - vpermq m2, [srcq+strideq*1+32*0], q3120 - vpermq m3, [srcq+strideq*1+32*1], q3120 - lea srcq, [srcq+strideq*2] - punpcklbw m4, m2, m0 - punpckhbw m5, m2, m0 - pmaddubsw m4, m6 - pmaddubsw m5, m6 - mova [tmpq+32*0], m4 - mova [tmpq+32*1], m5 - punpcklbw m4, m3, m1 - punpckhbw m5, m3, m1 - vpermq m0, [srcq+strideq*0+32*0], q3120 - vpermq m1, [srcq+strideq*0+32*1], q3120 - pmaddubsw m4, m6 - pmaddubsw m5, m6 - mova [tmpq+32*2], m4 - mova [tmpq+32*3], m5 - add tmpq, 32*8 - punpcklbw m4, m0, m2 - punpckhbw m5, m0, m2 - punpcklbw m2, m1, m3 - punpckhbw m3, m1, m3 - pmaddubsw m4, m6 - pmaddubsw m5, m6 - pmaddubsw m2, m6 - pmaddubsw m3, m6 - mova [tmpq-32*4], m4 - mova [tmpq-32*3], m5 - mova [tmpq-32*2], m2 - mova [tmpq-32*1], m3 -%endif - sub hd, 2 - jg .v_w64_loop - RET -.v_w128: -%if cpuflag(avx512) - mova m5, [bilin_v_perm64] - vpermq m0, m5, [srcq+strideq*0+ 0] - vpermq m1, m5, [srcq+strideq*0+64] -.v_w128_loop: - vpermq m2, m5, [srcq+strideq*1+ 0] - vpermq m3, m5, [srcq+strideq*1+64] - lea srcq, [srcq+strideq*2] - punpcklbw m4, m2, m0 - punpckhbw m0, m2, m0 - pmaddubsw m4, m6 - pmaddubsw m0, m6 - mova [tmpq+64*0], m4 - mova [tmpq+64*1], m0 - punpcklbw m4, m3, m1 - punpckhbw m1, m3, m1 - pmaddubsw m4, m6 - pmaddubsw m1, m6 - mova [tmpq+64*2], m4 - mova [tmpq+64*3], m1 - vpermq m0, m5, [srcq+strideq*0+ 0] - vpermq m1, m5, [srcq+strideq*0+64] - punpcklbw m4, m0, m2 - punpckhbw m2, m0, m2 - pmaddubsw m4, m6 - pmaddubsw m2, m6 - mova [tmpq+64*4], m4 - mova [tmpq+64*5], m2 - punpcklbw m4, m1, m3 - punpckhbw m3, m1, m3 - pmaddubsw m4, m6 - pmaddubsw m3, m6 - mova [tmpq+64*6], m4 - mova [tmpq+64*7], m3 - add tmpq, 64*8 - sub hd, 2 - jg .v_w128_loop -%else - mov t0, tmpq - mov t1, srcq - lea t2d, [hq+(3<<8)] -.v_w128_loop0: - vpermq m0, [srcq+strideq*0], q3120 -.v_w128_loop: - vpermq m1, [srcq+strideq*1], q3120 - lea srcq, [srcq+strideq*2] - punpcklbw m2, m1, m0 - punpckhbw m3, m1, m0 - vpermq m0, [srcq+strideq*0], q3120 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 - pmaddubsw m2, m6 - pmaddubsw m3, m6 - pmaddubsw m4, m6 - pmaddubsw m5, m6 - mova [tmpq+32*0], m2 - mova [tmpq+32*1], m3 - mova [tmpq+32*8], m4 - mova [tmpq+32*9], m5 - add tmpq, 32*16 - sub hd, 2 - jg .v_w128_loop - movzx hd, t2b - add t0, 64 - add t1, 32 - mov tmpq, t0 - mov srcq, t1 - sub t2d, 1<<8 - jg .v_w128_loop0 -%endif - RET -.hv: - ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 - ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 7 - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] - shl mxyd, 11 -%if cpuflag(avx512) - vpbroadcastw m6, mxyd -%else - movd xm6, mxyd - vpbroadcastw m6, xm6 -%endif - add wq, t2 - lea stride3q, [strideq*3] - jmp wq -.hv_w4: - vbroadcasti128 ym4, [bilin_h_shuf4] - vpbroadcastq ym0, [srcq+strideq*0] - pshufb ym0, ym4 - pmaddubsw ym0, ym5 -.hv_w4_loop: - movq xm1, [srcq+strideq*1] - movhps xm1, [srcq+strideq*2] - movq xm2, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - movhps xm2, [srcq+strideq*0] - vinserti128 ym1, xm2, 1 - pshufb ym1, ym4 - pmaddubsw ym1, ym5 ; 1 2 3 4 -%if cpuflag(avx512) - valignq ym2, ym1, ym0, 3 ; 0 1 2 3 -%else - vpblendd ym2, ym1, ym0, 0xc0 - vpermq ym2, ym2, q2103 ; 0 1 2 3 -%endif - mova ym0, ym1 - psubw ym1, ym2 - pmulhrsw ym1, ym6 - paddw ym1, ym2 - mova [tmpq], ym1 - add tmpq, 32 - sub hd, 4 - jg .hv_w4_loop - RET -.hv_w8: -%if cpuflag(avx512) - vbroadcasti128 m4, [bilin_h_shuf8] -%endif - vbroadcasti128 m0, [srcq+strideq*0] - pshufb m0, m4 - pmaddubsw m0, m5 -.hv_w8_loop: - movu xm1, [srcq+strideq*1] -%if cpuflag(avx512) - vinserti128 ym1, [srcq+strideq*2], 1 - vinserti128 m1, [srcq+stride3q ], 2 - lea srcq, [srcq+strideq*4] - vinserti128 m1, [srcq+strideq*0], 3 - pshufb m1, m4 - pmaddubsw m1, m5 ; 1 2 3 4 - valignq m2, m1, m0, 6 ; 0 1 2 3 - mova m0, m1 - psubw m1, m2 - pmulhrsw m1, m6 - paddw m1, m2 - mova [tmpq], m1 -%else - vinserti128 m1, m1, [srcq+strideq*2], 1 - movu xm2, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vinserti128 m2, m2, [srcq+strideq*0], 1 - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 ; 1 2 - vperm2i128 m3, m0, m1, 0x21 ; 0 1 - pmaddubsw m0, m2, m5 ; 3 4 - vperm2i128 m2, m1, m0, 0x21 ; 2 3 - psubw m1, m3 - pmulhrsw m1, m6 - paddw m1, m3 - psubw m3, m0, m2 - pmulhrsw m3, m6 - paddw m3, m2 - mova [tmpq+32*0], m1 - mova [tmpq+32*1], m3 -%endif - add tmpq, 32*2 - sub hd, 4 - jg .hv_w8_loop - RET -.hv_w16: -%if cpuflag(avx512icl) - mova m4, [bilin_h_perm16] - vbroadcasti32x8 m0, [srcq+strideq*0] - vpermb m0, m4, m0 -%else - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, [srcq+strideq*0+8*1], 1 - pshufb m0, m4 -%endif - pmaddubsw m0, m5 -.hv_w16_loop: -%if cpuflag(avx512icl) - movu ym1, [srcq+strideq*1] - vinserti32x8 m1, [srcq+strideq*2], 1 - movu ym2, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vinserti32x8 m2, [srcq+strideq*0], 1 - vpermb m1, m4, m1 - vpermb m2, m4, m2 - pmaddubsw m1, m5 ; 1 2 - vshufi32x4 m3, m0, m1, q1032 ; 0 1 - pmaddubsw m0, m2, m5 ; 3 4 - vshufi32x4 m2, m1, m0, q1032 ; 2 3 - psubw m1, m3 - pmulhrsw m1, m6 - paddw m1, m3 - psubw m3, m0, m2 - pmulhrsw m3, m6 - paddw m3, m2 - mova [tmpq+64*0], m1 - mova [tmpq+64*1], m3 -%else - movu xm1, [srcq+strideq*1+8*0] - vinserti128 m1, [srcq+strideq*1+8*1], 1 - lea srcq, [srcq+strideq*2] - movu xm2, [srcq+strideq*0+8*0] - vinserti128 m2, [srcq+strideq*0+8*1], 1 - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 - psubw m3, m1, m0 - pmulhrsw m3, m6 - paddw m3, m0 - pmaddubsw m0, m2, m5 - psubw m2, m0, m1 - pmulhrsw m2, m6 - paddw m2, m1 - mova [tmpq+32*0], m3 - mova [tmpq+32*1], m2 -%endif - add tmpq, mmsize*2 - sub hd, mmsize*2/(16*2) - jg .hv_w16_loop - RET -.hv_w32: -%if cpuflag(avx512icl) - mova m4, [bilin_h_perm32] - vpermb m0, m4, [srcq+strideq*0] - pmaddubsw m0, m5 -.hv_w32_loop: - vpermb m1, m4, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermb m2, m4, [srcq+strideq*0] - pmaddubsw m1, m5 - psubw m3, m1, m0 - pmulhrsw m3, m6 - paddw m3, m0 - pmaddubsw m0, m2, m5 - psubw m2, m0, m1 - pmulhrsw m2, m6 - paddw m2, m1 - mova [tmpq+64*0], m3 - mova [tmpq+64*1], m2 - add tmpq, 64*2 - sub hd, 2 -%else - movu xm0, [srcq+8*0] - vinserti128 m0, [srcq+8*1], 1 - movu xm1, [srcq+8*2] - vinserti128 m1, [srcq+8*3], 1 - pshufb m0, m4 - pshufb m1, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 -.hv_w32_loop: - add srcq, strideq - movu xm2, [srcq+8*0] - vinserti128 m2, m2, [srcq+8*1], 1 - pshufb m2, m4 - pmaddubsw m2, m5 - psubw m3, m2, m0 - pmulhrsw m3, m6 - paddw m3, m0 - mova m0, m2 - mova [tmpq+ 0], m3 - movu xm2, [srcq+8*2] - vinserti128 m2, m2, [srcq+8*3], 1 - pshufb m2, m4 - pmaddubsw m2, m5 - psubw m3, m2, m1 - pmulhrsw m3, m6 - paddw m3, m1 - mova m1, m2 - mova [tmpq+32], m3 - add tmpq, 32*2 - dec hd -%endif - jg .hv_w32_loop - RET -.hv_w64: -%if cpuflag(avx512icl) - mova m4, [bilin_h_perm32] - vpermb m0, m4, [srcq+32*0] - vpermb m1, m4, [srcq+32*1] - pmaddubsw m0, m5 - pmaddubsw m1, m5 -.hv_w64_loop: - add srcq, strideq - vpermb m2, m4, [srcq+32*0] - vpermb m3, m4, [srcq+32*1] - pmaddubsw m2, m5 - pmaddubsw m3, m5 - psubw m7, m2, m0 - psubw m8, m3, m1 - pmulhrsw m7, m6 - pmulhrsw m8, m6 - paddw m7, m0 - paddw m8, m1 - mova [tmpq+ 0], m7 - mova [tmpq+64], m8 - mova m0, m2 - mova m1, m3 - add tmpq, 64*2 - dec hd - jg .hv_w64_loop -%else - mov t0, tmpq - mov t1, srcq - lea t2d, [hq+(3<<8)] -.hv_w64_loop0: - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, m0, [srcq+strideq*0+8*1], 1 - pshufb m0, m4 - pmaddubsw m0, m5 -.hv_w64_loop: - movu xm1, [srcq+strideq*1+8*0] - vinserti128 m1, m1, [srcq+strideq*1+8*1], 1 - lea srcq, [srcq+strideq*2] - movu xm2, [srcq+strideq*0+8*0] - vinserti128 m2, m2, [srcq+strideq*0+8*1], 1 - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 - psubw m3, m1, m0 - pmulhrsw m3, m6 - paddw m3, m0 - pmaddubsw m0, m2, m5 - psubw m2, m0, m1 - pmulhrsw m2, m6 - paddw m2, m1 - mova [tmpq+32*0], m3 - add tmpq, 32*8 - mova [tmpq-32*4], m2 - sub hd, 2 - jg .hv_w64_loop - movzx hd, t2b - add t0, 32 - add t1, 16 - mov tmpq, t0 - mov srcq, t1 - sub t2d, 1<<8 - jg .hv_w64_loop0 -%endif - RET -.hv_w128: -%if cpuflag(avx512icl) - mova m4, [bilin_h_perm32] - vpermb m0, m4, [srcq+32*0] - vpermb m1, m4, [srcq+32*1] - vpermb m2, m4, [srcq+32*2] - vpermb m3, m4, [srcq+32*3] - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 -.hv_w128_loop: - add srcq, strideq - vpermb m7, m4, [srcq+32*0] - vpermb m8, m4, [srcq+32*1] - vpermb m9, m4, [srcq+32*2] - vpermb m10, m4, [srcq+32*3] - pmaddubsw m7, m5 - pmaddubsw m8, m5 - pmaddubsw m9, m5 - pmaddubsw m10, m5 - psubw m11, m7, m0 - psubw m12, m8, m1 - psubw m13, m9, m2 - psubw m14, m10, m3 - pmulhrsw m11, m6 - pmulhrsw m12, m6 - pmulhrsw m13, m6 - pmulhrsw m14, m6 - paddw m11, m0 - paddw m12, m1 - paddw m13, m2 - paddw m14, m3 - mova [tmpq+64*0], m11 - mova [tmpq+64*1], m12 - mova [tmpq+64*2], m13 - mova [tmpq+64*3], m14 - mova m0, m7 - mova m1, m8 - mova m2, m9 - mova m3, m10 - add tmpq, 64*4 - dec hd - jg .hv_w128_loop -%else - mov t0, tmpq - mov t1, srcq - lea t2d, [hq+(7<<8)] -.hv_w128_loop0: - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, m0, [srcq+strideq*0+8*1], 1 - pshufb m0, m4 - pmaddubsw m0, m5 -.hv_w128_loop: - movu xm1, [srcq+strideq*1+8*0] - vinserti128 m1, m1, [srcq+strideq*1+8*1], 1 - lea srcq, [srcq+strideq*2] - movu xm2, [srcq+strideq*0+8*0] - vinserti128 m2, m2, [srcq+strideq*0+8*1], 1 - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 - psubw m3, m1, m0 - pmulhrsw m3, m6 - paddw m3, m0 - pmaddubsw m0, m2, m5 - psubw m2, m0, m1 - pmulhrsw m2, m6 - paddw m2, m1 - mova [tmpq+32*0], m3 - mova [tmpq+32*8], m2 - add tmpq, 32*16 - sub hd, 2 - jg .hv_w128_loop - movzx hd, t2b - add t0, mmsize - add t1, mmsize/2 - mov tmpq, t0 - mov srcq, t1 - sub t2d, 1<<8 - jg .hv_w128_loop0 -%endif - RET -%endmacro - -; int8_t subpel_filters[5][15][8] -%assign FILTER_REGULAR (0*15 << 16) | 3*15 -%assign FILTER_SMOOTH (1*15 << 16) | 4*15 -%assign FILTER_SHARP (2*15 << 16) | 3*15 - -%macro FN 4 ; fn, type, type_h, type_v -cglobal %1_%2 - mov t0d, FILTER_%3 - mov t1d, FILTER_%4 -%ifnidn %1, sharp_smooth ; skip the jump in the last filter - jmp mangle(private_prefix %+ _%1 %+ SUFFIX) -%endif -%endmacro - -%if WIN64 -DECLARE_REG_TMP 4, 5 -%else -DECLARE_REG_TMP 7, 8 -%endif - -%define PUT_8TAP_FN FN put_8tap, - -PUT_8TAP_FN regular, REGULAR, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH - -cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 - imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h - imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v - lea r8, [put_avx2] - movsxd wq, wm - movifnidn hd, hm - test mxd, 0xf00 - jnz .h - test myd, 0xf00 - jnz .v - tzcnt wd, wd - movzx wd, word [r8+wq*2+table_offset(put,)] - add wq, r8 - lea r6, [ssq*3] - lea r7, [dsq*3] -%if WIN64 - pop r8 -%endif - jmp wq -.h: - test myd, 0xf00 - jnz .hv - vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) - WIN64_SPILL_XMM 11 - cmp wd, 4 - jl .h_w2 - vbroadcasti128 m6, [subpel_h_shufA] - je .h_w4 - tzcnt wd, wd - vbroadcasti128 m7, [subpel_h_shufB] - vbroadcasti128 m8, [subpel_h_shufC] - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] - vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] - vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] - add wq, r8 - jmp wq -.h_w2: - movzx mxd, mxb - dec srcq - mova xm4, [subpel_h_shuf4] - vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] -.h_w2_loop: - movq xm0, [srcq+ssq*0] - movhps xm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xm0, xm4 - pmaddubsw xm0, xm3 - phaddw xm0, xm0 - paddw xm0, xm5 - psraw xm0, 6 - packuswb xm0, xm0 - pextrw [dstq+dsq*0], xm0, 0 - pextrw [dstq+dsq*1], xm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w2_loop - RET -.h_w4: - movzx mxd, mxb - dec srcq - vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] -.h_w4_loop: - movq xm0, [srcq+ssq*0] - movq xm1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xm0, xm6 - pshufb xm1, xm6 - pmaddubsw xm0, xm3 - pmaddubsw xm1, xm3 - phaddw xm0, xm1 - paddw xm0, xm5 - psraw xm0, 6 - packuswb xm0, xm0 - movd [dstq+dsq*0], xm0 - pextrd [dstq+dsq*1], xm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w4_loop - RET -.h_w8: -%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] - pshufb m%2, m%1, m7 - pshufb m%3, m%1, m8 - pshufb m%1, m6 - pmaddubsw m%4, m%2, m9 - pmaddubsw m%2, m10 - pmaddubsw m%3, m10 - pmaddubsw m%1, m9 - paddw m%3, m%4 - paddw m%1, m%2 - phaddw m%1, m%3 - paddw m%1, m5 - psraw m%1, 6 -%endmacro - movu xm0, [srcq+ssq*0] - vinserti128 m0, m0, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - PUT_8TAP_H 0, 1, 2, 3 - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - movq [dstq+dsq*0], xm0 - movhps [dstq+dsq*1], xm0 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w8 - RET -.h_w16: - movu xm0, [srcq+ssq*0+8*0] - vinserti128 m0, m0, [srcq+ssq*1+8*0], 1 - movu xm1, [srcq+ssq*0+8*1] - vinserti128 m1, m1, [srcq+ssq*1+8*1], 1 - PUT_8TAP_H 0, 2, 3, 4 - lea srcq, [srcq+ssq*2] - PUT_8TAP_H 1, 2, 3, 4 - packuswb m0, m1 - mova [dstq+dsq*0], xm0 - vextracti128 [dstq+dsq*1], m0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .h_w16 - RET -.h_w32: - xor r6d, r6d - jmp .h_start -.h_w64: - mov r6, -32*1 - jmp .h_start -.h_w128: - mov r6, -32*3 -.h_start: - sub srcq, r6 - sub dstq, r6 - mov r4, r6 -.h_loop: - movu m0, [srcq+r6+8*0] - movu m1, [srcq+r6+8*1] - PUT_8TAP_H 0, 2, 3, 4 - PUT_8TAP_H 1, 2, 3, 4 - packuswb m0, m1 - mova [dstq+r6], m0 - add r6, 32 - jle .h_loop - add srcq, ssq - add dstq, dsq - mov r6, r4 - dec hd - jg .h_loop - RET -.v: - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 16 - movzx mxd, myb - shr myd, 16 - cmp hd, 6 - cmovs myd, mxd - tzcnt r6d, wd - movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] - vpbroadcastd m7, [pw_512] - lea myq, [r8+myq*8+subpel_filters-put_avx2] - vpbroadcastw m8, [myq+0] - vpbroadcastw m9, [myq+2] - vpbroadcastw m10, [myq+4] - vpbroadcastw m11, [myq+6] - add r6, r8 - lea ss3q, [ssq*3] - sub srcq, ss3q - jmp r6 -.v_w2: - movd xm2, [srcq+ssq*0] - pinsrw xm2, [srcq+ssq*1], 2 - pinsrw xm2, [srcq+ssq*2], 4 - pinsrw xm2, [srcq+ss3q ], 6 ; 0 1 2 3 - lea srcq, [srcq+ssq*4] - movd xm3, [srcq+ssq*0] - vpbroadcastd xm1, [srcq+ssq*1] - vpbroadcastd xm0, [srcq+ssq*2] - add srcq, ss3q - vpblendd xm3, xm3, xm1, 0x02 ; 4 5 - vpblendd xm1, xm1, xm0, 0x02 ; 5 6 - palignr xm4, xm3, xm2, 4 ; 1 2 3 4 - punpcklbw xm3, xm1 ; 45 56 - punpcklbw xm1, xm2, xm4 ; 01 12 - punpckhbw xm2, xm4 ; 23 34 -.v_w2_loop: - pmaddubsw xm5, xm1, xm8 ; a0 b0 - mova xm1, xm2 - pmaddubsw xm2, xm9 ; a1 b1 - paddw xm5, xm2 - mova xm2, xm3 - pmaddubsw xm3, xm10 ; a2 b2 - paddw xm5, xm3 - vpbroadcastd xm4, [srcq+ssq*0] - vpblendd xm3, xm0, xm4, 0x02 ; 6 7 - vpbroadcastd xm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vpblendd xm4, xm4, xm0, 0x02 ; 7 8 - punpcklbw xm3, xm4 ; 67 78 - pmaddubsw xm4, xm3, xm11 ; a3 b3 - paddw xm5, xm4 - pmulhrsw xm5, xm7 - packuswb xm5, xm5 - pextrw [dstq+dsq*0], xm5, 0 - pextrw [dstq+dsq*1], xm5, 2 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .v_w2_loop - RET -.v_w4: - movd xm2, [srcq+ssq*0] - pinsrd xm2, [srcq+ssq*1], 1 - pinsrd xm2, [srcq+ssq*2], 2 - pinsrd xm2, [srcq+ss3q ], 3 ; 0 1 2 3 - lea srcq, [srcq+ssq*4] - movd xm3, [srcq+ssq*0] - vpbroadcastd xm1, [srcq+ssq*1] - vpbroadcastd xm0, [srcq+ssq*2] - add srcq, ss3q - vpblendd xm3, xm3, xm1, 0x02 ; 4 5 - vpblendd xm1, xm1, xm0, 0x02 ; 5 6 - palignr xm4, xm3, xm2, 4 ; 1 2 3 4 - punpcklbw xm3, xm1 ; 45 56 - punpcklbw xm1, xm2, xm4 ; 01 12 - punpckhbw xm2, xm4 ; 23 34 -.v_w4_loop: - pmaddubsw xm5, xm1, xm8 ; a0 b0 - mova xm1, xm2 - pmaddubsw xm2, xm9 ; a1 b1 - paddw xm5, xm2 - mova xm2, xm3 - pmaddubsw xm3, xm10 ; a2 b2 - paddw xm5, xm3 - vpbroadcastd xm4, [srcq+ssq*0] - vpblendd xm3, xm0, xm4, 0x02 ; 6 7 - vpbroadcastd xm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vpblendd xm4, xm4, xm0, 0x02 ; 7 8 - punpcklbw xm3, xm4 ; 67 78 - pmaddubsw xm4, xm3, xm11 ; a3 b3 - paddw xm5, xm4 - pmulhrsw xm5, xm7 - packuswb xm5, xm5 - movd [dstq+dsq*0], xm5 - pextrd [dstq+dsq*1], xm5, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .v_w4_loop - RET -.v_w8: - movq xm1, [srcq+ssq*0] - vpbroadcastq m4, [srcq+ssq*1] - vpbroadcastq m2, [srcq+ssq*2] - vpbroadcastq m5, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - vpbroadcastq m3, [srcq+ssq*0] - vpbroadcastq m6, [srcq+ssq*1] - vpbroadcastq m0, [srcq+ssq*2] - add srcq, ss3q - vpblendd m1, m1, m4, 0x30 - vpblendd m4, m4, m2, 0x30 - punpcklbw m1, m4 ; 01 12 - vpblendd m2, m2, m5, 0x30 - vpblendd m5, m5, m3, 0x30 - punpcklbw m2, m5 ; 23 34 - vpblendd m3, m3, m6, 0x30 - vpblendd m6, m6, m0, 0x30 - punpcklbw m3, m6 ; 45 56 -.v_w8_loop: - pmaddubsw m5, m1, m8 ; a0 b0 - mova m1, m2 - pmaddubsw m2, m9 ; a1 b1 - paddw m5, m2 - mova m2, m3 - pmaddubsw m3, m10 ; a2 b2 - paddw m5, m3 - vpbroadcastq m4, [srcq+ssq*0] - vpblendd m3, m0, m4, 0x30 - vpbroadcastq m0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vpblendd m4, m4, m0, 0x30 - punpcklbw m3, m4 ; 67 78 - pmaddubsw m4, m3, m11 ; a3 b3 - paddw m5, m4 - pmulhrsw m5, m7 - vextracti128 xm4, m5, 1 - packuswb xm5, xm4 - movq [dstq+dsq*0], xm5 - movhps [dstq+dsq*1], xm5 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .v_w8_loop - RET -.v_w16: -.v_w32: -.v_w64: -.v_w128: - lea r6d, [wq-16] - mov r4, dstq - mov r7, srcq - shl r6d, 4 - mov r6b, hb -.v_w16_loop0: - vbroadcasti128 m4, [srcq+ssq*0] - vbroadcasti128 m5, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vbroadcasti128 m0, [srcq+ssq*1] - vbroadcasti128 m6, [srcq+ssq*0] - lea srcq, [srcq+ssq*2] - vbroadcasti128 m1, [srcq+ssq*0] - vbroadcasti128 m2, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vbroadcasti128 m3, [srcq+ssq*0] - shufpd m4, m4, m0, 0x0c - shufpd m5, m5, m1, 0x0c - punpcklbw m1, m4, m5 ; 01 - punpckhbw m4, m5 ; 34 - shufpd m6, m6, m2, 0x0c - punpcklbw m2, m5, m6 ; 12 - punpckhbw m5, m6 ; 45 - shufpd m0, m0, m3, 0x0c - punpcklbw m3, m6, m0 ; 23 - punpckhbw m6, m0 ; 56 -.v_w16_loop: - vbroadcasti128 m12, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vbroadcasti128 m13, [srcq+ssq*0] - pmaddubsw m14, m1, m8 ; a0 - pmaddubsw m15, m2, m8 ; b0 - mova m1, m3 - mova m2, m4 - pmaddubsw m3, m9 ; a1 - pmaddubsw m4, m9 ; b1 - paddw m14, m3 - paddw m15, m4 - mova m3, m5 - mova m4, m6 - pmaddubsw m5, m10 ; a2 - pmaddubsw m6, m10 ; b2 - paddw m14, m5 - paddw m15, m6 - shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c - punpcklbw m5, m6, m0 ; 67 - punpckhbw m6, m0 ; 78 - pmaddubsw m12, m5, m11 ; a3 - pmaddubsw m13, m6, m11 ; b3 - paddw m14, m12 - paddw m15, m13 - pmulhrsw m14, m7 - pmulhrsw m15, m7 - packuswb m14, m15 - vpermq m14, m14, q3120 - mova [dstq+dsq*0], xm14 - vextracti128 [dstq+dsq*1], m14, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .v_w16_loop - movzx hd, r6b - add r4, 16 - add r7, 16 - mov dstq, r4 - mov srcq, r7 - sub r6d, 1<<8 - jg .v_w16_loop0 - RET -.hv: - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 16 - cmp wd, 4 - jg .hv_w8 - movzx mxd, mxb - dec srcq - vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] - movzx mxd, myb - shr myd, 16 - cmp hd, 6 - cmovs myd, mxd - vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] - lea ss3q, [ssq*3] - sub srcq, ss3q - punpcklbw m0, m0 - psraw m0, 8 ; sign-extend - vpbroadcastd m8, [pw_8192] - vpbroadcastd m9, [pd_512] - pshufd m10, m0, q0000 - pshufd m11, m0, q1111 - pshufd m12, m0, q2222 - pshufd m13, m0, q3333 - cmp wd, 4 - je .hv_w4 - vbroadcasti128 m6, [subpel_h_shuf4] - movq xm2, [srcq+ssq*0] - movhps xm2, [srcq+ssq*1] - movq xm0, [srcq+ssq*2] - movhps xm0, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - vpbroadcastq m3, [srcq+ssq*0] - vpbroadcastq m4, [srcq+ssq*1] - vpbroadcastq m1, [srcq+ssq*2] - add srcq, ss3q - vpblendd m2, m2, m3, 0x30 - vpblendd m0, m0, m1, 0x30 - vpblendd m2, m2, m4, 0xc0 - pshufb m2, m6 - pshufb m0, m6 - pmaddubsw m2, m7 - pmaddubsw m0, m7 - phaddw m2, m0 - pmulhrsw m2, m8 - vextracti128 xm3, m2, 1 - palignr xm4, xm3, xm2, 4 - punpcklwd xm1, xm2, xm4 ; 01 12 - punpckhwd xm2, xm4 ; 23 34 - pshufd xm0, xm3, q2121 - punpcklwd xm3, xm0 ; 45 56 -.hv_w2_loop: - pmaddwd xm5, xm1, xm10 ; a0 b0 - mova xm1, xm2 - pmaddwd xm2, xm11 ; a1 b1 - paddd xm5, xm2 - mova xm2, xm3 - pmaddwd xm3, xm12 ; a2 b2 - paddd xm5, xm3 - movq xm4, [srcq+ssq*0] - movhps xm4, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pshufb xm4, xm6 - pmaddubsw xm4, xm7 - phaddw xm4, xm4 - pmulhrsw xm4, xm8 - palignr xm3, xm4, xm0, 12 - mova xm0, xm4 - punpcklwd xm3, xm0 ; 67 78 - pmaddwd xm4, xm3, xm13 ; a3 b3 - paddd xm5, xm9 - paddd xm5, xm4 - psrad xm5, 10 - packssdw xm5, xm5 - packuswb xm5, xm5 - pextrw [dstq+dsq*0], xm5, 0 - pextrw [dstq+dsq*1], xm5, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .hv_w2_loop - RET -.hv_w4: - mova m6, [subpel_h_shuf4] - vpbroadcastq m2, [srcq+ssq*0] - vpbroadcastq m4, [srcq+ssq*1] - vpbroadcastq m0, [srcq+ssq*2] - vpbroadcastq m5, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - vpbroadcastq m3, [srcq+ssq*0] - vpblendd m2, m2, m4, 0xcc ; 0 1 - vpbroadcastq m4, [srcq+ssq*1] - vpbroadcastq m1, [srcq+ssq*2] - add srcq, ss3q - vpblendd m0, m0, m5, 0xcc ; 2 3 - vpblendd m3, m3, m4, 0xcc ; 4 5 - pshufb m2, m6 - pshufb m0, m6 - pshufb m3, m6 - pshufb m1, m6 - pmaddubsw m2, m7 - pmaddubsw m0, m7 - pmaddubsw m3, m7 - pmaddubsw m1, m7 - phaddw m2, m0 - phaddw m3, m1 - pmulhrsw m2, m8 - pmulhrsw m3, m8 - palignr m4, m3, m2, 4 - punpcklwd m1, m2, m4 ; 01 12 - punpckhwd m2, m4 ; 23 34 - pshufd m0, m3, q2121 - punpcklwd m3, m0 ; 45 56 -.hv_w4_loop: - pmaddwd m5, m1, m10 ; a0 b0 - mova m1, m2 - pmaddwd m2, m11 ; a1 b1 - paddd m5, m2 - mova m2, m3 - pmaddwd m3, m12 ; a2 b2 - paddd m5, m3 - vpbroadcastq m4, [srcq+ssq*0] - vpbroadcastq m3, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vpblendd m4, m4, m3, 0xcc ; 7 8 - pshufb m4, m6 - pmaddubsw m4, m7 - phaddw m4, m4 - pmulhrsw m4, m8 - palignr m3, m4, m0, 12 - mova m0, m4 - punpcklwd m3, m0 ; 67 78 - pmaddwd m4, m3, m13 ; a3 b3 - paddd m5, m9 - paddd m5, m4 - psrad m5, 10 - vextracti128 xm4, m5, 1 - packssdw xm5, xm4 - packuswb xm5, xm5 - pshuflw xm5, xm5, q3120 - movd [dstq+dsq*0], xm5 - pextrd [dstq+dsq*1], xm5, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .hv_w4_loop - RET -.hv_w8: - shr mxd, 16 - sub srcq, 3 - vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] - vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] - movzx mxd, myb - shr myd, 16 - cmp hd, 6 - cmovs myd, mxd - vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] - lea ss3q, [ssq*3] - sub srcq, ss3q - punpcklbw m0, m0 - psraw m0, 8 ; sign-extend - pshufd m12, m0, q0000 - pshufd m13, m0, q1111 - pshufd m14, m0, q2222 - pshufd m15, m0, q3333 - lea r6d, [wq-8] - mov r4, dstq - mov r7, srcq - shl r6d, 5 - mov r6b, hb -.hv_w8_loop0: - vbroadcasti128 m7, [subpel_h_shufA] - vbroadcasti128 m8, [subpel_h_shufB] - vbroadcasti128 m9, [subpel_h_shufC] - movu xm4, [srcq+ssq*0] - movu xm5, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - movu xm6, [srcq+ssq*0] - vbroadcasti128 m0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vpblendd m4, m4, m0, 0xf0 ; 0 3 - vinserti128 m5, m5, [srcq+ssq*0], 1 ; 1 4 - vinserti128 m6, m6, [srcq+ssq*1], 1 ; 2 5 - lea srcq, [srcq+ssq*2] - vinserti128 m0, m0, [srcq+ssq*0], 1 ; 3 6 -%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] - pshufb %3, %1, %6 - pshufb %4, %1, %7 - pshufb %1, %5 - pmaddubsw %2, %3, m10 - pmaddubsw %4, m11 - pmaddubsw %3, m11 - pmaddubsw %1, m10 - paddw %2, %4 - paddw %1, %3 - phaddw %1, %2 -%endmacro - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 - vpbroadcastd m7, [pw_8192] - vpermq m4, m4, q3120 - vpermq m5, m5, q3120 - vpermq m6, m6, q3120 - pmulhrsw m0, m7 - pmulhrsw m4, m7 - pmulhrsw m5, m7 - pmulhrsw m6, m7 - vpermq m7, m0, q3120 - punpcklwd m1, m4, m5 ; 01 - punpckhwd m4, m5 ; 34 - punpcklwd m2, m5, m6 ; 12 - punpckhwd m5, m6 ; 45 - punpcklwd m3, m6, m7 ; 23 - punpckhwd m6, m7 ; 56 -.hv_w8_loop: - vextracti128 r6m, m0, 1 ; not enough registers - movu xm0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - vinserti128 m0, m0, [srcq+ssq*0], 1 ; 7 8 - pmaddwd m8, m1, m12 ; a0 - pmaddwd m9, m2, m12 ; b0 - mova m1, m3 - mova m2, m4 - pmaddwd m3, m13 ; a1 - pmaddwd m4, m13 ; b1 - paddd m8, m3 - paddd m9, m4 - mova m3, m5 - mova m4, m6 - pmaddwd m5, m14 ; a2 - pmaddwd m6, m14 ; b2 - paddd m8, m5 - paddd m9, m6 - vbroadcasti128 m6, [subpel_h_shufB] - vbroadcasti128 m7, [subpel_h_shufC] - vbroadcasti128 m5, [subpel_h_shufA] - HV_H_W8 m0, m5, m6, m7, m5, m6, m7 - vpbroadcastd m5, [pw_8192] - vpbroadcastd m7, [pd_512] - vbroadcasti128 m6, r6m - pmulhrsw m0, m5 - paddd m8, m7 - paddd m9, m7 - vpermq m7, m0, q3120 ; 7 8 - shufpd m6, m6, m7, 0x04 ; 6 7 - punpcklwd m5, m6, m7 ; 67 - punpckhwd m6, m7 ; 78 - pmaddwd m7, m5, m15 ; a3 - paddd m8, m7 - pmaddwd m7, m6, m15 ; b3 - paddd m7, m9 - psrad m8, 10 - psrad m7, 10 - packssdw m8, m7 - vextracti128 xm7, m8, 1 - packuswb xm8, xm7 - pshufd xm7, xm8, q3120 - movq [dstq+dsq*0], xm7 - movhps [dstq+dsq*1], xm7 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .hv_w8_loop - movzx hd, r6b - add r4, 8 - add r7, 8 - mov dstq, r4 - mov srcq, r7 - sub r6d, 1<<8 - jg .hv_w8_loop0 - RET - -%macro PREP_8TAP_H 0 - %if cpuflag(avx512) - vpermb m10, m5, m0 - vpermb m11, m5, m1 - vpermb m12, m6, m0 - vpermb m13, m6, m1 - vpermb m14, m7, m0 - vpermb m15, m7, m1 - mova m0, m4 - mova m2, m4 - mova m1, m4 - mova m3, m4 - vpdpbusd m0, m10, m8 - vpdpbusd m2, m12, m8 - vpdpbusd m1, m11, m8 - vpdpbusd m3, m13, m8 - vpdpbusd m0, m12, m9 - vpdpbusd m2, m14, m9 - vpdpbusd m1, m13, m9 - vpdpbusd m3, m15, m9 - packssdw m0, m2 - packssdw m1, m3 - psraw m0, 2 - psraw m1, 2 - mova [tmpq+ 0], m0 - mova [tmpq+64], m1 - %else - pshufb m1, m0, m5 - pshufb m2, m0, m6 - pshufb m3, m0, m7 - pmaddubsw m1, m8 - pmaddubsw m0, m2, m8 - pmaddubsw m2, m9 - pmaddubsw m3, m9 - paddw m1, m2 - paddw m0, m3 - phaddw m0, m1, m0 - pmulhrsw m0, m4 - %endif -%endmacro - -%macro PREP_8TAP_V_W4 5 ; round, weights - movd xm0, [srcq+strideq*0] - vpbroadcastd ym1, [srcq+strideq*2] - vpbroadcastd xm2, [srcq+strideq*1] - vpbroadcastd ym3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd ym1, ym1, ym0, 0x01 ; 0 2 2 _ 2 _ _ _ - vpblendd ym3, ym3, ym2, 0x03 ; 1 1 3 3 3 3 _ _ - vpbroadcastd ym0, [srcq+strideq*0] - vpbroadcastd ym2, [srcq+strideq*1] - vpblendd ym1, ym1, ym0, 0x68 ; 0 2 2 4 2 4 4 _ - vpbroadcastd ym0, [srcq+strideq*2] - vbroadcasti128 ym5, [deint_shuf4] - vpblendd ym3, ym3, ym2, 0xc0 ; 1 1 3 3 3 3 5 5 - vpblendd ym2, ym3, ym1, 0x55 ; 0 1 2 3 2 3 4 5 - vpblendd ym3, ym3, ym1, 0xaa ; 1 2 3 4 3 4 5 _ - punpcklbw ym1, ym2, ym3 ; 01 12 23 34 - vpblendd ym3, ym3, ym0, 0x80 ; 1 2 3 4 3 4 5 6 - punpckhbw ym2, ym3 ; 23 34 45 56 -.v_w4_loop: - pinsrd xm0, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - vpbroadcastd ym3, [srcq+strideq*0] - vpbroadcastd ym4, [srcq+strideq*1] - vpblendd ym3, ym3, ym4, 0x20 ; _ _ 8 _ 8 9 _ _ - vpblendd ym3, ym3, ym0, 0x03 ; 6 7 8 _ 8 9 _ _ - vpbroadcastd ym0, [srcq+strideq*2] - vpblendd ym3, ym3, ym0, 0x40 ; 6 7 8 _ 8 9 a _ - pshufb ym3, ym5 ; 67 78 89 9a - pmaddubsw ym4, ym1, ym%2 - vperm2i128 ym1, ym2, ym3, 0x21 ; 45 56 67 78 - pmaddubsw ym2, ym%3 - paddw ym4, ym2 - mova ym2, ym3 - pmaddubsw ym3, ym%5 - paddw ym3, ym4 - pmaddubsw ym4, ym1, ym%4 - paddw ym3, ym4 - pmulhrsw ym3, ym%1 - mova [tmpq], ym3 -%endmacro - -%macro PREP_8TAP_FN 3 ; type, type_h, type_v -cglobal prep_8tap_%1 - mov t0d, FILTER_%2 - mov t1d, FILTER_%3 -%ifnidn %1, sharp_smooth ; skip the jump in the last filter - jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX) -%endif -%endmacro - -%macro PREP_8TAP 0 - %if WIN64 - DECLARE_REG_TMP 6, 4 - %else - DECLARE_REG_TMP 6, 7 - %endif -PREP_8TAP_FN regular, REGULAR, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH - -cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 - imul mxd, mxm, 0x010101 - add mxd, t0d ; 8tap_h, mx, 4tap_h - imul myd, mym, 0x010101 - add myd, t1d ; 8tap_v, my, 4tap_v - lea r7, [prep%+SUFFIX] - movsxd wq, wm - movifnidn hd, hm - test mxd, 0xf00 - jnz .h - test myd, 0xf00 - jnz .v - tzcnt wd, wd - movzx wd, word [r7+wq*2+table_offset(prep,)] - add wq, r7 - lea r6, [strideq*3] -%if WIN64 - pop r7 -%endif - jmp wq -.h: - test myd, 0xf00 - jnz .hv -%if cpuflag(avx512) - vpbroadcastd m4, [pd_2] -%else - vpbroadcastd m4, [pw_8192] - vbroadcasti128 m5, [subpel_h_shufA] -%endif - WIN64_SPILL_XMM 10 - cmp wd, 4 - je .h_w4 - tzcnt wd, wd -%if notcpuflag(avx512) - vbroadcasti128 m6, [subpel_h_shufB] - vbroadcasti128 m7, [subpel_h_shufC] -%endif - shr mxd, 16 - sub srcq, 3 - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] - vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] - add wq, r7 - jmp wq -.h_w4: -%if cpuflag(avx512) - mov r3d, 0x4 - kmovb k1, r3d - vbroadcasti128 ym5, [subpel_h_shufA] -%endif - movzx mxd, mxb - dec srcq - vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] - lea stride3q, [strideq*3] -.h_w4_loop: -%if cpuflag(avx512icl) - mova ym0, ym4 - mova ym1, ym4 - movq xm2, [srcq+strideq*0] - movq xm3, [srcq+strideq*1] - vpbroadcastq ym2{k1}, [srcq+strideq*2] - vpbroadcastq ym3{k1}, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - pshufb ym2, ym5 - pshufb ym3, ym5 - vpdpbusd ym0, ym2, ym6 - vpdpbusd ym1, ym3, ym6 - packssdw ym0, ym1 - psraw ym0, 2 -%else - movq xm0, [srcq+strideq*0] - vpbroadcastq m2, [srcq+strideq*2] - movq xm1, [srcq+strideq*1] - vpblendd m0, m0, m2, 0xf0 - vpbroadcastq m2, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpblendd m1, m1, m2, 0xf0 - pshufb m0, m5 - pshufb m1, m5 - pmaddubsw m0, m6 - pmaddubsw m1, m6 - phaddw m0, m1 - pmulhrsw m0, m4 -%endif - mova [tmpq], ym0 - add tmpq, 32 - sub hd, 4 - jg .h_w4_loop - RET -.h_w8: -%if cpuflag(avx512) - vbroadcasti128 m5, [subpel_h_shufA] - vbroadcasti128 m6, [subpel_h_shufB] - vbroadcasti128 m7, [subpel_h_shufC] - lea stride3q, [strideq*3] -%endif -.h_w8_loop: - movu xm0, [srcq+strideq*0] - vinserti128 ym0, [srcq+strideq*1], 1 -%if cpuflag(avx512) - vinserti128 m0, [srcq+strideq*2], 2 - vinserti128 m0, [srcq+stride3q ], 3 -%endif - lea srcq, [srcq+strideq*(mmsize/(8*2))] -%if cpuflag(avx512icl) - mova m10, m4 - mova m11, m4 - pshufb m1, m0, m5 - pshufb m2, m0, m6 - pshufb m3, m0, m7 - vpdpbusd m10, m1, m8 - vpdpbusd m11, m2, m8 - vpdpbusd m10, m2, m9 - vpdpbusd m11, m3, m9 - packssdw m10, m11 - psraw m0, m10, 2 -%else - PREP_8TAP_H -%endif - mova [tmpq], m0 - add tmpq, mmsize - sub hd, mmsize/(8*2) - jg .h_w8_loop - RET -.h_w16: -%if cpuflag(avx512icl) - mova m5, [spel_h_perm16a] - mova m6, [spel_h_perm16b] - mova m7, [spel_h_perm16c] - lea stride3q, [strideq*3] -.h_w16_loop: - movu ym0, [srcq+strideq*0] - movu ym1, [srcq+strideq*2] - vinserti32x8 m0, [srcq+strideq*1], 1 - vinserti32x8 m1, [srcq+stride3q ], 1 - lea srcq, [srcq+strideq*4] - PREP_8TAP_H -%else -.h_w16_loop: - movu xm0, [srcq+strideq*0+8*0] - vinserti128 m0, [srcq+strideq*0+8*1], 1 - PREP_8TAP_H - mova [tmpq+32*0], m0 - movu xm0, [srcq+strideq*1+8*0] - vinserti128 m0, m0, [srcq+strideq*1+8*1], 1 - lea srcq, [srcq+strideq*2] - PREP_8TAP_H - mova [tmpq+32*1], m0 -%endif - add tmpq, mmsize*2 - sub hd, mmsize*2/(16*2) - jg .h_w16_loop - RET -.h_w32: -%if cpuflag(avx512icl) - mova m5, [spel_h_perm32a] - mova m6, [spel_h_perm32b] - mova m7, [spel_h_perm32c] -.h_w32_loop: - movu m0, [srcq+strideq*0] - movu m1, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - PREP_8TAP_H - add tmpq, 64*2 - sub hd, 2 - jg .h_w32_loop - RET -%else - xor r6d, r6d - jmp .h_start -%endif -.h_w64: -%if cpuflag(avx512) - xor r6d, r6d -%else - mov r6, -32*1 -%endif - jmp .h_start -.h_w128: -%if cpuflag(avx512) - mov r6, -64*1 -%else - mov r6, -32*3 -%endif -.h_start: -%if cpuflag(avx512) - mova m5, [spel_h_perm32a] - mova m6, [spel_h_perm32b] - mova m7, [spel_h_perm32c] -%endif - sub srcq, r6 - mov r5, r6 -.h_loop: -%if cpuflag(avx512icl) - movu m0, [srcq+r6+32*0] - movu m1, [srcq+r6+32*1] - PREP_8TAP_H -%else - movu xm0, [srcq+r6+8*0] - vinserti128 ym0, [srcq+r6+8*1], 1 - PREP_8TAP_H - mova [tmpq+32*0], m0 - movu xm0, [srcq+r6+8*2] - vinserti128 ym0, [srcq+r6+8*3], 1 - PREP_8TAP_H - mova [tmpq+32*1], m0 -%endif - add tmpq, mmsize*2 - add r6, mmsize - jle .h_loop - add srcq, strideq - mov r6, r5 - dec hd - jg .h_loop - RET -.v: - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 16 - movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. - shr myd, 16 ; Note that the code is 8-tap only, having - cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 - cmove myd, mxd ; had a negligible effect on performance. - ; TODO: Would a 6-tap code path be worth it? -%if cpuflag(avx512) - tzcnt wd, wd - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)] - add wq, r7 -%endif - lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] - lea stride3q, [strideq*3] - sub srcq, stride3q - vpbroadcastd m7, [pw_8192] - vpbroadcastw m8, [myq+0] - vpbroadcastw m9, [myq+2] - vpbroadcastw m10, [myq+4] - vpbroadcastw m11, [myq+6] -%if cpuflag(avx512) - jmp wq -%else - cmp wd, 8 - jg .v_w16 - je .v_w8 -%endif -.v_w4: -%if cpuflag(avx512) - AVX512_MM_PERMUTATION - PREP_8TAP_V_W4 23, 24, 25, 26, 27 - AVX512_MM_PERMUTATION -%else - PREP_8TAP_V_W4 7, 8, 9, 10, 11 -%endif - add tmpq, 32 - sub hd, 4 - jg .v_w4_loop -%if cpuflag(avx512) - vzeroupper -%endif - RET -.v_w8: -%if cpuflag(avx512) - mov r3d, 0xf044 - kmovw k1, r3d - kshiftrw k2, k1, 8 - movq xm0, [srcq+strideq*0] - vpbroadcastq ym1, [srcq+strideq*1] - vpbroadcastq m2, [srcq+strideq*2] - vpbroadcastq m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpbroadcastq m4, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - vpbroadcastq m6, [srcq+strideq*2] - vmovdqa64 ym0{k1}, ym1 - vmovdqa64 ym1{k1}, ym2 - vmovdqa64 m2{k1}, m3 - vmovdqa64 m3{k1}, m4 - vmovdqa64 m4{k1}, m5 - vmovdqa64 m5{k1}, m6 - punpcklbw ym0, ym1 ; 01 12 __ __ - punpcklbw m2, m3 ; 23 34 23 34 - punpcklbw m4, m5 ; 45 56 45 56 - vmovdqa64 m0{k2}, m2 ; 01 12 23 34 - vmovdqa64 m2{k2}, m4 ; 23 34 45 56 -.v_w8_loop: - vpbroadcastq m1, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpbroadcastq m3, [srcq+strideq*0] - vpbroadcastq m5, [srcq+strideq*1] - pmaddubsw m14, m0, m8 - pmaddubsw m15, m2, m9 - vpblendmq m0{k1}, m6, m1 - vpblendmq m2{k1}, m1, m3 - vpbroadcastq m6, [srcq+strideq*2] - paddw m14, m15 - punpcklbw m2, m0, m2 ; 67 78 67 78 - vpblendmq m12{k1}, m3, m5 - vpblendmq m13{k1}, m5, m6 - vpblendmq m0{k2}, m4, m2 ; 45 56 67 78 - punpcklbw m4, m12, m13 ; 89 9a 89 9a - vmovdqa64 m2{k2}, m4 ; 67 78 89 9a - pmaddubsw m12, m0, m10 - pmaddubsw m13, m2, m11 - paddw m14, m12 - paddw m14, m13 - pmulhrsw m14, m7 - mova [tmpq], m14 -%else - movq xm1, [srcq+strideq*0] - vpbroadcastq m4, [srcq+strideq*1] - vpbroadcastq m2, [srcq+strideq*2] - vpbroadcastq m5, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpbroadcastq m3, [srcq+strideq*0] - vpbroadcastq m6, [srcq+strideq*1] - vpbroadcastq m0, [srcq+strideq*2] - vpblendd m1, m1, m4, 0x30 - vpblendd m4, m4, m2, 0x30 - punpcklbw m1, m4 ; 01 12 - vpblendd m2, m2, m5, 0x30 - vpblendd m5, m5, m3, 0x30 - punpcklbw m2, m5 ; 23 34 - vpblendd m3, m3, m6, 0x30 - vpblendd m6, m6, m0, 0x30 - punpcklbw m3, m6 ; 45 56 -.v_w8_loop: - vpbroadcastq m4, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - pmaddubsw m5, m2, m9 ; a1 - pmaddubsw m6, m2, m8 ; b0 - vpblendd m2, m0, m4, 0x30 - vpbroadcastq m0, [srcq+strideq*0] - vpblendd m4, m4, m0, 0x30 - punpcklbw m2, m4 ; 67 78 - pmaddubsw m1, m8 ; a0 - pmaddubsw m4, m3, m9 ; b1 - paddw m5, m1 - mova m1, m3 - pmaddubsw m3, m10 ; a2 - paddw m6, m4 - paddw m5, m3 - vpbroadcastq m4, [srcq+strideq*1] - vpblendd m3, m0, m4, 0x30 - vpbroadcastq m0, [srcq+strideq*2] - vpblendd m4, m4, m0, 0x30 - punpcklbw m3, m4 ; 89 9a - pmaddubsw m4, m2, m11 ; a3 - paddw m5, m4 - pmaddubsw m4, m2, m10 ; b2 - paddw m6, m4 - pmaddubsw m4, m3, m11 ; b3 - paddw m6, m4 - pmulhrsw m5, m7 - pmulhrsw m6, m7 - mova [tmpq+32*0], m5 - mova [tmpq+32*1], m6 -%endif - add tmpq, 32*2 - sub hd, 4 - jg .v_w8_loop - RET -.v_w16: -%if cpuflag(avx512) - mov r3d, 0xf0 - kmovb k1, r3d - vbroadcasti128 m0, [srcq+strideq*0] - vbroadcasti128 m1, [srcq+strideq*1] - vbroadcasti128 m2, [srcq+strideq*2] - vbroadcasti128 m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vbroadcasti128 m4, [srcq+strideq*0] - vbroadcasti128 m5, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*2] - vmovdqa64 m0{k1}, m1 - vmovdqa64 m1{k1}, m2 - vmovdqa64 m2{k1}, m3 - vmovdqa64 m3{k1}, m4 - vmovdqa64 m4{k1}, m5 - vmovdqa64 m5{k1}, m6 - shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b - shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b - shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_-- - shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_-- - punpckhbw m2, m0, m1 ; 23a 23b 34a 34b - punpcklbw m0, m1 ; 01a 01b 12a 12b - punpcklbw m4, m5 ; 45a 45b 56a 56b -.v_w16_loop: - vbroadcasti128 m3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vbroadcasti128 m5, [srcq+strideq*0] - vpblendmq m1{k1}, m6, m3 - vmovdqa64 m3{k1}, m5 - pmaddubsw m12, m0, m8 - pmaddubsw m13, m2, m8 - pmaddubsw m14, m2, m9 - pmaddubsw m15, m4, m9 - pmaddubsw m0, m4, m10 - vbroadcasti128 m2, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*2] - paddw m12, m14 - paddw m13, m15 - paddw m12, m0 - vmovdqa64 m5{k1}, m2 - vmovdqa64 m2{k1}, m6 - mova m0, m4 - shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b - shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab - punpcklbw m2, m1, m3 ; 67a 67b 78a 78b - punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab - pmaddubsw m14, m2, m10 - pmaddubsw m15, m2, m11 - paddw m13, m14 - paddw m12, m15 - pmaddubsw m14, m4, m11 - paddw m13, m14 - pmulhrsw m12, m7 - pmulhrsw m13, m7 - mova [tmpq+ 0], m12 - mova [tmpq+64], m13 - add tmpq, 64*2 - sub hd, 4 - jg .v_w16_loop -%else - lea r6d, [wq-16] - mov r5, tmpq - mov r7, srcq - shl r6d, 4 - mov r6b, hb -.v_w16_loop0: - vbroadcasti128 m4, [srcq+strideq*0] - vbroadcasti128 m5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m0, [srcq+strideq*1] - vbroadcasti128 m6, [srcq+strideq*0] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m1, [srcq+strideq*0] - vbroadcasti128 m2, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m3, [srcq+strideq*0] - shufpd m4, m4, m0, 0x0c - shufpd m5, m5, m1, 0x0c - punpcklbw m1, m4, m5 ; 01 - punpckhbw m4, m5 ; 34 - shufpd m6, m6, m2, 0x0c - punpcklbw m2, m5, m6 ; 12 - punpckhbw m5, m6 ; 45 - shufpd m0, m0, m3, 0x0c - punpcklbw m3, m6, m0 ; 23 - punpckhbw m6, m0 ; 56 -.v_w16_loop: - vbroadcasti128 m12, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vbroadcasti128 m13, [srcq+strideq*0] - pmaddubsw m14, m1, m8 ; a0 - pmaddubsw m15, m2, m8 ; b0 - mova m1, m3 - mova m2, m4 - pmaddubsw m3, m9 ; a1 - pmaddubsw m4, m9 ; b1 - paddw m14, m3 - paddw m15, m4 - mova m3, m5 - mova m4, m6 - pmaddubsw m5, m10 ; a2 - pmaddubsw m6, m10 ; b2 - paddw m14, m5 - paddw m15, m6 - shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c - punpcklbw m5, m6, m0 ; 67 - punpckhbw m6, m0 ; 78 - pmaddubsw m12, m5, m11 ; a3 - pmaddubsw m13, m6, m11 ; b3 - paddw m14, m12 - paddw m15, m13 - pmulhrsw m14, m7 - pmulhrsw m15, m7 - mova [tmpq+wq*0], m14 - mova [tmpq+wq*2], m15 - lea tmpq, [tmpq+wq*4] - sub hd, 2 - jg .v_w16_loop - movzx hd, r6b - add r5, 32 - add r7, 16 - mov tmpq, r5 - mov srcq, r7 - sub r6d, 1<<8 - jg .v_w16_loop0 -%endif - RET -%if cpuflag(avx512) -.v_w32: - mova m18, [bilin_v_perm64] - movu ym0, [srcq+strideq*0] - movu ym1, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym2, [srcq+strideq*0] - movu ym3, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym4, [srcq+strideq*0] - movu ym5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym6, [srcq+strideq*0] - vpermq m0, m18, m0 - vpermq m1, m18, m1 - vpermq m2, m18, m2 - vpermq m3, m18, m3 - vpermq m4, m18, m4 - vpermq m5, m18, m5 - vpermq m6, m18, m6 - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - punpcklbw m3, m4 - punpcklbw m4, m5 - punpcklbw m5, m6 -.v_w32_loop: - movu ym12, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu ym13, [srcq+strideq*0] - pmaddubsw m14, m0, m8 - pmaddubsw m16, m2, m9 - pmaddubsw m15, m1, m8 - pmaddubsw m17, m3, m9 - mova m0, m2 - mova m1, m3 - vpermq m12, m18, m12 - vpermq m13, m18, m13 - paddw m14, m16 - paddw m15, m17 - pmaddubsw m16, m4, m10 - pmaddubsw m17, m5, m10 - punpcklbw m6, m12 - punpcklbw m12, m13 - mova m2, m4 - mova m3, m5 - paddw m14, m16 - paddw m15, m17 - pmaddubsw m16, m6, m11 - pmaddubsw m17, m12, m11 - mova m4, m6 - mova m5, m12 - paddw m14, m16 - paddw m15, m17 - pmulhrsw m14, m7 - pmulhrsw m15, m7 - mova m6, m13 - mova [tmpq+ 0], m14 - mova [tmpq+64], m15 - add tmpq, 64*2 - sub hd, 2 - jg .v_w32_loop - vzeroupper - RET -.v_w64: - mov r6d, hd - mov wd, 64 - jmp .v_start -.v_w128: - lea r6d, [(1<<8)+hq] - mov wd, 128 -.v_start: - WIN64_SPILL_XMM 27 - mova m26, [bilin_v_perm64] - mov r5, tmpq - mov r7, srcq -.v_loop0: - vpermq m0, m26, [srcq+strideq*0] - vpermq m1, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m2, m26, [srcq+strideq*0] - vpermq m3, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m4, m26, [srcq+strideq*0] - vpermq m5, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m6, m26, [srcq+strideq*0] - punpckhbw m12, m0, m1 - punpcklbw m0, m1 - punpckhbw m13, m1, m2 - punpcklbw m1, m2 - punpckhbw m14, m2, m3 - punpcklbw m2, m3 - punpckhbw m15, m3, m4 - punpcklbw m3, m4 - punpckhbw m16, m4, m5 - punpcklbw m4, m5 - punpckhbw m17, m5, m6 - punpcklbw m5, m6 -.v_loop: - vpermq m18, m26, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpermq m19, m26, [srcq+strideq*0] - pmaddubsw m20, m0, m8 - pmaddubsw m21, m12, m8 - pmaddubsw m22, m1, m8 - pmaddubsw m23, m13, m8 - mova m0, m2 - mova m12, m14 - mova m1, m3 - mova m13, m15 - pmaddubsw m2, m9 - pmaddubsw m14, m9 - pmaddubsw m3, m9 - pmaddubsw m15, m9 - punpckhbw m24, m6, m18 - punpcklbw m6, m18 - paddw m20, m2 - paddw m21, m14 - paddw m22, m3 - paddw m23, m15 - mova m2, m4 - mova m14, m16 - mova m3, m5 - mova m15, m17 - pmaddubsw m4, m10 - pmaddubsw m16, m10 - pmaddubsw m5, m10 - pmaddubsw m17, m10 - punpckhbw m25, m18, m19 - punpcklbw m18, m19 - paddw m20, m4 - paddw m21, m16 - paddw m22, m5 - paddw m23, m17 - mova m4, m6 - mova m16, m24 - mova m5, m18 - mova m17, m25 - pmaddubsw m6, m11 - pmaddubsw m24, m11 - pmaddubsw m18, m11 - pmaddubsw m25, m11 - paddw m20, m6 - paddw m21, m24 - paddw m22, m18 - paddw m23, m25 - pmulhrsw m20, m7 - pmulhrsw m21, m7 - pmulhrsw m22, m7 - pmulhrsw m23, m7 - mova m6, m19 - mova [tmpq+wq*0+ 0], m20 - mova [tmpq+wq*0+64], m21 - mova [tmpq+wq*2+ 0], m22 - mova [tmpq+wq*2+64], m23 - lea tmpq, [tmpq+wq*4] - sub hd, 2 - jg .v_loop - movzx hd, r6b - add r5, 64*2 - add r7, 64 - mov tmpq, r5 - mov srcq, r7 - sub r6d, 1<<8 - jg .v_loop0 -%endif - RET -.hv: - %assign stack_offset stack_offset - stack_size_padded - %assign stack_size_padded 0 - WIN64_SPILL_XMM 16 - cmp wd, 4 - je .hv_w4 - shr mxd, 16 - sub srcq, 3 - vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] - vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] - movzx mxd, myb - shr myd, 16 - cmp hd, 4 - cmove myd, mxd -%if cpuflag(avx512) - tzcnt wd, wd - vpbroadcastd m8, [pd_2] - movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)] - vpbroadcastd m9, [pd_32] - add wq, r7 -%endif - vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] - lea stride3q, [strideq*3] - sub srcq, stride3q - punpcklbw m0, m0 - psraw m0, 8 ; sign-extend - pshufd m12, m0, q0000 - pshufd m13, m0, q1111 - pshufd m14, m0, q2222 - pshufd m15, m0, q3333 -%if cpuflag(avx512) - jmp wq -%else - jmp .hv_w8 -%endif -.hv_w4: - movzx mxd, mxb - dec srcq - vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] - movzx mxd, myb - shr myd, 16 - cmp hd, 4 - cmove myd, mxd - vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] - lea stride3q, [strideq*3] - sub srcq, stride3q -%if cpuflag(avx512) - mov r3d, 0x04 - kmovb k1, r3d - kshiftlb k2, k1, 2 - kshiftlb k3, k1, 4 - vpbroadcastd m10, [pd_2] - vbroadcasti128 m16, [subpel_h_shufA] -%else - mova m7, [subpel_h_shuf4] - pmovzxbd m9, [deint_shuf4] - vpbroadcastd m10, [pw_8192] -%endif - punpcklbw m0, m0 - psraw m0, 8 ; sign-extend - vpbroadcastd m11, [pd_32] - pshufd m12, m0, q0000 - pshufd m13, m0, q1111 - pshufd m14, m0, q2222 - pshufd m15, m0, q3333 -%if cpuflag(avx512icl) - movq xm3, [srcq+strideq*0] - vpbroadcastq ym2, [srcq+strideq*1] - vpbroadcastq ym3{k1}, [srcq+strideq*2] - vpbroadcastq m2{k2}, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpbroadcastq m3{k2}, [srcq+strideq*0] - vpbroadcastq m2{k3}, [srcq+strideq*1] - vpbroadcastq m3{k3}, [srcq+strideq*2] - mova m17, [spel_hv_perm4a] - movu m18, [spel_hv_perm4b] - mova m0, m10 - mova m1, m10 - pshufb m2, m16 - pshufb m3, m16 - vpdpbusd m0, m2, m8 - vpdpbusd m1, m3, m8 - packssdw m0, m1 ; _ 0 1 2 3 4 5 6 - psraw m0, 2 - vpermb m1, m17, m0 ; 01 12 23 34 - vpermb m2, m18, m0 ; 23 34 45 56 -.hv_w4_loop: - movq xm3, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - movq xm4, [srcq+strideq*0] - vpbroadcastq ym3{k1}, [srcq+strideq*1] - vpbroadcastq ym4{k1}, [srcq+strideq*2] - mova ym5, ym10 - mova ym6, ym10 - pshufb ym3, ym16 - pshufb ym4, ym16 - vpdpbusd ym5, ym3, ym8 - vpdpbusd ym6, ym4, ym8 - mova m7, m11 - packssdw ym5, ym6 ; 7 8 9 a _ _ _ _ - psraw ym5, 2 - valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a - vpdpwssd m7, m1, m12 - vpdpwssd m7, m2, m13 - vpermb m1, m17, m0 ; 45 56 67 78 - vpermb m2, m18, m0 ; 67 78 89 9a - vpdpwssd m7, m1, m14 - vpdpwssd m7, m2, m15 - psrad m7, 6 - vpmovdw [tmpq], m7 -%else - vpbroadcastq m2, [srcq+strideq*0] - vpbroadcastq m4, [srcq+strideq*1] - vpbroadcastq m0, [srcq+strideq*2] - vpbroadcastq m5, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vpbroadcastq m3, [srcq+strideq*0] - vpbroadcastq m6, [srcq+strideq*1] - vpbroadcastq m1, [srcq+strideq*2] - vpblendd m2, m2, m4, 0xcc ; 0 1 - vpblendd m0, m0, m5, 0xcc ; 2 3 - vpblendd m3, m3, m6, 0xcc ; 4 5 - pshufb m2, m7 ; 00 01 10 11 02 03 12 13 - pshufb m0, m7 ; 20 21 30 31 22 23 32 33 - pshufb m3, m7 ; 40 41 50 51 42 43 52 53 - pshufb m1, m7 ; 60 61 60 61 62 63 62 63 - pmaddubsw m2, m8 - pmaddubsw m0, m8 - pmaddubsw m3, m8 - pmaddubsw m1, m8 - phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b - phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ - pmulhrsw m2, m10 - pmulhrsw m3, m10 - palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b - punpcklwd m1, m2, m4 ; 01 12 - punpckhwd m2, m4 ; 23 34 - pshufd m0, m3, q2121 - punpcklwd m3, m0 ; 45 56 -.hv_w4_loop: - pmaddwd m5, m1, m12 ; a0 b0 - pmaddwd m6, m2, m12 ; c0 d0 - pmaddwd m2, m13 ; a1 b1 - pmaddwd m4, m3, m13 ; c1 d1 - mova m1, m3 - pmaddwd m3, m14 ; a2 b2 - paddd m5, m2 - vpbroadcastq m2, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - paddd m6, m4 - paddd m5, m3 - vpbroadcastq m4, [srcq+strideq*0] - vpbroadcastq m3, [srcq+strideq*1] - vpblendd m2, m2, m4, 0xcc - vpbroadcastq m4, [srcq+strideq*2] - vpblendd m3, m3, m4, 0xcc - pshufb m2, m7 - pshufb m3, m7 - pmaddubsw m2, m8 - pmaddubsw m3, m8 - phaddw m2, m3 - pmulhrsw m2, m10 - palignr m3, m2, m0, 12 - mova m0, m2 - punpcklwd m2, m3, m0 ; 67 78 - punpckhwd m3, m0 ; 89 9a - pmaddwd m4, m2, m14 ; c2 d2 - paddd m6, m11 - paddd m5, m11 - paddd m6, m4 - pmaddwd m4, m2, m15 ; a3 b3 - paddd m5, m4 - pmaddwd m4, m3, m15 ; c3 d3 - paddd m6, m4 - psrad m5, 6 - psrad m6, 6 - packssdw m5, m6 - vpermd m5, m9, m5 - mova [tmpq], m5 -%endif - add tmpq, 32 - sub hd, 4 - jg .hv_w4_loop -%if cpuflag(avx512) - vzeroupper -%endif - RET -.hv_w8: -%if cpuflag(avx512icl) - WIN64_SPILL_XMM 24 - vbroadcasti128 m16, [subpel_h_shufA] - vbroadcasti128 m17, [subpel_h_shufB] - vbroadcasti128 m18, [subpel_h_shufC] - vinserti128 ym0, [srcq+strideq*0], 1 - vinserti128 m0, [srcq+strideq*1], 2 - vinserti128 m0, [srcq+strideq*2], 3 - movu xm1, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vinserti128 ym1, [srcq+strideq*0], 1 - vinserti128 m1, [srcq+strideq*1], 2 - vinserti128 m1, [srcq+strideq*2], 3 - mova m2, m8 - mova m4, m8 - mova m3, m8 - mova m5, m8 - pshufb m20, m0, m16 - pshufb m21, m0, m17 - pshufb m22, m0, m18 - pshufb m23, m1, m16 - pshufb m6, m1, m17 - pshufb m7, m1, m18 - vpdpbusd m2, m20, m10 - vpdpbusd m4, m21, m10 - vpdpbusd m2, m21, m11 - vpdpbusd m4, m22, m11 - vpdpbusd m3, m23, m10 - vpdpbusd m5, m6, m10 - vpdpbusd m3, m6, m11 - vpdpbusd m5, m7, m11 - packssdw m2, m4 - packssdw m3, m5 - psraw m2, 2 ; _ 0 1 2 - psraw m3, 2 ; 3 4 5 6 - valignq m0, m3, m2, 2 ; 0 1 2 3 - valignq m1, m3, m2, 4 ; 1 2 3 4 - valignq m2, m3, m2, 6 ; 2 3 4 5 - punpcklwd m4, m0, m1 ; 01a 12a 23a 34a - punpckhwd m5, m0, m1 ; 01b 12b 23b 34b - punpcklwd m6, m2, m3 ; 23a 34a 45a 56a - punpckhwd m7, m2, m3 ; 23b 34b 45b 56b -.hv_w8_loop: - movu xm19, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - vinserti128 ym19, [srcq+strideq*0], 1 - vinserti128 m19, [srcq+strideq*1], 2 - vinserti128 m19, [srcq+strideq*2], 3 - mova m20, m9 - mova m21, m9 - mova m22, m8 - mova m23, m8 - vpdpwssd m20, m4, m12 - vpdpwssd m21, m5, m12 - vpdpwssd m20, m6, m13 - vpdpwssd m21, m7, m13 - pshufb m0, m19, m16 - pshufb m1, m19, m17 - pshufb m2, m19, m18 - vpdpbusd m22, m0, m10 - vpdpbusd m23, m1, m10 - vpdpbusd m22, m1, m11 - vpdpbusd m23, m2, m11 - packssdw m22, m23 - psraw m22, 2 ; 7 8 9 A - valignq m0, m22, m3, 2 ; 4 5 6 7 - valignq m1, m22, m3, 4 ; 5 6 7 8 - valignq m2, m22, m3, 6 ; 6 7 8 9 - mova m3, m22 - punpcklwd m4, m0, m1 ; 45a 56a 67a 78a - punpckhwd m5, m0, m1 ; 45b 56b 67b 78b - punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa - punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab - vpdpwssd m20, m4, m14 - vpdpwssd m21, m5, m14 - vpdpwssd m20, m6, m15 - vpdpwssd m21, m7, m15 - psrad m20, 6 - psrad m21, 6 - packssdw m20, m21 - mova [tmpq], m20 - add tmpq, 64 - sub hd, 4 - jg .hv_w8_loop -%else - lea r6d, [wq-8] - mov r5, tmpq - mov r7, srcq - shl r6d, 5 - mov r6b, hb -.hv_w8_loop0: - vbroadcasti128 m7, [subpel_h_shufA] - vbroadcasti128 m8, [subpel_h_shufB] - vbroadcasti128 m9, [subpel_h_shufC] - movu xm4, [srcq+strideq*0] - movu xm5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movu xm6, [srcq+strideq*0] - vbroadcasti128 m0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vpblendd m4, m4, m0, 0xf0 ; 0 3 - vinserti128 m5, m5, [srcq+strideq*0], 1 ; 1 4 - vinserti128 m6, m6, [srcq+strideq*1], 1 ; 2 5 - lea srcq, [srcq+strideq*2] - vinserti128 m0, m0, [srcq+strideq*0], 1 ; 3 6 - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 - vpbroadcastd m7, [pw_8192] - vpermq m4, m4, q3120 - vpermq m5, m5, q3120 - vpermq m6, m6, q3120 - pmulhrsw m0, m7 - pmulhrsw m4, m7 - pmulhrsw m5, m7 - pmulhrsw m6, m7 - vpermq m7, m0, q3120 - punpcklwd m1, m4, m5 ; 01 - punpckhwd m4, m5 ; 34 - punpcklwd m2, m5, m6 ; 12 - punpckhwd m5, m6 ; 45 - punpcklwd m3, m6, m7 ; 23 - punpckhwd m6, m7 ; 56 -.hv_w8_loop: - vextracti128 [tmpq], m0, 1 ; not enough registers - movu xm0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti128 m0, m0, [srcq+strideq*0], 1 ; 7 8 - pmaddwd m8, m1, m12 ; a0 - pmaddwd m9, m2, m12 ; b0 - mova m1, m3 - mova m2, m4 - pmaddwd m3, m13 ; a1 - pmaddwd m4, m13 ; b1 - paddd m8, m3 - paddd m9, m4 - mova m3, m5 - mova m4, m6 - pmaddwd m5, m14 ; a2 - pmaddwd m6, m14 ; b2 - paddd m8, m5 - paddd m9, m6 - vbroadcasti128 m6, [subpel_h_shufB] - vbroadcasti128 m7, [subpel_h_shufC] - vbroadcasti128 m5, [subpel_h_shufA] - HV_H_W8 m0, m5, m6, m7, m5, m6, m7 - vpbroadcastd m5, [pw_8192] - vpbroadcastd m7, [pd_32] - vbroadcasti128 m6, [tmpq] - pmulhrsw m0, m5 - paddd m8, m7 - paddd m9, m7 - vpermq m7, m0, q3120 ; 7 8 - shufpd m6, m6, m7, 0x04 ; 6 7 - punpcklwd m5, m6, m7 ; 67 - punpckhwd m6, m7 ; 78 - pmaddwd m7, m5, m15 ; a3 - paddd m8, m7 - pmaddwd m7, m6, m15 ; b3 - paddd m7, m9 - psrad m8, 6 - psrad m7, 6 - packssdw m8, m7 - vpermq m7, m8, q3120 - mova [tmpq+wq*0], xm7 - vextracti128 [tmpq+wq*2], m7, 1 - lea tmpq, [tmpq+wq*4] - sub hd, 2 - jg .hv_w8_loop - movzx hd, r6b - add r5, 16 - add r7, 8 - mov tmpq, r5 - mov srcq, r7 - sub r6d, 1<<8 - jg .hv_w8_loop0 -%endif - RET -%if cpuflag(avx512icl) -.hv_w16: - mov wd, 16*2 - jmp .hv_start -.hv_w32: - mov wd, 32*2 - jmp .hv_start -.hv_w64: - mov wd, 64*2 - jmp .hv_start -.hv_w128: - mov wd, 128*2 -.hv_start: - WIN64_SPILL_XMM 31 - mova m16, [spel_h_perm16a] - mova m17, [spel_h_perm16b] - mova m18, [spel_h_perm16c] - lea r6d, [wq*8-16*2*8+hq] - mov r5, tmpq - mov r7, srcq -.hv_loop0: - movu ym0, [srcq+strideq*0] - vinserti32x8 m0, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym1, [srcq+strideq*0] - vinserti32x8 m1, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym2, [srcq+strideq*0] - vinserti32x8 m2, [srcq+strideq*1], 1 - lea srcq, [srcq+strideq*2] - movu ym3, [srcq+strideq*0] - mova m4, m8 - mova m5, m8 - mova m6, m8 - mova m7, m8 - vpermb m19, m16, m0 - vpermb m20, m17, m0 - vpermb m21, m18, m0 - vpermb m22, m16, m1 - vpermb m23, m17, m1 - vpermb m24, m18, m1 - vpermb m25, m16, m2 - vpermb m26, m17, m2 - vpermb m27, m18, m2 - vpermb ym28, ym16, ym3 - vpermb ym29, ym17, ym3 - vpermb ym30, ym18, ym3 - mova m0, m8 - mova m1, m8 - mova ym2, ym8 - mova ym3, ym8 - vpdpbusd m4, m19, m10 - vpdpbusd m5, m20, m10 - vpdpbusd m6, m22, m10 - vpdpbusd m7, m23, m10 - vpdpbusd m0, m25, m10 - vpdpbusd m1, m26, m10 - vpdpbusd ym2, ym28, ym10 - vpdpbusd ym3, ym29, ym10 - vpdpbusd m4, m20, m11 - vpdpbusd m5, m21, m11 - vpdpbusd m6, m23, m11 - vpdpbusd m7, m24, m11 - vpdpbusd m0, m26, m11 - vpdpbusd m1, m27, m11 - vpdpbusd ym2, ym29, ym11 - vpdpbusd ym3, ym30, ym11 - packssdw m4, m5 - packssdw m6, m7 - packssdw m0, m1 - packssdw ym2, ym3 - psraw m4, 2 ; 0a 0b 1a 1b - psraw m6, 2 ; 2a 2b 3a 3b - psraw m0, 2 ; 4a 4b 5a 5b - psraw ym2, 2 ; 6a 6b __ __ - vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b - vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b - vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b - punpcklwd m2, m4, m5 ; 01a 01c 12a 12c - punpckhwd m3, m4, m5 ; 01b 01d 12b 12d - punpcklwd m4, m6, m7 ; 23a 23c 34a 34c - punpckhwd m5, m6, m7 ; 23b 23d 34b 34d - punpcklwd m6, m0, m1 ; 45a 45c 56a 56c - punpckhwd m7, m0, m1 ; 45b 45d 56b 56d -.hv_loop: - movu ym19, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - vinserti32x8 m19, [srcq+strideq*0], 1 - mova m20, m9 - mova m21, m9 - mova m22, m8 - mova m23, m8 - vpdpwssd m20, m2, m12 - vpdpwssd m21, m3, m12 - vpdpwssd m20, m4, m13 - vpdpwssd m21, m5, m13 - vpermb m24, m16, m19 - vpermb m25, m17, m19 - vpermb m26, m18, m19 - vpdpbusd m22, m24, m10 - vpdpbusd m23, m25, m10 - vpdpbusd m22, m25, m11 - vpdpbusd m23, m26, m11 - packssdw m22, m23 - psraw m22, 2 ; 7a 7b 8a 8b - vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b - mova m2, m4 - mova m3, m5 - mova m1, m22 - mova m4, m6 - mova m5, m7 - punpcklwd m6, m0, m1 ; 67a 67c 78a 78c - punpckhwd m7, m0, m1 ; 67b 67d 78b 78d - vpdpwssd m20, m4, m14 - vpdpwssd m21, m5, m14 - vpdpwssd m20, m6, m15 - vpdpwssd m21, m7, m15 - psrad m20, 6 - psrad m21, 6 - packssdw m20, m21 - mova [tmpq+wq*0], ym20 - vextracti32x8 [tmpq+wq*1], m20, 1 - lea tmpq, [tmpq+wq*2] - sub hd, 2 - jg .hv_loop - movzx hd, r6b - add r5, 32 - add r7, 16 - mov tmpq, r5 - mov srcq, r7 - sub r6d, 1<<8 - jg .hv_loop0 -%endif - RET -%endmacro - -%macro movifprep 2 - %if isprep - mov %1, %2 - %endif -%endmacro - -%macro REMAP_REG 2 - %xdefine r%1 r%2 - %xdefine r%1q r%2q - %xdefine r%1d r%2d -%endmacro - -%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 - %if isprep - %xdefine r14_save r14 - %assign %%i 14 - %rep 14 - %assign %%j %%i-1 - REMAP_REG %%i, %%j - %assign %%i %%i-1 - %endrep - %endif -%endmacro - -%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 - %if isprep - %assign %%i 1 - %rep 13 - %assign %%j %%i+1 - REMAP_REG %%i, %%j - %assign %%i %%i+1 - %endrep - %xdefine r14 r14_save - %undef r14_save - %endif -%endmacro - -%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged - MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT - RET - %if %1 - MCT_8TAP_SCALED_REMAP_REGS_TO_PREV - %endif -%endmacro - -%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] - movq xm%1, [srcq+ r4] - movq xm%2, [srcq+ r6] - movhps xm%1, [srcq+ r7] - movhps xm%2, [srcq+ r9] - vinserti128 m%1, [srcq+r10], 1 - vinserti128 m%2, [srcq+r11], 1 - vpbroadcastq m%5, [srcq+r13] - vpbroadcastq m%6, [srcq+ rX] - add srcq, ssq - movq xm%3, [srcq+ r4] - movq xm%4, [srcq+ r6] - movhps xm%3, [srcq+ r7] - movhps xm%4, [srcq+ r9] - vinserti128 m%3, [srcq+r10], 1 - vinserti128 m%4, [srcq+r11], 1 - vpbroadcastq m%7, [srcq+r13] - vpbroadcastq m%8, [srcq+ rX] - add srcq, ssq - vpblendd m%1, m%5, 0xc0 - vpblendd m%2, m%6, 0xc0 - vpblendd m%3, m%7, 0xc0 - vpblendd m%4, m%8, 0xc0 - pmaddubsw m%1, m15 - pmaddubsw m%2, m10 - pmaddubsw m%3, m15 - pmaddubsw m%4, m10 - phaddw m%1, m%2 - phaddw m%3, m%4 - phaddw m%1, m%3 - pmulhrsw m%1, m12 -%endmacro - -%macro MC_8TAP_SCALED 1 -%ifidn %1, put - %assign isprep 0 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy - %else -cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy - %endif - %xdefine base_reg r12 - %define rndshift 10 -%else - %assign isprep 1 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy - %xdefine tmp_stridem r14q - %else -cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy - %define tmp_stridem qword [rsp+104] - %endif - %xdefine base_reg r11 - %define rndshift 6 -%endif - lea base_reg, [%1_8tap_scaled_avx2] -%define base base_reg-%1_8tap_scaled_avx2 - tzcnt wd, wm - vpbroadcastd m8, dxm -%if isprep && UNIX64 - movd xm14, mxd - vpbroadcastd m14, xm14 - mov r5d, t0d - DECLARE_REG_TMP 5, 7 -%else - vpbroadcastd m14, mxm -%endif - mov dyd, dym -%ifidn %1, put - %if WIN64 - mov r8d, hm - DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 - %define hm r5m - %define dxm r8m - %else - DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 - %define hm r6m - %endif - %if required_stack_alignment > STACK_ALIGNMENT - %define dsm [rsp+96] - %define rX r1 - %define rXd r1d - %else - %define dsm dsq - %define rX r14 - %define rXd r14d - %endif -%else ; prep - %if WIN64 - mov r7d, hm - DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 - %define hm r4m - %define dxm r7m - %else - DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 - %define hm [rsp+96] - %endif - MCT_8TAP_SCALED_REMAP_REGS_TO_PREV - %define rX r14 - %define rXd r14d -%endif - vpbroadcastd m10, [base+pd_0x3ff] - vpbroadcastd m12, [base+pw_8192] -%ifidn %1, put - vpbroadcastd m13, [base+pd_512] -%else - vpbroadcastd m13, [base+pd_32] -%endif - pxor m9, m9 - lea ss3q, [ssq*3] - movzx r7d, t1b - shr t1d, 16 - cmp hd, 6 - cmovs t1d, r7d - sub srcq, ss3q - cmp dyd, 1024 - je .dy1 - cmp dyd, 2048 - je .dy2 - movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] - add wq, base_reg - jmp wq -%ifidn %1, put -.w2: - mov myd, mym - movzx t0d, t0b - dec srcq - movd xm15, t0d - punpckldq m8, m9, m8 - paddd m14, m8 ; mx+dx*[0-1] - vpbroadcastd m11, [base+pd_0x4000] - vpbroadcastd xm15, xm15 - pand m8, m14, m10 - psrld m8, 6 - paddd xm15, xm8 - movd r4d, xm15 - pextrd r6d, xm15, 1 - vbroadcasti128 m5, [base+bdct_lb_dw] - vbroadcasti128 m6, [base+subpel_s_shuf2] - vpbroadcastd m15, [base+subpel_filters+r4*8+2] - vpbroadcastd m7, [base+subpel_filters+r6*8+2] - pcmpeqd m8, m9 - psrld m14, 10 - movq xm0, [srcq+ssq*0] - movq xm1, [srcq+ssq*2] - movhps xm0, [srcq+ssq*1] - movhps xm1, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - pshufb m14, m5 - paddb m14, m6 - vinserti128 m0, [srcq+ssq*0], 1 - vinserti128 m1, [srcq+ssq*2], 1 - vpbroadcastq m2, [srcq+ssq*1] - vpbroadcastq m3, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - vpblendd m15, m7, 0xaa - vpblendd m0, m2, 0xc0 ; 0 1 4 5 - vpblendd m1, m3, 0xc0 ; 2 3 6 7 - pblendvb m15, m11, m8 - pshufb m0, m14 - pshufb m1, m14 - pmaddubsw m0, m15 - pmaddubsw m1, m15 - phaddw m0, m1 - pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 - vextracti128 xm1, m0, 1 ; 4 5 6 7 - palignr xm2, xm1, xm0, 4 ; 1 2 3 4 - punpcklwd xm3, xm0, xm2 ; 01 12 - punpckhwd xm0, xm2 ; 23 34 - pshufd xm4, xm1, q0321 ; 5 6 7 _ - punpcklwd xm2, xm1, xm4 ; 45 56 - punpckhwd xm4, xm1, xm4 ; 67 __ -.w2_loop: - and myd, 0x3ff - mov r6d, 64 << 24 - mov r4d, myd - shr r4d, 6 - lea r4d, [t1+r4] - cmovnz r6q, [base+subpel_filters+r4*8] - movq xm11, r6q - punpcklbw xm11, xm11 - psraw xm11, 8 - pshufd xm8, xm11, q0000 - pshufd xm9, xm11, q1111 - pshufd xm10, xm11, q2222 - pshufd xm11, xm11, q3333 - pmaddwd xm5, xm3, xm8 - pmaddwd xm6, xm0, xm9 - pmaddwd xm7, xm2, xm10 - pmaddwd xm8, xm4, xm11 - paddd xm5, xm6 - paddd xm7, xm8 - paddd xm5, xm13 - paddd xm5, xm7 - psrad xm5, 10 - packssdw xm5, xm5 - packuswb xm5, xm5 - pextrw [dstq], xm5, 0 - add dstq, dsq - dec hd - jz .ret - add myd, dyd - test myd, ~0x3ff - jz .w2_loop - movq xm5, [srcq] - test myd, 0x400 - jz .w2_skip_line - add srcq, ssq - shufps xm3, xm0, q1032 ; 01 12 - shufps xm0, xm2, q1032 ; 23 34 - shufps xm2, xm4, q1032 ; 45 56 - pshufb xm5, xm14 - pmaddubsw xm5, xm15 - phaddw xm5, xm5 - pmulhrsw xm5, xm12 - palignr xm1, xm5, xm1, 12 - punpcklqdq xm1, xm1 ; 6 7 6 7 - punpcklwd xm4, xm1, xm5 ; 67 __ - jmp .w2_loop -.w2_skip_line: - movhps xm5, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - mova xm3, xm0 ; 01 12 - mova xm0, xm2 ; 23 34 - pshufb xm5, xm14 - pmaddubsw xm5, xm15 - phaddw xm5, xm5 - pmulhrsw xm5, xm12 ; 6 7 6 7 - palignr xm1, xm5, xm1, 8 ; 4 5 6 7 - pshufd xm5, xm1, q0321 ; 5 6 7 _ - punpcklwd xm2, xm1, xm5 ; 45 56 - punpckhwd xm4, xm1, xm5 ; 67 __ - jmp .w2_loop -%endif -.w4: - mov myd, mym - vbroadcasti128 m7, [base+rescale_mul] - movzx t0d, t0b - dec srcq - movd xm15, t0d - pmaddwd m8, m7 - vpbroadcastd m11, [base+pd_0x4000] - vpbroadcastd xm15, xm15 - paddd m14, m8 ; mx+dx*[0-3] - pand m0, m14, m10 - psrld m0, 6 - paddd xm15, xm0 - movd r4d, xm15 - pextrd r6d, xm15, 1 - pextrd r11d, xm15, 2 - pextrd r13d, xm15, 3 - movd xm15, [base+subpel_filters+r4*8+2] - vbroadcasti128 m5, [base+bdct_lb_dw] - vpbroadcastq m6, [base+subpel_s_shuf2] - pinsrd xm15, [base+subpel_filters+r6*8+2], 1 - pcmpeqd m0, m9 - psrld m14, 10 - movu xm7, [srcq+ssq*0] - movu xm9, [srcq+ssq*1] - pinsrd xm15, [base+subpel_filters+r11*8+2], 2 - movu xm8, [srcq+ssq*2] - movu xm10, [srcq+ss3q ] - pinsrd xm15, [base+subpel_filters+r13*8+2], 3 - lea srcq, [srcq+ssq*4] - pshufb m14, m5 - paddb m14, m6 - vinserti128 m7, [srcq+ssq*0], 1 - vinserti128 m9, [srcq+ssq*1], 1 - vinserti128 m15, xm15, 1 - vinserti128 m8, [srcq+ssq*2], 1 - vinserti128 m10, [srcq+ss3q ], 1 - lea srcq, [srcq+ssq*4] - pblendvb m15, m11, m0 - pshufb m7, m14 - pshufb m9, m14 - pshufb m8, m14 - pshufb m10, m14 - pmaddubsw m7, m15 - pmaddubsw m9, m15 - pmaddubsw m8, m15 - pmaddubsw m10, m15 - phaddw m7, m9 - phaddw m8, m10 - pmulhrsw m7, m12 ; 0 1 4 5 - pmulhrsw m8, m12 ; 2 3 6 7 - vextracti128 xm9, m7, 1 ; 4 5 - vextracti128 xm3, m8, 1 ; 6 7 - shufps xm4, xm7, xm8, q1032 ; 1 2 - shufps xm5, xm8, xm9, q1032 ; 3 4 - shufps xm6, xm9, xm3, q1032 ; 5 6 - psrldq xm11, xm3, 8 ; 7 _ - punpcklwd xm0, xm7, xm4 ; 01 - punpckhwd xm7, xm4 ; 12 - punpcklwd xm1, xm8, xm5 ; 23 - punpckhwd xm8, xm5 ; 34 - punpcklwd xm2, xm9, xm6 ; 45 - punpckhwd xm9, xm6 ; 56 - punpcklwd xm3, xm11 ; 67 - mova [rsp+0x00], xm7 - mova [rsp+0x10], xm8 - mova [rsp+0x20], xm9 -.w4_loop: - and myd, 0x3ff - mov r6d, 64 << 24 - mov r4d, myd - shr r4d, 6 - lea r4d, [t1+r4] - cmovnz r6q, [base+subpel_filters+r4*8] - movq xm10, r6q - punpcklbw xm10, xm10 - psraw xm10, 8 - pshufd xm7, xm10, q0000 - pshufd xm8, xm10, q1111 - pshufd xm9, xm10, q2222 - pshufd xm10, xm10, q3333 - pmaddwd xm4, xm0, xm7 - pmaddwd xm5, xm1, xm8 - pmaddwd xm6, xm2, xm9 - pmaddwd xm7, xm3, xm10 - paddd xm4, xm5 - paddd xm6, xm7 - paddd xm4, xm13 - paddd xm4, xm6 - psrad xm4, rndshift - packssdw xm4, xm4 -%ifidn %1, put - packuswb xm4, xm4 - movd [dstq], xm4 - add dstq, dsq -%else - movq [tmpq], xm4 - add tmpq, 8 -%endif - dec hd - jz .ret - add myd, dyd - test myd, ~0x3ff - jz .w4_loop - movu xm4, [srcq] - test myd, 0x400 - jz .w4_skip_line - mova xm0, [rsp+0x00] - mova [rsp+0x00], xm1 - mova xm1, [rsp+0x10] - mova [rsp+0x10], xm2 - mova xm2, [rsp+0x20] - mova [rsp+0x20], xm3 - pshufb xm4, xm14 - pmaddubsw xm4, xm15 - phaddw xm4, xm4 - pmulhrsw xm4, xm12 - punpcklwd xm3, xm11, xm4 - mova xm11, xm4 - add srcq, ssq - jmp .w4_loop -.w4_skip_line: - movu xm5, [srcq+ssq*1] - movu m6, [rsp+0x10] - pshufb xm4, xm14 - pshufb xm5, xm14 - pmaddubsw xm4, xm15 - pmaddubsw xm5, xm15 - movu [rsp+0x00], m6 - phaddw xm4, xm5 - pmulhrsw xm4, xm12 - punpcklwd xm9, xm11, xm4 - mova [rsp+0x20], xm9 - psrldq xm11, xm4, 8 - mova xm0, xm1 - mova xm1, xm2 - mova xm2, xm3 - punpcklwd xm3, xm4, xm11 - lea srcq, [srcq+ssq*2] - jmp .w4_loop -.w8: -%ifidn %1, put - movifnidn dsm, dsq -%endif - shr t0d, 16 - sub srcq, 3 - movd xm15, t0d - pmaddwd m8, [base+rescale_mul] - vpbroadcastq m11, [base+pq_0x40000000] - vpbroadcastd m15, xm15 - paddd m14, m8 ; mx+dx*[0-7] - pand m6, m14, m10 - psrld m6, 6 - paddd m15, m6 - pcmpeqd m6, m9 - vextracti128 xm7, m15, 1 - movd r4d, xm15 - pextrd r6d, xm15, 2 - pextrd r7d, xm15, 1 - pextrd r9d, xm15, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - movq xm15, [base+subpel_filters+r4*8] - movq xm10, [base+subpel_filters+r6*8] - movhps xm15, [base+subpel_filters+r7*8] - movhps xm10, [base+subpel_filters+r9*8] - vinserti128 m15, [base+subpel_filters+r10*8], 1 - vinserti128 m10, [base+subpel_filters+r11*8], 1 - vpbroadcastq m9, [base+subpel_filters+r13*8] - vpbroadcastq m8, [base+subpel_filters+rX*8] - psrld m14, 10 - mova [rsp], xm14 - vextracti128 xm7, m14, 1 - movd r4d, xm14 - pextrd r6d, xm14, 2 - pextrd r7d, xm14, 1 - pextrd r9d, xm14, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - pshufd m5, m6, q1100 - pshufd m6, m6, q3322 - vpblendd m15, m9, 0xc0 - vpblendd m10, m8, 0xc0 - pblendvb m15, m11, m5 - pblendvb m10, m11, m6 - vbroadcasti128 m14, [base+subpel_s_shuf8] - MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b - MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b - MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b - MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b - mov myd, mym - mov dyd, dym - pshufb m0, m14 ; 01a 01b - pshufb m1, m14 ; 23a 23b - pshufb m2, m14 ; 45a 45b - pshufb m3, m14 ; 67a 67b - vbroadcasti128 m14, [base+wswap] -.w8_loop: - and myd, 0x3ff - mov r6d, 64 << 24 - mov r4d, myd - shr r4d, 6 - lea r4d, [t1+r4] - cmovnz r6q, [base+subpel_filters+r4*8] - movq xm11, r6q - punpcklbw xm11, xm11 - psraw xm11, 8 - vinserti128 m11, xm11, 1 - pshufd m8, m11, q0000 - pshufd m9, m11, q1111 - pmaddwd m4, m0, m8 - pmaddwd m5, m1, m9 - pshufd m8, m11, q2222 - pshufd m11, m11, q3333 - pmaddwd m6, m2, m8 - pmaddwd m7, m3, m11 - paddd m4, m5 - paddd m6, m7 - paddd m4, m13 - paddd m4, m6 - psrad m4, rndshift - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 -%ifidn %1, put - packuswb xm4, xm4 - movq [dstq], xm4 - add dstq, dsm -%else - mova [tmpq], xm4 - add tmpq, 16 -%endif - dec hd - jz .ret - add myd, dyd - test myd, ~0x3ff - jz .w8_loop - test myd, 0x400 - mov [rsp+16], myd - mov r4d, [rsp+ 0] - mov r6d, [rsp+ 8] - mov r7d, [rsp+ 4] - mov r9d, [rsp+12] - jz .w8_skip_line - vpbroadcastq m6, [srcq+r13] - vpbroadcastq m7, [srcq+ rX] - movq xm4, [srcq+ r4] - movq xm5, [srcq+ r6] - movhps xm4, [srcq+ r7] - movhps xm5, [srcq+ r9] - vinserti128 m4, [srcq+r10], 1 - vinserti128 m5, [srcq+r11], 1 - add srcq, ssq - mov myd, [rsp+16] - mov dyd, dym - pshufb m0, m14 - pshufb m1, m14 - pshufb m2, m14 - pshufb m3, m14 - vpblendd m4, m6, 0xc0 - vpblendd m5, m7, 0xc0 - pmaddubsw m4, m15 - pmaddubsw m5, m10 - phaddw m4, m5 - pslld m5, m4, 16 - paddw m4, m5 - pmulhrsw m4, m12 - pblendw m0, m1, 0xaa - pblendw m1, m2, 0xaa - pblendw m2, m3, 0xaa - pblendw m3, m4, 0xaa - jmp .w8_loop -.w8_skip_line: - mova m0, m1 - mova m1, m2 - mova m2, m3 - vpbroadcastq m7, [srcq+r13] - vpbroadcastq m8, [srcq+ rX] - movq xm3, [srcq+ r4] - movq xm4, [srcq+ r6] - movhps xm3, [srcq+ r7] - movhps xm4, [srcq+ r9] - vinserti128 m3, [srcq+r10], 1 - vinserti128 m4, [srcq+r11], 1 - add srcq, ssq - movq xm5, [srcq+ r4] - movq xm6, [srcq+ r6] - movhps xm5, [srcq+ r7] - movhps xm6, [srcq+ r9] - vinserti128 m5, [srcq+r10], 1 - vinserti128 m6, [srcq+r11], 1 - vpbroadcastq m9, [srcq+r13] - vpbroadcastq m11, [srcq+ rX] - add srcq, ssq - mov myd, [rsp+16] - mov dyd, dym - vpblendd m3, m7, 0xc0 - vpblendd m4, m8, 0xc0 - vpblendd m5, m9, 0xc0 - vpblendd m6, m11, 0xc0 - pmaddubsw m3, m15 - pmaddubsw m4, m10 - pmaddubsw m5, m15 - pmaddubsw m6, m10 - phaddw m3, m4 - phaddw m5, m6 - psrld m4, m3, 16 - pslld m6, m5, 16 - paddw m3, m4 - paddw m5, m6 - pblendw m3, m5, 0xaa - pmulhrsw m3, m12 - jmp .w8_loop -.w16: - mov dword [rsp+48], 2 - movifprep tmp_stridem, 32 - jmp .w_start -.w32: - mov dword [rsp+48], 4 - movifprep tmp_stridem, 64 - jmp .w_start -.w64: - mov dword [rsp+48], 8 - movifprep tmp_stridem, 128 - jmp .w_start -.w128: - mov dword [rsp+48], 16 - movifprep tmp_stridem, 256 -.w_start: -%ifidn %1, put - movifnidn dsm, dsq -%endif - shr t0d, 16 - sub srcq, 3 - pmaddwd m8, [base+rescale_mul] - movd xm15, t0d - mov [rsp+72], t0d - mov [rsp+56], srcq - mov [rsp+64], r0q ; dstq / tmpq -%if UNIX64 - mov hm, hd -%endif - shl dword dxm, 3 ; dx*8 - vpbroadcastd m15, xm15 - paddd m14, m8 ; mx+dx*[0-7] - jmp .hloop -.hloop_prep: - dec dword [rsp+48] - jz .ret - add qword [rsp+64], 8*(isprep+1) - mov hd, hm - vpbroadcastd m8, dxm - vpbroadcastd m10, [base+pd_0x3ff] - paddd m14, m8, [rsp+16] - vpbroadcastd m15, [rsp+72] - pxor m9, m9 - mov srcq, [rsp+56] - mov r0q, [rsp+64] ; dstq / tmpq -.hloop: - vpbroadcastq m11, [base+pq_0x40000000] - pand m6, m14, m10 - psrld m6, 6 - paddd m15, m6 - pcmpeqd m6, m9 - vextracti128 xm7, m15, 1 - movd r4d, xm15 - pextrd r6d, xm15, 2 - pextrd r7d, xm15, 1 - pextrd r9d, xm15, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - movu [rsp+16], m14 - movq xm15, [base+subpel_filters+ r4*8] - movq xm10, [base+subpel_filters+ r6*8] - movhps xm15, [base+subpel_filters+ r7*8] - movhps xm10, [base+subpel_filters+ r9*8] - vinserti128 m15, [base+subpel_filters+r10*8], 1 - vinserti128 m10, [base+subpel_filters+r11*8], 1 - vpbroadcastq m9, [base+subpel_filters+r13*8] - vpbroadcastq m8, [base+subpel_filters+ rX*8] - psrld m14, 10 - vextracti128 xm7, m14, 1 - mova [rsp], xm14 - movd r4d, xm14 - pextrd r6d, xm14, 2 - pextrd r7d, xm14, 1 - pextrd r9d, xm14, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - pshufd m5, m6, q1100 - pshufd m6, m6, q3322 - vpblendd m15, m9, 0xc0 - vpblendd m10, m8, 0xc0 - pblendvb m15, m11, m5 - pblendvb m10, m11, m6 - vbroadcasti128 m14, [base+subpel_s_shuf8] - MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b - MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b - MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b - MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b - mov myd, mym - mov dyd, dym - pshufb m0, m14 ; 01a 01b - pshufb m1, m14 ; 23a 23b - pshufb m2, m14 ; 45a 45b - pshufb m3, m14 ; 67a 67b - vbroadcasti128 m14, [base+wswap] -.vloop: - and myd, 0x3ff - mov r6d, 64 << 24 - mov r4d, myd - shr r4d, 6 - lea r4d, [t1+r4] - cmovnz r6q, [base+subpel_filters+r4*8] - movq xm11, r6q - punpcklbw xm11, xm11 - psraw xm11, 8 - vinserti128 m11, xm11, 1 - pshufd m8, m11, q0000 - pshufd m9, m11, q1111 - pmaddwd m4, m0, m8 - pmaddwd m5, m1, m9 - pshufd m8, m11, q2222 - pshufd m11, m11, q3333 - pmaddwd m6, m2, m8 - pmaddwd m7, m3, m11 - paddd m4, m5 - paddd m6, m7 - paddd m4, m13 - paddd m4, m6 - psrad m4, rndshift - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 -%ifidn %1, put - packuswb xm4, xm4 - movq [dstq], xm4 - add dstq, dsm -%else - mova [tmpq], xm4 - add tmpq, tmp_stridem -%endif - dec hd - jz .hloop_prep - add myd, dyd - test myd, ~0x3ff - jz .vloop - test myd, 0x400 - mov [rsp+52], myd - mov r4d, [rsp+ 0] - mov r6d, [rsp+ 8] - mov r7d, [rsp+ 4] - mov r9d, [rsp+12] - jz .skip_line - vpbroadcastq m6, [srcq+r13] - vpbroadcastq m7, [srcq+ rX] - movq xm4, [srcq+ r4] - movq xm5, [srcq+ r6] - movhps xm4, [srcq+ r7] - movhps xm5, [srcq+ r9] - vinserti128 m4, [srcq+r10], 1 - vinserti128 m5, [srcq+r11], 1 - add srcq, ssq - mov myd, [rsp+52] - mov dyd, dym - pshufb m0, m14 - pshufb m1, m14 - pshufb m2, m14 - pshufb m3, m14 - vpblendd m4, m6, 0xc0 - vpblendd m5, m7, 0xc0 - pmaddubsw m4, m15 - pmaddubsw m5, m10 - phaddw m4, m5 - pslld m5, m4, 16 - paddw m4, m5 - pmulhrsw m4, m12 - pblendw m0, m1, 0xaa - pblendw m1, m2, 0xaa - pblendw m2, m3, 0xaa - pblendw m3, m4, 0xaa - jmp .vloop -.skip_line: - mova m0, m1 - mova m1, m2 - mova m2, m3 - vpbroadcastq m7, [srcq+r13] - vpbroadcastq m8, [srcq+ rX] - movq xm3, [srcq+ r4] - movq xm4, [srcq+ r6] - movhps xm3, [srcq+ r7] - movhps xm4, [srcq+ r9] - vinserti128 m3, [srcq+r10], 1 - vinserti128 m4, [srcq+r11], 1 - add srcq, ssq - movq xm5, [srcq+ r4] - movq xm6, [srcq+ r6] - movhps xm5, [srcq+ r7] - movhps xm6, [srcq+ r9] - vinserti128 m5, [srcq+r10], 1 - vinserti128 m6, [srcq+r11], 1 - vpbroadcastq m9, [srcq+r13] - vpbroadcastq m11, [srcq+ rX] - add srcq, ssq - mov myd, [rsp+52] - mov dyd, dym - vpblendd m3, m7, 0xc0 - vpblendd m4, m8, 0xc0 - vpblendd m5, m9, 0xc0 - vpblendd m6, m11, 0xc0 - pmaddubsw m3, m15 - pmaddubsw m4, m10 - pmaddubsw m5, m15 - pmaddubsw m6, m10 - phaddw m3, m4 - phaddw m5, m6 - psrld m4, m3, 16 - pslld m6, m5, 16 - paddw m3, m4 - paddw m5, m6 - pblendw m3, m5, 0xaa - pmulhrsw m3, m12 - jmp .vloop -.dy1: - movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] - add wq, base_reg - jmp wq -%ifidn %1, put -.dy1_w2: - mov myd, mym - movzx t0d, t0b - dec srcq - movd xm15, t0d - punpckldq m8, m9, m8 - paddd m14, m8 ; mx+dx*[0-1] - vpbroadcastd m11, [base+pd_0x4000] - vpbroadcastd xm15, xm15 - pand m8, m14, m10 - psrld m8, 6 - paddd xm15, xm8 - movd r4d, xm15 - pextrd r6d, xm15, 1 - vbroadcasti128 m5, [base+bdct_lb_dw] - vbroadcasti128 m6, [base+subpel_s_shuf2] - vpbroadcastd m15, [base+subpel_filters+r4*8+2] - vpbroadcastd m7, [base+subpel_filters+r6*8+2] - pcmpeqd m8, m9 - psrld m14, 10 - movq xm0, [srcq+ssq*0] - movq xm1, [srcq+ssq*2] - movhps xm0, [srcq+ssq*1] - movhps xm1, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - shr myd, 6 - mov r4d, 64 << 24 - lea myd, [t1+myq] - cmovnz r4q, [base+subpel_filters+myq*8] - pshufb m14, m5 - paddb m14, m6 - vinserti128 m0, [srcq+ssq*0], 1 - vinserti128 m1, [srcq+ssq*2], 1 - vpbroadcastq m2, [srcq+ssq*1] - add srcq, ss3q - movq xm10, r4q - punpcklbw xm10, xm10 - psraw xm10, 8 - vpblendd m15, m7, 0xaa - pblendvb m15, m11, m8 - pshufd xm8, xm10, q0000 - pshufd xm9, xm10, q1111 - pshufd xm11, xm10, q3333 - pshufd xm10, xm10, q2222 - vpblendd m0, m2, 0xc0 - pshufb m1, m14 - pshufb m0, m14 - pmaddubsw m1, m15 - pmaddubsw m0, m15 - phaddw m0, m1 - pmulhrsw m0, m12 - vextracti128 xm1, m0, 1 - palignr xm2, xm1, xm0, 4 - pshufd xm4, xm1, q2121 - punpcklwd xm3, xm0, xm2 ; 01 12 - punpckhwd xm0, xm2 ; 23 34 - punpcklwd xm2, xm1, xm4 ; 45 56 -.dy1_w2_loop: - movq xm1, [srcq+ssq*0] - movhps xm1, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - pmaddwd xm5, xm3, xm8 - pmaddwd xm6, xm0, xm9 - pmaddwd xm7, xm2, xm10 - mova xm3, xm0 - mova xm0, xm2 - paddd xm5, xm13 - paddd xm6, xm7 - pshufb xm1, xm14 - pmaddubsw xm1, xm15 - phaddw xm1, xm1 - pmulhrsw xm1, xm12 - palignr xm7, xm1, xm4, 12 - punpcklwd xm2, xm7, xm1 ; 67 78 - pmaddwd xm7, xm2, xm11 - mova xm4, xm1 - paddd xm5, xm6 - paddd xm5, xm7 - psrad xm5, rndshift - packssdw xm5, xm5 - packuswb xm5, xm5 - pextrw [dstq+dsq*0], xm5, 0 - pextrw [dstq+dsq*1], xm5, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .dy1_w2_loop - RET -%endif -.dy1_w4: - mov myd, mym - vbroadcasti128 m7, [base+rescale_mul] - movzx t0d, t0b - dec srcq - movd xm15, t0d - pmaddwd m8, m7 - vpbroadcastd m11, [base+pd_0x4000] - vpbroadcastd xm15, xm15 - paddd m14, m8 ; mx+dx*[0-3] - pand m8, m14, m10 - psrld m8, 6 - paddd xm15, xm8 - vpermq m8, m8, q3120 - movd r4d, xm15 - pextrd r6d, xm15, 2 - pextrd r11d, xm15, 1 - pextrd r13d, xm15, 3 - movd xm15, [base+subpel_filters+r4*8+2] - vpbroadcastd m7, [base+subpel_filters+r6*8+2] - movu xm2, [srcq+ssq*0] - movu xm3, [srcq+ssq*2] - vbroadcasti128 m5, [base+bdct_lb_dw] - vpbroadcastq m6, [base+subpel_s_shuf2] - pcmpeqd m8, m9 - psrld m14, 10 - pinsrd xm15, [base+subpel_filters+r11*8+2], 1 - vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 - vinserti128 m2, [srcq+ssq*1], 1 - vinserti128 m3, [srcq+ss3q ], 1 - lea srcq, [srcq+ssq*4] - shr myd, 6 - mov r4d, 64 << 24 - lea myd, [t1+myq] - cmovnz r4q, [base+subpel_filters+myq*8] - pshufb m14, m5 - paddb m14, m6 - movu xm4, [srcq+ssq*0] - movu xm5, [srcq+ssq*2] - vinserti128 m4, [srcq+ssq*1], 1 - add srcq, ss3q - vpblendd m15, m7, 0x30 - punpcklqdq m15, m15 - pblendvb m15, m11, m8 - movq xm10, r4q - punpcklbw xm10, xm10 - psraw xm10, 8 - vinserti128 m10, xm10, 1 - pshufb m2, m14 - pshufb m3, m14 - pshufb m4, m14 - pshufb xm5, xm14 - vpermq m2, m2, q3120 - vpermq m3, m3, q3120 - vpermq m4, m4, q3120 - vpermq m5, m5, q3120 - pshufd m7, m10, q0000 - pshufd m8, m10, q1111 - pshufd m9, m10, q2222 - pshufd m10, m10, q3333 - pmaddubsw m2, m15 - pmaddubsw m3, m15 - pmaddubsw m4, m15 - pmaddubsw m5, m15 - phaddw m2, m3 - phaddw m4, m5 - pmulhrsw m2, m12 - pmulhrsw m4, m12 - palignr m5, m4, m2, 4 - pshufd m3, m4, q2121 - punpcklwd m0, m2, m5 ; 01 12 - punpckhwd m1, m2, m5 ; 23 34 - punpcklwd m2, m4, m3 ; 45 56 -.dy1_w4_loop: - movu xm11, [srcq+ssq*0] - vinserti128 m11, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - pmaddwd m4, m0, m7 - pmaddwd m5, m1, m8 - pmaddwd m6, m2, m9 - mova m0, m1 - mova m1, m2 - paddd m4, m13 - paddd m5, m6 - pshufb m11, m14 - vpermq m11, m11, q3120 - pmaddubsw m11, m15 - phaddw m11, m11 - pmulhrsw m11, m12 - palignr m6, m11, m3, 12 - punpcklwd m2, m6, m11 ; 67 78 - mova m3, m11 - pmaddwd m6, m2, m10 - paddd m4, m5 - paddd m4, m6 - psrad m4, rndshift - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 -%ifidn %1, put - packuswb xm4, xm4 - pshuflw xm4, xm4, q3120 - movd [dstq+dsq*0], xm4 - pextrd [dstq+dsq*1], xm4, 1 - lea dstq, [dstq+dsq*2] -%else - pshufd xm4, xm4, q3120 - mova [tmpq], xm4 - add tmpq, 16 -%endif - sub hd, 2 - jg .dy1_w4_loop - MC_8TAP_SCALED_RET -.dy1_w8: -%ifidn %1, put - movifnidn dsm, dsq -%endif - shr t0d, 16 - sub srcq, 3 - movd xm15, t0d - pmaddwd m8, [base+rescale_mul] - vpbroadcastq m11, [base+pq_0x40000000] - vpbroadcastd m15, xm15 - paddd m14, m8 ; mx+dx*[0-7] - pand m6, m14, m10 - psrld m6, 6 - paddd m15, m6 - pcmpeqd m6, m9 - vextracti128 xm7, m15, 1 - movd r4d, xm15 - pextrd r6d, xm15, 2 - pextrd r7d, xm15, 1 - pextrd r9d, xm15, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - movq xm15, [base+subpel_filters+ r4*8] - movq xm10, [base+subpel_filters+ r6*8] - movhps xm15, [base+subpel_filters+ r7*8] - movhps xm10, [base+subpel_filters+ r9*8] - vinserti128 m15, [base+subpel_filters+r10*8], 1 - vinserti128 m10, [base+subpel_filters+r11*8], 1 - vpbroadcastq m9, [base+subpel_filters+r13*8] - vpbroadcastq m8, [base+subpel_filters+ rX*8] - psrld m14, 10 - vextracti128 xm7, m14, 1 - movd r4d, xm14 - pextrd r6d, xm14, 2 - pextrd r7d, xm14, 1 - pextrd r9d, xm14, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - mov [rsp+32], r7d - pshufd m5, m6, q1100 - pshufd m6, m6, q3322 - vpblendd m15, m9, 0xc0 - vpblendd m10, m8, 0xc0 - pblendvb m15, m11, m5 - pblendvb m10, m11, m6 - vbroadcasti128 m14, [base+subpel_s_shuf8] - MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b - MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b - MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b - MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b - mov myd, mym - movu [rsp], m10 - pshufb m0, m14 ; 01a 01b - pshufb m1, m14 ; 23a 23b - pshufb m2, m14 ; 45a 45b - pshufb m3, m14 ; 67a 67b - shr myd, 6 - lea myd, [t1+myq] - mov t1d, 64 << 24 - cmovnz t1q, [base+subpel_filters+myq*8] - vbroadcasti128 m14, [base+wswap] - movq xm11, t1q - punpcklbw xm11, xm11 - psraw xm11, 8 - vinserti128 m11, xm11, 1 - mov r7d, [rsp+32] - pshufd m8, m11, q0000 - pshufd m9, m11, q1111 - pshufd m10, m11, q2222 - pshufd m11, m11, q3333 -.dy1_w8_loop: - pmaddwd m4, m0, m8 - pmaddwd m5, m1, m9 - pmaddwd m6, m2, m10 - pmaddwd m7, m3, m11 - paddd m4, m5 - paddd m6, m7 - paddd m4, m13 - paddd m4, m6 - psrad m4, rndshift - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 -%ifidn %1, put - packuswb xm4, xm4 - movq [dstq], xm4 - add dstq, dsm -%else - mova [tmpq], xm4 - add tmpq, 16 -%endif - dec hd - jz .ret - movq xm4, [srcq+ r4] - movq xm5, [srcq+ r6] - movhps xm4, [srcq+ r7] - movhps xm5, [srcq+ r9] - vinserti128 m4, [srcq+r10], 1 - vinserti128 m5, [srcq+r11], 1 - vpbroadcastq m6, [srcq+r13] - vpbroadcastq m7, [srcq+ rX] - add srcq, ssq - pshufb m0, m14 - pshufb m1, m14 - pshufb m2, m14 - pshufb m3, m14 - vpblendd m4, m6, 0xc0 - vpblendd m5, m7, 0xc0 - pmaddubsw m4, m15 - pmaddubsw m5, [rsp] - phaddw m4, m5 - pslld m5, m4, 16 - paddw m4, m5 - pmulhrsw m4, m12 - pblendw m0, m1, 0xaa - pblendw m1, m2, 0xaa - pblendw m2, m3, 0xaa - pblendw m3, m4, 0xaa - jmp .dy1_w8_loop -.dy1_w16: - mov dword [rsp+72], 2 - movifprep tmp_stridem, 32 - jmp .dy1_w_start -.dy1_w32: - mov dword [rsp+72], 4 - movifprep tmp_stridem, 64 - jmp .dy1_w_start -.dy1_w64: - mov dword [rsp+72], 8 - movifprep tmp_stridem, 128 - jmp .dy1_w_start -.dy1_w128: - mov dword [rsp+72], 16 - movifprep tmp_stridem, 256 -.dy1_w_start: -%ifidn %1, put - movifnidn dsm, dsq -%endif - shr t0d, 16 - sub srcq, 3 - pmaddwd m8, [base+rescale_mul] - movd xm15, t0d - mov [rsp+76], t0d - mov [rsp+80], srcq - mov [rsp+88], r0q ; dstq / tmpq -%if UNIX64 - mov hm, hd -%endif - shl dword dxm, 3 ; dx*8 - vpbroadcastd m15, xm15 - paddd m14, m8 ; mx+dx*[0-7] - jmp .dy1_hloop -.dy1_hloop_prep: - dec dword [rsp+72] - jz .ret - add qword [rsp+88], 8*(isprep+1) - mov hd, hm - vpbroadcastd m8, dxm - vpbroadcastd m10, [base+pd_0x3ff] - paddd m14, m8, [rsp+32] - vpbroadcastd m15, [rsp+76] - pxor m9, m9 - mov srcq, [rsp+80] - mov r0q, [rsp+88] ; dstq / tmpq -.dy1_hloop: - vpbroadcastq m11, [base+pq_0x40000000] - pand m6, m14, m10 - psrld m6, 6 - paddd m15, m6 - pcmpeqd m6, m9 - vextracti128 xm7, m15, 1 - movd r4d, xm15 - pextrd r6d, xm15, 2 - pextrd r7d, xm15, 1 - pextrd r9d, xm15, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - movu [rsp+32], m14 - movq xm15, [base+subpel_filters+ r4*8] - movq xm10, [base+subpel_filters+ r6*8] - movhps xm15, [base+subpel_filters+ r7*8] - movhps xm10, [base+subpel_filters+ r9*8] - vinserti128 m15, [base+subpel_filters+r10*8], 1 - vinserti128 m10, [base+subpel_filters+r11*8], 1 - vpbroadcastq m9, [base+subpel_filters+r13*8] - vpbroadcastq m8, [base+subpel_filters+ rX*8] - psrld m14, 10 - vextracti128 xm7, m14, 1 - movq [rsp+64], xm14 - movd r4d, xm14 - pextrd r6d, xm14, 2 - pextrd r7d, xm14, 1 - pextrd r9d, xm14, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - pshufd m5, m6, q1100 - pshufd m6, m6, q3322 - vpblendd m15, m9, 0xc0 - vpblendd m10, m8, 0xc0 - pblendvb m15, m11, m5 - pblendvb m10, m11, m6 - vbroadcasti128 m14, [base+subpel_s_shuf8] - MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b - MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b - MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b - MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b - mov myd, mym - movu [rsp], m10 - pshufb m0, m14 ; 01a 01b - pshufb m1, m14 ; 23a 23b - pshufb m2, m14 ; 45a 45b - pshufb m3, m14 ; 67a 67b - shr myd, 6 - mov r4d, 64 << 24 - lea myd, [t1+myq] - cmovnz r4q, [base+subpel_filters+myq*8] - vbroadcasti128 m14, [base+wswap] - movq xm11, r4q - punpcklbw xm11, xm11 - psraw xm11, 8 - vinserti128 m11, xm11, 1 - mov r4d, [rsp+64] - mov r7d, [rsp+68] - pshufd m8, m11, q0000 - pshufd m9, m11, q1111 - pshufd m10, m11, q2222 - pshufd m11, m11, q3333 -.dy1_vloop: - pmaddwd m4, m0, m8 - pmaddwd m5, m1, m9 - pmaddwd m6, m2, m10 - pmaddwd m7, m3, m11 - paddd m4, m5 - paddd m6, m7 - paddd m4, m13 - paddd m4, m6 - psrad m4, rndshift - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 -%ifidn %1, put - packuswb xm4, xm4 - movq [dstq], xm4 - add dstq, dsm -%else - mova [tmpq], xm4 - add tmpq, tmp_stridem -%endif - dec hd - jz .dy1_hloop_prep - movq xm4, [srcq+ r4] - movq xm5, [srcq+ r6] - movhps xm4, [srcq+ r7] - movhps xm5, [srcq+ r9] - vinserti128 m4, [srcq+r10], 1 - vinserti128 m5, [srcq+r11], 1 - vpbroadcastq m6, [srcq+r13] - vpbroadcastq m7, [srcq+ rX] - add srcq, ssq - pshufb m0, m14 - pshufb m1, m14 - pshufb m2, m14 - pshufb m3, m14 - vpblendd m4, m6, 0xc0 - vpblendd m5, m7, 0xc0 - pmaddubsw m4, m15 - pmaddubsw m5, [rsp] - phaddw m4, m5 - pslld m5, m4, 16 - paddw m4, m5 - pmulhrsw m4, m12 - pblendw m0, m1, 0xaa - pblendw m1, m2, 0xaa - pblendw m2, m3, 0xaa - pblendw m3, m4, 0xaa - jmp .dy1_vloop -.dy2: - movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] - add wq, base_reg - jmp wq -%ifidn %1, put -.dy2_w2: - mov myd, mym - movzx t0d, t0b - dec srcq - movd xm15, t0d - punpckldq m8, m9, m8 - paddd m14, m8 ; mx+dx*[0-1] - vpbroadcastd m11, [base+pd_0x4000] - vpbroadcastd xm15, xm15 - pand m8, m14, m10 - psrld m8, 6 - paddd xm15, xm8 - movd r4d, xm15 - pextrd r6d, xm15, 1 - vbroadcasti128 m5, [base+bdct_lb_dw] - vbroadcasti128 m6, [base+subpel_s_shuf2] - vpbroadcastd m15, [base+subpel_filters+r4*8+2] - vpbroadcastd m7, [base+subpel_filters+r6*8+2] - pcmpeqd m8, m9 - psrld m14, 10 - movq xm0, [srcq+ssq*0] - vpbroadcastq m2, [srcq+ssq*1] - movhps xm0, [srcq+ssq*2] - vpbroadcastq m3, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - pshufb m14, m5 - paddb m14, m6 - vpblendd m15, m7, 0xaa - pblendvb m15, m11, m8 - movhps xm1, [srcq+ssq*0] - vpbroadcastq m4, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - shr myd, 6 - mov r4d, 64 << 24 - lea myd, [t1+myq] - cmovnz r4q, [base+subpel_filters+myq*8] - vpblendd m0, m2, 0x30 - vpblendd m1, m4, 0xc0 - vpblendd m0, m3, 0xc0 - pshufb m0, m14 - pshufb m1, m14 - pmaddubsw m0, m15 - pmaddubsw m1, m15 - movq xm11, r4q - punpcklbw xm11, xm11 - psraw xm11, 8 - phaddw m0, m1 - pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 - pshufd xm8, xm11, q0000 - pshufd xm9, xm11, q1111 - pshufd xm10, xm11, q2222 - pshufd xm11, xm11, q3333 - pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 - vextracti128 xm1, m2, 1 - punpcklwd xm3, xm2, xm1 ; 01 23 - punpckhwd xm2, xm1 ; 23 45 -.dy2_w2_loop: - movq xm6, [srcq+ssq*0] - vpbroadcastq m7, [srcq+ssq*1] - movhps xm6, [srcq+ssq*2] - vpbroadcastq m1, [srcq+ss3q ] - lea srcq, [srcq+ssq*4] - pmaddwd xm4, xm3, xm8 - pmaddwd xm5, xm2, xm9 - vpblendd m6, m7, 0x30 - vpblendd m6, m1, 0xc0 - pshufb m6, m14 - pmaddubsw m6, m15 - phaddw m6, m6 - pmulhrsw m6, m12 - palignr m0, m6, m0, 8 - pshufd m2, m0, q3221 - vextracti128 xm1, m2, 1 - punpcklwd xm3, xm2, xm1 ; 45 67 - punpckhwd xm2, xm1 ; 67 89 - pmaddwd xm6, xm3, xm10 - pmaddwd xm7, xm2, xm11 - paddd xm4, xm5 - paddd xm4, xm13 - paddd xm6, xm7 - paddd xm4, xm6 - psrad xm4, rndshift - packssdw xm4, xm4 - packuswb xm4, xm4 - pextrw [dstq+dsq*0], xm4, 0 - pextrw [dstq+dsq*1], xm4, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .dy2_w2_loop - RET -%endif -.dy2_w4: - mov myd, mym - vbroadcasti128 m7, [base+rescale_mul] - movzx t0d, t0b - dec srcq - movd xm15, t0d - pmaddwd m8, m7 - vpbroadcastd m11, [base+pd_0x4000] - vpbroadcastd xm15, xm15 - paddd m14, m8 ; mx+dx*[0-3] - pand m8, m14, m10 - psrld m8, 6 - paddd xm15, xm8 - movd r4d, xm15 - pextrd r6d, xm15, 1 - pextrd r11d, xm15, 2 - pextrd r13d, xm15, 3 - movd xm15, [base+subpel_filters+r4*8+2] - vbroadcasti128 m5, [base+bdct_lb_dw] - vpbroadcastq m6, [base+subpel_s_shuf2] - pinsrd xm15, [base+subpel_filters+r6*8+2], 1 - pcmpeqd m8, m9 - psrld m14, 10 - movu xm0, [srcq+ssq*0] - movu xm2, [srcq+ssq*2] - pinsrd xm15, [base+subpel_filters+r11*8+2], 2 - movu xm1, [srcq+ssq*1] - movu xm3, [srcq+ss3q ] - pinsrd xm15, [base+subpel_filters+r13*8+2], 3 - lea srcq, [srcq+ssq*4] - shr myd, 6 - mov r4d, 64 << 24 - lea myd, [t1+myq] - cmovnz r4q, [base+subpel_filters+myq*8] - vinserti128 m15, xm15, 1 - pshufb m14, m5 - paddb m14, m6 - vinserti128 m2, [srcq+ssq*0], 1 - vinserti128 m3, [srcq+ssq*1], 1 - lea srcq, [srcq+ssq*2] - pblendvb m15, m11, m8 - pshufb xm0, xm14 - pshufb m2, m14 - pshufb xm1, xm14 - pshufb m3, m14 - pmaddubsw xm0, xm15 - pmaddubsw m2, m15 - pmaddubsw xm1, xm15 - pmaddubsw m3, m15 - movq xm11, r4q - punpcklbw xm11, xm11 - psraw xm11, 8 - vinserti128 m11, xm11, 1 - phaddw m0, m2 - phaddw m1, m3 - pmulhrsw m0, m12 ; 0 2 _ 4 - pmulhrsw m1, m12 ; 1 3 _ 5 - pshufd m8, m11, q0000 - pshufd m9, m11, q1111 - pshufd m10, m11, q2222 - pshufd m11, m11, q3333 - punpcklwd xm2, xm0, xm1 - punpckhwd m1, m0, m1 ; 23 45 - vinserti128 m0, m2, xm1, 1 ; 01 23 -.dy2_w4_loop: - movu xm6, [srcq+ssq*0] - movu xm7, [srcq+ssq*1] - vinserti128 m6, [srcq+ssq*2], 1 - vinserti128 m7, [srcq+ss3q ], 1 - lea srcq, [srcq+ssq*4] - pmaddwd m4, m0, m8 - pmaddwd m5, m1, m9 - pshufb m6, m14 - pshufb m7, m14 - pmaddubsw m6, m15 - pmaddubsw m7, m15 - psrld m2, m6, 16 - pslld m3, m7, 16 - paddw m6, m2 - paddw m7, m3 - pblendw m6, m7, 0xaa ; 67 89 - pmulhrsw m6, m12 - paddd m4, m5 - vpblendd m0, m1, m6, 0x0f - mova m1, m6 - vpermq m0, m0, q1032 ; 45 67 - pmaddwd m6, m0, m10 - pmaddwd m7, m1, m11 - paddd m4, m13 - paddd m6, m7 - paddd m4, m6 - psrad m4, rndshift - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 -%ifidn %1, put - packuswb xm4, xm4 - movd [dstq+dsq*0], xm4 - pextrd [dstq+dsq*1], xm4, 1 - lea dstq, [dstq+dsq*2] -%else - mova [tmpq], xm4 - add tmpq, 16 -%endif - sub hd, 2 - jg .dy2_w4_loop - MC_8TAP_SCALED_RET -.dy2_w8: -%ifidn %1, put - movifnidn dsm, dsq -%endif - shr t0d, 16 - sub srcq, 3 - movd xm15, t0d - pmaddwd m8, [base+rescale_mul] - vpbroadcastq m11, [base+pq_0x40000000] - vpbroadcastd m15, xm15 - paddd m14, m8 ; mx+dx*[0-7] - pand m6, m14, m10 - psrld m6, 6 - paddd m15, m6 - pcmpeqd m6, m9 - vextracti128 xm7, m15, 1 - movd r4d, xm15 - pextrd r6d, xm15, 2 - pextrd r7d, xm15, 1 - pextrd r9d, xm15, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - movq xm15, [base+subpel_filters+ r4*8] - movq xm10, [base+subpel_filters+ r6*8] - movhps xm15, [base+subpel_filters+ r7*8] - movhps xm10, [base+subpel_filters+ r9*8] - vinserti128 m15, [base+subpel_filters+r10*8], 1 - vinserti128 m10, [base+subpel_filters+r11*8], 1 - vpbroadcastq m9, [base+subpel_filters+r13*8] - vpbroadcastq m8, [base+subpel_filters+ rX*8] - psrld m14, 10 - vextracti128 xm7, m14, 1 - movd r4d, xm14 - pextrd r6d, xm14, 2 - pextrd r7d, xm14, 1 - pextrd r9d, xm14, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - mov [rsp], r7d - pshufd m5, m6, q1100 - pshufd m6, m6, q3322 - vpblendd m15, m9, 0xc0 - vpblendd m10, m8, 0xc0 - pblendvb m15, m11, m5 - pblendvb m10, m11, m6 - vbroadcasti128 m14, [base+subpel_s_shuf8] - MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b - MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b - MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b - MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b - mov myd, mym - pshufb m0, m14 ; 01a 01b - pshufb m1, m14 ; 23a 23b - pshufb m2, m14 ; 45a 45b - pshufb m3, m14 ; 67a 67b - shr myd, 6 - lea myd, [t1+myq] - mov t1d, 64 << 24 - cmovnz t1q, [base+subpel_filters+myq*8] - movq xm11, t1q - punpcklbw xm11, xm11 - psraw xm11, 8 - vinserti128 m11, xm11, 1 - mov r7d, [rsp] - pshufd m8, m11, q0000 - pshufd m9, m11, q1111 - pshufd m14, m11, q2222 - pshufd m11, m11, q3333 -.dy2_w8_loop: - pmaddwd m4, m0, m8 - pmaddwd m5, m1, m9 - pmaddwd m6, m2, m14 - pmaddwd m7, m3, m11 - paddd m4, m5 - paddd m6, m7 - paddd m4, m13 - paddd m4, m6 - psrad m4, rndshift - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 -%ifidn %1, put - packuswb xm4, xm4 - movq [dstq], xm4 - add dstq, dsm -%else - mova [tmpq], xm4 - add tmpq, 16 -%endif - dec hd - jz .ret - mova m0, m1 - mova m1, m2 - mova m2, m3 - movq xm3, [srcq+ r4] - movq xm4, [srcq+ r6] - movhps xm3, [srcq+ r7] - movhps xm4, [srcq+ r9] - vinserti128 m3, [srcq+r10], 1 - vinserti128 m4, [srcq+r11], 1 - vpbroadcastq m5, [srcq+r13] - vpbroadcastq m6, [srcq+ rX] - add srcq, ssq - vpblendd m3, m5, 0xc0 - vpblendd m4, m6, 0xc0 - pmaddubsw m3, m15 - pmaddubsw m4, m10 - phaddw m3, m4 - movq xm4, [srcq+ r4] - movq xm5, [srcq+ r6] - movhps xm4, [srcq+ r7] - movhps xm5, [srcq+ r9] - vinserti128 m4, [srcq+r10], 1 - vinserti128 m5, [srcq+r11], 1 - vpbroadcastq m6, [srcq+r13] - vpbroadcastq m7, [srcq+ rX] - add srcq, ssq - vpblendd m4, m6, 0xc0 - vpblendd m5, m7, 0xc0 - pmaddubsw m4, m15 - pmaddubsw m5, m10 - phaddw m4, m5 - psrld m5, m3, 16 - pslld m6, m4, 16 - paddw m3, m5 - paddw m4, m6 - pblendw m3, m4, 0xaa - pmulhrsw m3, m12 - jmp .dy2_w8_loop -.dy2_w16: - mov dword [rsp+40], 2 - movifprep tmp_stridem, 32 - jmp .dy2_w_start -.dy2_w32: - mov dword [rsp+40], 4 - movifprep tmp_stridem, 64 - jmp .dy2_w_start -.dy2_w64: - mov dword [rsp+40], 8 - movifprep tmp_stridem, 128 - jmp .dy2_w_start -.dy2_w128: - mov dword [rsp+40], 16 - movifprep tmp_stridem, 256 -.dy2_w_start: -%ifidn %1, put - movifnidn dsm, dsq -%endif - shr t0d, 16 - sub srcq, 3 - pmaddwd m8, [base+rescale_mul] - movd xm15, t0d - mov [rsp+64], t0d - mov [rsp+48], srcq - mov [rsp+56], r0q ; dstq / tmpq -%if UNIX64 - mov hm, hd -%endif - shl dword dxm, 3 ; dx*8 - vpbroadcastd m15, xm15 - paddd m14, m8 ; mx+dx*[0-7] - jmp .dy2_hloop -.dy2_hloop_prep: - dec dword [rsp+40] - jz .ret - add qword [rsp+56], 8*(isprep+1) - mov hd, hm - vpbroadcastd m8, dxm - vpbroadcastd m10, [base+pd_0x3ff] - paddd m14, m8, [rsp] - vpbroadcastd m15, [rsp+64] - pxor m9, m9 - mov srcq, [rsp+48] - mov r0q, [rsp+56] ; dstq / tmpq -.dy2_hloop: - vpbroadcastq m11, [base+pq_0x40000000] - pand m6, m14, m10 - psrld m6, 6 - paddd m15, m6 - pcmpeqd m6, m9 - vextracti128 xm7, m15, 1 - movd r4d, xm15 - pextrd r6d, xm15, 2 - pextrd r7d, xm15, 1 - pextrd r9d, xm15, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - movu [rsp], m14 - movq xm15, [base+subpel_filters+ r4*8] - movq xm10, [base+subpel_filters+ r6*8] - movhps xm15, [base+subpel_filters+ r7*8] - movhps xm10, [base+subpel_filters+ r9*8] - vinserti128 m15, [base+subpel_filters+r10*8], 1 - vinserti128 m10, [base+subpel_filters+r11*8], 1 - vpbroadcastq m9, [base+subpel_filters+r13*8] - vpbroadcastq m8, [base+subpel_filters+ rX*8] - psrld m14, 10 - vextracti128 xm7, m14, 1 - movq [rsp+32], xm14 - movd r4d, xm14 - pextrd r6d, xm14, 2 - pextrd r7d, xm14, 1 - pextrd r9d, xm14, 3 - movd r10d, xm7 - pextrd r11d, xm7, 2 - pextrd r13d, xm7, 1 - pextrd rXd, xm7, 3 - pshufd m5, m6, q1100 - pshufd m6, m6, q3322 - vpblendd m15, m9, 0xc0 - vpblendd m10, m8, 0xc0 - pblendvb m15, m11, m5 - pblendvb m10, m11, m6 - vbroadcasti128 m14, [base+subpel_s_shuf8] - MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b - MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b - MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b - MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b - mov myd, mym - pshufb m0, m14 ; 01a 01b - pshufb m1, m14 ; 23a 23b - pshufb m2, m14 ; 45a 45b - pshufb m3, m14 ; 67a 67b - shr myd, 6 - mov r4d, 64 << 24 - lea myd, [t1+myq] - cmovnz r4q, [base+subpel_filters+myq*8] - movq xm14, r4q - punpcklbw xm14, xm14 - psraw xm14, 8 - vinserti128 m14, xm14, 1 - mov r4d, [rsp+32] - mov r7d, [rsp+36] - pshufd m8, m14, q0000 - pshufd m9, m14, q1111 - pshufd m11, m14, q2222 - pshufd m14, m14, q3333 -.dy2_vloop: - pmaddwd m4, m0, m8 - pmaddwd m5, m1, m9 - pmaddwd m6, m2, m11 - pmaddwd m7, m3, m14 - paddd m4, m5 - paddd m6, m7 - paddd m4, m13 - paddd m4, m6 - psrad m4, rndshift - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 -%ifidn %1, put - packuswb xm4, xm4 - movq [dstq], xm4 - add dstq, dsm -%else - mova [tmpq], xm4 - add tmpq, tmp_stridem -%endif - dec hd - jz .dy2_hloop_prep - mova m0, m1 - mova m1, m2 - mova m2, m3 - movq xm3, [srcq+ r4] - movq xm4, [srcq+ r6] - movhps xm3, [srcq+ r7] - movhps xm4, [srcq+ r9] - vinserti128 m3, [srcq+r10], 1 - vinserti128 m4, [srcq+r11], 1 - vpbroadcastq m5, [srcq+r13] - vpbroadcastq m6, [srcq+ rX] - add srcq, ssq - vpblendd m3, m5, 0xc0 - vpblendd m4, m6, 0xc0 - pmaddubsw m3, m15 - pmaddubsw m4, m10 - phaddw m3, m4 - movq xm4, [srcq+ r4] - movq xm5, [srcq+ r6] - movhps xm4, [srcq+ r7] - movhps xm5, [srcq+ r9] - vinserti128 m4, [srcq+r10], 1 - vinserti128 m5, [srcq+r11], 1 - vpbroadcastq m6, [srcq+r13] - vpbroadcastq m7, [srcq+ rX] - add srcq, ssq - vpblendd m4, m6, 0xc0 - vpblendd m5, m7, 0xc0 - pmaddubsw m4, m15 - pmaddubsw m5, m10 - phaddw m4, m5 - psrld m5, m3, 16 - pslld m6, m4, 16 - paddw m3, m5 - paddw m4, m6 - pblendw m3, m4, 0xaa - pmulhrsw m3, m12 - jmp .dy2_vloop -.ret: - MC_8TAP_SCALED_RET 0 -%undef isprep -%endmacro - -%macro BILIN_SCALED_FN 1 -cglobal %1_bilin_scaled - mov t0d, (5*15 << 16) | 5*15 - mov t1d, (5*15 << 16) | 5*15 - jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) -%endmacro -%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, -%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, - -%if WIN64 -DECLARE_REG_TMP 6, 5 -%else -DECLARE_REG_TMP 6, 8 -%endif -BILIN_SCALED_FN put -PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR -PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH -PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_SCALED_FN sharp, SHARP, SHARP -PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -MC_8TAP_SCALED put - -%if WIN64 -DECLARE_REG_TMP 5, 4 -%else -DECLARE_REG_TMP 6, 7 -%endif -BILIN_SCALED_FN prep -PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR -PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_SCALED_FN sharp, SHARP, SHARP -PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH -MC_8TAP_SCALED prep - -%macro WARP_V 5 ; dst, 02, 46, 13, 57 - ; Can be done using gathers, but that's terribly slow on many CPU:s - lea tmp1d, [myq+deltaq*4] - lea tmp2d, [myq+deltaq*1] - shr myd, 10 - shr tmp1d, 10 - movq xm8, [filterq+myq *8] - vinserti128 m8, [filterq+tmp1q*8], 1 ; a e - lea tmp1d, [tmp2q+deltaq*4] - lea myd, [tmp2q+deltaq*1] - shr tmp2d, 10 - shr tmp1d, 10 - movq xm0, [filterq+tmp2q*8] - vinserti128 m0, [filterq+tmp1q*8], 1 ; b f - lea tmp1d, [myq+deltaq*4] - lea tmp2d, [myq+deltaq*1] - shr myd, 10 - shr tmp1d, 10 - movq xm9, [filterq+myq *8] - vinserti128 m9, [filterq+tmp1q*8], 1 ; c g - lea tmp1d, [tmp2q+deltaq*4] - lea myd, [tmp2q+gammaq] ; my += gamma - shr tmp2d, 10 - shr tmp1d, 10 - punpcklwd m8, m0 - movq xm0, [filterq+tmp2q*8] - vinserti128 m0, [filterq+tmp1q*8], 1 ; d h - punpcklwd m0, m9, m0 - punpckldq m9, m8, m0 - punpckhdq m0, m8, m0 - punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 - punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 - pmaddwd m%2, m8 - pmaddwd m9, m%3 - punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 - punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 - pmaddwd m8, m%4 - pmaddwd m0, m%5 - paddd m%2, m9 - paddd m0, m8 - paddd m%1, m0, m%2 -%endmacro - -cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts -%if WIN64 - sub rsp, 0xa0 -%endif - call mangle(private_prefix %+ _warp_affine_8x8_avx2).main -.loop: - psrad m7, 13 - psrad m0, 13 - packssdw m7, m0 - pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 - vpermq m7, m7, q3120 - mova [tmpq+tsq*0], xm7 - vextracti128 [tmpq+tsq*2], m7, 1 - dec r4d - jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end - call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2 - lea tmpq, [tmpq+tsq*4] - jmp .loop - -cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ - beta, filter, tmp1, delta, my, gamma -%if WIN64 - sub rsp, 0xa0 - %assign xmm_regs_used 16 - %assign stack_size_padded 0xa0 - %assign stack_offset stack_offset+stack_size_padded -%endif - call .main - jmp .start -.loop: - call .main2 - lea dstq, [dstq+dsq*2] -.start: - psrad m7, 18 - psrad m0, 18 - packusdw m7, m0 - pavgw m7, m11 ; (x + (1 << 10)) >> 11 - vextracti128 xm0, m7, 1 - packuswb xm7, xm0 - pshufd xm7, xm7, q3120 - movq [dstq+dsq*0], xm7 - movhps [dstq+dsq*1], xm7 - dec r4d - jg .loop -.end: - RET -ALIGN function_align -.main: - ; Stack args offset by one (r4m -> r5m etc.) due to call -%if WIN64 - mov abcdq, r5m - mov mxd, r6m - movaps [rsp+stack_offset+0x10], xmm6 - movaps [rsp+stack_offset+0x20], xmm7 - movaps [rsp+0x28], xmm8 - movaps [rsp+0x38], xmm9 - movaps [rsp+0x48], xmm10 - movaps [rsp+0x58], xmm11 - movaps [rsp+0x68], xmm12 - movaps [rsp+0x78], xmm13 - movaps [rsp+0x88], xmm14 - movaps [rsp+0x98], xmm15 -%endif - movsx alphad, word [abcdq+2*0] - movsx betad, word [abcdq+2*1] - mova m12, [warp_8x8_shufA] - mova m13, [warp_8x8_shufB] - vpbroadcastd m14, [pw_8192] - vpbroadcastd m15, [pd_32768] - pxor m11, m11 - lea filterq, [mc_warp_filter] - lea tmp1q, [ssq*3+3] - add mxd, 512+(64<<10) - lea tmp2d, [alphaq*3] - sub srcq, tmp1q ; src -= src_stride*3 + 3 - sub betad, tmp2d ; beta -= alpha*3 - mov myd, r7m - call .h - psrld m1, m0, 16 - call .h - psrld m4, m0, 16 - call .h - pblendw m1, m0, 0xaa ; 02 - call .h - pblendw m4, m0, 0xaa ; 13 - call .h - psrld m2, m1, 16 - pblendw m2, m0, 0xaa ; 24 - call .h - psrld m5, m4, 16 - pblendw m5, m0, 0xaa ; 35 - call .h - psrld m3, m2, 16 - pblendw m3, m0, 0xaa ; 46 - movsx deltad, word [abcdq+2*2] - movsx gammad, word [abcdq+2*3] - add myd, 512+(64<<10) - mov r4d, 4 - lea tmp1d, [deltaq*3] - sub gammad, tmp1d ; gamma -= delta*3 -.main2: - call .h - psrld m6, m5, 16 - pblendw m6, m0, 0xaa ; 57 - WARP_V 7, 1, 3, 4, 6 - call .h - mova m1, m2 - mova m2, m3 - psrld m3, 16 - pblendw m3, m0, 0xaa ; 68 - WARP_V 0, 4, 6, 1, 3 - mova m4, m5 - mova m5, m6 - ret -ALIGN function_align -.h: - lea tmp1d, [mxq+alphaq*4] - lea tmp2d, [mxq+alphaq*1] - vbroadcasti128 m10, [srcq] - shr mxd, 10 - shr tmp1d, 10 - movq xm8, [filterq+mxq *8] - vinserti128 m8, [filterq+tmp1q*8], 1 - lea tmp1d, [tmp2q+alphaq*4] - lea mxd, [tmp2q+alphaq*1] - shr tmp2d, 10 - shr tmp1d, 10 - movq xm0, [filterq+tmp2q*8] - vinserti128 m0, [filterq+tmp1q*8], 1 - lea tmp1d, [mxq+alphaq*4] - lea tmp2d, [mxq+alphaq*1] - shr mxd, 10 - shr tmp1d, 10 - movq xm9, [filterq+mxq *8] - vinserti128 m9, [filterq+tmp1q*8], 1 - lea tmp1d, [tmp2q+alphaq*4] - lea mxd, [tmp2q+betaq] ; mx += beta - shr tmp2d, 10 - shr tmp1d, 10 - punpcklqdq m8, m0 ; 0 1 4 5 - movq xm0, [filterq+tmp2q*8] - vinserti128 m0, [filterq+tmp1q*8], 1 - punpcklqdq m9, m0 ; 2 3 6 7 - pshufb m0, m10, m12 - pmaddubsw m0, m8 - pshufb m10, m13 - pmaddubsw m10, m9 - add srcq, ssq - phaddw m0, m10 - pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 - paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword - ret - -%macro WRAP_YMM 1+ - INIT_YMM cpuname - %1 - INIT_ZMM cpuname -%endmacro - -%macro BIDIR_FN 1 ; op -%if mmsize == 64 - lea stride3q, [strideq*3] - jmp wq -.w4: - cmp hd, 8 - jg .w4_h16 - WRAP_YMM %1 0 - vextracti32x4 xmm1, ym0, 1 - movd [dstq ], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 - jl .w4_ret - lea dstq, [dstq+strideq*4] - pextrd [dstq ], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 -.w4_ret: - RET -.w4_h16: - vpbroadcastd m7, strided - pmulld m7, [bidir_sctr_w4] - %1 0 - kxnorw k1, k1, k1 - vpscatterdd [dstq+m7]{k1}, m0 - RET -.w8: - cmp hd, 4 - jne .w8_h8 - WRAP_YMM %1 0 - vextracti128 xmm1, ym0, 1 - movq [dstq ], xm0 - movq [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 - RET -.w8_loop: - %1_INC_PTR 2 - lea dstq, [dstq+strideq*4] -.w8_h8: - %1 0 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 - movq [dstq ], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 - lea dstq, [dstq+strideq*4] - movhps [dstq ], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 - sub hd, 8 - jg .w8_loop - RET -.w16_loop: - %1_INC_PTR 2 - lea dstq, [dstq+strideq*4] -.w16: - %1 0 - vpermq m0, m0, q3120 - mova [dstq ], xm0 - vextracti32x4 [dstq+strideq*1], m0, 2 - vextracti32x4 [dstq+strideq*2], ym0, 1 - vextracti32x4 [dstq+stride3q ], m0, 3 - sub hd, 4 - jg .w16_loop - RET -.w32: - pmovzxbq m7, [warp_8x8_shufA] -.w32_loop: - %1 0 - %1_INC_PTR 2 - vpermq m0, m7, m0 - mova [dstq+strideq*0], ym0 - vextracti32x8 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w32_loop - RET -.w64: - pmovzxbq m7, [warp_8x8_shufA] -.w64_loop: - %1 0 - %1_INC_PTR 2 - vpermq m0, m7, m0 - mova [dstq], m0 - add dstq, strideq - dec hd - jg .w64_loop - RET -.w128: - pmovzxbq m7, [warp_8x8_shufA] -.w128_loop: - %1 0 - vpermq m6, m7, m0 - %1 2 - mova [dstq+64*0], m6 - %1_INC_PTR 4 - vpermq m6, m7, m0 - mova [dstq+64*1], m6 - add dstq, strideq - dec hd - jg .w128_loop - RET -%else - %1 0 - lea stride3q, [strideq*3] - jmp wq -.w4: - vextracti128 xm1, m0, 1 - movd [dstq ], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xm1 - pextrd [dstq+stride3q ], xm1, 1 - cmp hd, 4 - je .ret - lea dstq, [dstq+strideq*4] - pextrd [dstq ], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+stride3q ], xm1, 3 - cmp hd, 8 - je .ret - %1 2 - lea dstq, [dstq+strideq*4] - vextracti128 xm1, m0, 1 - movd [dstq ], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xm1 - pextrd [dstq+stride3q ], xm1, 1 - lea dstq, [dstq+strideq*4] - pextrd [dstq ], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+stride3q ], xm1, 3 -.ret: - RET -.w8_loop: - %1_INC_PTR 2 - %1 0 - lea dstq, [dstq+strideq*4] -.w8: - vextracti128 xm1, m0, 1 - movq [dstq ], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xm1 - sub hd, 4 - jg .w8_loop - RET -.w16_loop: - %1_INC_PTR 4 - %1 0 - lea dstq, [dstq+strideq*4] -.w16: - vpermq m0, m0, q3120 - mova [dstq ], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - %1 2 - vpermq m0, m0, q3120 - mova [dstq+strideq*2], xm0 - vextracti128 [dstq+stride3q ], m0, 1 - sub hd, 4 - jg .w16_loop - RET -.w32_loop: - %1_INC_PTR 4 - %1 0 - lea dstq, [dstq+strideq*2] -.w32: - vpermq m0, m0, q3120 - mova [dstq+strideq*0], m0 - %1 2 - vpermq m0, m0, q3120 - mova [dstq+strideq*1], m0 - sub hd, 2 - jg .w32_loop - RET -.w64_loop: - %1_INC_PTR 4 - %1 0 - add dstq, strideq -.w64: - vpermq m0, m0, q3120 - mova [dstq], m0 - %1 2 - vpermq m0, m0, q3120 - mova [dstq+32], m0 - dec hd - jg .w64_loop - RET -.w128_loop: - %1 0 - add dstq, strideq -.w128: - vpermq m0, m0, q3120 - mova [dstq+0*32], m0 - %1 2 - vpermq m0, m0, q3120 - mova [dstq+1*32], m0 - %1_INC_PTR 8 - %1 -4 - vpermq m0, m0, q3120 - mova [dstq+2*32], m0 - %1 -2 - vpermq m0, m0, q3120 - mova [dstq+3*32], m0 - dec hd - jg .w128_loop - RET -%endif -%endmacro - -%macro AVG 1 ; src_offset - mova m0, [tmp1q+(%1+0)*mmsize] - paddw m0, [tmp2q+(%1+0)*mmsize] - mova m1, [tmp1q+(%1+1)*mmsize] - paddw m1, [tmp2q+(%1+1)*mmsize] - pmulhrsw m0, m2 - pmulhrsw m1, m2 - packuswb m0, m1 -%endmacro - -%macro AVG_INC_PTR 1 - add tmp1q, %1*mmsize - add tmp2q, %1*mmsize -%endmacro - -%macro AVG_FN 0 -cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 -%define base r6-avg %+ SUFFIX %+ _table - lea r6, [avg %+ SUFFIX %+ _table] - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, dword [r6+wq*4] - vpbroadcastd m2, [base+pw_1024] - add wq, r6 - BIDIR_FN AVG -%endmacro - -%macro W_AVG 1 ; src_offset - ; (a * weight + b * (16 - weight) + 128) >> 8 - ; = ((a - b) * weight + (b << 4) + 128) >> 8 - ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 - ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 - mova m0, [tmp1q+(%1+0)*mmsize] - psubw m2, m0, [tmp2q+(%1+0)*mmsize] - mova m1, [tmp1q+(%1+1)*mmsize] - psubw m3, m1, [tmp2q+(%1+1)*mmsize] - pmulhw m2, m4 - pmulhw m3, m4 - paddw m0, m2 - paddw m1, m3 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - packuswb m0, m1 -%endmacro - -%define W_AVG_INC_PTR AVG_INC_PTR - -%macro W_AVG_FN 0 -cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 -%define base r6-w_avg %+ SUFFIX %+ _table - lea r6, [w_avg %+ SUFFIX %+ _table] - tzcnt wd, wm - movifnidn hd, hm - vpbroadcastw m4, r6m ; weight - movsxd wq, dword [r6+wq*4] - vpbroadcastd m5, [base+pw_2048] - psllw m4, 12 ; (weight-16) << 12 when interpreted as signed - add wq, r6 - cmp dword r6m, 7 - jg .weight_gt7 - mov r6, tmp1q - pxor m0, m0 - mov tmp1q, tmp2q - psubw m4, m0, m4 ; -weight - mov tmp2q, r6 -.weight_gt7: - BIDIR_FN W_AVG -%endmacro - -%macro MASK 1 ; src_offset - ; (a * m + b * (64 - m) + 512) >> 10 - ; = ((a - b) * m + (b << 6) + 512) >> 10 - ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 -%if mmsize == 64 - vpermq m3, m8, [maskq+%1*32] -%else - vpermq m3, [maskq+%1*16], q3120 -%endif - mova m0, [tmp2q+(%1+0)*mmsize] - psubw m1, m0, [tmp1q+(%1+0)*mmsize] - psubb m3, m4, m3 - paddw m1, m1 ; (b - a) << 1 - paddb m3, m3 - punpcklbw m2, m4, m3 ; -m << 9 - pmulhw m1, m2 - paddw m0, m1 - mova m1, [tmp2q+(%1+1)*mmsize] - psubw m2, m1, [tmp1q+(%1+1)*mmsize] - paddw m2, m2 - punpckhbw m3, m4, m3 - pmulhw m2, m3 - paddw m1, m2 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - packuswb m0, m1 -%endmacro - -%macro MASK_INC_PTR 1 - add maskq, %1*mmsize/2 - add tmp2q, %1*mmsize - add tmp1q, %1*mmsize -%endmacro - -%macro MASK_FN 0 -cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 -%define base r7-mask %+ SUFFIX %+ _table - lea r7, [mask %+ SUFFIX %+ _table] - tzcnt wd, wm - movifnidn hd, hm - mov maskq, maskmp - movsxd wq, dword [r7+wq*4] - pxor m4, m4 -%if mmsize == 64 - mova m8, [base+bilin_v_perm64] -%endif - vpbroadcastd m5, [base+pw_2048] - add wq, r7 - BIDIR_FN MASK -%endmacro MASK_FN - -%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 - mova m%1, [tmp1q+mmsize*%3] - mova m1, [tmp2q+mmsize*%3] - psubw m1, m%1 - pabsw m%2, m1 - psubusw m%2, m6, m%2 - psrlw m%2, 8 ; 64 - m - psllw m2, m%2, 10 - pmulhw m1, m2 - paddw m%1, m1 - mova m1, [tmp1q+mmsize*%4] - mova m2, [tmp2q+mmsize*%4] - psubw m2, m1 - pabsw m3, m2 - psubusw m3, m6, m3 -%if cpuflag(avx512icl) - vpshldw m%2, m3, 8 - psllw m3, m%2, 10 -%if %5 - psubb m%2, m5, m%2 -%endif -%else - psrlw m3, 8 -%if %5 - packuswb m%2, m3 - psubb m%2, m5, m%2 - vpermq m%2, m%2, q3120 -%else - phaddw m%2, m3 -%endif - psllw m3, 10 -%endif - pmulhw m2, m3 - paddw m1, m2 - pmulhrsw m%1, m7 - pmulhrsw m1, m7 - packuswb m%1, m1 -%endmacro - -cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask -%define base r6-blend_avx2_table - lea r6, [blend_avx2_table] - tzcnt wd, wm - movifnidn hd, hm - movifnidn maskq, maskmp - movsxd wq, dword [r6+wq*4] - vpbroadcastd m4, [base+pb_64] - vpbroadcastd m5, [base+pw_512] - add wq, r6 - lea r6, [dsq*3] - jmp wq -.w4: - movd xm0, [dstq+dsq*0] - pinsrd xm0, [dstq+dsq*1], 1 - vpbroadcastd xm1, [dstq+dsq*2] - pinsrd xm1, [dstq+r6 ], 3 - mova xm6, [maskq] - psubb xm3, xm4, xm6 - punpcklbw xm2, xm3, xm6 - punpckhbw xm3, xm6 - mova xm6, [tmpq] - add maskq, 4*4 - add tmpq, 4*4 - punpcklbw xm0, xm6 - punpckhbw xm1, xm6 - pmaddubsw xm0, xm2 - pmaddubsw xm1, xm3 - pmulhrsw xm0, xm5 - pmulhrsw xm1, xm5 - packuswb xm0, xm1 - movd [dstq+dsq*0], xm0 - pextrd [dstq+dsq*1], xm0, 1 - pextrd [dstq+dsq*2], xm0, 2 - pextrd [dstq+r6 ], xm0, 3 - lea dstq, [dstq+dsq*4] - sub hd, 4 - jg .w4 - RET -ALIGN function_align -.w8: - movq xm1, [dstq+dsq*0] - movhps xm1, [dstq+dsq*1] - vpbroadcastq m2, [dstq+dsq*2] - vpbroadcastq m3, [dstq+r6 ] - mova m0, [maskq] - mova m6, [tmpq] - add maskq, 8*4 - add tmpq, 8*4 - vpblendd m1, m2, 0x30 - vpblendd m1, m3, 0xc0 - psubb m3, m4, m0 - punpcklbw m2, m3, m0 - punpckhbw m3, m0 - punpcklbw m0, m1, m6 - punpckhbw m1, m6 - pmaddubsw m0, m2 - pmaddubsw m1, m3 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - packuswb m0, m1 - vextracti128 xm1, m0, 1 - movq [dstq+dsq*0], xm0 - movhps [dstq+dsq*1], xm0 - movq [dstq+dsq*2], xm1 - movhps [dstq+r6 ], xm1 - lea dstq, [dstq+dsq*4] - sub hd, 4 - jg .w8 - RET -ALIGN function_align -.w16: - mova m0, [maskq] - mova xm1, [dstq+dsq*0] - vinserti128 m1, [dstq+dsq*1], 1 - psubb m3, m4, m0 - punpcklbw m2, m3, m0 - punpckhbw m3, m0 - mova m6, [tmpq] - add maskq, 16*2 - add tmpq, 16*2 - punpcklbw m0, m1, m6 - punpckhbw m1, m6 - pmaddubsw m0, m2 - pmaddubsw m1, m3 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - packuswb m0, m1 - mova [dstq+dsq*0], xm0 - vextracti128 [dstq+dsq*1], m0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .w16 - RET -ALIGN function_align -.w32: - mova m0, [maskq] - mova m1, [dstq] - mova m6, [tmpq] - add maskq, 32 - add tmpq, 32 - psubb m3, m4, m0 - punpcklbw m2, m3, m0 - punpckhbw m3, m0 - punpcklbw m0, m1, m6 - punpckhbw m1, m6 - pmaddubsw m0, m2 - pmaddubsw m1, m3 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - packuswb m0, m1 - mova [dstq], m0 - add dstq, dsq - dec hd - jg .w32 - RET - -cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask -%define base r5-blend_v_avx2_table - lea r5, [blend_v_avx2_table] - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, dword [r5+wq*4] - vpbroadcastd m5, [base+pw_512] - add wq, r5 - add maskq, obmc_masks-blend_v_avx2_table - jmp wq -.w2: - vpbroadcastd xm2, [maskq+2*2] -.w2_s0_loop: - movd xm0, [dstq+dsq*0] - pinsrw xm0, [dstq+dsq*1], 1 - movd xm1, [tmpq] - add tmpq, 2*2 - punpcklbw xm0, xm1 - pmaddubsw xm0, xm2 - pmulhrsw xm0, xm5 - packuswb xm0, xm0 - pextrw [dstq+dsq*0], xm0, 0 - pextrw [dstq+dsq*1], xm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .w2_s0_loop - RET -ALIGN function_align -.w4: - vpbroadcastq xm2, [maskq+4*2] -.w4_loop: - movd xm0, [dstq+dsq*0] - pinsrd xm0, [dstq+dsq*1], 1 - movq xm1, [tmpq] - add tmpq, 4*2 - punpcklbw xm0, xm1 - pmaddubsw xm0, xm2 - pmulhrsw xm0, xm5 - packuswb xm0, xm0 - movd [dstq+dsq*0], xm0 - pextrd [dstq+dsq*1], xm0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .w4_loop - RET -ALIGN function_align -.w8: - vbroadcasti128 m4, [maskq+8*2] -.w8_loop: - vpbroadcastq m2, [dstq+dsq*0] - movq xm0, [dstq+dsq*1] - vpblendd m0, m2, 0x30 - movq xm1, [tmpq+8*1] - vinserti128 m1, [tmpq+8*0], 1 - add tmpq, 8*2 - punpcklbw m0, m1 - pmaddubsw m0, m4 - pmulhrsw m0, m5 - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - movhps [dstq+dsq*0], xm0 - movq [dstq+dsq*1], xm0 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .w8_loop - RET -ALIGN function_align -.w16: - vbroadcasti128 m3, [maskq+16*2] - vbroadcasti128 m4, [maskq+16*3] -.w16_loop: - mova xm1, [dstq+dsq*0] - vinserti128 m1, [dstq+dsq*1], 1 - mova m2, [tmpq] - add tmpq, 16*2 - punpcklbw m0, m1, m2 - punpckhbw m1, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m4 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - packuswb m0, m1 - mova [dstq+dsq*0], xm0 - vextracti128 [dstq+dsq*1], m0, 1 - lea dstq, [dstq+dsq*2] - sub hd, 2 - jg .w16_loop - RET -ALIGN function_align -.w32: - mova xm3, [maskq+16*4] - vinserti128 m3, [maskq+16*6], 1 - mova xm4, [maskq+16*5] - vinserti128 m4, [maskq+16*7], 1 -.w32_loop: - mova m1, [dstq] - mova m2, [tmpq] - add tmpq, 32 - punpcklbw m0, m1, m2 - punpckhbw m1, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m4 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - packuswb m0, m1 - mova [dstq], m0 - add dstq, dsq - dec hd - jg .w32_loop - RET - -cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask -%define base r5-blend_h_avx2_table - lea r5, [blend_h_avx2_table] - mov r6d, wd - tzcnt wd, wd - mov hd, hm - movsxd wq, dword [r5+wq*4] - vpbroadcastd m5, [base+pw_512] - add wq, r5 - lea maskq, [base+obmc_masks+hq*2] - lea hd, [hq*3] - shr hd, 2 ; h * 3/4 - lea maskq, [maskq+hq*2] - neg hq - jmp wq -.w2: - movd xm0, [dstq+dsq*0] - pinsrw xm0, [dstq+dsq*1], 1 - movd xm2, [maskq+hq*2] - movd xm1, [tmpq] - add tmpq, 2*2 - punpcklwd xm2, xm2 - punpcklbw xm0, xm1 - pmaddubsw xm0, xm2 - pmulhrsw xm0, xm5 - packuswb xm0, xm0 - pextrw [dstq+dsq*0], xm0, 0 - pextrw [dstq+dsq*1], xm0, 1 - lea dstq, [dstq+dsq*2] - add hq, 2 - jl .w2 - RET -ALIGN function_align -.w4: - mova xm3, [blend_shuf] -.w4_loop: - movd xm0, [dstq+dsq*0] - pinsrd xm0, [dstq+dsq*1], 1 - movd xm2, [maskq+hq*2] - movq xm1, [tmpq] - add tmpq, 4*2 - pshufb xm2, xm3 - punpcklbw xm0, xm1 - pmaddubsw xm0, xm2 - pmulhrsw xm0, xm5 - packuswb xm0, xm0 - movd [dstq+dsq*0], xm0 - pextrd [dstq+dsq*1], xm0, 1 - lea dstq, [dstq+dsq*2] - add hq, 2 - jl .w4_loop - RET -ALIGN function_align -.w8: - vbroadcasti128 m4, [blend_shuf] - shufpd m4, m4, 0x03 -.w8_loop: - vpbroadcastq m1, [dstq+dsq*0] - movq xm0, [dstq+dsq*1] - vpblendd m0, m1, 0x30 - vpbroadcastd m3, [maskq+hq*2] - movq xm1, [tmpq+8*1] - vinserti128 m1, [tmpq+8*0], 1 - add tmpq, 8*2 - pshufb m3, m4 - punpcklbw m0, m1 - pmaddubsw m0, m3 - pmulhrsw m0, m5 - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - movhps [dstq+dsq*0], xm0 - movq [dstq+dsq*1], xm0 - lea dstq, [dstq+dsq*2] - add hq, 2 - jl .w8_loop - RET -ALIGN function_align -.w16: - vbroadcasti128 m4, [blend_shuf] - shufpd m4, m4, 0x0c -.w16_loop: - mova xm1, [dstq+dsq*0] - vinserti128 m1, [dstq+dsq*1], 1 - vpbroadcastd m3, [maskq+hq*2] - mova m2, [tmpq] - add tmpq, 16*2 - pshufb m3, m4 - punpcklbw m0, m1, m2 - punpckhbw m1, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - packuswb m0, m1 - mova [dstq+dsq*0], xm0 - vextracti128 [dstq+dsq*1], m0, 1 - lea dstq, [dstq+dsq*2] - add hq, 2 - jl .w16_loop - RET -ALIGN function_align -.w32: ; w32/w64/w128 - sub dsq, r6 -.w32_loop0: - vpbroadcastw m3, [maskq+hq*2] - mov wd, r6d -.w32_loop: - mova m1, [dstq] - mova m2, [tmpq] - add tmpq, 32 - punpcklbw m0, m1, m2 - punpckhbw m1, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - packuswb m0, m1 - mova [dstq], m0 - add dstq, 32 - sub wd, 32 - jg .w32_loop - add dstq, dsq - inc hq - jl .w32_loop0 - RET - -cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ - bottomext, rightext - ; we assume that the buffer (stride) is larger than width, so we can - ; safely overwrite by a few bytes - - ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) - xor r12d, r12d - lea r10, [ihq-1] - cmp yq, ihq - cmovs r10, yq - test yq, yq - cmovs r10, r12 - imul r10, sstrideq - add srcq, r10 - - ; ref += iclip(x, 0, iw - 1) - lea r10, [iwq-1] - cmp xq, iwq - cmovs r10, xq - test xq, xq - cmovs r10, r12 - add srcq, r10 - - ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) - lea bottomextq, [yq+bhq] - sub bottomextq, ihq - lea r3, [bhq-1] - cmovs bottomextq, r12 - - DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ - bottomext, rightext - - ; top_ext = iclip(-y, 0, bh - 1) - neg topextq - cmovs topextq, r12 - cmp bottomextq, bhq - cmovns bottomextq, r3 - cmp topextq, bhq - cmovg topextq, r3 - - ; right_ext = iclip(x + bw - iw, 0, bw - 1) - lea rightextq, [xq+bwq] - sub rightextq, iwq - lea r2, [bwq-1] - cmovs rightextq, r12 - - DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ - bottomext, rightext - - ; left_ext = iclip(-x, 0, bw - 1) - neg leftextq - cmovs leftextq, r12 - cmp rightextq, bwq - cmovns rightextq, r2 - cmp leftextq, bwq - cmovns leftextq, r2 - - DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ - dst, dstride, src, sstride, bottomext, rightext - - ; center_h = bh - top_ext - bottom_ext - lea r3, [bottomextq+topextq] - sub centerhq, r3 - - ; blk += top_ext * PXSTRIDE(dst_stride) - mov r2, topextq - imul r2, dstrideq - add dstq, r2 - mov r9m, dstq - - ; center_w = bw - left_ext - right_ext - mov centerwq, bwq - lea r3, [rightextq+leftextq] - sub centerwq, r3 - -%macro v_loop 3 ; need_left_ext, need_right_ext, suffix -.v_loop_%3: -%if %1 - ; left extension - xor r3, r3 - vpbroadcastb m0, [srcq] -.left_loop_%3: - mova [dstq+r3], m0 - add r3, 32 - cmp r3, leftextq - jl .left_loop_%3 - - ; body - lea r12, [dstq+leftextq] -%endif - xor r3, r3 -.body_loop_%3: - movu m0, [srcq+r3] -%if %1 - movu [r12+r3], m0 -%else - movu [dstq+r3], m0 -%endif - add r3, 32 - cmp r3, centerwq - jl .body_loop_%3 - -%if %2 - ; right extension -%if %1 - add r12, centerwq -%else - lea r12, [dstq+centerwq] -%endif - xor r3, r3 - vpbroadcastb m0, [srcq+centerwq-1] -.right_loop_%3: - movu [r12+r3], m0 - add r3, 32 - cmp r3, rightextq - jl .right_loop_%3 - -%endif - add dstq, dstrideq - add srcq, sstrideq - dec centerhq - jg .v_loop_%3 -%endmacro - - test leftextq, leftextq - jnz .need_left_ext - test rightextq, rightextq - jnz .need_right_ext - v_loop 0, 0, 0 - jmp .body_done - -.need_left_ext: - test rightextq, rightextq - jnz .need_left_right_ext - v_loop 1, 0, 1 - jmp .body_done - -.need_left_right_ext: - v_loop 1, 1, 2 - jmp .body_done - -.need_right_ext: - v_loop 0, 1, 3 - -.body_done: - ; bottom edge extension - test bottomextq, bottomextq - jz .top - mov srcq, dstq - sub srcq, dstrideq - xor r1, r1 -.bottom_x_loop: - mova m0, [srcq+r1] - lea r3, [dstq+r1] - mov r4, bottomextq -.bottom_y_loop: - mova [r3], m0 - add r3, dstrideq - dec r4 - jg .bottom_y_loop - add r1, 32 - cmp r1, bwq - jl .bottom_x_loop - -.top: - ; top edge extension - test topextq, topextq - jz .end - mov srcq, r9m - mov dstq, dstm - xor r1, r1 -.top_x_loop: - mova m0, [srcq+r1] - lea r3, [dstq+r1] - mov r4, topextq -.top_y_loop: - mova [r3], m0 - add r3, dstrideq - dec r4 - jg .top_y_loop - add r1, 32 - cmp r1, bwq - jl .top_x_loop - -.end: - RET - -cextern resize_filter - -INIT_YMM avx2 -cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ - dst_w, h, src_w, dx, mx0 - sub dword mx0m, 4<<14 - sub dword src_wm, 8 - vpbroadcastd m5, dxm - vpbroadcastd m8, mx0m - vpbroadcastd m6, src_wm - - DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr - LEA r7, $$ -%define base r7-$$ - - vpbroadcastd m3, [base+pw_m256] - vpbroadcastd m7, [base+pd_63] - vbroadcasti128 m15, [base+pb_8x0_8x8] - pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] - pslld m5, 3 ; dx*8 - pslld m6, 14 - paddd m8, m2 ; mx+[0..7]*dx - pxor m2, m2 - - ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 - ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 - -.loop_y: - xor xd, xd - mova m4, m8 ; per-line working version of mx - -.loop_x: - pmaxsd m0, m4, m2 - psrad m9, m4, 8 ; filter offset (unmasked) - pminsd m0, m6 ; iclip(mx, 0, src_w-8) - psubd m1, m4, m0 ; pshufb offset - psrad m0, 14 ; clipped src_x offset - psrad m1, 14 ; pshufb edge_emu offset - pand m9, m7 ; filter offset (masked) - - ; load source pixels - this ugly code is vpgatherdq emulation since - ; directly using vpgatherdq on Haswell is quite a bit slower :( - movd r8d, xm0 - pextrd r9d, xm0, 1 - pextrd r10d, xm0, 2 - pextrd r11d, xm0, 3 - vextracti128 xm0, m0, 1 - movq xm12, [srcq+r8] - movq xm13, [srcq+r10] - movhps xm12, [srcq+r9] - movhps xm13, [srcq+r11] - movd r8d, xm0 - pextrd r9d, xm0, 1 - pextrd r10d, xm0, 2 - pextrd r11d, xm0, 3 - vinserti128 m12, [srcq+r8], 1 - vinserti128 m13, [srcq+r10], 1 - vpbroadcastq m10, [srcq+r9] - vpbroadcastq m11, [srcq+r11] - vpblendd m12, m12, m10, 11000000b - vpblendd m13, m13, m11, 11000000b - - ; if no emulation is required, we don't need to shuffle or emulate edges - ; this also saves 2 quasi-vpgatherdqs - vptest m1, m1 - jz .filter - - movd r8d, xm1 - pextrd r9d, xm1, 1 - pextrd r10d, xm1, 2 - pextrd r11d, xm1, 3 - movsxd r8, r8d - movsxd r9, r9d - movsxd r10, r10d - movsxd r11, r11d - vextracti128 xm1, m1, 1 - movq xm14, [base+resize_shuf+4+r8] - movq xm0, [base+resize_shuf+4+r10] - movhps xm14, [base+resize_shuf+4+r9] - movhps xm0, [base+resize_shuf+4+r11] - movd r8d, xm1 - pextrd r9d, xm1, 1 - pextrd r10d, xm1, 2 - pextrd r11d, xm1, 3 - movsxd r8, r8d - movsxd r9, r9d - movsxd r10, r10d - movsxd r11, r11d - vinserti128 m14, [base+resize_shuf+4+r8], 1 - vinserti128 m0, [base+resize_shuf+4+r10], 1 - vpbroadcastq m10, [base+resize_shuf+4+r9] - vpbroadcastq m11, [base+resize_shuf+4+r11] - vpblendd m14, m14, m10, 11000000b - vpblendd m0, m0, m11, 11000000b - - paddb m14, m15 - paddb m0, m15 - pshufb m12, m14 - pshufb m13, m0 - -.filter: - movd r8d, xm9 - pextrd r9d, xm9, 1 - pextrd r10d, xm9, 2 - pextrd r11d, xm9, 3 - vextracti128 xm9, m9, 1 - movq xm10, [base+resize_filter+r8*8] - movq xm11, [base+resize_filter+r10*8] - movhps xm10, [base+resize_filter+r9*8] - movhps xm11, [base+resize_filter+r11*8] - movd r8d, xm9 - pextrd r9d, xm9, 1 - pextrd r10d, xm9, 2 - pextrd r11d, xm9, 3 - vinserti128 m10, [base+resize_filter+r8*8], 1 - vinserti128 m11, [base+resize_filter+r10*8], 1 - vpbroadcastq m14, [base+resize_filter+r9*8] - vpbroadcastq m1, [base+resize_filter+r11*8] - vpblendd m10, m10, m14, 11000000b - vpblendd m11, m11, m1, 11000000b - - pmaddubsw m12, m10 - pmaddubsw m13, m11 - phaddw m12, m13 - vextracti128 xm13, m12, 1 - phaddsw xm12, xm13 - pmulhrsw xm12, xm3 ; x=(x+64)>>7 - packuswb xm12, xm12 - movq [dstq+xq], xm12 - - paddd m4, m5 - add xd, 8 - cmp xd, dst_wd - jl .loop_x - - add dstq, dst_strideq - add srcq, src_strideq - dec hd - jg .loop_y - RET - -INIT_YMM avx2 -PREP_BILIN -PREP_8TAP -AVG_FN -W_AVG_FN -MASK_FN - -cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 -%define base r7-w_mask_420_avx2_table - lea r7, [w_mask_420_avx2_table] - tzcnt wd, wm - mov r6d, r7m ; sign - movifnidn hd, hm - movsxd wq, [r7+wq*4] - vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 - vpbroadcastd m7, [base+pw_2048] - pmovzxbd m9, [base+deint_shuf4] - vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign - add wq, r7 - W_MASK 0, 4, 0, 1 - mov maskq, maskmp - lea stride3q, [strideq*3] - jmp wq -.w4: - vextracti128 xm1, m0, 1 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xm1 - pextrd [dstq+stride3q ], xm1, 1 - cmp hd, 8 - jl .w4_end - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+stride3q ], xm1, 3 - jg .w4_h16 -.w4_end: - vextracti128 xm0, m4, 1 - vpblendd xm1, xm4, xm0, 0x05 - vpblendd xm4, xm4, xm0, 0x0a - pshufd xm1, xm1, q2301 - psubw xm4, xm8, xm4 - psubw xm4, xm1 - psrlw xm4, 2 - packuswb xm4, xm4 - movq [maskq], xm4 - RET -.w4_h16: - W_MASK 0, 5, 2, 3 - lea dstq, [dstq+strideq*4] - phaddd m4, m5 - vextracti128 xm1, m0, 1 - psubw m4, m8, m4 - psrlw m4, 2 - vpermd m4, m9, m4 - vextracti128 xm5, m4, 1 - packuswb xm4, xm5 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xm1 - pextrd [dstq+stride3q], xm1, 1 - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+stride3q ], xm1, 3 - mova [maskq], xm4 - RET -.w8_loop: - add tmp1q, 2*32 - add tmp2q, 2*32 - W_MASK 0, 4, 0, 1 - lea dstq, [dstq+strideq*4] - add maskq, 8 -.w8: - vextracti128 xm2, m4, 1 - vextracti128 xm1, m0, 1 - psubw xm4, xm8, xm4 - psubw xm4, xm2 - psrlw xm4, 2 - packuswb xm4, xm4 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xm1 - movq [maskq], xm4 - sub hd, 4 - jg .w8_loop - RET -.w16_loop: - add tmp1q, 4*32 - add tmp2q, 4*32 - W_MASK 0, 4, 0, 1 - lea dstq, [dstq+strideq*4] - add maskq, 16 -.w16: - vpermq m0, m0, q3120 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - W_MASK 0, 5, 2, 3 - punpckhqdq m1, m4, m5 - punpcklqdq m4, m5 - psubw m1, m8, m1 - psubw m1, m4 - psrlw m1, 2 - vpermq m0, m0, q3120 - packuswb m1, m1 - vpermd m1, m9, m1 - mova [dstq+strideq*2], xm0 - vextracti128 [dstq+stride3q ], m0, 1 - mova [maskq], xm1 - sub hd, 4 - jg .w16_loop - RET -.w32_loop: - add tmp1q, 4*32 - add tmp2q, 4*32 - W_MASK 0, 4, 0, 1 - lea dstq, [dstq+strideq*2] - add maskq, 16 -.w32: - vpermq m0, m0, q3120 - mova [dstq+strideq*0], m0 - W_MASK 0, 5, 2, 3 - psubw m4, m8, m4 - psubw m4, m5 - psrlw m4, 2 - vpermq m0, m0, q3120 - packuswb m4, m4 - vpermd m4, m9, m4 - mova [dstq+strideq*1], m0 - mova [maskq], xm4 - sub hd, 2 - jg .w32_loop - RET -.w64_loop_even: - psubw m10, m8, m4 - psubw m11, m8, m5 - dec hd -.w64_loop: - add tmp1q, 4*32 - add tmp2q, 4*32 - W_MASK 0, 4, 0, 1 - add dstq, strideq -.w64: - vpermq m0, m0, q3120 - mova [dstq+32*0], m0 - W_MASK 0, 5, 2, 3 - vpermq m0, m0, q3120 - mova [dstq+32*1], m0 - test hd, 1 - jz .w64_loop_even - psubw m4, m10, m4 - psubw m5, m11, m5 - psrlw m4, 2 - psrlw m5, 2 - packuswb m4, m5 - vpermd m4, m9, m4 - mova [maskq], m4 - add maskq, 32 - dec hd - jg .w64_loop - RET -.w128_loop_even: - psubw m12, m8, m4 - psubw m13, m8, m5 - dec hd -.w128_loop: - W_MASK 0, 4, 0, 1 - add dstq, strideq -.w128: - vpermq m0, m0, q3120 - mova [dstq+32*0], m0 - W_MASK 0, 5, 2, 3 - vpermq m0, m0, q3120 - mova [dstq+32*1], m0 - add tmp1q, 8*32 - add tmp2q, 8*32 - test hd, 1 - jz .w128_even - psubw m4, m10, m4 - psubw m5, m11, m5 - psrlw m4, 2 - psrlw m5, 2 - packuswb m4, m5 - vpermd m4, m9, m4 - mova [maskq+32*0], m4 - jmp .w128_odd -.w128_even: - psubw m10, m8, m4 - psubw m11, m8, m5 -.w128_odd: - W_MASK 0, 4, -4, -3 - vpermq m0, m0, q3120 - mova [dstq+32*2], m0 - W_MASK 0, 5, -2, -1 - vpermq m0, m0, q3120 - mova [dstq+32*3], m0 - test hd, 1 - jz .w128_loop_even - psubw m4, m12, m4 - psubw m5, m13, m5 - psrlw m4, 2 - psrlw m5, 2 - packuswb m4, m5 - vpermd m4, m9, m4 - mova [maskq+32*1], m4 - add maskq, 64 - dec hd - jg .w128_loop - RET - -cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 -%define base r7-w_mask_422_avx2_table - lea r7, [w_mask_422_avx2_table] - tzcnt wd, wm - mov r6d, r7m ; sign - movifnidn hd, hm - pxor m9, m9 - movsxd wq, dword [r7+wq*4] - vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 - vpbroadcastd m7, [base+pw_2048] - pmovzxbd m10, [base+deint_shuf4] - vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign - add wq, r7 - mov maskq, maskmp - W_MASK 0, 4, 0, 1 - lea stride3q, [strideq*3] - jmp wq -.w4: - vextracti128 xm1, m0, 1 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xm1 - pextrd [dstq+stride3q ], xm1, 1 - cmp hd, 8 - jl .w4_end - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+stride3q ], xm1, 3 - jg .w4_h16 -.w4_end: - vextracti128 xm5, m4, 1 - packuswb xm4, xm5 - psubb xm5, xm8, xm4 - pavgb xm5, xm9 - pshufd xm5, xm5, q3120 - mova [maskq], xm5 - RET -.w4_h16: - W_MASK 0, 5, 2, 3 - lea dstq, [dstq+strideq*4] - packuswb m4, m5 - psubb m5, m8, m4 - pavgb m5, m9 - vpermd m5, m10, m5 - vextracti128 xm1, m0, 1 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xm1 - pextrd [dstq+stride3q ], xm1, 1 - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+stride3q ], xm1, 3 - mova [maskq], m5 - RET -.w8_loop: - add tmp1q, 32*2 - add tmp2q, 32*2 - W_MASK 0, 4, 0, 1 - lea dstq, [dstq+strideq*4] - add maskq, 16 -.w8: - vextracti128 xm5, m4, 1 - vextracti128 xm1, m0, 1 - packuswb xm4, xm5 - psubb xm5, xm8, xm4 - pavgb xm5, xm9 - pshufd xm5, xm5, q3120 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xm1 - mova [maskq], xm5 - sub hd, 4 - jg .w8_loop - RET -.w16_loop: - add tmp1q, 32*4 - add tmp2q, 32*4 - W_MASK 0, 4, 0, 1 - lea dstq, [dstq+strideq*4] - add maskq, 32 -.w16: - vpermq m0, m0, q3120 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - W_MASK 0, 5, 2, 3 - packuswb m4, m5 - psubb m5, m8, m4 - pavgb m5, m9 - vpermq m0, m0, q3120 - vpermd m5, m10, m5 - mova [dstq+strideq*2], xm0 - vextracti128 [dstq+stride3q ], m0, 1 - mova [maskq], m5 - sub hd, 4 - jg .w16_loop - RET -.w32_loop: - add tmp1q, 32*4 - add tmp2q, 32*4 - W_MASK 0, 4, 0, 1 - lea dstq, [dstq+strideq*2] - add maskq, 32 -.w32: - vpermq m0, m0, q3120 - mova [dstq+strideq*0], m0 - W_MASK 0, 5, 2, 3 - packuswb m4, m5 - psubb m5, m8, m4 - pavgb m5, m9 - vpermq m0, m0, q3120 - vpermd m5, m10, m5 - mova [dstq+strideq*1], m0 - mova [maskq], m5 - sub hd, 2 - jg .w32_loop - RET -.w64_loop: - add tmp1q, 32*4 - add tmp2q, 32*4 - W_MASK 0, 4, 0, 1 - add dstq, strideq - add maskq, 32 -.w64: - vpermq m0, m0, q3120 - mova [dstq+32*0], m0 - W_MASK 0, 5, 2, 3 - packuswb m4, m5 - psubb m5, m8, m4 - pavgb m5, m9 - vpermq m0, m0, q3120 - vpermd m5, m10, m5 - mova [dstq+32*1], m0 - mova [maskq], m5 - dec hd - jg .w64_loop - RET -.w128_loop: - add tmp1q, 32*8 - add tmp2q, 32*8 - W_MASK 0, 4, 0, 1 - add dstq, strideq - add maskq, 32*2 -.w128: - vpermq m0, m0, q3120 - mova [dstq+32*0], m0 - W_MASK 0, 5, 2, 3 - packuswb m4, m5 - psubb m5, m8, m4 - pavgb m5, m9 - vpermq m0, m0, q3120 - vpermd m5, m10, m5 - mova [dstq+32*1], m0 - mova [maskq+32*0], m5 - W_MASK 0, 4, 4, 5 - vpermq m0, m0, q3120 - mova [dstq+32*2], m0 - W_MASK 0, 5, 6, 7 - packuswb m4, m5 - psubb m5, m8, m4 - pavgb m5, m9 - vpermq m0, m0, q3120 - vpermd m5, m10, m5 - mova [dstq+32*3], m0 - mova [maskq+32*1], m5 - dec hd - jg .w128_loop - RET - -cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 -%define base r7-w_mask_444_avx2_table - lea r7, [w_mask_444_avx2_table] - tzcnt wd, wm - movifnidn hd, hm - mov maskq, maskmp - movsxd wq, dword [r7+wq*4] - vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 - vpbroadcastd m5, [base+pb_64] - vpbroadcastd m7, [base+pw_2048] - add wq, r7 - W_MASK 0, 4, 0, 1, 1 - lea stride3q, [strideq*3] - jmp wq -.w4: - vextracti128 xm1, m0, 1 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xm1 - pextrd [dstq+stride3q ], xm1, 1 - mova [maskq+32*0], m4 - cmp hd, 8 - jl .w4_end - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+stride3q ], xm1, 3 - je .w4_end - W_MASK 0, 4, 2, 3, 1 - lea dstq, [dstq+strideq*4] - vextracti128 xm1, m0, 1 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xm1 - pextrd [dstq+stride3q ], xm1, 1 - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xm1, 2 - pextrd [dstq+stride3q ], xm1, 3 - mova [maskq+32*1], m4 -.w4_end: - RET -.w8_loop: - add tmp1q, 32*2 - add tmp2q, 32*2 - W_MASK 0, 4, 0, 1, 1 - lea dstq, [dstq+strideq*4] - add maskq, 32 -.w8: - vextracti128 xm1, m0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xm1 - mova [maskq], m4 - sub hd, 4 - jg .w8_loop - RET -.w16_loop: - add tmp1q, 32*2 - add tmp2q, 32*2 - W_MASK 0, 4, 0, 1, 1 - lea dstq, [dstq+strideq*2] - add maskq, 32 -.w16: - vpermq m0, m0, q3120 - mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - mova [maskq], m4 - sub hd, 2 - jg .w16_loop - RET -.w32_loop: - add tmp1q, 32*2 - add tmp2q, 32*2 - W_MASK 0, 4, 0, 1, 1 - add dstq, strideq - add maskq, 32 -.w32: - vpermq m0, m0, q3120 - mova [dstq], m0 - mova [maskq], m4 - dec hd - jg .w32_loop - RET -.w64_loop: - add tmp1q, 32*4 - add tmp2q, 32*4 - W_MASK 0, 4, 0, 1, 1 - add dstq, strideq - add maskq, 32*2 -.w64: - vpermq m0, m0, q3120 - mova [dstq+32*0], m0 - mova [maskq+32*0], m4 - W_MASK 0, 4, 2, 3, 1 - vpermq m0, m0, q3120 - mova [dstq+32*1], m0 - mova [maskq+32*1], m4 - dec hd - jg .w64_loop - RET -.w128_loop: - add tmp1q, 32*8 - add tmp2q, 32*8 - W_MASK 0, 4, 0, 1, 1 - add dstq, strideq - add maskq, 32*4 -.w128: - vpermq m0, m0, q3120 - mova [dstq+32*0], m0 - mova [maskq+32*0], m4 - W_MASK 0, 4, 2, 3, 1 - vpermq m0, m0, q3120 - mova [dstq+32*1], m0 - mova [maskq+32*1], m4 - W_MASK 0, 4, 4, 5, 1 - vpermq m0, m0, q3120 - mova [dstq+32*2], m0 - mova [maskq+32*2], m4 - W_MASK 0, 4, 6, 7, 1 - vpermq m0, m0, q3120 - mova [dstq+32*3], m0 - mova [maskq+32*3], m4 - dec hd - jg .w128_loop - RET - -%if HAVE_AVX512ICL -INIT_ZMM avx512icl -PREP_BILIN -PREP_8TAP -AVG_FN -W_AVG_FN -MASK_FN - -cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 -%define base r7-w_mask_420_avx512icl_table - lea r7, [w_mask_420_avx512icl_table] - tzcnt wd, wm - mov r6d, r7m ; sign - movifnidn hd, hm - movsxd wq, [r7+wq*4] - vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 - vpbroadcastd m7, [base+pw_2048] - vpbroadcastd m9, [base+pb_m64] ; -1 << 6 - mova ym10, [base+wm_420_mask+32] - vpbroadcastd m8, [base+wm_sign_avx512+r6*8] ; (258 - sign) << 6 - add wq, r7 - mov maskq, maskmp - lea stride3q, [strideq*3] - jmp wq -.w4: - mova m5, [wm_420_perm4] - cmp hd, 8 - jg .w4_h16 - WRAP_YMM W_MASK 0, 4, 0, 1 - vinserti128 ym5, [wm_420_perm4+32], 1 - vpermb ym4, ym5, ym4 - vpdpbusd ym8, ym4, ym9 - vextracti128 xmm1, m0, 1 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 - jl .w4_end - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 -.w4_end: - vpermb ym8, ym10, ym8 - movq [maskq], xm8 - RET -.w4_h16: - vpbroadcastd m11, strided - pmulld m11, [bidir_sctr_w4] - W_MASK 0, 4, 0, 1 - vpermb m4, m5, m4 - vpdpbusd m8, m4, m9 - kxnorw k1, k1, k1 - vpermb m8, m10, m8 - mova [maskq], xm8 - vpscatterdd [dstq+m11]{k1}, m0 - RET -.w8: - mova m5, [wm_420_perm8] - cmp hd, 4 - jne .w8_h8 - WRAP_YMM W_MASK 0, 4, 0, 1 - vinserti128 ym5, [wm_420_perm8+32], 1 - vpermb ym4, ym5, ym4 - vpdpbusd ym8, ym4, ym9 - vpermb m8, m10, m8 - mova [maskq], xm8 - vextracti128 xmm1, ym0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 - RET -.w8_loop: - add tmp1q, 128 - add tmp2q, 128 - add maskq, 16 - lea dstq, [dstq+strideq*4] -.w8_h8: - W_MASK 0, 4, 0, 1 - vpermb m4, m5, m4 - mova m1, m8 - vpdpbusd m1, m4, m9 - vpermb m1, m10, m1 - mova [maskq], xm1 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 - lea dstq, [dstq+strideq*4] - movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 - sub hd, 8 - jg .w8_loop - RET -.w16: - mova m5, [wm_420_perm16] -.w16_loop: - W_MASK 0, 4, 0, 1 - vpermb m4, m5, m4 - mova m1, m8 - vpdpbusd m1, m4, m9 - add tmp1q, 128 - add tmp2q, 128 - vpermb m1, m10, m1 - vpermq m0, m0, q3120 - mova [maskq], xm1 - add maskq, 16 - mova [dstq+strideq*0], xm0 - vextracti32x4 [dstq+strideq*1], m0, 2 - vextracti32x4 [dstq+strideq*2], ym0, 1 - vextracti32x4 [dstq+stride3q ], m0, 3 - lea dstq, [dstq+strideq*4] - sub hd, 4 - jg .w16_loop - RET -.w32: - pmovzxbq m5, [warp_8x8_shufA] -.w32_loop: - W_MASK 0, 4, 0, 1 - mova m1, m8 - vpdpbusd m1, m4, m9 - add tmp1q, 128 - add tmp2q, 128 - vpermb m1, m10, m1 - vpermq m0, m5, m0 - mova [maskq], xm1 - add maskq, 16 - mova [dstq+strideq*0], ym0 - vextracti32x8 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w32_loop - RET -.w64: - pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14 - psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15 -.w64_loop: - W_MASK 0, 4, 0, 2 - W_MASK 11, 5, 1, 3 - mova m2, m8 - vpdpbusd m2, m4, m9 - mova m3, m8 - vpdpbusd m3, m5, m9 - add tmp1q, 256 - add tmp2q, 256 - vpermt2b m2, m10, m3 - mova m1, m0 - vpermt2q m0, m12, m11 - vpermt2q m1, m13, m11 - mova [maskq], ym2 - add maskq, 32 - mova [dstq+strideq*0], m0 - mova [dstq+strideq*1], m1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w64_loop - RET -.w128: - pmovzxbq m14, [wm_420_perm64] - mova m10, [wm_420_mask] - psrlq m15, m14, 4 -.w128_loop: - W_MASK 0, 12, 0, 4 - W_MASK 11, 13, 1, 5 - mova m4, m8 - vpdpbusd m4, m12, m9 - mova m5, m8 - vpdpbusd m5, m13, m9 - mova m1, m0 - vpermt2q m0, m14, m11 - vpermt2q m1, m15, m11 - mova [dstq+strideq*0+64*0], m0 - mova [dstq+strideq*1+64*0], m1 - W_MASK 0, 12, 2, 6 - W_MASK 11, 13, 3, 7 - vprold m4, 16 - vprold m5, 16 - vpdpbusd m4, m12, m9 - vpdpbusd m5, m13, m9 - add tmp1q, 512 - add tmp2q, 512 - vpermt2b m4, m10, m5 - mova m1, m0 - vpermt2q m0, m14, m11 - vpermt2q m1, m15, m11 - mova [maskq], m4 - add maskq, 64 - mova [dstq+strideq*0+64*1], m0 - mova [dstq+strideq*1+64*1], m1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w128_loop - RET - -cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 -%define base r7-w_mask_422_avx512icl_table - lea r7, [w_mask_422_avx512icl_table] - tzcnt wd, wm - mov r6d, r7m ; sign - movifnidn hd, hm - movsxd wq, dword [r7+wq*4] - vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 - vpbroadcastd m7, [base+pw_2048] - vpbroadcastd m9, [base+pw_m128] - mova m10, [base+wm_422_mask] - vpbroadcastd m11, [base+pb_127] - add wq, r7 - vpbroadcastd m8, [base+wm_sign_avx512+4+r6*4] - mov maskq, maskmp - lea stride3q, [strideq*3] - jmp wq -.w4: - cmp hd, 8 - jg .w4_h16 - WRAP_YMM W_MASK 0, 4, 0, 1 - movhps xm10, [wm_422_mask+16] - vpdpwssd ym8, ym4, ym9 - vpermb ym8, ym10, ym8 - vextracti128 xmm1, m0, 1 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 - jl .w4_end - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 -.w4_end: - pand xm8, xm11 - mova [maskq], xm8 - RET -.w4_h16: - vpbroadcastd m5, strided - pmulld m5, [bidir_sctr_w4] - W_MASK 0, 4, 0, 1 - vpdpwssd m8, m4, m9 - kxnorw k1, k1, k1 - vpermb m8, m10, m8 - pand ym8, ym11 - mova [maskq], ym8 - vpscatterdd [dstq+m5]{k1}, m0 - RET -.w8: - cmp hd, 4 - jne .w8_h8 - WRAP_YMM W_MASK 0, 4, 0, 1 - movhps xm10, [wm_422_mask+16] - vpdpwssd ym8, ym4, ym9 - vpermb ym8, ym10, ym8 - pand xm8, xm11 - mova [maskq], xm8 - vextracti128 xmm1, ym0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 - RET -.w8_loop: - add tmp1q, 128 - add tmp2q, 128 - add maskq, 32 - lea dstq, [dstq+strideq*4] -.w8_h8: - W_MASK 0, 4, 0, 1 - mova m1, m8 - vpdpwssd m1, m4, m9 - vpermb m1, m10, m1 - pand ym1, ym11 - mova [maskq], ym1 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 - lea dstq, [dstq+strideq*4] - movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 - sub hd, 8 - jg .w8_loop - RET -.w16_loop: - add tmp1q, 128 - add tmp2q, 128 - add maskq, 32 - lea dstq, [dstq+strideq*4] -.w16: - W_MASK 0, 4, 0, 1 - mova m1, m8 - vpdpwssd m1, m4, m9 - vpermb m1, m10, m1 - vpermq m0, m0, q3120 - pand ym1, ym11 - mova [maskq], ym1 - mova [dstq+strideq*0], xm0 - vextracti32x4 [dstq+strideq*1], m0, 2 - vextracti32x4 [dstq+strideq*2], ym0, 1 - vextracti32x4 [dstq+stride3q ], m0, 3 - sub hd, 4 - jg .w16_loop - RET -.w32: - pmovzxbq m5, [warp_8x8_shufA] -.w32_loop: - W_MASK 0, 4, 0, 1 - mova m1, m8 - vpdpwssd m1, m4, m9 - add tmp1q, 128 - add tmp2q, 128 - vpermb m1, m10, m1 - vpermq m0, m5, m0 - pand ym1, ym11 - mova [maskq], ym1 - add maskq, 32 - mova [dstq+strideq*0], ym0 - vextracti32x8 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w32_loop - RET -.w64: - pmovzxbq m5, [warp_8x8_shufA] -.w64_loop: - W_MASK 0, 4, 0, 1 - mova m1, m8 - vpdpwssd m1, m4, m9 - add tmp1q, 128 - add tmp2q, 128 - vpermb m1, m10, m1 - vpermq m0, m5, m0 - pand ym1, ym11 - mova [maskq], ym1 - add maskq, 32 - mova [dstq], m0 - add dstq, strideq - dec hd - jg .w64_loop - RET -.w128: - pmovzxbq m13, [warp_8x8_shufA] -.w128_loop: - W_MASK 0, 4, 0, 1 - W_MASK 12, 5, 2, 3 - mova m2, m8 - vpdpwssd m2, m4, m9 - mova m3, m8 - vpdpwssd m3, m5, m9 - add tmp1q, 256 - add tmp2q, 256 - vpermt2b m2, m10, m3 - vpermq m0, m13, m0 - vpermq m1, m13, m12 - pand m2, m11 - mova [maskq], m2 - add maskq, 64 - mova [dstq+64*0], m0 - mova [dstq+64*1], m1 - add dstq, strideq - dec hd - jg .w128_loop - RET - -cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 -%define base r7-w_mask_444_avx512icl_table - lea r7, [w_mask_444_avx512icl_table] - tzcnt wd, wm - movifnidn hd, hm - movsxd wq, dword [r7+wq*4] - vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 - vpbroadcastd m5, [base+pb_64] - vpbroadcastd m7, [base+pw_2048] - mova m8, [base+wm_444_mask] - add wq, r7 - mov maskq, maskmp - lea stride3q, [strideq*3] - jmp wq -.w4: - cmp hd, 8 - jg .w4_h16 - WRAP_YMM W_MASK 0, 4, 0, 1, 1 - vinserti128 ym8, [wm_444_mask+32], 1 - vpermb ym4, ym8, ym4 - mova [maskq], ym4 - vextracti128 xmm1, m0, 1 - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 - jl .w4_end - lea dstq, [dstq+strideq*4] - pextrd [dstq+strideq*0], xm0, 2 - pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 -.w4_end: - RET -.w4_h16: - vpbroadcastd m9, strided - pmulld m9, [bidir_sctr_w4] - W_MASK 0, 4, 0, 1, 1 - vpermb m4, m8, m4 - kxnorw k1, k1, k1 - mova [maskq], m4 - vpscatterdd [dstq+m9]{k1}, m0 - RET -.w8: - cmp hd, 4 - jne .w8_h8 - WRAP_YMM W_MASK 0, 4, 0, 1, 1 - vinserti128 ym8, [wm_444_mask+32], 1 - vpermb ym4, ym8, ym4 - mova [maskq], ym4 - vextracti128 xmm1, ym0, 1 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 - RET -.w8_loop: - add tmp1q, 128 - add tmp2q, 128 - add maskq, 64 - lea dstq, [dstq+strideq*4] -.w8_h8: - W_MASK 0, 4, 0, 1, 1 - vpermb m4, m8, m4 - mova [maskq], m4 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 - lea dstq, [dstq+strideq*4] - movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 - sub hd, 8 - jg .w8_loop - RET -.w16_loop: - add tmp1q, 128 - add tmp2q, 128 - add maskq, 64 - lea dstq, [dstq+strideq*4] -.w16: - W_MASK 0, 4, 0, 1, 1 - vpermb m4, m8, m4 - vpermq m0, m0, q3120 - mova [maskq], m4 - mova [dstq+strideq*0], xm0 - vextracti32x4 [dstq+strideq*1], m0, 2 - vextracti32x4 [dstq+strideq*2], ym0, 1 - vextracti32x4 [dstq+stride3q ], m0, 3 - sub hd, 4 - jg .w16_loop - RET -.w32: - pmovzxbq m9, [warp_8x8_shufA] -.w32_loop: - W_MASK 0, 4, 0, 1, 1 - vpermb m4, m8, m4 - add tmp1q, 128 - add tmp2q, 128 - vpermq m0, m9, m0 - mova [maskq], m4 - add maskq, 64 - mova [dstq+strideq*0], ym0 - vextracti32x8 [dstq+strideq*1], m0, 1 - lea dstq, [dstq+strideq*2] - sub hd, 2 - jg .w32_loop - RET -.w64: - pmovzxbq m9, [warp_8x8_shufA] -.w64_loop: - W_MASK 0, 4, 0, 1, 1 - vpermb m4, m8, m4 - add tmp1q, 128 - add tmp2q, 128 - vpermq m0, m9, m0 - mova [maskq], m4 - add maskq, 64 - mova [dstq], m0 - add dstq, strideq - dec hd - jg .w64_loop - RET -.w128: - pmovzxbq m11, [warp_8x8_shufA] -.w128_loop: - W_MASK 0, 4, 0, 1, 1 - W_MASK 10, 9, 2, 3, 1 - vpermb m4, m8, m4 - vpermb m9, m8, m9 - add tmp1q, 256 - add tmp2q, 256 - vpermq m0, m11, m0 - vpermq m10, m11, m10 - mova [maskq+64*0], m4 - mova [maskq+64*1], m9 - add maskq, 128 - mova [dstq+64*0], m0 - mova [dstq+64*1], m10 - add dstq, strideq - dec hd - jg .w128_loop - RET - -%endif ; HAVE_AVX512ICL - -%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/mc_avx2.asm dav1d-0.9.1/src/x86/mc_avx2.asm --- dav1d-0.7.1/src/x86/mc_avx2.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/mc_avx2.asm 2021-07-28 21:38:28.913852200 +0000 @@ -0,0 +1,5703 @@ +; Copyright © 2018-2020, VideoLAN and dav1d authors +; Copyright © 2018-2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +; dav1d_obmc_masks[] with 64-x interleaved +obmc_masks: db 0, 0, 0, 0 + ; 2 + db 45, 19, 64, 0 + ; 4 + db 39, 25, 50, 14, 59, 5, 64, 0 + ; 8 + db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 + ; 16 + db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 + db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 + ; 32 + db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 + db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 + db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 + db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 + +warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 + db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 +warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 + db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 +subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 +subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 +bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 +wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 +resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 + db 7, 7, 7, 7, 7, 7, 7, 7 + +wm_420_sign: dd 0x01020102, 0x01010101 +wm_422_sign: dd 0x80808080, 0x7f7f7f7f + +pb_64: times 4 db 64 +pw_m256: times 2 dw -256 +pw_15: times 2 dw 15 +pw_32: times 2 dw 32 +pw_34: times 2 dw 34 +pw_258: times 2 dw 258 +pw_512: times 2 dw 512 +pw_1024: times 2 dw 1024 +pw_2048: times 2 dw 2048 +pw_6903: times 2 dw 6903 +pw_8192: times 2 dw 8192 +pd_32: dd 32 +pd_63: dd 63 +pd_512: dd 512 +pd_32768: dd 32768 +pd_0x3ff: dd 0x3ff +pd_0x4000: dd 0x4000 +pq_0x40000000: dq 0x40000000 + +cextern mc_subpel_filters +cextern mc_warp_filter2 +cextern resize_filter + +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +%macro BIDIR_JMP_TABLE 1-* + %xdefine %1_table (%%table - 2*%2) + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%macro SCALED_JMP_TABLE 1-* + %xdefine %1_table (%%table - %2) + %xdefine %%base mangle(private_prefix %+ _%1) +%%table: + %rep %0 - 1 + dw %%base %+ .w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_1024: + %xdefine %1_dy1_table (%%dy_1024 - %2) + %rep %0 - 1 + dw %%base %+ .dy1_w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_2048: + %xdefine %1_dy2_table (%%dy_2048 - %2) + %rep %0 - 1 + dw %%base %+ .dy2_w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put) +%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep) + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 + +SECTION .text + +INIT_XMM avx2 +cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + movifnidn mxyd, r6m ; mx + lea r7, [put_avx2] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [r7+wq*2+table_offset(put,)] + add wq, r7 + jmp wq +.put_w2: + movzx r6d, word [srcq+ssq*0] + movzx r7d, word [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6w + mov [dstq+dsq*1], r7w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +INIT_YMM avx2 +.put_w32: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+ssq*0+32*0] + movu m1, [srcq+ssq*0+32*1] + movu m2, [srcq+ssq*1+32*0] + movu m3, [srcq+ssq*1+32*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+32*0], m0 + mova [dstq+dsq*0+32*1], m1 + mova [dstq+dsq*1+32*0], m2 + mova [dstq+dsq*1+32*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] + movu m2, [srcq+32*2] + movu m3, [srcq+32*3] + add srcq, ssq + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 + ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 + imul mxyd, 255 + vbroadcasti128 m4, [bilin_h_shuf8] + add mxyd, 16 + movd xm5, mxyd + mov mxyd, r7m ; my + vpbroadcastw m5, xm5 + test mxyd, mxyd + jnz .hv + movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] + vpbroadcastd m3, [pw_2048] + add wq, r7 + jmp wq +.h_w2: + movd xm0, [srcq+ssq*0] + pinsrd xm0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 + pmulhrsw xm0, xm3 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + mova xm4, [bilin_h_shuf4] +.h_w4_loop: + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 + pmulhrsw xm0, xm3 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pshufb xm1, xm4 + pmaddubsw xm0, xm5 + pmaddubsw xm1, xm5 + pmulhrsw xm0, xm3 + pmulhrsw xm1, xm3 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*1+8*0], 1 + movu xm1, [srcq+ssq*0+8*1] + vinserti128 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + movu m1, [srcq+8*4] + movu m2, [srcq+8*5] + add srcq, ssq + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + packuswb m1, m2 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + mov r6, -32*3 +.h_w128_loop: + movu m0, [srcq+r6+32*3+8*0] + movu m1, [srcq+r6+32*3+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+r6+32*3], m0 + add r6, 32 + jle .h_w128_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] + imul mxyd, 255 + vpbroadcastd m5, [pw_2048] + add mxyd, 16 + add wq, r7 + movd xm4, mxyd + vpbroadcastw m4, xm4 + jmp wq +.v_w2: + movd xm0, [srcq+ssq*0] +.v_w2_loop: + pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 + lea srcq, [srcq+ssq*2] + pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 + pshuflw xm1, xm1, q2301 ; 1 0 + punpcklbw xm1, xm0 + pmaddubsw xm1, xm4 + pmulhrsw xm1, xm5 + packuswb xm1, xm1 + pextrw [dstq+dsq*0], xm1, 1 + pextrw [dstq+dsq*1], xm1, 0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xm0, [srcq+ssq*0] +.v_w4_loop: + vpbroadcastd xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm1, xm2, xm0, 0x01 ; 0 1 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm2, xm0, 0x02 ; 1 2 + punpcklbw xm1, xm2 + pmaddubsw xm1, xm4 + pmulhrsw xm1, xm5 + packuswb xm1, xm1 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xm0, [srcq+ssq*0] +.v_w8_loop: + movq xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw xm1, xm0, xm2 + movq xm0, [srcq+ssq*0] + punpcklbw xm2, xm0 + pmaddubsw xm1, xm4 + pmaddubsw xm2, xm4 + pmulhrsw xm1, xm5 + pmulhrsw xm2, xm5 + packuswb xm1, xm2 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + movu xm0, [srcq+ssq*0] +.v_w16_loop: + vbroadcasti128 m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd m2, m3, m0, 0x0f ; 0 1 + vbroadcasti128 m0, [srcq+ssq*0] + vpblendd m3, m0, 0xf0 ; 1 2 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +%macro PUT_BILIN_V_W32 0 + movu m0, [srcq+ssq*0] +%%loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m1, m0, m3 + punpckhbw m2, m0, m3 + movu m0, [srcq+ssq*0] + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + pmaddubsw m2, m4 + pmaddubsw m3, m4 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + packuswb m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg %%loop +%endmacro + PUT_BILIN_V_W32 + RET +.v_w64: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] +.v_w64_loop: + add srcq, ssq + movu m3, [srcq+32*0] + punpcklbw m2, m0, m3 + punpckhbw m0, m3 + pmaddubsw m2, m4 + pmaddubsw m0, m4 + pmulhrsw m2, m5 + pmulhrsw m0, m5 + packuswb m2, m0 + mova m0, m3 + movu m3, [srcq+32*1] + mova [dstq+32*0], m2 + punpcklbw m2, m1, m3 + punpckhbw m1, m3 + pmaddubsw m2, m4 + pmaddubsw m1, m4 + pmulhrsw m2, m5 + pmulhrsw m1, m5 + packuswb m2, m1 + mova m1, m3 + mova [dstq+32*1], m2 + add dstq, dsq + dec hd + jg .v_w64_loop + RET +.v_w128: + lea r6d, [hq+(3<<8)] + mov r4, srcq + mov r7, dstq +.v_w128_loop: + PUT_BILIN_V_W32 + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w128_loop + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 + ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 + movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 ; can't shift by 12 due to signed overflow + vpbroadcastd m7, [pw_15] + movd xm6, mxyd + add wq, r7 + paddb m5, m5 + vpbroadcastw m6, xm6 + jmp wq +.hv_w2: + vpbroadcastd xm0, [srcq+ssq*0] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 +.hv_w2_loop: + movd xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pinsrd xm1, [srcq+ssq*0], 1 + pshufb xm1, xm4 + pmaddubsw xm1, xm5 ; 1 _ 2 _ + shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ + mova xm0, xm1 + psubw xm1, xm2 + pmulhw xm1, xm6 + pavgw xm2, xm7 + paddw xm1, xm2 + psrlw xm1, 4 + packuswb xm1, xm1 + pextrw [dstq+dsq*0], xm1, 0 + pextrw [dstq+dsq*1], xm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova xm4, [bilin_h_shuf4] + movddup xm0, [srcq+ssq*0] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 +.hv_w4_loop: + movq xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm1, [srcq+ssq*0] + pshufb xm1, xm4 + pmaddubsw xm1, xm5 ; 1 2 + shufps xm2, xm0, xm1, q1032 ; 0 1 + mova xm0, xm1 + psubw xm1, xm2 + pmulhw xm1, xm6 + pavgw xm2, xm7 + paddw xm1, xm2 + psrlw xm1, 4 + packuswb xm1, xm1 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti128 m0, [srcq+ssq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m1, [srcq+ssq*0], 1 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhw m1, m6 + pavgw m2, m7 + paddw m1, m2 + psrlw m1, 4 + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + movu m0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*0+8*1], 1 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w16_loop: + movu xm2, [srcq+ssq*1+8*0] + vinserti128 m2, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + movu xm3, [srcq+ssq*0+8*0] + vinserti128 m3, [srcq+ssq*0+8*1], 1 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + psubw m1, m2, m0 + pmulhw m1, m6 + pavgw m0, m7 + paddw m1, m0 + pmaddubsw m0, m3, m5 + psubw m3, m0, m2 + pmulhw m3, m6 + pavgw m2, m7 + paddw m3, m2 + psrlw m1, 4 + psrlw m3, 4 + packuswb m1, m3 + vpermq m1, m1, q3120 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w128: + lea r6d, [hq+(3<<16)] + jmp .hv_w32_start +.hv_w64: + lea r6d, [hq+(1<<16)] +.hv_w32_start: + mov r4, srcq + mov r7, dstq +.hv_w32: +%if WIN64 + movaps r4m, xmm8 +%endif +.hv_w32_loop0: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w32_loop: + add srcq, ssq + movu m2, [srcq+8*0] + movu m3, [srcq+8*1] + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + psubw m8, m2, m0 + pmulhw m8, m6 + pavgw m0, m7 + paddw m8, m0 + mova m0, m2 + psubw m2, m3, m1 + pmulhw m2, m6 + pavgw m1, m7 + paddw m2, m1 + mova m1, m3 + psrlw m8, 4 + psrlw m2, 4 + packuswb m8, m2 + mova [dstq], m8 + add dstq, dsq + dec hd + jg .hv_w32_loop + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<16 + jg .hv_w32_loop0 +%if WIN64 + movaps xmm8, r4m +%endif + RET + +cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep%+SUFFIX] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movd xm0, [srcq+strideq*0] + pinsrd xm0, [srcq+strideq*1], 1 + pinsrd xm0, [srcq+strideq*2], 2 + pinsrd xm0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pmovzxbw m0, xm0 + psllw m0, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + movq xm1, [srcq+strideq*2] + movhps xm1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmovzxbw m0, xm0 + pmovzxbw m1, xm1 + psllw m0, 4 + psllw m1, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + pmovzxbw m0, [srcq+strideq*0] + pmovzxbw m1, [srcq+strideq*1] + pmovzxbw m2, [srcq+strideq*2] + pmovzxbw m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmovzxbw m0, [srcq+strideq*0+16*0] + pmovzxbw m1, [srcq+strideq*0+16*1] + pmovzxbw m2, [srcq+strideq*1+16*0] + pmovzxbw m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .prep_w32 + RET +.prep_w64: + pmovzxbw m0, [srcq+16*0] + pmovzxbw m1, [srcq+16*1] + pmovzxbw m2, [srcq+16*2] + pmovzxbw m3, [srcq+16*3] + add srcq, strideq + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .prep_w64 + RET +.prep_w128: + pmovzxbw m0, [srcq+16*0] + pmovzxbw m1, [srcq+16*1] + pmovzxbw m2, [srcq+16*2] + pmovzxbw m3, [srcq+16*3] + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + pmovzxbw m0, [srcq+16*4] + pmovzxbw m1, [srcq+16*5] + pmovzxbw m2, [srcq+16*6] + pmovzxbw m3, [srcq+16*7] + add tmpq, 32*8 + add srcq, strideq + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .prep_w128 + RET +.h: + ; 16 * src[x] + (mx * (src[x + 1] - src[x])) + ; = (16 - mx) * src[x] + mx * src[x + 1] + imul mxyd, 255 + vbroadcasti128 m4, [bilin_h_shuf8] + add mxyd, 16 + movd xm5, mxyd + mov mxyd, r6m ; my + vpbroadcastw m5, xm5 + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + vbroadcasti128 m4, [bilin_h_shuf4] +.h_w4_loop: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + movq xm1, [srcq+strideq*2] + movhps xm1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 m0, xm1, 1 + pshufb m0, m4 + pmaddubsw m0, m5 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: +.h_w8_loop: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*2] + vinserti128 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: +.h_w16_loop: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + movu xm1, [srcq+strideq*1+8*0] + vinserti128 m1, [srcq+strideq*1+8*1], 1 + movu xm2, [srcq+strideq*2+8*0] + vinserti128 m2, [srcq+strideq*2+8*1], 1 + movu xm3, [srcq+stride3q +8*0] + vinserti128 m3, [srcq+stride3q +8*1], 1 + lea srcq, [srcq+strideq*4] + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: +.h_w32_loop: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + movu xm1, [srcq+strideq*0+8*2] + vinserti128 m1, [srcq+strideq*0+8*3], 1 + movu xm2, [srcq+strideq*1+8*0] + vinserti128 m2, [srcq+strideq*1+8*1], 1 + movu xm3, [srcq+strideq*1+8*2] + vinserti128 m3, [srcq+strideq*1+8*3], 1 + lea srcq, [srcq+strideq*2] + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .h_w32_loop + RET +.h_w64: + movu xm0, [srcq+8*0] + vinserti128 m0, [srcq+8*1], 1 + movu xm1, [srcq+8*2] + vinserti128 m1, [srcq+8*3], 1 + movu xm2, [srcq+8*4] + vinserti128 m2, [srcq+8*5], 1 + movu xm3, [srcq+8*6] + vinserti128 m3, [srcq+8*7], 1 + add srcq, strideq + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .h_w64 + RET +.h_w128: + movu xm0, [srcq+8*0] + vinserti128 m0, [srcq+8*1], 1 + movu xm1, [srcq+8*2] + vinserti128 m1, [srcq+8*3], 1 + movu xm2, [srcq+8*4] + vinserti128 m2, [srcq+8*5], 1 + movu xm3, [srcq+8*6] + vinserti128 m3, [srcq+8*7], 1 + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + movu xm0, [srcq+8* 8] + vinserti128 m0, [srcq+8* 9], 1 + movu xm1, [srcq+8*10] + vinserti128 m1, [srcq+8*11], 1 + movu xm2, [srcq+8*12] + vinserti128 m2, [srcq+8*13], 1 + movu xm3, [srcq+8*14] + vinserti128 m3, [srcq+8*15], 1 + add tmpq, 32*8 + add srcq, strideq + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .h_w128 + RET +.v: + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + imul mxyd, 255 + add mxyd, 16 + add wq, r6 + lea stride3q, [strideq*3] + movd xm6, mxyd + vpbroadcastw m6, xm6 + jmp wq +.v_w4: + movd xm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastd m1, [srcq+strideq*2] + vpbroadcastd xm2, [srcq+strideq*1] + vpbroadcastd m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0x05 ; 0 2 2 2 + vpbroadcastd m0, [srcq+strideq*0] + vpblendd m3, m2, 0x0f ; 1 1 3 3 + vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 + vpblendd m1, m3, 0xaa ; 0 1 2 3 + vpblendd m2, m3, 0x55 ; 1 2 3 4 + punpcklbw m1, m2 + pmaddubsw m1, m6 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq xm0, [srcq+strideq*0] +.v_w8_loop: + vpbroadcastq m1, [srcq+strideq*2] + vpbroadcastq m2, [srcq+strideq*1] + vpbroadcastq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0x03 ; 0 2 2 2 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m2, m3, 0xcc ; 1 3 1 3 + vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2 + vpblendd m2, m1, 0x0f ; 0 2 1 3 + vpblendd m3, m0, 0xc0 ; 1 3 2 4 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m1, m6 + pmaddubsw m2, m6 + mova [tmpq+32*0], m1 + mova [tmpq+32*1], m2 + add tmpq, 32*2 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti128 m0, [srcq+strideq*0] +.v_w16_loop: + vbroadcasti128 m1, [srcq+strideq*1] + vbroadcasti128 m2, [srcq+strideq*2] + vbroadcasti128 m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + shufpd m4, m0, m2, 0x0c ; 0 2 + vbroadcasti128 m0, [srcq+strideq*0] + shufpd m1, m3, 0x0c ; 1 3 + shufpd m2, m0, 0x0c ; 2 4 + punpcklbw m3, m4, m1 + punpcklbw m5, m1, m2 + punpckhbw m4, m1 + punpckhbw m1, m2 + pmaddubsw m3, m6 + pmaddubsw m5, m6 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + mova [tmpq+32*0], m3 + mova [tmpq+32*1], m5 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m1 + add tmpq, 32*4 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + vpermq m0, [srcq+strideq*0], q3120 +.v_w32_loop: + vpermq m1, [srcq+strideq*1], q3120 + vpermq m2, [srcq+strideq*2], q3120 + vpermq m3, [srcq+stride3q ], q3120 + lea srcq, [srcq+strideq*4] + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + vpermq m0, [srcq+strideq*0], q3120 + pmaddubsw m4, m6 + pmaddubsw m5, m6 + mova [tmpq+32*0], m4 + mova [tmpq+32*1], m5 + punpcklbw m4, m1, m2 + punpckhbw m1, m2 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + punpcklbw m5, m2, m3 + punpckhbw m2, m3 + pmaddubsw m5, m6 + pmaddubsw m2, m6 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m1 + add tmpq, 32*8 + punpcklbw m1, m3, m0 + punpckhbw m3, m0 + pmaddubsw m1, m6 + pmaddubsw m3, m6 + mova [tmpq-32*4], m5 + mova [tmpq-32*3], m2 + mova [tmpq-32*2], m1 + mova [tmpq-32*1], m3 + sub hd, 4 + jg .v_w32_loop + RET +.v_w64: + vpermq m0, [srcq+strideq*0+32*0], q3120 + vpermq m1, [srcq+strideq*0+32*1], q3120 +.v_w64_loop: + vpermq m2, [srcq+strideq*1+32*0], q3120 + vpermq m3, [srcq+strideq*1+32*1], q3120 + lea srcq, [srcq+strideq*2] + punpcklbw m4, m0, m2 + punpckhbw m0, m2 + pmaddubsw m4, m6 + pmaddubsw m0, m6 + mova [tmpq+32*0], m4 + mova [tmpq+32*1], m0 + punpcklbw m4, m1, m3 + punpckhbw m5, m1, m3 + vpermq m0, [srcq+strideq*0+32*0], q3120 + vpermq m1, [srcq+strideq*0+32*1], q3120 + pmaddubsw m4, m6 + pmaddubsw m5, m6 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m5 + add tmpq, 32*8 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 + punpcklbw m5, m3, m1 + punpckhbw m3, m1 + pmaddubsw m4, m6 + pmaddubsw m2, m6 + pmaddubsw m5, m6 + pmaddubsw m3, m6 + mova [tmpq-32*4], m4 + mova [tmpq-32*3], m2 + mova [tmpq-32*2], m5 + mova [tmpq-32*1], m3 + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + lea r6d, [hq+(3<<8)] + mov r3, srcq + mov r5, tmpq +.v_w128_loop0: + vpermq m0, [srcq+strideq*0], q3120 +.v_w128_loop: + vpermq m1, [srcq+strideq*1], q3120 + lea srcq, [srcq+strideq*2] + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vpermq m0, [srcq+strideq*0], q3120 + pmaddubsw m2, m6 + pmaddubsw m3, m6 + punpcklbw m4, m1, m0 + punpckhbw m1, m0 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + mova [tmpq+32*0], m2 + mova [tmpq+32*1], m3 + mova [tmpq+32*8], m4 + mova [tmpq+32*9], m1 + add tmpq, 32*16 + sub hd, 2 + jg .v_w128_loop + add r3, 32 + add r5, 64 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .v_w128_loop0 + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 + ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + movd xm6, mxyd + vpbroadcastw m6, xm6 + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.hv_w4: + vbroadcasti128 m4, [bilin_h_shuf4] + vpbroadcastq m0, [srcq+strideq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w4_loop: + movq xm1, [srcq+strideq*1] + movhps xm1, [srcq+strideq*2] + movq xm2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + movhps xm2, [srcq+strideq*0] + vinserti128 m1, xm2, 1 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 3 4 + vpblendd m2, m1, m0, 0xc0 + vpermq m2, m2, q2103 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti128 m0, [srcq+strideq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu xm1, [srcq+strideq*1] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 m2, [srcq+strideq*0], 1 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 ; 1 2 + vperm2i128 m3, m0, m1, 0x21 ; 0 1 + pmaddubsw m0, m2, m5 ; 3 4 + vperm2i128 m2, m1, m0, 0x21 ; 2 3 + psubw m1, m3 + pmulhrsw m1, m6 + paddw m1, m3 + psubw m3, m0, m2 + pmulhrsw m3, m6 + paddw m3, m2 + mova [tmpq+32*0], m1 + mova [tmpq+32*1], m3 + add tmpq, 32*2 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w16_loop: + movu xm1, [srcq+strideq*1+8*0] + vinserti128 m1, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + movu xm2, [srcq+strideq*0+8*0] + vinserti128 m2, [srcq+strideq*0+8*1], 1 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + psubw m3, m1, m0 + pmulhrsw m3, m6 + paddw m3, m0 + pmaddubsw m0, m2, m5 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+32*0], m3 + mova [tmpq+32*1], m2 + add tmpq, 32*2 + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + movu xm0, [srcq+8*0] + vinserti128 m0, [srcq+8*1], 1 + movu xm1, [srcq+8*2] + vinserti128 m1, [srcq+8*3], 1 + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w32_loop: + add srcq, strideq + movu xm2, [srcq+8*0] + vinserti128 m2, [srcq+8*1], 1 + pshufb m2, m4 + pmaddubsw m2, m5 + psubw m3, m2, m0 + pmulhrsw m3, m6 + paddw m3, m0 + mova m0, m2 + movu xm2, [srcq+8*2] + vinserti128 m2, [srcq+8*3], 1 + pshufb m2, m4 + pmaddubsw m2, m5 + mova [tmpq+32*0], m3 + psubw m3, m2, m1 + pmulhrsw m3, m6 + paddw m3, m1 + mova m1, m2 + mova [tmpq+32*1], m3 + add tmpq, 32*2 + dec hd + jg .hv_w32_loop + RET +.hv_w128: + lea r3d, [hq+(7<<8)] + mov r6d, 256 + jmp .hv_w64_start +.hv_w64: + lea r3d, [hq+(3<<8)] + mov r6d, 128 +.hv_w64_start: +%if WIN64 + PUSH r7 +%endif + mov r5, srcq + mov r7, tmpq +.hv_w64_loop0: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w64_loop: + movu xm1, [srcq+strideq*1+8*0] + vinserti128 m1, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + movu xm2, [srcq+strideq*0+8*0] + vinserti128 m2, [srcq+strideq*0+8*1], 1 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + psubw m3, m1, m0 + pmulhrsw m3, m6 + paddw m3, m0 + pmaddubsw m0, m2, m5 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+r6*0], m3 + mova [tmpq+r6*1], m2 + lea tmpq, [tmpq+r6*2] + sub hd, 2 + jg .hv_w64_loop + add r5, 16 + add r7, 32 + movzx hd, r3b + mov srcq, r5 + mov tmpq, r7 + sub r3d, 1<<8 + jg .hv_w64_loop0 +%if WIN64 + POP r7 +%endif + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; fn, type, type_h, type_v +cglobal %1_%2 + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1 %+ SUFFIX) +%endif +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +%define PUT_8TAP_FN FN put_8tap, + +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR + +cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 + lea r6, [ssq*3] + lea r7, [dsq*3] +%if WIN64 + pop r8 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) + WIN64_SPILL_XMM 11 + cmp wd, 4 + jl .h_w2 + vbroadcasti128 m6, [subpel_h_shufA] + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m7, [subpel_h_shufB] + vbroadcasti128 m8, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] + vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] + add wq, r8 + jmp wq +.h_w2: + movzx mxd, mxb + dec srcq + mova xm4, [subpel_h_shuf4] + vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] +.h_w2_loop: + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm3 + phaddw xm0, xm0 + paddw xm0, xm5 + psraw xm0, 6 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] +.h_w4_loop: + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm6 + pshufb xm1, xm6 + pmaddubsw xm0, xm3 + pmaddubsw xm1, xm3 + phaddw xm0, xm1 + paddw xm0, xm5 + psraw xm0, 6 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: +%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] + pshufb m%2, m%1, m7 + pshufb m%3, m%1, m8 + pshufb m%1, m6 + pmaddubsw m%4, m%2, m9 + pmaddubsw m%2, m10 + pmaddubsw m%3, m10 + pmaddubsw m%1, m9 + paddw m%3, m%4 + paddw m%1, m%2 + phaddw m%1, m%3 + paddw m%1, m5 + psraw m%1, 6 +%endmacro + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*1+8*0], 1 + movu xm1, [srcq+ssq*0+8*1] + vinserti128 m1, [srcq+ssq*1+8*1], 1 + PUT_8TAP_H 0, 2, 3, 4 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + sub dstq, r6 + mov r4, r6 +.h_loop: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+r6], m0 + add r6, 32 + jle .h_loop + add srcq, ssq + add dstq, dsq + mov r6, r4 + dec hd + jg .h_loop + RET +.v: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] + vpbroadcastd m7, [pw_512] + lea myq, [r8+myq*8+subpel_filters-put_avx2] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + add r6, r8 + lea ss3q, [ssq*3] + sub srcq, ss3q + jmp r6 +.v_w2: + movd xm2, [srcq+ssq*0] + pinsrw xm2, [srcq+ssq*1], 2 + pinsrw xm2, [srcq+ssq*2], 4 + add srcq, ss3q + pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 + movd xm3, [srcq+ssq*1] + vpbroadcastd xm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm3, xm1, 0x02 ; 4 5 + vpblendd xm1, xm0, 0x02 ; 5 6 + palignr xm4, xm3, xm2, 4 ; 1 2 3 4 + punpcklbw xm3, xm1 ; 45 56 + punpcklbw xm1, xm2, xm4 ; 01 12 + punpckhbw xm2, xm4 ; 23 34 +.v_w2_loop: + pmaddubsw xm5, xm1, xm8 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm9 ; a1 b1 + paddw xm5, xm2 + mova xm2, xm3 + pmaddubsw xm3, xm10 ; a2 b2 + paddw xm5, xm3 + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 7 8 + punpcklbw xm3, xm4 ; 67 78 + pmaddubsw xm4, xm3, xm11 ; a3 b3 + paddw xm5, xm4 + pmulhrsw xm5, xm7 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xm2, [srcq+ssq*0] + pinsrd xm2, [srcq+ssq*1], 1 + pinsrd xm2, [srcq+ssq*2], 2 + add srcq, ss3q + pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xm3, [srcq+ssq*1] + vpbroadcastd xm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm3, xm1, 0x02 ; 4 5 + vpblendd xm1, xm0, 0x02 ; 5 6 + palignr xm4, xm3, xm2, 4 ; 1 2 3 4 + punpcklbw xm3, xm1 ; 45 56 + punpcklbw xm1, xm2, xm4 ; 01 12 + punpckhbw xm2, xm4 ; 23 34 +.v_w4_loop: + pmaddubsw xm5, xm1, xm8 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm9 ; a1 b1 + paddw xm5, xm2 + mova xm2, xm3 + pmaddubsw xm3, xm10 ; a2 b2 + paddw xm5, xm3 + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 7 8 + punpcklbw xm3, xm4 ; 67 78 + pmaddubsw xm4, xm3, xm11 ; a3 b3 + paddw xm5, xm4 + pmulhrsw xm5, xm7 + packuswb xm5, xm5 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + vpbroadcastq m2, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m5, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m6, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m4, 0x30 + vpblendd m4, m2, 0x30 + punpcklbw m1, m4 ; 01 12 + vpblendd m2, m5, 0x30 + vpblendd m5, m3, 0x30 + punpcklbw m2, m5 ; 23 34 + vpblendd m3, m6, 0x30 + vpblendd m6, m0, 0x30 + punpcklbw m3, m6 ; 45 56 +.v_w8_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m5, m1, m8 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m9 ; a1 b1 + paddw m5, m2 + mova m2, m3 + pmaddubsw m3, m10 ; a2 b2 + paddw m5, m3 + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m4, m0, 0x30 + punpcklbw m3, m4 ; 67 78 + pmaddubsw m4, m3, m11 ; a3 b3 + paddw m5, m4 + pmulhrsw m5, m7 + vextracti128 xm4, m5, 1 + packuswb xm5, xm4 + movq [dstq+dsq*0], xm5 + movhps [dstq+dsq*1], xm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +.v_w32: +.v_w64: +.v_w128: + lea r6d, [wq*8-128] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*2] +.v_w16_loop0: + vbroadcasti128 m4, [srcq+ssq*0] + vbroadcasti128 m5, [srcq+ssq*1] + vbroadcasti128 m6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + vbroadcasti128 m2, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m3, [srcq+ssq*0] + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklbw m1, m4, m5 ; 01 + punpckhbw m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklbw m2, m5, m6 ; 12 + punpckhbw m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklbw m3, m6, m0 ; 23 + punpckhbw m6, m0 ; 56 +.v_w16_loop: + vbroadcasti128 m12, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti128 m13, [srcq+ssq*0] + pmaddubsw m14, m1, m8 ; a0 + pmaddubsw m15, m2, m8 ; b0 + mova m1, m3 + mova m2, m4 + pmaddubsw m3, m9 ; a1 + pmaddubsw m4, m9 ; b1 + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + mova m4, m6 + pmaddubsw m5, m10 ; a2 + pmaddubsw m6, m10 ; b2 + paddw m14, m5 + paddw m15, m6 + shufpd m6, m0, m12, 0x0d + shufpd m0, m12, m13, 0x0c + punpcklbw m5, m6, m0 ; 67 + punpckhbw m6, m0 ; 78 + pmaddubsw m12, m5, m11 ; a3 + pmaddubsw m13, m6, m11 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + packuswb m14, m15 + vpermq m14, m14, q3120 + mova [dstq+dsq*0], xm14 + vextracti128 [dstq+dsq*1], m14, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] + lea ss3q, [ssq*3] + sub srcq, ss3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + vpbroadcastd m8, [pw_8192] + vpbroadcastd m9, [pd_512] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m6, [subpel_h_shuf4] + movq xm2, [srcq+ssq*0] + movhps xm2, [srcq+ssq*1] + movq xm0, [srcq+ssq*2] + add srcq, ss3q + movhps xm0, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1, [srcq+ssq*0] + vpblendd m2, m3, 0x30 + vpblendd m0, m1, 0x30 + vpblendd m2, m4, 0xc0 + pshufb m2, m6 + pshufb m0, m6 + pmaddubsw m2, m7 + pmaddubsw m0, m7 + phaddw m2, m0 + pmulhrsw m2, m8 + vextracti128 xm3, m2, 1 + palignr xm4, xm3, xm2, 4 + punpcklwd xm1, xm2, xm4 ; 01 12 + punpckhwd xm2, xm4 ; 23 34 + pshufd xm0, xm3, q2121 + punpcklwd xm3, xm0 ; 45 56 +.hv_w2_loop: + movq xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm4, [srcq+ssq*0] + pshufb xm4, xm6 + pmaddubsw xm4, xm7 + pmaddwd xm5, xm1, xm10 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm11 ; a1 b1 + paddd xm5, xm2 + mova xm2, xm3 + pmaddwd xm3, xm12 ; a2 b2 + phaddw xm4, xm4 + pmulhrsw xm4, xm8 + paddd xm5, xm3 + palignr xm3, xm4, xm0, 12 + mova xm0, xm4 + punpcklwd xm3, xm0 ; 67 78 + pmaddwd xm4, xm3, xm13 ; a3 b3 + paddd xm5, xm9 + paddd xm5, xm4 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova m6, [subpel_h_shuf4] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + vpbroadcastq m0, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m5, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpblendd m2, m4, 0xcc ; 0 1 + vpbroadcastq m4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1, [srcq+ssq*0] + vpblendd m0, m5, 0xcc ; 2 3 + vpblendd m3, m4, 0xcc ; 4 5 + pshufb m2, m6 + pshufb m0, m6 + pshufb m3, m6 + pshufb m1, m6 + pmaddubsw m2, m7 + pmaddubsw m0, m7 + pmaddubsw m3, m7 + pmaddubsw m1, m7 + phaddw m2, m0 + phaddw m3, m1 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + palignr m4, m3, m2, 4 + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + pshufd m0, m3, q2121 + punpcklwd m3, m0 ; 45 56 +.hv_w4_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m1, m10 ; a0 b0 + mova m1, m2 + pmaddwd m2, m11 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m12 ; a2 b2 + paddd m5, m3 + vpbroadcastq m3, [srcq+ssq*0] + vpblendd m4, m3, 0xcc ; 7 8 + pshufb m4, m6 + pmaddubsw m4, m7 + phaddw m4, m4 + pmulhrsw m4, m8 + palignr m3, m4, m0, 12 + mova m0, m4 + punpcklwd m3, m0 ; 67 78 + pmaddwd m4, m3, m13 ; a3 b3 + paddd m5, m9 + paddd m5, m4 + psrad m5, 10 + vextracti128 xm4, m5, 1 + packssdw xm5, xm4 + packuswb xm5, xm5 + pshuflw xm5, xm5, q3120 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] + vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] + lea ss3q, [ssq*3] + sub srcq, ss3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + lea r6d, [wq*8-64] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*4] +.hv_w8_loop0: + vbroadcasti128 m7, [subpel_h_shufA] + movu xm4, [srcq+ssq*0] + vbroadcasti128 m8, [subpel_h_shufB] + movu xm5, [srcq+ssq*1] + vbroadcasti128 m9, [subpel_h_shufC] + movu xm6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m0, [srcq+ssq*0] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 + vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 + add srcq, ss3q + vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 +%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] + pshufb %3, %1, %6 + pshufb %4, %1, %7 + pshufb %1, %5 + pmaddubsw %2, %3, m10 + pmaddubsw %4, m11 + pmaddubsw %3, m11 + pmaddubsw %1, m10 + paddw %2, %4 + paddw %1, %3 + phaddw %1, %2 +%endmacro + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + vpbroadcastd m7, [pw_8192] + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + pmulhrsw m0, m7 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + vpermq m7, m0, q3120 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vextracti128 r6m, m0, 1 ; not enough registers + movu xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m0, [srcq+ssq*0], 1 ; 7 8 + pmaddwd m8, m1, m12 ; a0 + pmaddwd m9, m2, m12 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m13 ; a1 + pmaddwd m4, m13 ; b1 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m14 ; a2 + pmaddwd m6, m14 ; b2 + paddd m8, m5 + paddd m9, m6 + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + vbroadcasti128 m5, [subpel_h_shufA] + HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + vpbroadcastd m5, [pw_8192] + vpbroadcastd m7, [pd_512] + vbroadcasti128 m6, r6m + pmulhrsw m0, m5 + paddd m8, m7 + paddd m9, m7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, m15 ; a3 + paddd m8, m7 + pmaddwd m7, m6, m15 ; b3 + paddd m7, m9 + psrad m8, 10 + psrad m7, 10 + packssdw m8, m7 + vextracti128 xm7, m8, 1 + packuswb xm8, xm7 + pshufd xm7, xm8, q3120 + movq [dstq+dsq*0], xm7 + movhps [dstq+dsq*1], xm7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add r4, 8 + add r7, 8 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +%macro PREP_8TAP_H 0 + pshufb m1, m0, m5 + pshufb m2, m0, m6 + pshufb m3, m0, m7 + pmaddubsw m1, m8 + pmaddubsw m0, m2, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + paddw m1, m2 + paddw m0, m3 + phaddw m0, m1, m0 + pmulhrsw m0, m4 +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, + +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep%+SUFFIX] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r7+wq*2+table_offset(prep,)] + add wq, r7 + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pw_8192] + vbroadcasti128 m5, [subpel_h_shufA] + WIN64_SPILL_XMM 10 + cmp wd, 4 + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] + vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] + add wq, r7 + jmp wq +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + lea stride3q, [strideq*3] +.h_w4_loop: + movq xm0, [srcq+strideq*0] + vpbroadcastq m2, [srcq+strideq*2] + movq xm1, [srcq+strideq*1] + vpblendd m0, m2, 0xf0 + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m2, 0xf0 + pshufb m0, m5 + pshufb m1, m5 + pmaddubsw m0, m6 + pmaddubsw m1, m6 + phaddw m0, m1 + pmulhrsw m0, m4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+strideq*1+8*0] + vinserti128 m0, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + mov r5, r6 +.h_loop: + movu xm0, [srcq+r6+8*0] + vinserti128 m0, [srcq+r6+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+r6+8*2] + vinserti128 m0, [srcq+r6+8*3], 1 + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + add r6, 32 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET +.v: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. + shr myd, 16 ; Note that the code is 8-tap only, having + cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 + cmove myd, mxd ; had a negligible effect on performance. + ; TODO: Would a 6-tap code path be worth it? + lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] + lea stride3q, [strideq*3] + sub srcq, stride3q + vpbroadcastd m7, [pw_8192] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + cmp wd, 8 + jg .v_w16 + je .v_w8 +.v_w4: + movd xm0, [srcq+strideq*0] + vpbroadcastd m1, [srcq+strideq*2] + vpbroadcastd xm2, [srcq+strideq*1] + add srcq, stride3q + vpbroadcastd m3, [srcq+strideq*0] + vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ + vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ + vpbroadcastd m0, [srcq+strideq*1] + vpbroadcastd m2, [srcq+strideq*2] + vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ + vpbroadcastd m0, [srcq+stride3q ] + vbroadcasti128 m5, [deint_shuf4] + vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 + vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 + vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ + punpcklbw m1, m2, m3 ; 01 12 23 34 + vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 + punpckhbw m2, m3 ; 23 34 45 56 +.v_w4_loop: + lea srcq, [srcq+strideq*4] + pinsrd xm0, [srcq+strideq*0], 1 + vpbroadcastd m3, [srcq+strideq*1] + vpbroadcastd m4, [srcq+strideq*2] + vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ + vpbroadcastd m0, [srcq+stride3q ] + vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ + vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ + pshufb m3, m5 ; 67 78 89 9a + pmaddubsw m4, m1, m8 + vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 + pmaddubsw m2, m9 + paddw m4, m2 + mova m2, m3 + pmaddubsw m3, m11 + paddw m3, m4 + pmaddubsw m4, m1, m10 + paddw m3, m4 + pmulhrsw m3, m7 + mova [tmpq], m3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+strideq*0] + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq m5, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m6, [srcq+strideq*1] + vpbroadcastq m0, [srcq+strideq*2] + vpblendd m1, m4, 0x30 + vpblendd m4, m2, 0x30 + punpcklbw m1, m4 ; 01 12 + vpblendd m2, m5, 0x30 + vpblendd m5, m3, 0x30 + punpcklbw m2, m5 ; 23 34 + vpblendd m3, m6, 0x30 + vpblendd m6, m0, 0x30 + punpcklbw m3, m6 ; 45 56 +.v_w8_loop: + vpbroadcastq m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmaddubsw m5, m2, m9 ; a1 + pmaddubsw m6, m2, m8 ; b0 + vpblendd m2, m0, m4, 0x30 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 67 78 + pmaddubsw m1, m8 ; a0 + pmaddubsw m4, m3, m9 ; b1 + paddw m5, m1 + mova m1, m3 + pmaddubsw m3, m10 ; a2 + paddw m6, m4 + paddw m5, m3 + vpbroadcastq m4, [srcq+strideq*1] + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+strideq*2] + vpblendd m4, m0, 0x30 + punpcklbw m3, m4 ; 89 9a + pmaddubsw m4, m2, m11 ; a3 + paddw m5, m4 + pmaddubsw m4, m2, m10 ; b2 + paddw m6, m4 + pmaddubsw m4, m3, m11 ; b3 + paddw m6, m4 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + mova [tmpq+32*0], m5 + mova [tmpq+32*1], m6 + add tmpq, 32*2 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + add wd, wd + mov r5, srcq + mov r7, tmpq + lea r6d, [hq+wq*8-256] +.v_w16_loop0: + vbroadcasti128 m4, [srcq+strideq*0] + vbroadcasti128 m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m0, [srcq+strideq*1] + vbroadcasti128 m6, [srcq+strideq*0] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m1, [srcq+strideq*0] + vbroadcasti128 m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m3, [srcq+strideq*0] + shufpd m4, m4, m0, 0x0c + shufpd m5, m5, m1, 0x0c + punpcklbw m1, m4, m5 ; 01 + punpckhbw m4, m5 ; 34 + shufpd m6, m6, m2, 0x0c + punpcklbw m2, m5, m6 ; 12 + punpckhbw m5, m6 ; 45 + shufpd m0, m0, m3, 0x0c + punpcklbw m3, m6, m0 ; 23 + punpckhbw m6, m0 ; 56 +.v_w16_loop: + vbroadcasti128 m12, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m13, [srcq+strideq*0] + pmaddubsw m14, m1, m8 ; a0 + pmaddubsw m15, m2, m8 ; b0 + mova m1, m3 + mova m2, m4 + pmaddubsw m3, m9 ; a1 + pmaddubsw m4, m9 ; b1 + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + mova m4, m6 + pmaddubsw m5, m10 ; a2 + pmaddubsw m6, m10 ; b2 + paddw m14, m5 + paddw m15, m6 + shufpd m6, m0, m12, 0x0d + shufpd m0, m12, m13, 0x0c + punpcklbw m5, m6, m0 ; 67 + punpckhbw m6, m0 ; 78 + pmaddubsw m12, m5, m11 ; a3 + pmaddubsw m13, m6, m11 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + mova [tmpq+wq*0], m14 + mova [tmpq+wq*1], m15 + lea tmpq, [tmpq+wq*2] + sub hd, 2 + jg .v_w16_loop + add r5, 16 + add r7, 32 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + WIN64_SPILL_XMM 16 + cmp wd, 4 + je .hv_w4 + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] + vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] + lea stride3q, [strideq*3] + sub srcq, stride3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + jmp .hv_w8 +.hv_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] + lea stride3q, [strideq*3] + sub srcq, stride3q + mova m7, [subpel_h_shuf4] + pmovzxbd m9, [deint_shuf4] + vpbroadcastd m10, [pw_8192] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + vpbroadcastd m11, [pd_32] + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + vpbroadcastq m2, [srcq+strideq*0] + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m0, [srcq+strideq*2] + vpbroadcastq m5, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m6, [srcq+strideq*1] + vpbroadcastq m1, [srcq+strideq*2] + vpblendd m2, m4, 0xcc ; 0 1 + vpblendd m0, m5, 0xcc ; 2 3 + vpblendd m3, m6, 0xcc ; 4 5 + pshufb m2, m7 ; 00 01 10 11 02 03 12 13 + pshufb m0, m7 ; 20 21 30 31 22 23 32 33 + pshufb m3, m7 ; 40 41 50 51 42 43 52 53 + pshufb m1, m7 ; 60 61 60 61 62 63 62 63 + pmaddubsw m2, m8 + pmaddubsw m0, m8 + pmaddubsw m3, m8 + pmaddubsw m1, m8 + phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b + phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ + pmulhrsw m2, m10 + pmulhrsw m3, m10 + palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + pshufd m0, m3, q2121 + punpcklwd m3, m0 ; 45 56 +.hv_w4_loop: + pmaddwd m5, m1, m12 ; a0 b0 + pmaddwd m6, m2, m12 ; c0 d0 + pmaddwd m2, m13 ; a1 b1 + pmaddwd m4, m3, m13 ; c1 d1 + mova m1, m3 + pmaddwd m3, m14 ; a2 b2 + paddd m5, m2 + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + paddd m6, m4 + vpbroadcastq m4, [srcq+strideq*0] + paddd m5, m3 + vpbroadcastq m3, [srcq+strideq*1] + vpblendd m2, m4, 0xcc + vpbroadcastq m4, [srcq+strideq*2] + vpblendd m3, m4, 0xcc + pshufb m2, m7 + pshufb m3, m7 + pmaddubsw m2, m8 + pmaddubsw m3, m8 + phaddw m2, m3 + pmulhrsw m2, m10 + palignr m3, m2, m0, 12 + mova m0, m2 + punpcklwd m2, m3, m0 ; 67 78 + punpckhwd m3, m0 ; 89 9a + pmaddwd m4, m2, m14 ; c2 d2 + paddd m6, m11 + paddd m5, m11 + paddd m6, m4 + pmaddwd m4, m2, m15 ; a3 b3 + paddd m5, m4 + pmaddwd m4, m3, m15 ; c3 d3 + paddd m6, m4 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + vpermd m5, m9, m5 + mova [tmpq], m5 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + lea r6d, [wq*8-64] + mov r5, srcq + mov r7, tmpq + lea r6d, [hq+r6*4] +.hv_w8_loop0: + vbroadcasti128 m7, [subpel_h_shufA] + movu xm4, [srcq+strideq*0] + vbroadcasti128 m8, [subpel_h_shufB] + movu xm5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m9, [subpel_h_shufC] + movu xm6, [srcq+strideq*0] + vbroadcasti128 m0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [srcq+strideq*0], 1 ; 1 4 + vinserti128 m6, [srcq+strideq*1], 1 ; 2 5 + lea srcq, [srcq+strideq*2] + vinserti128 m0, [srcq+strideq*0], 1 ; 3 6 + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + vpbroadcastd m7, [pw_8192] + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + pmulhrsw m0, m7 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + vpermq m7, m0, q3120 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vextracti128 [tmpq], m0, 1 ; not enough registers + movu xm0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti128 m0, [srcq+strideq*0], 1 ; 7 8 + pmaddwd m8, m1, m12 ; a0 + pmaddwd m9, m2, m12 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m13 ; a1 + pmaddwd m4, m13 ; b1 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m14 ; a2 + pmaddwd m6, m14 ; b2 + paddd m8, m5 + paddd m9, m6 + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + vbroadcasti128 m5, [subpel_h_shufA] + HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + vpbroadcastd m5, [pw_8192] + vpbroadcastd m7, [pd_32] + vbroadcasti128 m6, [tmpq] + pmulhrsw m0, m5 + paddd m8, m7 + paddd m9, m7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, m15 ; a3 + paddd m8, m7 + pmaddwd m7, m6, m15 ; b3 + paddd m7, m9 + psrad m8, 6 + psrad m7, 6 + packssdw m8, m7 + vpermq m7, m8, q3120 + mova [tmpq+wq*0], xm7 + vextracti128 [tmpq+wq*2], m7, 1 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w8_loop + add r5, 8 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro REMAP_REG 2 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %xdefine r14_save r14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + %xdefine r14 r14_save + %undef r14_save + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] + movq xm%1, [srcq+ r4] + movq xm%2, [srcq+ r6] + movhps xm%1, [srcq+ r7] + movhps xm%2, [srcq+ r9] + vinserti128 m%1, [srcq+r10], 1 + vinserti128 m%2, [srcq+r11], 1 + vpbroadcastq m%5, [srcq+r13] + vpbroadcastq m%6, [srcq+ rX] + add srcq, ssq + movq xm%3, [srcq+ r4] + movq xm%4, [srcq+ r6] + movhps xm%3, [srcq+ r7] + movhps xm%4, [srcq+ r9] + vinserti128 m%3, [srcq+r10], 1 + vinserti128 m%4, [srcq+r11], 1 + vpbroadcastq m%7, [srcq+r13] + vpbroadcastq m%8, [srcq+ rX] + add srcq, ssq + vpblendd m%1, m%5, 0xc0 + vpblendd m%2, m%6, 0xc0 + vpblendd m%3, m%7, 0xc0 + vpblendd m%4, m%8, 0xc0 + pmaddubsw m%1, m15 + pmaddubsw m%2, m10 + pmaddubsw m%3, m15 + pmaddubsw m%4, m10 + phaddw m%1, m%2 + phaddw m%3, m%4 + phaddw m%1, m%3 + pmulhrsw m%1, m12 +%endmacro + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isprep 0 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %xdefine base_reg r12 + %define rndshift 10 +%else + %assign isprep 1 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+120] + %endif + %xdefine base_reg r11 + %define rndshift 6 +%endif + lea base_reg, [%1_8tap_scaled_avx2] +%define base base_reg-%1_8tap_scaled_avx2 + tzcnt wd, wm + vpbroadcastd m8, dxm +%if isprep && UNIX64 + movd xm14, mxd + vpbroadcastd m14, xm14 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 +%else + vpbroadcastd m14, mxm +%endif + mov dyd, dym +%ifidn %1, put + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+112] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %else + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+112] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define rX r14 + %define rXd r14d +%endif + vpbroadcastd m10, [base+pd_0x3ff] + vpbroadcastd m12, [base+pw_8192] +%ifidn %1, put + vpbroadcastd m13, [base+pd_512] +%else + vpbroadcastd m13, [base+pd_32] +%endif + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpblendd m15, m7, 0xaa + vpblendd m0, m2, 0xc0 ; 0 1 4 5 + vpblendd m1, m3, 0xc0 ; 2 3 6 7 + pblendvb m15, m11, m8 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 + vextracti128 xm1, m0, 1 ; 4 5 6 7 + palignr xm2, xm1, xm0, 4 ; 1 2 3 4 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + pshufd xm4, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm4 ; 45 56 + punpckhwd xm4, xm1, xm4 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + pmaddwd xm8, xm4, xm11 + paddd xm5, xm6 + paddd xm7, xm8 + paddd xm5, xm13 + paddd xm5, xm7 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq], xm5, 0 + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movq xm5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps xm3, xm0, q1032 ; 01 12 + shufps xm0, xm2, q1032 ; 23 34 + shufps xm2, xm4, q1032 ; 45 56 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 + palignr xm1, xm5, xm1, 12 + punpcklqdq xm1, xm1 ; 6 7 6 7 + punpcklwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movhps xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xm3, xm0 ; 01 12 + mova xm0, xm2 ; 23 34 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 ; 6 7 6 7 + palignr xm1, xm5, xm1, 8 ; 4 5 6 7 + pshufd xm5, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm5 ; 45 56 + punpckhwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +%endif +.w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd xm15, xm0 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m0, m9 + psrld m14, 10 + movu xm7, [srcq+ssq*0] + movu xm9, [srcq+ssq*1] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm8, [srcq+ssq*2] + movu xm10, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m7, [srcq+ssq*0], 1 + vinserti128 m9, [srcq+ssq*1], 1 + vinserti128 m15, xm15, 1 + vinserti128 m8, [srcq+ssq*2], 1 + vinserti128 m10, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pblendvb m15, m11, m0 + pshufb m7, m14 + pshufb m9, m14 + pshufb m8, m14 + pshufb m10, m14 + pmaddubsw m7, m15 + pmaddubsw m9, m15 + pmaddubsw m8, m15 + pmaddubsw m10, m15 + phaddw m7, m9 + phaddw m8, m10 + pmulhrsw m7, m12 ; 0 1 4 5 + pmulhrsw m8, m12 ; 2 3 6 7 + vextracti128 xm9, m7, 1 ; 4 5 + vextracti128 xm3, m8, 1 ; 6 7 + shufps xm4, xm7, xm8, q1032 ; 1 2 + shufps xm5, xm8, xm9, q1032 ; 3 4 + shufps xm6, xm9, xm3, q1032 ; 5 6 + psrldq xm11, xm3, 8 ; 7 _ + punpcklwd xm0, xm7, xm4 ; 01 + punpckhwd xm7, xm4 ; 12 + punpcklwd xm1, xm8, xm5 ; 23 + punpckhwd xm8, xm5 ; 34 + punpcklwd xm2, xm9, xm6 ; 45 + punpckhwd xm9, xm6 ; 56 + punpcklwd xm3, xm11 ; 67 + mova [rsp+0x00], xm7 + mova [rsp+0x10], xm8 + mova [rsp+0x20], xm9 +.w4_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm10, r6q + punpcklbw xm10, xm10 + psraw xm10, 8 + pshufd xm7, xm10, q0000 + pshufd xm8, xm10, q1111 + pshufd xm9, xm10, q2222 + pshufd xm10, xm10, q3333 + pmaddwd xm4, xm0, xm7 + pmaddwd xm5, xm1, xm8 + pmaddwd xm6, xm2, xm9 + pmaddwd xm7, xm3, xm10 + paddd xm4, xm5 + paddd xm6, xm7 + paddd xm4, xm13 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq], xm4 + add dstq, dsq +%else + movq [tmpq], xm4 + add tmpq, 8 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + movu xm4, [srcq] + test myd, 0x400 + jz .w4_skip_line + mova xm0, [rsp+0x00] + mova [rsp+0x00], xm1 + mova xm1, [rsp+0x10] + mova [rsp+0x10], xm2 + mova xm2, [rsp+0x20] + mova [rsp+0x20], xm3 + pshufb xm4, xm14 + pmaddubsw xm4, xm15 + phaddw xm4, xm4 + pmulhrsw xm4, xm12 + punpcklwd xm3, xm11, xm4 + mova xm11, xm4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu xm5, [srcq+ssq*1] + movu m6, [rsp+0x10] + pshufb xm4, xm14 + pshufb xm5, xm14 + pmaddubsw xm4, xm15 + pmaddubsw xm5, xm15 + movu [rsp+0x00], m6 + phaddw xm4, xm5 + pmulhrsw xm4, xm12 + punpcklwd xm9, xm11, xm4 + mova [rsp+0x20], xm9 + psrldq xm11, xm4, 8 + mova xm0, xm1 + mova xm1, xm2 + mova xm2, xm3 + punpcklwd xm3, xm4, xm11 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +.w8: + mov dword [rsp+48], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [rsp+48], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+48], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+48], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+48], 16 + movifprep tmp_stridem, 256 +.w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+72], t0d + mov [rsp+56], srcq + mov [rsp+64], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .hloop +.hloop_prep: + dec dword [rsp+48] + jz .ret + add qword [rsp+64], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+16] + vpbroadcastd m15, [rsp+72] + pxor m9, m9 + mov srcq, [rsp+56] + mov r0q, [rsp+64] ; dstq / tmpq +.hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+16], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + mova [rsp], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + mov dyd, dym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufd m8, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+52], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .skip_line + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .vloop +.skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + vpbroadcastq m7, [srcq+r13] + vpbroadcastq m8, [srcq+ rX] + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + add srcq, ssq + movq xm5, [srcq+ r4] + movq xm6, [srcq+ r6] + movhps xm5, [srcq+ r7] + movhps xm6, [srcq+ r9] + vinserti128 m5, [srcq+r10], 1 + vinserti128 m6, [srcq+r11], 1 + vpbroadcastq m9, [srcq+r13] + vpbroadcastq m11, [srcq+ rX] + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + vpblendd m3, m7, 0xc0 + vpblendd m4, m8, 0xc0 + vpblendd m5, m9, 0xc0 + vpblendd m6, m11, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + pmaddubsw m5, m15 + pmaddubsw m6, m10 + phaddw m3, m4 + phaddw m5, m6 + psrld m4, m3, 16 + pslld m6, m5, 16 + paddw m3, m4 + paddw m5, m6 + pblendw m3, m5, 0xaa + pmulhrsw m3, m12 + jmp .vloop +.dy1: + movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy1_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + add srcq, ss3q + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + pshufd xm8, xm10, q0000 + pshufd xm9, xm10, q1111 + pshufd xm11, xm10, q3333 + pshufd xm10, xm10, q2222 + vpblendd m0, m2, 0xc0 + pshufb m1, m14 + pshufb m0, m14 + pmaddubsw m1, m15 + pmaddubsw m0, m15 + phaddw m0, m1 + pmulhrsw m0, m12 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 + pshufd xm4, xm1, q2121 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + punpcklwd xm2, xm1, xm4 ; 45 56 +.dy1_w2_loop: + movq xm1, [srcq+ssq*0] + movhps xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + mova xm3, xm0 + mova xm0, xm2 + paddd xm5, xm13 + paddd xm6, xm7 + pshufb xm1, xm14 + pmaddubsw xm1, xm15 + phaddw xm1, xm1 + pmulhrsw xm1, xm12 + palignr xm7, xm1, xm4, 12 + punpcklwd xm2, xm7, xm1 ; 67 78 + pmaddwd xm7, xm2, xm11 + mova xm4, xm1 + paddd xm5, xm6 + paddd xm5, xm7 + psrad xm5, rndshift + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +.dy1_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + vpermq m8, m8, q3120 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r11d, xm15, 1 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + movu xm2, [srcq+ssq*0] + movu xm3, [srcq+ssq*2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pcmpeqd m8, m9 + psrld m14, 10 + pinsrd xm15, [base+subpel_filters+r11*8+2], 1 + vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 + vinserti128 m2, [srcq+ssq*1], 1 + vinserti128 m3, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movu xm4, [srcq+ssq*0] + movu xm5, [srcq+ssq*2] + vinserti128 m4, [srcq+ssq*1], 1 + add srcq, ss3q + vpblendd m15, m7, 0x30 + punpcklqdq m15, m15 + pblendvb m15, m11, m8 + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vinserti128 m10, xm10, 1 + pshufb m2, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb xm5, xm14 + vpermq m2, m2, q3120 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m2, m3 + phaddw m4, m5 + pmulhrsw m2, m12 + pmulhrsw m4, m12 + palignr m5, m4, m2, 4 + pshufd m3, m4, q2121 + punpcklwd m0, m2, m5 ; 01 12 + punpckhwd m1, m2, m5 ; 23 34 + punpcklwd m2, m4, m3 ; 45 56 +.dy1_w4_loop: + movu xm11, [srcq+ssq*0] + vinserti128 m11, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + mova m0, m1 + mova m1, m2 + paddd m4, m13 + paddd m5, m6 + pshufb m11, m14 + vpermq m11, m11, q3120 + pmaddubsw m11, m15 + phaddw m11, m11 + pmulhrsw m11, m12 + palignr m6, m11, m3, 12 + punpcklwd m2, m6, m11 ; 67 78 + mova m3, m11 + pmaddwd m6, m2, m10 + paddd m4, m5 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + pshuflw xm4, xm4, q3120 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + pshufd xm4, xm4, q3120 + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET +.dy1_w8: + mov dword [rsp+72], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [rsp+72], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+72], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+72], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+72], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+76], t0d + mov [rsp+80], srcq + mov [rsp+88], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + movq xm0, r4q + punpcklbw xm0, xm0 + psraw xm0, 8 + mova [rsp+96], xm0 + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+72] + jz .ret + add qword [rsp+88], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+32] + vpbroadcastd m15, [rsp+76] + pxor m9, m9 + mov srcq, [rsp+80] + mov r0q, [rsp+88] ; dstq / tmpq +.dy1_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+32], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movq [rsp+64], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + movu [rsp], m10 + vpbroadcastd m8, [rsp+0x60] + vpbroadcastd m9, [rsp+0x64] + vpbroadcastd m10, [rsp+0x68] + vpbroadcastd m11, [rsp+0x6c] + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, [rsp] + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_vloop +.dy2: + movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy2_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + vpbroadcastq m2, [srcq+ssq*1] + movhps xm0, [srcq+ssq*2] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + movhps xm1, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vpblendd m0, m2, 0x30 + vpblendd m1, m4, 0xc0 + vpblendd m0, m3, 0xc0 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 01 23 + punpckhwd xm2, xm1 ; 23 45 +.dy2_w2_loop: + movq xm6, [srcq+ssq*0] + vpbroadcastq m7, [srcq+ssq*1] + movhps xm6, [srcq+ssq*2] + vpbroadcastq m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd xm4, xm3, xm8 + pmaddwd xm5, xm2, xm9 + vpblendd m6, m7, 0x30 + vpblendd m6, m1, 0xc0 + pshufb m6, m14 + pmaddubsw m6, m15 + phaddw m6, m6 + pmulhrsw m6, m12 + palignr m0, m6, m0, 8 + pshufd m2, m0, q3221 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 45 67 + punpckhwd xm2, xm1 ; 67 89 + pmaddwd xm6, xm3, xm10 + pmaddwd xm7, xm2, xm11 + paddd xm4, xm5 + paddd xm4, xm13 + paddd xm6, xm7 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 + packuswb xm4, xm4 + pextrw [dstq+dsq*0], xm4, 0 + pextrw [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +.dy2_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m8, m9 + psrld m14, 10 + movu xm0, [srcq+ssq*0] + movu xm2, [srcq+ssq*2] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm1, [srcq+ssq*1] + movu xm3, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vinserti128 m15, xm15, 1 + pshufb m14, m5 + paddb m14, m6 + vinserti128 m2, [srcq+ssq*0], 1 + vinserti128 m3, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pblendvb m15, m11, m8 + pshufb xm0, xm14 + pshufb m2, m14 + pshufb xm1, xm14 + pshufb m3, m14 + pmaddubsw xm0, xm15 + pmaddubsw m2, m15 + pmaddubsw xm1, xm15 + pmaddubsw m3, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 2 _ 4 + pmulhrsw m1, m12 ; 1 3 _ 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + punpcklwd xm2, xm0, xm1 + punpckhwd m1, m0, m1 ; 23 45 + vinserti128 m0, m2, xm1, 1 ; 01 23 +.dy2_w4_loop: + movu xm6, [srcq+ssq*0] + movu xm7, [srcq+ssq*1] + vinserti128 m6, [srcq+ssq*2], 1 + vinserti128 m7, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufb m6, m14 + pshufb m7, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + psrld m2, m6, 16 + pslld m3, m7, 16 + paddw m6, m2 + paddw m7, m3 + pblendw m6, m7, 0xaa ; 67 89 + pmulhrsw m6, m12 + paddd m4, m5 + vperm2i128 m0, m1, m6, 0x21 ; 45 67 + mova m1, m6 + pmaddwd m6, m0, m10 + pmaddwd m7, m1, m11 + paddd m4, m13 + paddd m6, m7 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET +.dy2_w8: + mov dword [rsp+40], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [rsp+40], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+40], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+40], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+40], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+64], t0d + mov [rsp+48], srcq + mov [rsp+56], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + movq xm0, r4q + punpcklbw xm0, xm0 + psraw xm0, 8 + mova [rsp+0x50], xm0 + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+40] + jz .ret + add qword [rsp+56], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp] + vpbroadcastd m15, [rsp+64] + pxor m9, m9 + mov srcq, [rsp+48] + mov r0q, [rsp+56] ; dstq / tmpq +.dy2_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + vpbroadcastd m8, [rsp+0x50] + vpbroadcastd m9, [rsp+0x54] + vpbroadcastd m11, [rsp+0x58] + vpbroadcastd m4, [rsp+0x5c] + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + SWAP m14, m4 +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m14 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep + mova m0, m1 + mova m1, m2 + mova m2, m3 + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + vpbroadcastq m5, [srcq+r13] + vpbroadcastq m6, [srcq+ rX] + add srcq, ssq + vpblendd m3, m5, 0xc0 + vpblendd m4, m6, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + phaddw m3, m4 + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + psrld m5, m3, 16 + pslld m6, m4, 16 + paddw m3, m5 + paddw m4, m6 + pblendw m3, m4, 0xaa + pmulhrsw m3, m12 + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled + mov t0d, (5*15 << 16) | 5*15 + mov t1d, t0d + jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif + +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, +%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, + +BILIN_SCALED_FN put +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +BILIN_SCALED_FN prep +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED prep + +%macro WARP_V 5 ; dst, 02, 46, 13, 57 + ; Can be done using gathers, but that's terribly slow on many CPU:s + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm8, [filterq+myq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 ; a e + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+deltaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; b f + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm9, [filterq+myq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 ; c g + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+gammaq] ; my += gamma + shr tmp2d, 10 + shr tmp1d, 10 + punpcklwd m8, m0 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; d h + punpcklwd m0, m9, m0 + punpckldq m9, m8, m0 + punpckhdq m0, m8, m0 + punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 + punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 + pmaddwd m%2, m8 + pmaddwd m9, m%3 + punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 + punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 + pmaddwd m8, m%4 + pmaddwd m0, m%5 + paddd m%2, m9 + paddd m0, m8 + paddd m%1, m0, m%2 +%endmacro + +cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts +%if WIN64 + sub rsp, 0xa0 +%endif + call mangle(private_prefix %+ _warp_affine_8x8_avx2).main +.loop: + psrad m7, 13 + psrad m0, 13 + packssdw m7, m0 + pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 + vpermq m7, m7, q3120 + mova [tmpq+tsq*0], xm7 + vextracti128 [tmpq+tsq*2], m7, 1 + dec r4d + jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end + call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2 + lea tmpq, [tmpq+tsq*4] + jmp .loop + +cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ + beta, filter, tmp1, delta, my, gamma +%if WIN64 + sub rsp, 0xa0 + %assign xmm_regs_used 16 + %assign stack_size_padded 0xa0 + %assign stack_offset stack_offset+stack_size_padded +%endif + call .main + jmp .start +.loop: + call .main2 + lea dstq, [dstq+dsq*2] +.start: + psrad m7, 18 + psrad m0, 18 + packusdw m7, m0 + pavgw m7, m11 ; (x + (1 << 10)) >> 11 + vextracti128 xm0, m7, 1 + packuswb xm7, xm0 + pshufd xm7, xm7, q3120 + movq [dstq+dsq*0], xm7 + movhps [dstq+dsq*1], xm7 + dec r4d + jg .loop +.end: + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov abcdq, r5m + mov mxd, r6m + movaps [rsp+stack_offset+0x10], xmm6 + movaps [rsp+stack_offset+0x20], xmm7 + movaps [rsp+0x28], xmm8 + movaps [rsp+0x38], xmm9 + movaps [rsp+0x48], xmm10 + movaps [rsp+0x58], xmm11 + movaps [rsp+0x68], xmm12 + movaps [rsp+0x78], xmm13 + movaps [rsp+0x88], xmm14 + movaps [rsp+0x98], xmm15 +%endif + movsx alphad, word [abcdq+2*0] + movsx betad, word [abcdq+2*1] + mova m12, [warp_8x8_shufA] + mova m13, [warp_8x8_shufB] + vpbroadcastd m14, [pw_8192] + vpbroadcastd m15, [pd_32768] + pxor m11, m11 + lea filterq, [mc_warp_filter2] + lea tmp1q, [ssq*3+3] + add mxd, 512+(64<<10) + lea tmp2d, [alphaq*3] + sub srcq, tmp1q ; src -= src_stride*3 + 3 + sub betad, tmp2d ; beta -= alpha*3 + mov myd, r7m + call .h + psrld m1, m0, 16 + call .h + psrld m4, m0, 16 + call .h + pblendw m1, m0, 0xaa ; 02 + call .h + pblendw m4, m0, 0xaa ; 13 + call .h + psrld m2, m1, 16 + pblendw m2, m0, 0xaa ; 24 + call .h + psrld m5, m4, 16 + pblendw m5, m0, 0xaa ; 35 + call .h + psrld m3, m2, 16 + pblendw m3, m0, 0xaa ; 46 + movsx deltad, word [abcdq+2*2] + movsx gammad, word [abcdq+2*3] + add myd, 512+(64<<10) + mov r4d, 4 + lea tmp1d, [deltaq*3] + sub gammad, tmp1d ; gamma -= delta*3 +.main2: + call .h + psrld m6, m5, 16 + pblendw m6, m0, 0xaa ; 57 + WARP_V 7, 1, 3, 4, 6 + call .h + mova m1, m2 + mova m2, m3 + psrld m3, 16 + pblendw m3, m0, 0xaa ; 68 + WARP_V 0, 4, 6, 1, 3 + mova m4, m5 + mova m5, m6 + ret +ALIGN function_align +.h: + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + vbroadcasti128 m10, [srcq] + shr mxd, 10 + shr tmp1d, 10 + movq xm8, [filterq+mxq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+alphaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + shr mxd, 10 + shr tmp1d, 10 + movq xm9, [filterq+mxq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + shr tmp2d, 10 + shr tmp1d, 10 + punpcklqdq m8, m0 ; 0 1 4 5 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 + punpcklqdq m9, m0 ; 2 3 6 7 + pshufb m0, m10, m12 + pmaddubsw m0, m8 + pshufb m10, m13 + pmaddubsw m10, m9 + add srcq, ssq + phaddw m0, m10 + pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 + paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword + ret + +%macro BIDIR_FN 1 ; op + %1 0 + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq ], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + cmp hd, 8 + je .ret + %1 2 + lea dstq, [dstq+strideq*4] + vextracti128 xm1, m0, 1 + movd [dstq ], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 +.ret: + RET +.w8_loop: + %1_INC_PTR 2 + %1 0 + lea dstq, [dstq+strideq*4] +.w8: + vextracti128 xm1, m0, 1 + movq [dstq ], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + %1_INC_PTR 4 + %1 0 + lea dstq, [dstq+strideq*4] +.w16: + vpermq m0, m0, q3120 + mova [dstq ], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + %1_INC_PTR 4 + %1 0 + lea dstq, [dstq+strideq*2] +.w32: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], m0 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+strideq*1], m0 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + %1_INC_PTR 4 + %1 0 + add dstq, strideq +.w64: + vpermq m0, m0, q3120 + mova [dstq], m0 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+32], m0 + dec hd + jg .w64_loop + RET +.w128_loop: + %1 0 + add dstq, strideq +.w128: + vpermq m0, m0, q3120 + mova [dstq+0*32], m0 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+1*32], m0 + %1_INC_PTR 8 + %1 -4 + vpermq m0, m0, q3120 + mova [dstq+2*32], m0 + %1 -2 + vpermq m0, m0, q3120 + mova [dstq+3*32], m0 + dec hd + jg .w128_loop + RET +%endmacro + +%macro AVG 1 ; src_offset + mova m0, [tmp1q+(%1+0)*32] + paddw m0, [tmp2q+(%1+0)*32] + mova m1, [tmp1q+(%1+1)*32] + paddw m1, [tmp2q+(%1+1)*32] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + packuswb m0, m1 +%endmacro + +%macro AVG_INC_PTR 1 + add tmp1q, %1*32 + add tmp2q, %1*32 +%endmacro + +cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg %+ SUFFIX %+ _table + lea r6, [avg %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r6+wq*4] + vpbroadcastd m2, [base+pw_1024] + add wq, r6 + BIDIR_FN AVG + +%macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 + ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 + ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 + mova m0, [tmp1q+(%1+0)*32] + psubw m2, m0, [tmp2q+(%1+0)*32] + mova m1, [tmp1q+(%1+1)*32] + psubw m3, m1, [tmp2q+(%1+1)*32] + pmulhw m2, m4 + pmulhw m3, m4 + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%define W_AVG_INC_PTR AVG_INC_PTR + +cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-w_avg %+ SUFFIX %+ _table + lea r6, [w_avg %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + vpbroadcastw m4, r6m ; weight + movsxd wq, dword [r6+wq*4] + vpbroadcastd m5, [base+pw_2048] + psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 + cmp dword r6m, 7 + jg .weight_gt7 + mov r6, tmp1q + pxor m0, m0 + mov tmp1q, tmp2q + psubw m4, m0, m4 ; -weight + mov tmp2q, r6 +.weight_gt7: + BIDIR_FN W_AVG + +%macro MASK 1 ; src_offset + ; (a * m + b * (64 - m) + 512) >> 10 + ; = ((a - b) * m + (b << 6) + 512) >> 10 + ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 + vpermq m3, [maskq+%1*16], q3120 + mova m0, [tmp2q+(%1+0)*32] + psubw m1, m0, [tmp1q+(%1+0)*32] + psubb m3, m4, m3 + paddw m1, m1 ; (b - a) << 1 + paddb m3, m3 + punpcklbw m2, m4, m3 ; -m << 9 + pmulhw m1, m2 + paddw m0, m1 + mova m1, [tmp2q+(%1+1)*32] + psubw m2, m1, [tmp1q+(%1+1)*32] + paddw m2, m2 + punpckhbw m3, m4, m3 + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%macro MASK_INC_PTR 1 + add maskq, %1*16 + add tmp2q, %1*32 + add tmp1q, %1*32 +%endmacro + +cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask %+ SUFFIX %+ _table + lea r7, [mask %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp + movsxd wq, dword [r7+wq*4] + vpbroadcastd m5, [base+pw_2048] + pxor m4, m4 + add wq, r7 + BIDIR_FN MASK + +%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 + mova m%1, [tmp1q+32*%3] + mova m1, [tmp2q+32*%3] + psubw m1, m%1 + pabsw m%2, m1 + psubusw m%2, m6, m%2 + psrlw m%2, 8 ; 64 - m + psllw m2, m%2, 10 + pmulhw m1, m2 + paddw m%1, m1 + mova m1, [tmp1q+32*%4] + mova m2, [tmp2q+32*%4] + psubw m2, m1 + pabsw m3, m2 + psubusw m3, m6, m3 + psrlw m3, 8 +%if %5 + packuswb m%2, m3 + psubb m%2, m5, m%2 + vpermq m%2, m%2, q3120 +%else + phaddw m%2, m3 +%endif + psllw m3, 10 + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m%1, m7 + pmulhrsw m1, m7 + packuswb m%1, m1 +%endmacro + +cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_avx2_table + lea r6, [blend_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movifnidn maskq, maskmp + movsxd wq, dword [r6+wq*4] + vpbroadcastd m4, [base+pb_64] + vpbroadcastd m5, [base+pw_512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + movd xm0, [dstq+dsq*0] + pinsrd xm0, [dstq+dsq*1], 1 + vpbroadcastd xm1, [dstq+dsq*2] + pinsrd xm1, [dstq+r6 ], 3 + mova xm6, [maskq] + psubb xm3, xm4, xm6 + punpcklbw xm2, xm3, xm6 + punpckhbw xm3, xm6 + mova xm6, [tmpq] + add maskq, 4*4 + add tmpq, 4*4 + punpcklbw xm0, xm6 + punpckhbw xm1, xm6 + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm3 + pmulhrsw xm0, xm5 + pmulhrsw xm1, xm5 + packuswb xm0, xm1 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + pextrd [dstq+dsq*2], xm0, 2 + pextrd [dstq+r6 ], xm0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + RET +ALIGN function_align +.w8: + movq xm1, [dstq+dsq*0] + movhps xm1, [dstq+dsq*1] + vpbroadcastq m2, [dstq+dsq*2] + vpbroadcastq m3, [dstq+r6 ] + mova m0, [maskq] + mova m6, [tmpq] + add maskq, 8*4 + add tmpq, 8*4 + vpblendd m1, m2, 0x30 + vpblendd m1, m3, 0xc0 + psubb m3, m4, m0 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + punpcklbw m0, m1, m6 + punpckhbw m1, m6 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + movq [dstq+dsq*2], xm1 + movhps [dstq+r6 ], xm1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + RET +ALIGN function_align +.w16: + mova m0, [maskq] + mova xm1, [dstq+dsq*0] + vinserti128 m1, [dstq+dsq*1], 1 + psubb m3, m4, m0 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + mova m6, [tmpq] + add maskq, 16*2 + add tmpq, 16*2 + punpcklbw m0, m1, m6 + punpckhbw m1, m6 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16 + RET +ALIGN function_align +.w32: + mova m0, [maskq] + mova m1, [dstq] + mova m6, [tmpq] + add maskq, 32 + add tmpq, 32 + psubb m3, m4, m0 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + punpcklbw m0, m1, m6 + punpckhbw m1, m6 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .w32 + RET + +cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_v_avx2_table + lea r5, [blend_v_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r5+wq*4] + vpbroadcastd m5, [base+pw_512] + add wq, r5 + add maskq, obmc_masks-blend_v_avx2_table + jmp wq +.w2: + vpbroadcastd xm2, [maskq+2*2] +.w2_s0_loop: + movd xm0, [dstq+dsq*0] + pinsrw xm0, [dstq+dsq*1], 1 + movd xm1, [tmpq] + add tmpq, 2*2 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_s0_loop + RET +ALIGN function_align +.w4: + vpbroadcastq xm2, [maskq+4*2] +.w4_loop: + movd xm0, [dstq+dsq*0] + pinsrd xm0, [dstq+dsq*1], 1 + movq xm1, [tmpq] + add tmpq, 4*2 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.w8: + vbroadcasti128 m4, [maskq+8*2] +.w8_loop: + vpbroadcastq m2, [dstq+dsq*0] + movq xm0, [dstq+dsq*1] + vpblendd m0, m2, 0x30 + movq xm1, [tmpq+8*1] + vinserti128 m1, [tmpq+8*0], 1 + add tmpq, 8*2 + punpcklbw m0, m1 + pmaddubsw m0, m4 + pmulhrsw m0, m5 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movhps [dstq+dsq*0], xm0 + movq [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + vbroadcasti128 m3, [maskq+16*2] + vbroadcasti128 m4, [maskq+16*3] +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti128 m1, [dstq+dsq*1], 1 + mova m2, [tmpq] + add tmpq, 16*2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + mova xm3, [maskq+16*4] + vinserti128 m3, [maskq+16*6], 1 + mova xm4, [maskq+16*5] + vinserti128 m4, [maskq+16*7], 1 +.w32_loop: + mova m1, [dstq] + mova m2, [tmpq] + add tmpq, 32 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .w32_loop + RET + +cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_h_avx2_table + lea r5, [blend_h_avx2_table] + mov r6d, wd + tzcnt wd, wd + mov hd, hm + movsxd wq, dword [r5+wq*4] + vpbroadcastd m5, [base+pw_512] + add wq, r5 + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd xm0, [dstq+dsq*0] + pinsrw xm0, [dstq+dsq*1], 1 + movd xm2, [maskq+hq*2] + movd xm1, [tmpq] + add tmpq, 2*2 + punpcklwd xm2, xm2 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +ALIGN function_align +.w4: + mova xm3, [blend_shuf] +.w4_loop: + movd xm0, [dstq+dsq*0] + pinsrd xm0, [dstq+dsq*1], 1 + movd xm2, [maskq+hq*2] + movq xm1, [tmpq] + add tmpq, 4*2 + pshufb xm2, xm3 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +ALIGN function_align +.w8: + vbroadcasti128 m4, [blend_shuf] + shufpd m4, m4, 0x03 +.w8_loop: + vpbroadcastq m1, [dstq+dsq*0] + movq xm0, [dstq+dsq*1] + vpblendd m0, m1, 0x30 + vpbroadcastd m3, [maskq+hq*2] + movq xm1, [tmpq+8*1] + vinserti128 m1, [tmpq+8*0], 1 + add tmpq, 8*2 + pshufb m3, m4 + punpcklbw m0, m1 + pmaddubsw m0, m3 + pmulhrsw m0, m5 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movhps [dstq+dsq*0], xm0 + movq [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + RET +ALIGN function_align +.w16: + vbroadcasti128 m4, [blend_shuf] + shufpd m4, m4, 0x0c +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti128 m1, [dstq+dsq*1], 1 + vpbroadcastd m3, [maskq+hq*2] + mova m2, [tmpq] + add tmpq, 16*2 + pshufb m3, m4 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16_loop + RET +ALIGN function_align +.w32: ; w32/w64/w128 + sub dsq, r6 +.w32_loop0: + vpbroadcastw m3, [maskq+hq*2] + mov wd, r6d +.w32_loop: + mova m1, [dstq] + mova m2, [tmpq] + add tmpq, 32 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, 32 + sub wd, 32 + jg .w32_loop + add dstq, dsq + inc hq + jl .w32_loop0 + RET + +cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ + bottomext, rightext + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor r12d, r12d + lea r10, [ihq-1] + cmp yq, ihq + cmovs r10, yq + test yq, yq + cmovs r10, r12 + imul r10, sstrideq + add srcq, r10 + + ; ref += iclip(x, 0, iw - 1) + lea r10, [iwq-1] + cmp xq, iwq + cmovs r10, xq + test xq, xq + cmovs r10, r12 + add srcq, r10 + + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) + lea bottomextq, [yq+bhq] + sub bottomextq, ihq + lea r3, [bhq-1] + cmovs bottomextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, r12 + cmp bottomextq, bhq + cmovns bottomextq, r3 + cmp topextq, bhq + cmovg topextq, r3 + + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + lea rightextq, [xq+bwq] + sub rightextq, iwq + lea r2, [bwq-1] + cmovs rightextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, r12 + cmp rightextq, bwq + cmovns rightextq, r2 + cmp leftextq, bwq + cmovns leftextq, r2 + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ + dst, dstride, src, sstride, bottomext, rightext + + ; center_h = bh - top_ext - bottom_ext + lea r3, [bottomextq+topextq] + sub centerhq, r3 + + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq + imul r2, dstrideq + add dstq, r2 + mov r9m, dstq + + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq + lea r3, [rightextq+leftextq] + sub centerwq, r3 + +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix +.v_loop_%3: +%if %1 + ; left extension + xor r3, r3 + vpbroadcastb m0, [srcq] +.left_loop_%3: + mova [dstq+r3], m0 + add r3, 32 + cmp r3, leftextq + jl .left_loop_%3 + + ; body + lea r12, [dstq+leftextq] +%endif + xor r3, r3 +.body_loop_%3: + movu m0, [srcq+r3] +%if %1 + movu [r12+r3], m0 +%else + movu [dstq+r3], m0 +%endif + add r3, 32 + cmp r3, centerwq + jl .body_loop_%3 + +%if %2 + ; right extension +%if %1 + add r12, centerwq +%else + lea r12, [dstq+centerwq] +%endif + xor r3, r3 + vpbroadcastb m0, [srcq+centerwq-1] +.right_loop_%3: + movu [r12+r3], m0 + add r3, 32 + cmp r3, rightextq + jl .right_loop_%3 + +%endif + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 +%endmacro + + test leftextq, leftextq + jnz .need_left_ext + test rightextq, rightextq + jnz .need_right_ext + v_loop 0, 0, 0 + jmp .body_done + +.need_left_ext: + test rightextq, rightextq + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: + ; bottom edge extension + test bottomextq, bottomextq + jz .top + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 +.bottom_x_loop: + mova m0, [srcq+r1] + lea r3, [dstq+r1] + mov r4, bottomextq +.bottom_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .bottom_y_loop + add r1, 32 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end + mov srcq, r9m + mov dstq, dstm + xor r1, r1 +.top_x_loop: + mova m0, [srcq+r1] + lea r3, [dstq+r1] + mov r4, topextq +.top_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .top_y_loop + add r1, 32 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr + LEA r7, $$ +%define base r7-$$ + + vpbroadcastd m3, [base+pw_m256] + vpbroadcastd m7, [base+pd_63] + vbroadcasti128 m15, [base+pb_8x0_8x8] + pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] + pslld m5, 3 ; dx*8 + pslld m6, 14 + paddd m8, m2 ; mx+[0..7]*dx + pxor m2, m2 + + ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 + ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 + +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx + +.loop_x: + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + pand m9, m7 ; filter offset (masked) + + ; load source pixels - this ugly code is vpgatherdq emulation since + ; directly using vpgatherdq on Haswell is quite a bit slower :( + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vextracti128 xm0, m0, 1 + movq xm12, [srcq+r8] + movq xm13, [srcq+r10] + movhps xm12, [srcq+r9] + movhps xm13, [srcq+r11] + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vinserti128 m12, [srcq+r8], 1 + vinserti128 m13, [srcq+r10], 1 + vpbroadcastq m10, [srcq+r9] + vpbroadcastq m11, [srcq+r11] + vpblendd m12, m10, 11000000b + vpblendd m13, m11, 11000000b + + ; if no emulation is required, we don't need to shuffle or emulate edges + ; this also saves 2 quasi-vpgatherdqs + vptest m1, m1 + jz .filter + + movd r8d, xm1 + pextrd r9d, xm1, 1 + pextrd r10d, xm1, 2 + pextrd r11d, xm1, 3 + movsxd r8, r8d + movsxd r9, r9d + movsxd r10, r10d + movsxd r11, r11d + vextracti128 xm1, m1, 1 + movq xm14, [base+resize_shuf+4+r8] + movq xm0, [base+resize_shuf+4+r10] + movhps xm14, [base+resize_shuf+4+r9] + movhps xm0, [base+resize_shuf+4+r11] + movd r8d, xm1 + pextrd r9d, xm1, 1 + pextrd r10d, xm1, 2 + pextrd r11d, xm1, 3 + movsxd r8, r8d + movsxd r9, r9d + movsxd r10, r10d + movsxd r11, r11d + vinserti128 m14, [base+resize_shuf+4+r8], 1 + vinserti128 m0, [base+resize_shuf+4+r10], 1 + vpbroadcastq m10, [base+resize_shuf+4+r9] + vpbroadcastq m11, [base+resize_shuf+4+r11] + vpblendd m14, m10, 11000000b + vpblendd m0, m11, 11000000b + + paddb m14, m15 + paddb m0, m15 + pshufb m12, m14 + pshufb m13, m0 + +.filter: + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + vextracti128 xm9, m9, 1 + movq xm10, [base+resize_filter+r8*8] + movq xm11, [base+resize_filter+r10*8] + movhps xm10, [base+resize_filter+r9*8] + movhps xm11, [base+resize_filter+r11*8] + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + vinserti128 m10, [base+resize_filter+r8*8], 1 + vinserti128 m11, [base+resize_filter+r10*8], 1 + vpbroadcastq m14, [base+resize_filter+r9*8] + vpbroadcastq m1, [base+resize_filter+r11*8] + vpblendd m10, m14, 11000000b + vpblendd m11, m1, 11000000b + + pmaddubsw m12, m10 + pmaddubsw m13, m11 + phaddw m12, m13 + vextracti128 xm13, m12, 1 + phaddsw xm12, xm13 + pmulhrsw xm12, xm3 ; x=(x+64)>>7 + packuswb xm12, xm12 + movq [dstq+xq], xm12 + + paddd m4, m5 + add xd, 8 + cmp xd, dst_wd + jl .loop_x + + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + +cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx2_table + lea r7, [w_mask_420_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + movsxd wq, [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + pmovzxbd m9, [base+deint_shuf4] + vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign + add wq, r7 + W_MASK 0, 4, 0, 1 + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + jg .w4_h16 +.w4_end: + vextracti128 xm0, m4, 1 + vpblendd xm1, xm4, xm0, 0x05 + vpblendd xm4, xm0, 0x0a + pshufd xm1, xm1, q2301 + psubw xm4, xm8, xm4 + psubw xm4, xm1 + psrlw xm4, 2 + packuswb xm4, xm4 + movq [maskq], xm4 + RET +.w4_h16: + W_MASK 0, 5, 2, 3 + lea dstq, [dstq+strideq*4] + phaddd m4, m5 + vextracti128 xm1, m0, 1 + psubw m4, m8, m4 + psrlw m4, 2 + vpermd m4, m9, m4 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + mova [maskq], xm4 + RET +.w8_loop: + add tmp1q, 2*32 + add tmp2q, 2*32 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 8 +.w8: + vextracti128 xm2, m4, 1 + vextracti128 xm1, m0, 1 + psubw xm4, xm8, xm4 + psubw xm4, xm2 + psrlw xm4, 2 + packuswb xm4, xm4 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + movq [maskq], xm4 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w16: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + W_MASK 0, 5, 2, 3 + punpckhqdq m1, m4, m5 + punpcklqdq m4, m5 + psubw m1, m8, m1 + psubw m1, m4 + psrlw m1, 2 + vpermq m0, m0, q3120 + packuswb m1, m1 + vpermd m1, m9, m1 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + mova [maskq], xm1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*2] + add maskq, 16 +.w32: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], m0 + W_MASK 0, 5, 2, 3 + psubw m4, m8, m4 + psubw m4, m5 + psrlw m4, 2 + vpermq m0, m0, q3120 + packuswb m4, m4 + vpermd m4, m9, m4 + mova [dstq+strideq*1], m0 + mova [maskq], xm4 + sub hd, 2 + jg .w32_loop + RET +.w64_loop_even: + psubw m10, m8, m4 + psubw m11, m8, m5 + dec hd +.w64_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 + W_MASK 0, 4, 0, 1 + add dstq, strideq +.w64: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + test hd, 1 + jz .w64_loop_even + psubw m4, m10, m4 + psubw m5, m11, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m9, m4 + mova [maskq], m4 + add maskq, 32 + dec hd + jg .w64_loop + RET +.w128_loop_even: + psubw m12, m8, m4 + psubw m13, m8, m5 + dec hd +.w128_loop: + W_MASK 0, 4, 0, 1 + add dstq, strideq +.w128: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + add tmp1q, 8*32 + add tmp2q, 8*32 + test hd, 1 + jz .w128_even + psubw m4, m10, m4 + psubw m5, m11, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m9, m4 + mova [maskq+32*0], m4 + jmp .w128_odd +.w128_even: + psubw m10, m8, m4 + psubw m11, m8, m5 +.w128_odd: + W_MASK 0, 4, -4, -3 + vpermq m0, m0, q3120 + mova [dstq+32*2], m0 + W_MASK 0, 5, -2, -1 + vpermq m0, m0, q3120 + mova [dstq+32*3], m0 + test hd, 1 + jz .w128_loop_even + psubw m4, m12, m4 + psubw m5, m13, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m9, m4 + mova [maskq+32*1], m4 + add maskq, 64 + dec hd + jg .w128_loop + RET + +cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx2_table + lea r7, [w_mask_422_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + pxor m9, m9 + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + pmovzxbd m10, [base+deint_shuf4] + vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign + add wq, r7 + mov maskq, maskmp + W_MASK 0, 4, 0, 1 + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + jg .w4_h16 +.w4_end: + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + psubb xm5, xm8, xm4 + pavgb xm5, xm9 + pshufd xm5, xm5, q3120 + mova [maskq], xm5 + RET +.w4_h16: + W_MASK 0, 5, 2, 3 + lea dstq, [dstq+strideq*4] + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermd m5, m10, m5 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + mova [maskq], m5 + RET +.w8_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w8: + vextracti128 xm5, m4, 1 + vextracti128 xm1, m0, 1 + packuswb xm4, xm5 + psubb xm5, xm8, xm4 + pavgb xm5, xm9 + pshufd xm5, xm5, q3120 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + mova [maskq], xm5 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w16: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + mova [maskq], m5 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w32: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], m0 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+strideq*1], m0 + mova [maskq], m5 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1 + add dstq, strideq + add maskq, 32 +.w64: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+32*1], m0 + mova [maskq], m5 + dec hd + jg .w64_loop + RET +.w128_loop: + add tmp1q, 32*8 + add tmp2q, 32*8 + W_MASK 0, 4, 0, 1 + add dstq, strideq + add maskq, 32*2 +.w128: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+32*1], m0 + mova [maskq+32*0], m5 + W_MASK 0, 4, 4, 5 + vpermq m0, m0, q3120 + mova [dstq+32*2], m0 + W_MASK 0, 5, 6, 7 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+32*3], m0 + mova [maskq+32*1], m5 + dec hd + jg .w128_loop + RET + +cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx2_table + lea r7, [w_mask_444_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m5, [base+pb_64] + vpbroadcastd m7, [base+pw_2048] + add wq, r7 + W_MASK 0, 4, 0, 1, 1 + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + mova [maskq+32*0], m4 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + je .w4_end + W_MASK 0, 4, 2, 3, 1 + lea dstq, [dstq+strideq*4] + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + mova [maskq+32*1], m4 +.w4_end: + RET +.w8_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1, 1 + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w8: + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + mova [maskq], m4 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1, 1 + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w16: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [maskq], m4 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1, 1 + add dstq, strideq + add maskq, 32 +.w32: + vpermq m0, m0, q3120 + mova [dstq], m0 + mova [maskq], m4 + dec hd + jg .w32_loop + RET +.w64_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1, 1 + add dstq, strideq + add maskq, 32*2 +.w64: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + mova [maskq+32*0], m4 + W_MASK 0, 4, 2, 3, 1 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + mova [maskq+32*1], m4 + dec hd + jg .w64_loop + RET +.w128_loop: + add tmp1q, 32*8 + add tmp2q, 32*8 + W_MASK 0, 4, 0, 1, 1 + add dstq, strideq + add maskq, 32*4 +.w128: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + mova [maskq+32*0], m4 + W_MASK 0, 4, 2, 3, 1 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + mova [maskq+32*1], m4 + W_MASK 0, 4, 4, 5, 1 + vpermq m0, m0, q3120 + mova [dstq+32*2], m0 + mova [maskq+32*2], m4 + W_MASK 0, 4, 6, 7, 1 + vpermq m0, m0, q3120 + mova [dstq+32*3], m0 + mova [maskq+32*3], m4 + dec hd + jg .w128_loop + RET + +%endif ; ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/mc_avx512.asm dav1d-0.9.1/src/x86/mc_avx512.asm --- dav1d-0.7.1/src/x86/mc_avx512.asm 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/src/x86/mc_avx512.asm 2021-07-28 21:38:28.913852200 +0000 @@ -0,0 +1,2395 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if HAVE_AVX512ICL && ARCH_X86_64 + +SECTION_RODATA 64 + +bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 + db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 + db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 + db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 +wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31 + db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63 + db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 + db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 +wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47 + db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63 + db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 + db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 +wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 + db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127 + db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 +wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62 + db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126 + db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 +wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 +bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 + db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 + db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39 + db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47 +bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 + db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 + db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23 + db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31 +bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 + db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 + db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87 + db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39 +bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 + db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 + db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23 + db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31 +bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7 + db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15 + db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 + db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 +bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7 +spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 + db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 +spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 + db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42 + db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50 +spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 + db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 + db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54 +spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 + db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 +spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 + db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26 + db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34 +spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 + db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 + db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 +spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 + db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 +spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 + db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 +spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 + db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 +deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 +subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 +subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 +bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 + +wm_420_perm64: dq 0xfedcba9876543210 +wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 + +pb_127: times 4 db 127 +pw_m128 times 2 dw -128 +pw_512: times 2 dw 512 +pw_1024: times 2 dw 1024 +pw_2048: times 2 dw 2048 +pw_6903: times 2 dw 6903 +pw_8192: times 2 dw 8192 +pd_2: dd 2 +pd_32: dd 32 +pd_32768: dd 32768 + +%define pb_m64 (wm_sign+4) +%define pb_64 (wm_sign+8) + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +%macro BIDIR_JMP_TABLE 1-* + %xdefine %1_table (%%table - 2*%2) + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep) + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128 + +SECTION .text + +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro + +%macro WRAP_YMM 1+ +INIT_YMM cpuname + %1 +INIT_ZMM cpuname +%endmacro + +DECLARE_REG_TMP 3, 5, 6 + +INIT_ZMM avx512icl +cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea t2, [prep_avx512icl] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [t2+wq*2+table_offset(prep,)] + add wq, t2 + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movd xmm0, [srcq+strideq*0] + pinsrd xmm0, [srcq+strideq*1], 1 + pinsrd xmm0, [srcq+strideq*2], 2 + pinsrd xmm0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pmovzxbw ym0, xmm0 + psllw ym0, 4 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movq xmm0, [srcq+strideq*0] + movq xmm1, [srcq+strideq*1] + vinserti128 ym0, ymm0, [srcq+strideq*2], 1 + vinserti128 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq ym0, ym1 + pmovzxbw m0, ym0 + psllw m0, 4 + mova [tmpq], m0 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu xmm0, [srcq+strideq*0] + vinserti128 ym0, ymm0, [srcq+strideq*1], 1 + movu xmm1, [srcq+strideq*2] + vinserti128 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmovzxbw m0, ym0 + pmovzxbw m1, ym1 + psllw m0, 4 + psllw m1, 4 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmovzxbw m0, [srcq+strideq*0] + pmovzxbw m1, [srcq+strideq*1] + pmovzxbw m2, [srcq+strideq*2] + pmovzxbw m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {psllw x, 4}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 4 + jg .prep_w32 + RET +.prep_w64: + pmovzxbw m0, [srcq+strideq*0+32*0] + pmovzxbw m1, [srcq+strideq*0+32*1] + pmovzxbw m2, [srcq+strideq*1+32*0] + pmovzxbw m3, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + REPX {psllw x, 4}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 2 + jg .prep_w64 + RET +.prep_w128: + pmovzxbw m0, [srcq+32*0] + pmovzxbw m1, [srcq+32*1] + pmovzxbw m2, [srcq+32*2] + pmovzxbw m3, [srcq+32*3] + REPX {psllw x, 4}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + add srcq, strideq + dec hd + jg .prep_w128 + RET +.h: + ; 16 * src[x] + (mx * (src[x + 1] - src[x])) + ; = (16 - mx) * src[x] + mx * src[x + 1] + imul mxyd, 0xff01 + add mxyd, 16 << 8 + vpbroadcastw m5, mxyd + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .hv + movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] + add wq, t2 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + vbroadcasti32x4 ym4, [bilin_h_shuf4] +.h_w4_loop: + movq xmm0, [srcq+strideq*0] + movq xmm1, [srcq+strideq*1] + vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1 + vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq ym0, ym1 + pshufb ym0, ym4 + pmaddubsw ym0, ym5 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + vbroadcasti32x4 m4, [bilin_h_shuf8] +.h_w8_loop: + movu xmm0, [srcq+strideq*0] + vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 + vinserti32x4 m0, [srcq+strideq*2], 2 + vinserti32x4 m0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pshufb m0, m4 + pmaddubsw m0, m5 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + mova m4, [bilin_h_perm16] +.h_w16_loop: + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + movu ym1, [srcq+strideq*2] + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vpermb m0, m4, m0 + vpermb m1, m4, m1 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: + mova m4, [bilin_h_perm32] +.h_w32_loop: + vpermb m0, m4, [srcq+strideq*0] + vpermb m1, m4, [srcq+strideq*1] + vpermb m2, m4, [srcq+strideq*2] + vpermb m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 4 + jg .h_w32_loop + RET +.h_w64: + mova m4, [bilin_h_perm32] +.h_w64_loop: + vpermb m0, m4, [srcq+strideq*0+32*0] + vpermb m1, m4, [srcq+strideq*0+32*1] + vpermb m2, m4, [srcq+strideq*1+32*0] + vpermb m3, m4, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 2 + jg .h_w64_loop + RET +.h_w128: + mova m4, [bilin_h_perm32] +.h_w128_loop: + vpermb m0, m4, [srcq+32*0] + vpermb m1, m4, [srcq+32*1] + vpermb m2, m4, [srcq+32*2] + vpermb m3, m4, [srcq+32*3] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + add srcq, strideq + dec hd + jg .h_w128_loop + RET +.v: + WIN64_SPILL_XMM 7 + movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] + imul mxyd, 0xff01 + add mxyd, 16 << 8 + add wq, t2 + lea stride3q, [strideq*3] + vpbroadcastw m6, mxyd + jmp wq +.v_w4: + vpbroadcastd xm0, [srcq+strideq*0] + mov r3d, 0x29 + vbroadcasti32x4 ym3, [bilin_v_shuf4] + kmovb k1, r3d +.v_w4_loop: + vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____ + vpbroadcastd ym2, [srcq+strideq*2] + vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__ + lea srcq, [srcq+strideq*4] + vpbroadcastd ym0, [srcq+strideq*0] + punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_ + pshufb ym2, ym3 + pmaddubsw ym2, ym6 + mova [tmpq], ym2 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + mova m5, [bilin_v_perm8] + vbroadcasti32x4 ym0, [srcq+strideq*0] +.v_w8_loop: + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 + vpbroadcastq ym0, [srcq+strideq*2] + vinserti32x4 m1, [srcq+stride3q ], 2 + lea srcq, [srcq+strideq*4] + vinserti32x4 ym0, [srcq+strideq*0], 0 + vpermt2b m1, m5, m0 + pmaddubsw m1, m6 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mova m5, [bilin_v_perm16] + movu xm0, [srcq+strideq*0] +.v_w16_loop: + movu xm2, [srcq+strideq*2] + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 + vpermt2b m1, m5, m2 + vinserti32x4 ym2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + movu xm0, [srcq+strideq*0] + vpermt2b m2, m5, m0 + pmaddubsw m1, m6 + pmaddubsw m2, m6 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + mova m5, [bilin_v_perm32] + movu ym0, [srcq+strideq*0] +.v_w32_loop: + movu ym2, [srcq+strideq*1] + movu ym3, [srcq+strideq*2] + movu ym4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpermt2b m0, m5, m2 + vpermt2b m2, m5, m3 + vpermt2b m3, m5, m4 + pmaddubsw m1, m0, m6 + movu ym0, [srcq+strideq*0] + vpermt2b m4, m5, m0 + pmaddubsw m2, m6 + pmaddubsw m3, m6 + pmaddubsw m4, m6 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + mova [tmpq+64*2], m3 + mova [tmpq+64*3], m4 + add tmpq, 64*4 + sub hd, 4 + jg .v_w32_loop + RET +.v_w64: + mova m5, [bilin_v_perm64] + vpermq m0, m5, [srcq+strideq*0] +.v_w64_loop: + vpermq m1, m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklbw m4, m1, m0 + punpckhbw m2, m1, m0 + vpermq m0, m5, [srcq+strideq*0] + punpcklbw m3, m0, m1 + punpckhbw m1, m0, m1 + pmaddubsw m4, m6 + pmaddubsw m2, m6 + pmaddubsw m3, m6 + pmaddubsw m1, m6 + mova [tmpq+64*0], m4 + mova [tmpq+64*1], m2 + mova [tmpq+64*2], m3 + mova [tmpq+64*3], m1 + add tmpq, 64*4 + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + mova m5, [bilin_v_perm64] + vpermq m0, m5, [srcq+strideq*0+ 0] + vpermq m1, m5, [srcq+strideq*0+64] +.v_w128_loop: + vpermq m2, m5, [srcq+strideq*1+ 0] + vpermq m3, m5, [srcq+strideq*1+64] + lea srcq, [srcq+strideq*2] + punpcklbw m4, m2, m0 + punpckhbw m0, m2, m0 + pmaddubsw m4, m6 + pmaddubsw m0, m6 + mova [tmpq+64*0], m4 + mova [tmpq+64*1], m0 + punpcklbw m4, m3, m1 + punpckhbw m1, m3, m1 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + mova [tmpq+64*2], m4 + mova [tmpq+64*3], m1 + vpermq m0, m5, [srcq+strideq*0+ 0] + vpermq m1, m5, [srcq+strideq*0+64] + punpcklbw m4, m0, m2 + punpckhbw m2, m0, m2 + pmaddubsw m4, m6 + pmaddubsw m2, m6 + mova [tmpq+64*4], m4 + mova [tmpq+64*5], m2 + punpcklbw m4, m1, m3 + punpckhbw m3, m1, m3 + pmaddubsw m4, m6 + pmaddubsw m3, m6 + mova [tmpq+64*6], m4 + mova [tmpq+64*7], m3 + add tmpq, 64*8 + sub hd, 2 + jg .v_w128_loop + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 + ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 7 + movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + vpbroadcastw m6, mxyd + add wq, t2 + lea stride3q, [strideq*3] + jmp wq +.hv_w4: + vbroadcasti32x4 ym4, [bilin_h_shuf4] + vpbroadcastq ym0, [srcq+strideq*0] + pshufb ym0, ym4 + pmaddubsw ym0, ym5 +.hv_w4_loop: + movq xmm1, [srcq+strideq*1] + movq xmm2, [srcq+strideq*2] + vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1 + punpcklqdq ym1, ym2 + pshufb ym1, ym4 + pmaddubsw ym1, ym5 ; 1 2 3 4 + valignq ym2, ym1, ym0, 3 ; 0 1 2 3 + mova ym0, ym1 + psubw ym1, ym2 + pmulhrsw ym1, ym6 + paddw ym1, ym2 + mova [tmpq], ym1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti32x4 m4, [bilin_h_shuf8] + vbroadcasti32x4 m0, [srcq+strideq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu xmm1, [srcq+strideq*1] + vinserti128 ym1, ymm1, [srcq+strideq*2], 1 + vinserti128 m1, [srcq+stride3q ], 2 + lea srcq, [srcq+strideq*4] + vinserti128 m1, [srcq+strideq*0], 3 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 3 4 + valignq m2, m1, m0, 6 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + mova m4, [bilin_h_perm16] + vbroadcasti32x8 m0, [srcq+strideq*0] + vpermb m0, m4, m0 + pmaddubsw m0, m5 +.hv_w16_loop: + movu ym1, [srcq+strideq*1] + vinserti32x8 m1, [srcq+strideq*2], 1 + movu ym2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti32x8 m2, [srcq+strideq*0], 1 + vpermb m1, m4, m1 + vpermb m2, m4, m2 + pmaddubsw m1, m5 ; 1 2 + vshufi32x4 m3, m0, m1, q1032 ; 0 1 + pmaddubsw m0, m2, m5 ; 3 4 + vshufi32x4 m2, m1, m0, q1032 ; 2 3 + psubw m1, m3 + pmulhrsw m1, m6 + paddw m1, m3 + psubw m3, m0, m2 + pmulhrsw m3, m6 + paddw m3, m2 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m3 + add tmpq, 64*2 + sub hd, 4 + jg .hv_w16_loop + RET +.hv_w32: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+strideq*0] + pmaddubsw m0, m5 +.hv_w32_loop: + vpermb m1, m4, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermb m2, m4, [srcq+strideq*0] + pmaddubsw m1, m5 + psubw m3, m1, m0 + pmulhrsw m3, m6 + paddw m3, m0 + pmaddubsw m0, m2, m5 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+64*0], m3 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 2 + jg .hv_w32_loop + RET +.hv_w64: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+32*0] + vpermb m1, m4, [srcq+32*1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w64_loop: + add srcq, strideq + vpermb m2, m4, [srcq+32*0] + vpermb m3, m4, [srcq+32*1] + pmaddubsw m2, m5 + pmaddubsw m3, m5 + psubw m7, m2, m0 + psubw m8, m3, m1 + pmulhrsw m7, m6 + pmulhrsw m8, m6 + paddw m7, m0 + mova m0, m2 + paddw m8, m1 + mova m1, m3 + mova [tmpq+64*0], m7 + mova [tmpq+64*1], m8 + add tmpq, 64*2 + dec hd + jg .hv_w64_loop + RET +.hv_w128: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+32*0] + vpermb m1, m4, [srcq+32*1] + vpermb m2, m4, [srcq+32*2] + vpermb m3, m4, [srcq+32*3] + REPX {pmaddubsw x, m5}, m0, m1, m2, m3 +.hv_w128_loop: + add srcq, strideq + vpermb m7, m4, [srcq+32*0] + vpermb m8, m4, [srcq+32*1] + vpermb m9, m4, [srcq+32*2] + vpermb m10, m4, [srcq+32*3] + REPX {pmaddubsw x, m5}, m7, m8, m9, m10 + psubw m11, m7, m0 + psubw m12, m8, m1 + psubw m13, m9, m2 + psubw m14, m10, m3 + REPX {pmulhrsw x, m6}, m11, m12, m13, m14 + paddw m11, m0 + mova m0, m7 + paddw m12, m1 + mova m1, m8 + paddw m13, m2 + mova m2, m9 + paddw m14, m3 + mova m3, m10 + mova [tmpq+64*0], m11 + mova [tmpq+64*1], m12 + mova [tmpq+64*2], m13 + mova [tmpq+64*3], m14 + add tmpq, 64*4 + dec hd + jg .hv_w128_loop + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; fn, type, type_h, type_v +cglobal %1_%2 + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1 %+ SUFFIX) +%endif +%endmacro + +%macro PREP_8TAP_H 0 + vpermb m10, m5, m0 + vpermb m11, m5, m1 + vpermb m12, m6, m0 + vpermb m13, m6, m1 + vpermb m14, m7, m0 + vpermb m15, m7, m1 + mova m0, m4 + vpdpbusd m0, m10, m8 + mova m2, m4 + vpdpbusd m2, m12, m8 + mova m1, m4 + vpdpbusd m1, m11, m8 + mova m3, m4 + vpdpbusd m3, m13, m8 + vpdpbusd m0, m12, m9 + vpdpbusd m2, m14, m9 + vpdpbusd m1, m13, m9 + vpdpbusd m3, m15, m9 + packssdw m0, m2 + packssdw m1, m3 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, + +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r7+wq*2+table_offset(prep,)] + add wq, r7 + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pd_2] + WIN64_SPILL_XMM 10 + cmp wd, 4 + je .h_w4 + tzcnt wd, wd + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0] + vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4] + add wq, r7 + jmp wq +.h_w4: + movzx mxd, mxb + vbroadcasti128 ym5, [subpel_h_shufA] + mov r3d, 0x4 + dec srcq + vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + kmovb k1, r3d + lea stride3q, [strideq*3] +.h_w4_loop: + movq xm2, [srcq+strideq*0] + movq xm3, [srcq+strideq*1] + vpbroadcastq ym2{k1}, [srcq+strideq*2] + vpbroadcastq ym3{k1}, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pshufb ym2, ym5 + pshufb ym3, ym5 + mova ym0, ym4 + vpdpbusd ym0, ym2, ym6 + mova ym1, ym4 + vpdpbusd ym1, ym3, ym6 + packssdw ym0, ym1 + psraw ym0, 2 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + vbroadcasti128 m5, [subpel_h_shufA] + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + lea stride3q, [strideq*3] +.h_w8_loop: + movu xmm3, [srcq+strideq*0] + vinserti128 ym3, ymm3, [srcq+strideq*1], 1 + vinserti128 m3, [srcq+strideq*2], 2 + vinserti128 m3, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pshufb m1, m3, m5 + pshufb m2, m3, m6 + mova m0, m4 + vpdpbusd m0, m1, m8 + mova m1, m4 + vpdpbusd m1, m2, m8 + pshufb m3, m7 + vpdpbusd m0, m2, m9 + vpdpbusd m1, m3, m9 + packssdw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + mova m5, [spel_h_perm16a] + mova m6, [spel_h_perm16b] + mova m7, [spel_h_perm16c] + lea stride3q, [strideq*3] +.h_w16_loop: + movu ym0, [srcq+strideq*0] + movu ym1, [srcq+strideq*2] + vinserti32x8 m0, [srcq+strideq*1], 1 + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + PREP_8TAP_H + add tmpq, 64*2 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: + mova m5, [spel_h_perm32a] + mova m6, [spel_h_perm32b] + mova m7, [spel_h_perm32c] +.h_w32_loop: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + add tmpq, 64*2 + sub hd, 2 + jg .h_w32_loop + RET +.h_w64: + xor r6d, r6d + jmp .h_start +.h_w128: + mov r6, -64*1 +.h_start: + mova m5, [spel_h_perm32a] + mova m6, [spel_h_perm32b] + mova m7, [spel_h_perm32c] + sub srcq, r6 + mov r5, r6 +.h_loop: + movu m0, [srcq+r6+32*0] + movu m1, [srcq+r6+32*1] + PREP_8TAP_H + add tmpq, 64*2 + add r6, 64 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET +.v: + movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. + shr myd, 16 ; Note that the code is 8-tap only, having + tzcnt wd, wd + cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 + cmove myd, mxd ; had a negligible effect on performance. + ; TODO: Would a 6-tap code path be worth it? + lea myq, [r7+myq*8+subpel_filters-prep_avx512icl] + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)] + add wq, r7 + lea stride3q, [strideq*3] + sub srcq, stride3q + vpbroadcastd m7, [pw_8192] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + jmp wq +.v_w4: + movd xmm0, [srcq+strideq*0] + vpbroadcastd ymm1, [srcq+strideq*2] + vpbroadcastd xmm2, [srcq+strideq*1] + vpbroadcastd ymm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ + vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ + vpbroadcastd ymm0, [srcq+strideq*0] + vpbroadcastd ymm2, [srcq+strideq*1] + vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ + vpbroadcastd ymm0, [srcq+strideq*2] + vbroadcasti128 ymm5, [deint_shuf4] + vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 + vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 + vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ + punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 + vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 + punpckhbw ymm2, ymm3 ; 23 34 45 56 +.v_w4_loop: + pinsrd xmm0, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vpbroadcastd ymm3, [srcq+strideq*0] + vpbroadcastd ymm4, [srcq+strideq*1] + vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ + vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ + vpbroadcastd ymm0, [srcq+strideq*2] + vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ + pshufb ymm3, ymm5 ; 67 78 89 9a + pmaddubsw ymm4, ymm1, ym8 + vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 + pmaddubsw ymm2, ym9 + paddw ymm4, ymm2 + mova ymm2, ymm3 + pmaddubsw ymm3, ym11 + paddw ymm3, ymm4 + pmaddubsw ymm4, ymm1, ym10 + paddw ymm3, ymm4 + pmulhrsw ymm3, ym7 + mova [tmpq], ymm3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + mov r3d, 0xf044 + kmovw k1, r3d + kshiftrw k2, k1, 8 + movq xm0, [srcq+strideq*0] + vpbroadcastq ym1, [srcq+strideq*1] + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m4, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + vpbroadcastq m6, [srcq+strideq*2] + vmovdqa64 ym0{k1}, ym1 + vmovdqa64 ym1{k1}, ym2 + vmovdqa64 m2{k1}, m3 + vmovdqa64 m3{k1}, m4 + vmovdqa64 m4{k1}, m5 + vmovdqa64 m5{k1}, m6 + punpcklbw ym0, ym1 ; 01 12 __ __ + punpcklbw m2, m3 ; 23 34 23 34 + punpcklbw m4, m5 ; 45 56 45 56 + vmovdqa64 m0{k2}, m2 ; 01 12 23 34 + vmovdqa64 m2{k2}, m4 ; 23 34 45 56 +.v_w8_loop: + vpbroadcastq m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + pmaddubsw m14, m0, m8 + pmaddubsw m15, m2, m9 + vpblendmq m0{k1}, m6, m1 + vpblendmq m2{k1}, m1, m3 + vpbroadcastq m6, [srcq+strideq*2] + paddw m14, m15 + punpcklbw m2, m0, m2 ; 67 78 67 78 + vpblendmq m12{k1}, m3, m5 + vpblendmq m13{k1}, m5, m6 + vpblendmq m0{k2}, m4, m2 ; 45 56 67 78 + punpcklbw m4, m12, m13 ; 89 9a 89 9a + vmovdqa64 m2{k2}, m4 ; 67 78 89 9a + pmaddubsw m12, m0, m10 + pmaddubsw m13, m2, m11 + paddw m14, m12 + paddw m14, m13 + pmulhrsw m14, m7 + mova [tmpq], m14 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mov r3d, 0xf0 + kmovb k1, r3d + vbroadcasti128 m0, [srcq+strideq*0] + vbroadcasti128 m1, [srcq+strideq*1] + vbroadcasti128 m2, [srcq+strideq*2] + vbroadcasti128 m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vbroadcasti128 m4, [srcq+strideq*0] + vbroadcasti128 m5, [srcq+strideq*1] + vbroadcasti128 m6, [srcq+strideq*2] + vmovdqa64 m0{k1}, m1 + vmovdqa64 m1{k1}, m2 + vmovdqa64 m2{k1}, m3 + vmovdqa64 m3{k1}, m4 + vmovdqa64 m4{k1}, m5 + vmovdqa64 m5{k1}, m6 + shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b + shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b + shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_-- + shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_-- + punpckhbw m2, m0, m1 ; 23a 23b 34a 34b + punpcklbw m0, m1 ; 01a 01b 12a 12b + punpcklbw m4, m5 ; 45a 45b 56a 56b +.v_w16_loop: + vbroadcasti128 m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vbroadcasti128 m5, [srcq+strideq*0] + vpblendmq m1{k1}, m6, m3 + vmovdqa64 m3{k1}, m5 + pmaddubsw m12, m0, m8 + pmaddubsw m13, m2, m8 + pmaddubsw m14, m2, m9 + pmaddubsw m15, m4, m9 + pmaddubsw m0, m4, m10 + vbroadcasti128 m2, [srcq+strideq*1] + vbroadcasti128 m6, [srcq+strideq*2] + paddw m12, m14 + paddw m13, m15 + paddw m12, m0 + vmovdqa64 m5{k1}, m2 + vmovdqa64 m2{k1}, m6 + mova m0, m4 + shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b + shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab + punpcklbw m2, m1, m3 ; 67a 67b 78a 78b + punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab + pmaddubsw m14, m2, m10 + pmaddubsw m15, m2, m11 + paddw m13, m14 + paddw m12, m15 + pmaddubsw m14, m4, m11 + paddw m13, m14 + pmulhrsw m12, m7 + pmulhrsw m13, m7 + mova [tmpq+ 0], m12 + mova [tmpq+64], m13 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + mova m18, [bilin_v_perm64] + movu ym0, [srcq+strideq*0] + movu ym1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym2, [srcq+strideq*0] + movu ym3, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym4, [srcq+strideq*0] + movu ym5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym6, [srcq+strideq*0] + vpermq m0, m18, m0 + vpermq m1, m18, m1 + vpermq m2, m18, m2 + vpermq m3, m18, m3 + vpermq m4, m18, m4 + vpermq m5, m18, m5 + vpermq m6, m18, m6 + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + punpcklbw m3, m4 + punpcklbw m4, m5 + punpcklbw m5, m6 +.v_w32_loop: + movu ym12, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym13, [srcq+strideq*0] + pmaddubsw m14, m0, m8 + pmaddubsw m16, m2, m9 + pmaddubsw m15, m1, m8 + pmaddubsw m17, m3, m9 + mova m0, m2 + mova m1, m3 + vpermq m12, m18, m12 + vpermq m13, m18, m13 + paddw m14, m16 + paddw m15, m17 + pmaddubsw m16, m4, m10 + pmaddubsw m17, m5, m10 + punpcklbw m6, m12 + punpcklbw m12, m13 + mova m2, m4 + mova m3, m5 + paddw m14, m16 + paddw m15, m17 + pmaddubsw m16, m6, m11 + pmaddubsw m17, m12, m11 + mova m4, m6 + mova m5, m12 + paddw m14, m16 + paddw m15, m17 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + mova m6, m13 + mova [tmpq+ 0], m14 + mova [tmpq+64], m15 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: + mov wd, 64 + jmp .v_start +.v_w128: + mov wd, 128 +.v_start: + WIN64_SPILL_XMM 27 + mova m26, [bilin_v_perm64] + lea r6d, [hq+wq*2] + mov r5, srcq + mov r7, tmpq +.v_loop0: + vpermq m0, m26, [srcq+strideq*0] + vpermq m1, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m2, m26, [srcq+strideq*0] + vpermq m3, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m4, m26, [srcq+strideq*0] + vpermq m5, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m6, m26, [srcq+strideq*0] + punpckhbw m12, m0, m1 + punpcklbw m0, m1 + punpckhbw m13, m1, m2 + punpcklbw m1, m2 + punpckhbw m14, m2, m3 + punpcklbw m2, m3 + punpckhbw m15, m3, m4 + punpcklbw m3, m4 + punpckhbw m16, m4, m5 + punpcklbw m4, m5 + punpckhbw m17, m5, m6 + punpcklbw m5, m6 +.v_loop: + vpermq m18, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m19, m26, [srcq+strideq*0] + pmaddubsw m20, m0, m8 + pmaddubsw m21, m12, m8 + pmaddubsw m22, m1, m8 + pmaddubsw m23, m13, m8 + mova m0, m2 + mova m12, m14 + mova m1, m3 + mova m13, m15 + pmaddubsw m2, m9 + pmaddubsw m14, m9 + pmaddubsw m3, m9 + pmaddubsw m15, m9 + punpckhbw m24, m6, m18 + punpcklbw m6, m18 + paddw m20, m2 + paddw m21, m14 + paddw m22, m3 + paddw m23, m15 + mova m2, m4 + mova m14, m16 + mova m3, m5 + mova m15, m17 + pmaddubsw m4, m10 + pmaddubsw m16, m10 + pmaddubsw m5, m10 + pmaddubsw m17, m10 + punpckhbw m25, m18, m19 + punpcklbw m18, m19 + paddw m20, m4 + paddw m21, m16 + paddw m22, m5 + paddw m23, m17 + mova m4, m6 + mova m16, m24 + mova m5, m18 + mova m17, m25 + pmaddubsw m6, m11 + pmaddubsw m24, m11 + pmaddubsw m18, m11 + pmaddubsw m25, m11 + paddw m20, m6 + paddw m21, m24 + paddw m22, m18 + paddw m23, m25 + pmulhrsw m20, m7 + pmulhrsw m21, m7 + pmulhrsw m22, m7 + pmulhrsw m23, m7 + mova m6, m19 + mova [tmpq+wq*0+ 0], m20 + mova [tmpq+wq*0+64], m21 + mova [tmpq+wq*2+ 0], m22 + mova [tmpq+wq*2+64], m23 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_loop + add r5, 64 + add r7, 128 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .v_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + WIN64_SPILL_XMM 16 + cmp wd, 4 + je .hv_w4 + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0] + vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + tzcnt wd, wd + vpbroadcastd m8, [pd_2] + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)] + vpbroadcastd m9, [pd_32] + add wq, r7 + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] + lea stride3q, [strideq*3] + sub srcq, stride3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + jmp wq +.hv_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] + lea stride3q, [strideq*3] + sub srcq, stride3q + mov r3d, 0x04 + kmovb k1, r3d + kshiftlb k2, k1, 2 + kshiftlb k3, k1, 4 + vpbroadcastd m10, [pd_2] + vbroadcasti128 m16, [subpel_h_shufA] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + vpbroadcastd m11, [pd_32] + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + movq xm3, [srcq+strideq*0] + vpbroadcastq ym2, [srcq+strideq*1] + vpbroadcastq ym3{k1}, [srcq+strideq*2] + vpbroadcastq m2{k2}, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3{k2}, [srcq+strideq*0] + vpbroadcastq m2{k3}, [srcq+strideq*1] + vpbroadcastq m3{k3}, [srcq+strideq*2] + mova m17, [spel_hv_perm4a] + movu m18, [spel_hv_perm4b] + mova m0, m10 + mova m1, m10 + pshufb m2, m16 + pshufb m3, m16 + vpdpbusd m0, m2, m8 + vpdpbusd m1, m3, m8 + packssdw m0, m1 ; _ 0 1 2 3 4 5 6 + psraw m0, 2 + vpermb m1, m17, m0 ; 01 12 23 34 + vpermb m2, m18, m0 ; 23 34 45 56 +.hv_w4_loop: + movq xm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + movq xm4, [srcq+strideq*0] + vpbroadcastq ym3{k1}, [srcq+strideq*1] + vpbroadcastq ym4{k1}, [srcq+strideq*2] + mova ym5, ym10 + mova ym6, ym10 + pshufb ym3, ym16 + pshufb ym4, ym16 + vpdpbusd ym5, ym3, ym8 + vpdpbusd ym6, ym4, ym8 + mova m7, m11 + packssdw ym5, ym6 ; 7 8 9 a _ _ _ _ + psraw ym5, 2 + valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a + vpdpwssd m7, m1, m12 + vpdpwssd m7, m2, m13 + vpermb m1, m17, m0 ; 45 56 67 78 + vpermb m2, m18, m0 ; 67 78 89 9a + vpdpwssd m7, m1, m14 + vpdpwssd m7, m2, m15 + psrad m7, 6 + vpmovdw [tmpq], m7 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + vzeroupper + RET +.hv_w8: + WIN64_SPILL_XMM 24 + vbroadcasti128 m16, [subpel_h_shufA] + vbroadcasti128 m17, [subpel_h_shufB] + vbroadcasti128 m18, [subpel_h_shufC] + vinserti128 ym0, [srcq+strideq*0], 1 + vinserti128 m0, [srcq+strideq*1], 2 + vinserti128 m0, [srcq+strideq*2], 3 + movu xm1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 ym1, [srcq+strideq*0], 1 + vinserti128 m1, [srcq+strideq*1], 2 + vinserti128 m1, [srcq+strideq*2], 3 + mova m2, m8 + mova m4, m8 + mova m3, m8 + mova m5, m8 + pshufb m20, m0, m16 + pshufb m21, m0, m17 + pshufb m22, m0, m18 + pshufb m23, m1, m16 + pshufb m6, m1, m17 + pshufb m7, m1, m18 + vpdpbusd m2, m20, m10 + vpdpbusd m4, m21, m10 + vpdpbusd m2, m21, m11 + vpdpbusd m4, m22, m11 + vpdpbusd m3, m23, m10 + vpdpbusd m5, m6, m10 + vpdpbusd m3, m6, m11 + vpdpbusd m5, m7, m11 + packssdw m2, m4 + packssdw m3, m5 + psraw m2, 2 ; _ 0 1 2 + psraw m3, 2 ; 3 4 5 6 + valignq m0, m3, m2, 2 ; 0 1 2 3 + valignq m1, m3, m2, 4 ; 1 2 3 4 + valignq m2, m3, m2, 6 ; 2 3 4 5 + punpcklwd m4, m0, m1 ; 01a 12a 23a 34a + punpckhwd m5, m0, m1 ; 01b 12b 23b 34b + punpcklwd m6, m2, m3 ; 23a 34a 45a 56a + punpckhwd m7, m2, m3 ; 23b 34b 45b 56b +.hv_w8_loop: + movu xm19, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 ym19, [srcq+strideq*0], 1 + vinserti128 m19, [srcq+strideq*1], 2 + vinserti128 m19, [srcq+strideq*2], 3 + mova m20, m9 + mova m21, m9 + mova m22, m8 + mova m23, m8 + vpdpwssd m20, m4, m12 + vpdpwssd m21, m5, m12 + vpdpwssd m20, m6, m13 + vpdpwssd m21, m7, m13 + pshufb m0, m19, m16 + pshufb m1, m19, m17 + pshufb m2, m19, m18 + vpdpbusd m22, m0, m10 + vpdpbusd m23, m1, m10 + vpdpbusd m22, m1, m11 + vpdpbusd m23, m2, m11 + packssdw m22, m23 + psraw m22, 2 ; 7 8 9 A + valignq m0, m22, m3, 2 ; 4 5 6 7 + valignq m1, m22, m3, 4 ; 5 6 7 8 + valignq m2, m22, m3, 6 ; 6 7 8 9 + mova m3, m22 + punpcklwd m4, m0, m1 ; 45a 56a 67a 78a + punpckhwd m5, m0, m1 ; 45b 56b 67b 78b + punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa + punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab + vpdpwssd m20, m4, m14 + vpdpwssd m21, m5, m14 + vpdpwssd m20, m6, m15 + vpdpwssd m21, m7, m15 + psrad m20, 6 + psrad m21, 6 + packssdw m20, m21 + mova [tmpq], m20 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + mov wd, 16*2 + jmp .hv_start +.hv_w32: + mov wd, 32*2 + jmp .hv_start +.hv_w64: + mov wd, 64*2 + jmp .hv_start +.hv_w128: + mov wd, 128*2 +.hv_start: + WIN64_SPILL_XMM 31 + mova m16, [spel_h_perm16a] + mova m17, [spel_h_perm16b] + mova m18, [spel_h_perm16c] + lea r6d, [hq+wq*8-256] + mov r5, srcq + mov r7, tmpq +.hv_loop0: + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + movu ym1, [srcq+strideq*0] + vinserti32x8 m1, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + movu ym2, [srcq+strideq*0] + vinserti32x8 m2, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + movu ym3, [srcq+strideq*0] + mova m4, m8 + mova m5, m8 + mova m6, m8 + mova m7, m8 + vpermb m19, m16, m0 + vpermb m20, m17, m0 + vpermb m21, m18, m0 + vpermb m22, m16, m1 + vpermb m23, m17, m1 + vpermb m24, m18, m1 + vpermb m25, m16, m2 + vpermb m26, m17, m2 + vpermb m27, m18, m2 + vpermb ym28, ym16, ym3 + vpermb ym29, ym17, ym3 + vpermb ym30, ym18, ym3 + mova m0, m8 + mova m1, m8 + mova ym2, ym8 + mova ym3, ym8 + vpdpbusd m4, m19, m10 + vpdpbusd m5, m20, m10 + vpdpbusd m6, m22, m10 + vpdpbusd m7, m23, m10 + vpdpbusd m0, m25, m10 + vpdpbusd m1, m26, m10 + vpdpbusd ym2, ym28, ym10 + vpdpbusd ym3, ym29, ym10 + vpdpbusd m4, m20, m11 + vpdpbusd m5, m21, m11 + vpdpbusd m6, m23, m11 + vpdpbusd m7, m24, m11 + vpdpbusd m0, m26, m11 + vpdpbusd m1, m27, m11 + vpdpbusd ym2, ym29, ym11 + vpdpbusd ym3, ym30, ym11 + packssdw m4, m5 + packssdw m6, m7 + packssdw m0, m1 + packssdw ym2, ym3 + psraw m4, 2 ; 0a 0b 1a 1b + psraw m6, 2 ; 2a 2b 3a 3b + psraw m0, 2 ; 4a 4b 5a 5b + psraw ym2, 2 ; 6a 6b __ __ + vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b + vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b + vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b + punpcklwd m2, m4, m5 ; 01a 01c 12a 12c + punpckhwd m3, m4, m5 ; 01b 01d 12b 12d + punpcklwd m4, m6, m7 ; 23a 23c 34a 34c + punpckhwd m5, m6, m7 ; 23b 23d 34b 34d + punpcklwd m6, m0, m1 ; 45a 45c 56a 56c + punpckhwd m7, m0, m1 ; 45b 45d 56b 56d +.hv_loop: + movu ym19, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m19, [srcq+strideq*0], 1 + mova m20, m9 + mova m21, m9 + mova m22, m8 + mova m23, m8 + vpdpwssd m20, m2, m12 + vpdpwssd m21, m3, m12 + vpdpwssd m20, m4, m13 + vpdpwssd m21, m5, m13 + vpermb m24, m16, m19 + vpermb m25, m17, m19 + vpermb m26, m18, m19 + vpdpbusd m22, m24, m10 + vpdpbusd m23, m25, m10 + vpdpbusd m22, m25, m11 + vpdpbusd m23, m26, m11 + packssdw m22, m23 + psraw m22, 2 ; 7a 7b 8a 8b + vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b + mova m2, m4 + mova m3, m5 + mova m1, m22 + mova m4, m6 + mova m5, m7 + punpcklwd m6, m0, m1 ; 67a 67c 78a 78c + punpckhwd m7, m0, m1 ; 67b 67d 78b 78d + vpdpwssd m20, m4, m14 + vpdpwssd m21, m5, m14 + vpdpwssd m20, m6, m15 + vpdpwssd m21, m7, m15 + psrad m20, 6 + psrad m21, 6 + packssdw m20, m21 + mova [tmpq+wq*0], ym20 + vextracti32x8 [tmpq+wq*1], m20, 1 + lea tmpq, [tmpq+wq*2] + sub hd, 2 + jg .hv_loop + add r5, 16 + add r7, 32 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .hv_loop0 + RET + +%macro BIDIR_FN 1 ; op + lea stride3q, [strideq*3] + jmp wq +.w4: + cmp hd, 8 + jg .w4_h16 + WRAP_YMM %1 0 + vextracti32x4 xmm1, ym0, 1 + movd [dstq ], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xmm1 + pextrd [dstq+stride3q ], xmm1, 1 + jl .w4_ret + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xmm1, 2 + pextrd [dstq+stride3q ], xmm1, 3 +.w4_ret: + RET +.w4_h16: + vpbroadcastd m7, strided + pmulld m7, [bidir_sctr_w4] + %1 0 + kxnorw k1, k1, k1 + vpscatterdd [dstq+m7]{k1}, m0 + RET +.w8: + cmp hd, 4 + jne .w8_h8 + WRAP_YMM %1 0 + vextracti128 xmm1, ym0, 1 + movq [dstq ], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + RET +.w8_loop: + %1_INC_PTR 2 + lea dstq, [dstq+strideq*4] +.w8_h8: + %1 0 + vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xmm2, m0, 2 + vextracti32x4 xmm3, m0, 3 + movq [dstq ], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + movhps [dstq ], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + %1_INC_PTR 2 + lea dstq, [dstq+strideq*4] +.w16: + %1 0 + vpermq m0, m0, q3120 + mova [dstq ], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m7, [pb_02461357] +.w32_loop: + %1 0 + %1_INC_PTR 2 + vpermq m0, m7, m0 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m7, [pb_02461357] +.w64_loop: + %1 0 + %1_INC_PTR 2 + vpermq m0, m7, m0 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w128: + pmovzxbq m7, [pb_02461357] +.w128_loop: + %1 0 + vpermq m6, m7, m0 + %1 2 + mova [dstq+64*0], m6 + %1_INC_PTR 4 + vpermq m6, m7, m0 + mova [dstq+64*1], m6 + add dstq, strideq + dec hd + jg .w128_loop + RET +%endmacro + +%macro AVG 1 ; src_offset + mova m0, [tmp1q+(%1+0)*mmsize] + paddw m0, [tmp2q+(%1+0)*mmsize] + mova m1, [tmp1q+(%1+1)*mmsize] + paddw m1, [tmp2q+(%1+1)*mmsize] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + packuswb m0, m1 +%endmacro + +%macro AVG_INC_PTR 1 + add tmp1q, %1*mmsize + add tmp2q, %1*mmsize +%endmacro + +cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg_avx512icl_table + lea r6, [avg_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r6+wq*4] + vpbroadcastd m2, [base+pw_1024] + add wq, r6 + BIDIR_FN AVG + +%macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 + ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 + ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 + mova m0, [tmp1q+(%1+0)*mmsize] + psubw m2, m0, [tmp2q+(%1+0)*mmsize] + mova m1, [tmp1q+(%1+1)*mmsize] + psubw m3, m1, [tmp2q+(%1+1)*mmsize] + pmulhw m2, m4 + pmulhw m3, m4 + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%define W_AVG_INC_PTR AVG_INC_PTR + +cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-w_avg_avx512icl_table + lea r6, [w_avg_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + vpbroadcastw m4, r6m ; weight + movsxd wq, dword [r6+wq*4] + vpbroadcastd m5, [base+pw_2048] + psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 + cmp dword r6m, 7 + jg .weight_gt7 + mov r6, tmp1q + pxor m0, m0 + mov tmp1q, tmp2q + psubw m4, m0, m4 ; -weight + mov tmp2q, r6 +.weight_gt7: + BIDIR_FN W_AVG + +%macro MASK 1 ; src_offset + ; (a * m + b * (64 - m) + 512) >> 10 + ; = ((a - b) * m + (b << 6) + 512) >> 10 + ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 +%if mmsize == 64 + vpermq m3, m8, [maskq+%1*32] +%else + vpermq m3, [maskq+%1*16], q3120 +%endif + mova m0, [tmp2q+(%1+0)*mmsize] + psubw m1, m0, [tmp1q+(%1+0)*mmsize] + psubb m3, m4, m3 + paddw m1, m1 ; (b - a) << 1 + paddb m3, m3 + punpcklbw m2, m4, m3 ; -m << 9 + pmulhw m1, m2 + paddw m0, m1 + mova m1, [tmp2q+(%1+1)*mmsize] + psubw m2, m1, [tmp1q+(%1+1)*mmsize] + paddw m2, m2 + punpckhbw m3, m4, m3 + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%macro MASK_INC_PTR 1 + add maskq, %1*32 + add tmp2q, %1*64 + add tmp1q, %1*64 +%endmacro + +cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask_avx512icl_table + lea r7, [mask_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp + movsxd wq, dword [r7+wq*4] + pxor m4, m4 + mova m8, [base+bilin_v_perm64] + vpbroadcastd m5, [base+pw_2048] + add wq, r7 + BIDIR_FN MASK + +%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 + mova m%1, [tmp1q+mmsize*%3] + mova m1, [tmp2q+mmsize*%3] + psubw m1, m%1 + pabsw m%2, m1 + psubusw m%2, m6, m%2 + psrlw m%2, 8 ; 64 - m + psllw m2, m%2, 10 + pmulhw m1, m2 + paddw m%1, m1 + mova m1, [tmp1q+mmsize*%4] + mova m2, [tmp2q+mmsize*%4] + psubw m2, m1 + pabsw m3, m2 + psubusw m3, m6, m3 + vpshldw m%2, m3, 8 + psllw m3, m%2, 10 +%if %5 + psubb m%2, m5, m%2 +%endif + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m%1, m7 + pmulhrsw m1, m7 + packuswb m%1, m1 +%endmacro + +cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx512icl_table + lea r7, [w_mask_420_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + movsxd wq, [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + vpbroadcastd m9, [base+pb_m64] ; -1 << 6 + mova ym10, [base+wm_420_mask+32] + vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6 + add wq, r7 + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + mova m5, [wm_420_perm4] + cmp hd, 8 + jg .w4_h16 + WRAP_YMM W_MASK 0, 4, 0, 1 + vinserti128 ym5, [wm_420_perm4+32], 1 + vpermb ym4, ym5, ym4 + vpdpbusd ym8, ym4, ym9 + vextracti128 xmm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xmm1 + pextrd [dstq+stride3q ], xmm1, 1 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xmm1, 2 + pextrd [dstq+stride3q ], xmm1, 3 +.w4_end: + vpermb ym8, ym10, ym8 + movq [maskq], xm8 + RET +.w4_h16: + vpbroadcastd m11, strided + pmulld m11, [bidir_sctr_w4] + W_MASK 0, 4, 0, 1 + vpermb m4, m5, m4 + vpdpbusd m8, m4, m9 + kxnorw k1, k1, k1 + vpermb m8, m10, m8 + mova [maskq], xm8 + vpscatterdd [dstq+m11]{k1}, m0 + RET +.w8: + mova m5, [wm_420_perm8] + cmp hd, 4 + jne .w8_h8 + WRAP_YMM W_MASK 0, 4, 0, 1 + vinserti128 ym5, [wm_420_perm8+32], 1 + vpermb ym4, ym5, ym4 + vpdpbusd ym8, ym4, ym9 + vpermb m8, m10, m8 + mova [maskq], xm8 + vextracti128 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + RET +.w8_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 16 + lea dstq, [dstq+strideq*4] +.w8_h8: + W_MASK 0, 4, 0, 1 + vpermb m4, m5, m4 + mova m1, m8 + vpdpbusd m1, m4, m9 + vpermb m1, m10, m1 + mova [maskq], xm1 + vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xmm2, m0, 2 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + sub hd, 8 + jg .w8_loop + RET +.w16: + mova m5, [wm_420_perm16] +.w16_loop: + W_MASK 0, 4, 0, 1 + vpermb m4, m5, m4 + mova m1, m8 + vpdpbusd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m0, q3120 + mova [maskq], xm1 + add maskq, 16 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m5, [pb_02461357] +.w32_loop: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpbusd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m5, m0 + mova [maskq], xm1 + add maskq, 16 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14 + psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15 +.w64_loop: + W_MASK 0, 4, 0, 2 + W_MASK 11, 5, 1, 3 + mova m2, m8 + vpdpbusd m2, m4, m9 + mova m3, m8 + vpdpbusd m3, m5, m9 + add tmp1q, 256 + add tmp2q, 256 + vpermt2b m2, m10, m3 + mova m1, m0 + vpermt2q m0, m12, m11 + vpermt2q m1, m13, m11 + mova [maskq], ym2 + add maskq, 32 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64_loop + RET +.w128: + pmovzxbq m14, [wm_420_perm64] + mova m10, [wm_420_mask] + psrlq m15, m14, 4 +.w128_loop: + W_MASK 0, 12, 0, 4 + W_MASK 11, 13, 1, 5 + mova m4, m8 + vpdpbusd m4, m12, m9 + mova m5, m8 + vpdpbusd m5, m13, m9 + mova m1, m0 + vpermt2q m0, m14, m11 + vpermt2q m1, m15, m11 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*1+64*0], m1 + W_MASK 0, 12, 2, 6 + W_MASK 11, 13, 3, 7 + vprold m4, 16 + vprold m5, 16 + vpdpbusd m4, m12, m9 + vpdpbusd m5, m13, m9 + add tmp1q, 512 + add tmp2q, 512 + vpermt2b m4, m10, m5 + mova m1, m0 + vpermt2q m0, m14, m11 + vpermt2q m1, m15, m11 + mova [maskq], m4 + add maskq, 64 + mova [dstq+strideq*0+64*1], m0 + mova [dstq+strideq*1+64*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w128_loop + RET + +cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx512icl_table + lea r7, [w_mask_422_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + vpbroadcastd m9, [base+pw_m128] + mova m10, [base+wm_422_mask] + vpbroadcastd m11, [base+pb_127] + add wq, r7 + vpbroadcastd m8, [base+wm_sign+4+r6*4] + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + cmp hd, 8 + jg .w4_h16 + WRAP_YMM W_MASK 0, 4, 0, 1 + movhps xm10, [wm_422_mask+16] + vpdpwssd ym8, ym4, ym9 + vpermb ym8, ym10, ym8 + vextracti128 xmm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xmm1 + pextrd [dstq+stride3q ], xmm1, 1 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xmm1, 2 + pextrd [dstq+stride3q ], xmm1, 3 +.w4_end: + pand xm8, xm11 + mova [maskq], xm8 + RET +.w4_h16: + vpbroadcastd m5, strided + pmulld m5, [bidir_sctr_w4] + W_MASK 0, 4, 0, 1 + vpdpwssd m8, m4, m9 + kxnorw k1, k1, k1 + vpermb m8, m10, m8 + pand ym8, ym11 + mova [maskq], ym8 + vpscatterdd [dstq+m5]{k1}, m0 + RET +.w8: + cmp hd, 4 + jne .w8_h8 + WRAP_YMM W_MASK 0, 4, 0, 1 + movhps xm10, [wm_422_mask+16] + vpdpwssd ym8, ym4, ym9 + vpermb ym8, ym10, ym8 + pand xm8, xm11 + mova [maskq], xm8 + vextracti128 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + RET +.w8_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 32 + lea dstq, [dstq+strideq*4] +.w8_h8: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + vpermb m1, m10, m1 + pand ym1, ym11 + mova [maskq], ym1 + vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xmm2, m0, 2 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 32 + lea dstq, [dstq+strideq*4] +.w16: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + vpermb m1, m10, m1 + vpermq m0, m0, q3120 + pand ym1, ym11 + mova [maskq], ym1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m5, [pb_02461357] +.w32_loop: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m5, m0 + pand ym1, ym11 + mova [maskq], ym1 + add maskq, 32 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m5, [pb_02461357] +.w64_loop: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m5, m0 + pand ym1, ym11 + mova [maskq], ym1 + add maskq, 32 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w128: + pmovzxbq m13, [pb_02461357] +.w128_loop: + W_MASK 0, 4, 0, 1 + W_MASK 12, 5, 2, 3 + mova m2, m8 + vpdpwssd m2, m4, m9 + mova m3, m8 + vpdpwssd m3, m5, m9 + add tmp1q, 256 + add tmp2q, 256 + vpermt2b m2, m10, m3 + vpermq m0, m13, m0 + vpermq m1, m13, m12 + pand m2, m11 + mova [maskq], m2 + add maskq, 64 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + dec hd + jg .w128_loop + RET + +cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx512icl_table + lea r7, [w_mask_444_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m5, [base+pb_64] + vpbroadcastd m7, [base+pw_2048] + mova m8, [base+wm_444_mask] + add wq, r7 + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + cmp hd, 8 + jg .w4_h16 + WRAP_YMM W_MASK 0, 4, 0, 1, 1 + vinserti128 ym8, [wm_444_mask+32], 1 + vpermb ym4, ym8, ym4 + mova [maskq], ym4 + vextracti128 xmm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xmm1 + pextrd [dstq+stride3q ], xmm1, 1 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xmm1, 2 + pextrd [dstq+stride3q ], xmm1, 3 +.w4_end: + RET +.w4_h16: + vpbroadcastd m9, strided + pmulld m9, [bidir_sctr_w4] + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + kxnorw k1, k1, k1 + mova [maskq], m4 + vpscatterdd [dstq+m9]{k1}, m0 + RET +.w8: + cmp hd, 4 + jne .w8_h8 + WRAP_YMM W_MASK 0, 4, 0, 1, 1 + vinserti128 ym8, [wm_444_mask+32], 1 + vpermb ym4, ym8, ym4 + mova [maskq], ym4 + vextracti128 xmm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xmm1 + RET +.w8_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 64 + lea dstq, [dstq+strideq*4] +.w8_h8: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + mova [maskq], m4 + vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xmm2, m0, 2 + vextracti32x4 xmm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*2], xmm2 + movq [dstq+stride3q ], xmm3 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm2 + movhps [dstq+stride3q ], xmm3 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 64 + lea dstq, [dstq+strideq*4] +.w16: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + vpermq m0, m0, q3120 + mova [maskq], m4 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m9, [pb_02461357] +.w32_loop: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + add tmp1q, 128 + add tmp2q, 128 + vpermq m0, m9, m0 + mova [maskq], m4 + add maskq, 64 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m9, [pb_02461357] +.w64_loop: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + add tmp1q, 128 + add tmp2q, 128 + vpermq m0, m9, m0 + mova [maskq], m4 + add maskq, 64 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w128: + pmovzxbq m11, [pb_02461357] +.w128_loop: + W_MASK 0, 4, 0, 1, 1 + W_MASK 10, 9, 2, 3, 1 + vpermb m4, m8, m4 + vpermb m9, m8, m9 + add tmp1q, 256 + add tmp2q, 256 + vpermq m0, m11, m0 + vpermq m10, m11, m10 + mova [maskq+64*0], m4 + mova [maskq+64*1], m9 + add maskq, 128 + mova [dstq+64*0], m0 + mova [dstq+64*1], m10 + add dstq, strideq + dec hd + jg .w128_loop + RET + +%endif ; HAVE_AVX512ICL && ARCH_X86_64 diff -Nru dav1d-0.7.1/src/x86/mc_init_tmpl.c dav1d-0.9.1/src/x86/mc_init_tmpl.c --- dav1d-0.7.1/src/x86/mc_init_tmpl.c 2020-06-21 11:48:55.036126400 +0000 +++ dav1d-0.9.1/src/x86/mc_init_tmpl.c 2021-07-28 21:38:28.913852200 +0000 @@ -28,129 +28,12 @@ #include "src/cpu.h" #include "src/mc.h" -decl_mc_fn(dav1d_put_8tap_regular_avx2); -decl_mc_fn(dav1d_put_8tap_regular_ssse3); -decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2); -decl_mc_fn(dav1d_put_8tap_regular_smooth_ssse3); -decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2); -decl_mc_fn(dav1d_put_8tap_regular_sharp_ssse3); -decl_mc_fn(dav1d_put_8tap_smooth_avx2); -decl_mc_fn(dav1d_put_8tap_smooth_ssse3); -decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2); -decl_mc_fn(dav1d_put_8tap_smooth_regular_ssse3); -decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2); -decl_mc_fn(dav1d_put_8tap_smooth_sharp_ssse3); -decl_mc_fn(dav1d_put_8tap_sharp_avx2); -decl_mc_fn(dav1d_put_8tap_sharp_ssse3); -decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2); -decl_mc_fn(dav1d_put_8tap_sharp_regular_ssse3); -decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2); -decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3); -decl_mc_fn(dav1d_put_bilin_avx2); -decl_mc_fn(dav1d_put_bilin_ssse3); - -decl_mct_fn(dav1d_prep_8tap_regular_avx512icl); -decl_mct_fn(dav1d_prep_8tap_regular_avx2); -decl_mct_fn(dav1d_prep_8tap_regular_ssse3); -decl_mct_fn(dav1d_prep_8tap_regular_sse2); -decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl); -decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2); -decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3); -decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2); -decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl); -decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2); -decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3); -decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2); -decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl); -decl_mct_fn(dav1d_prep_8tap_smooth_avx2); -decl_mct_fn(dav1d_prep_8tap_smooth_ssse3); -decl_mct_fn(dav1d_prep_8tap_smooth_sse2); -decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl); -decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2); -decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3); -decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2); -decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl); -decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2); -decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3); -decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2); -decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl); -decl_mct_fn(dav1d_prep_8tap_sharp_avx2); -decl_mct_fn(dav1d_prep_8tap_sharp_ssse3); -decl_mct_fn(dav1d_prep_8tap_sharp_sse2); -decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl); -decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2); -decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3); -decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2); -decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl); -decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2); -decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3); -decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2); -decl_mct_fn(dav1d_prep_bilin_avx512icl); -decl_mct_fn(dav1d_prep_bilin_avx2); -decl_mct_fn(dav1d_prep_bilin_ssse3); -decl_mct_fn(dav1d_prep_bilin_sse2); - -decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2); -decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2); -decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2); -decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2); -decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2); -decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2); -decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2); -decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2); -decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2); -decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2); - -decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2); -decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2); -decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2); -decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2); -decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2); -decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2); -decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2); -decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2); -decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2); -decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2); - -decl_avg_fn(dav1d_avg_avx512icl); -decl_avg_fn(dav1d_avg_avx2); -decl_avg_fn(dav1d_avg_ssse3); -decl_w_avg_fn(dav1d_w_avg_avx512icl); -decl_w_avg_fn(dav1d_w_avg_avx2); -decl_w_avg_fn(dav1d_w_avg_ssse3); -decl_mask_fn(dav1d_mask_avx512icl); -decl_mask_fn(dav1d_mask_avx2); -decl_mask_fn(dav1d_mask_ssse3); -decl_w_mask_fn(dav1d_w_mask_420_avx512icl); -decl_w_mask_fn(dav1d_w_mask_420_avx2); -decl_w_mask_fn(dav1d_w_mask_420_ssse3); -decl_w_mask_fn(dav1d_w_mask_422_avx512icl); -decl_w_mask_fn(dav1d_w_mask_422_avx2); -decl_w_mask_fn(dav1d_w_mask_444_avx512icl); -decl_w_mask_fn(dav1d_w_mask_444_avx2); -decl_blend_fn(dav1d_blend_avx2); -decl_blend_fn(dav1d_blend_ssse3); -decl_blend_dir_fn(dav1d_blend_v_avx2); -decl_blend_dir_fn(dav1d_blend_v_ssse3); -decl_blend_dir_fn(dav1d_blend_h_avx2); -decl_blend_dir_fn(dav1d_blend_h_ssse3); - -decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2); -decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4); -decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3); -decl_warp8x8_fn(dav1d_warp_affine_8x8_sse2); -decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2); -decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4); -decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3); -decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2); - -decl_emu_edge_fn(dav1d_emu_edge_avx2); -decl_emu_edge_fn(dav1d_emu_edge_ssse3); - -decl_resize_fn(dav1d_resize_avx2); -decl_resize_fn(dav1d_resize_ssse3); - -COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { +#if BITDEPTH == 8 +#define decl_fn(type, name) \ + decl_##type##_fn(name##_sse2); \ + decl_##type##_fn(name##_ssse3); \ + decl_##type##_fn(name##_avx2); \ + decl_##type##_fn(name##_avx512icl); #define init_mc_fn(type, name, suffix) \ c->mc[type] = dav1d_put_##name##_##suffix #define init_mct_fn(type, name, suffix) \ @@ -159,7 +42,87 @@ c->mc_scaled[type] = dav1d_put_##name##_##suffix #define init_mct_scaled_fn(type, name, suffix) \ c->mct_scaled[type] = dav1d_prep_##name##_##suffix +#else +#define decl_fn(type, name) \ + decl_##type##_fn(name##_16bpc_sse2); \ + decl_##type##_fn(name##_16bpc_ssse3); \ + decl_##type##_fn(name##_16bpc_avx2); \ + decl_##type##_fn(name##_16bpc_avx512icl); +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = dav1d_put_##name##_16bpc_##suffix +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = dav1d_prep_##name##_16bpc_##suffix +#define init_mc_scaled_fn(type, name, suffix) \ + c->mc_scaled[type] = dav1d_put_##name##_16bpc_##suffix +#define init_mct_scaled_fn(type, name, suffix) \ + c->mct_scaled[type] = dav1d_prep_##name##_16bpc_##suffix +#endif + +decl_fn(mc, dav1d_put_8tap_regular); +decl_fn(mc, dav1d_put_8tap_regular_smooth); +decl_fn(mc, dav1d_put_8tap_regular_sharp); +decl_fn(mc, dav1d_put_8tap_smooth); +decl_fn(mc, dav1d_put_8tap_smooth_regular); +decl_fn(mc, dav1d_put_8tap_smooth_sharp); +decl_fn(mc, dav1d_put_8tap_sharp); +decl_fn(mc, dav1d_put_8tap_sharp_regular); +decl_fn(mc, dav1d_put_8tap_sharp_smooth); +decl_fn(mc, dav1d_put_bilin); + +decl_fn(mct, dav1d_prep_8tap_regular); +decl_fn(mct, dav1d_prep_8tap_regular_smooth); +decl_fn(mct, dav1d_prep_8tap_regular_sharp); +decl_fn(mct, dav1d_prep_8tap_smooth); +decl_fn(mct, dav1d_prep_8tap_smooth_regular); +decl_fn(mct, dav1d_prep_8tap_smooth_sharp); +decl_fn(mct, dav1d_prep_8tap_sharp); +decl_fn(mct, dav1d_prep_8tap_sharp_regular); +decl_fn(mct, dav1d_prep_8tap_sharp_smooth); +decl_fn(mct, dav1d_prep_bilin); + +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth); +decl_fn(mc_scaled, dav1d_put_bilin_scaled); + +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth); +decl_fn(mct_scaled, dav1d_prep_bilin_scaled); + +decl_fn(avg, dav1d_avg); +decl_fn(w_avg, dav1d_w_avg); +decl_fn(mask, dav1d_mask); +decl_fn(w_mask, dav1d_w_mask_420); +decl_fn(w_mask, dav1d_w_mask_422); +decl_fn(w_mask, dav1d_w_mask_444); +decl_fn(blend, dav1d_blend); +decl_fn(blend_dir, dav1d_blend_v); +decl_fn(blend_dir, dav1d_blend_h); + +decl_fn(warp8x8, dav1d_warp_affine_8x8); +decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4); +decl_fn(warp8x8t, dav1d_warp_affine_8x8t); +decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4); + +decl_fn(emu_edge, dav1d_emu_edge); +decl_resize_fn(dav1d_resize_avx2); +decl_resize_fn(dav1d_resize_ssse3); + +COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) @@ -184,8 +147,6 @@ if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; -#if BITDEPTH == 8 - init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); @@ -195,8 +156,8 @@ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); - init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); @@ -206,7 +167,33 @@ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); + +#if BITDEPTH == 8 && ARCH_X86_64 + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); +#endif +#if BITDEPTH == 8 c->avg = dav1d_avg_ssse3; c->w_avg = dav1d_w_avg_ssse3; c->mask = dav1d_mask_ssse3; @@ -220,6 +207,19 @@ c->emu_edge = dav1d_emu_edge_ssse3; c->resize = dav1d_resize_ssse3; +#else + c->avg = dav1d_avg_16bpc_ssse3; + c->w_avg = dav1d_w_avg_16bpc_ssse3; + c->mask = dav1d_mask_16bpc_ssse3; + c->w_mask[0] = dav1d_w_mask_444_16bpc_ssse3; + c->w_mask[1] = dav1d_w_mask_422_16bpc_ssse3; + c->w_mask[2] = dav1d_w_mask_420_16bpc_ssse3; + c->blend = dav1d_blend_16bpc_ssse3; + c->blend_v = dav1d_blend_v_16bpc_ssse3; + c->blend_h = dav1d_blend_h_16bpc_ssse3; + c->warp8x8 = dav1d_warp_affine_8x8_16bpc_ssse3; + c->warp8x8t = dav1d_warp_affine_8x8t_16bpc_ssse3; + c->emu_edge = dav1d_emu_edge_16bpc_ssse3; #endif if(!(flags & DAV1D_X86_CPU_FLAG_SSE41)) @@ -234,7 +234,6 @@ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if BITDEPTH == 8 init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); @@ -257,6 +256,7 @@ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); +#if BITDEPTH == 8 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); @@ -294,6 +294,19 @@ c->emu_edge = dav1d_emu_edge_avx2; c->resize = dav1d_resize_avx2; +#else + c->avg = dav1d_avg_16bpc_avx2; + c->w_avg = dav1d_w_avg_16bpc_avx2; + c->mask = dav1d_mask_16bpc_avx2; + c->w_mask[0] = dav1d_w_mask_444_16bpc_avx2; + c->w_mask[1] = dav1d_w_mask_422_16bpc_avx2; + c->w_mask[2] = dav1d_w_mask_420_16bpc_avx2; + c->blend = dav1d_blend_16bpc_avx2; + c->blend_v = dav1d_blend_v_16bpc_avx2; + c->blend_h = dav1d_blend_h_16bpc_avx2; + c->warp8x8 = dav1d_warp_affine_8x8_16bpc_avx2; + c->warp8x8t = dav1d_warp_affine_8x8t_16bpc_avx2; + c->emu_edge = dav1d_emu_edge_16bpc_avx2; #endif if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) diff -Nru dav1d-0.7.1/src/x86/mc_sse.asm dav1d-0.9.1/src/x86/mc_sse.asm --- dav1d-0.7.1/src/x86/mc_sse.asm 2020-06-21 11:48:55.036126400 +0000 +++ dav1d-0.9.1/src/x86/mc_sse.asm 2021-07-28 21:38:28.917852400 +0000 @@ -24,6 +24,7 @@ ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +%include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 @@ -54,12 +55,19 @@ subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 -bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 -bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 pb_8x0_8x8: times 8 db 0 times 8 db 8 -resize_mul: dd 0, 1, 2, 3 +bdct_lb_dw: times 4 db 0 + times 4 db 4 + times 4 db 8 + times 4 db 12 +rescale_mul: dd 0, 1, 2, 3 resize_shuf: times 5 db 0 db 1, 2, 3, 4, 5, 6 times 5+16 db 7 @@ -69,6 +77,7 @@ pw_1: times 8 dw 1 pw_2: times 8 dw 2 pw_8: times 8 dw 8 +pw_15: times 8 dw 15 pw_26: times 8 dw 26 pw_34: times 8 dw 34 pw_512: times 8 dw 512 @@ -82,6 +91,111 @@ pd_16384: times 4 dd 16484 pd_32768: times 4 dd 32768 pd_262144:times 4 dd 262144 +pd_0x3ff: times 4 dd 0x3ff +pd_0x4000:times 4 dd 0x4000 +pq_0x40000000: times 2 dq 0x40000000 + +const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage + ; [-1, 0) + db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0 + db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0 + db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0 + db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0 + db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0 + db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0 + db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0 + db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0 + db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0 + db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0 + db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0 + db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0 + db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0 + db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0 + db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0 + db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0 + db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0 + db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0 + db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0 + db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0 + db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0 + db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0 + db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0 + db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0 + db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0 + db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0 + db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0 + db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0 + db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0 + db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0 + db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0 + db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0 + ; [0, 1) + db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0 + db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0 + db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1 + db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1 + db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1 + db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1 + db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1 + db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1 + db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2 + db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2 + db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2 + db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2 + db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2 + db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2 + db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2 + db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2 + db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2 + db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2 + db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2 + db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2 + db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2 + db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2 + db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2 + db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2 + db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2 + db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1 + db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2 + db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1 + db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1 + db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1 + db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0 + db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0 + ; [1, 2) + db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0 + db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1 + db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1 + db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1 + db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1 + db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2 + db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2 + db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2 + db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3 + db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3 + db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3 + db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4 + db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4 + db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4 + db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4 + db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4 + db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4 + db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4 + db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4 + db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4 + db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4 + db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4 + db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4 + db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3 + db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3 + db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3 + db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2 + db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2 + db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2 + db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1 + db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1 + db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0 + db 0, 0, 2, -1, 0, 0, 127, 0 pw_258: times 2 dw 258 @@ -165,9 +279,36 @@ HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 -%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX +%macro SCALED_JMP_TABLE 1-* + %xdefine %1_table (%%table - %2) + %xdefine %%base mangle(private_prefix %+ _%1) +%%table: + %rep %0 - 1 + dw %%base %+ .w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_1024: + %xdefine %1_dy1_table (%%dy_1024 - %2) + %rep %0 - 1 + dw %%base %+ .dy1_w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_2048: + %xdefine %1_dy2_table (%%dy_2048 - %2) + %rep %0 - 1 + dw %%base %+ .dy2_w%2 - %%base + %rotate 1 + %endrep +%endmacro -cextern mc_warp_filter +%if ARCH_X86_64 +SCALED_JMP_TABLE put_8tap_scaled_ssse3, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled_ssse3, 4, 8, 16, 32, 64, 128 +%endif + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX SECTION .text @@ -180,16 +321,18 @@ DECLARE_REG_TMP 7 %define base 0 %endif -; + %macro RESTORE_DSQ_32 1 %if ARCH_X86_32 mov %1, dsm ; restore dsq %endif %endmacro -; -cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak + +cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx LEA t0, put_ssse3 + movifnidn srcq, srcmp + movifnidn ssq, ssmp tzcnt wd, wm mov hd, hm test mxyd, mxyd @@ -295,20 +438,19 @@ .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 - imul mxyd, 0xff01 + imul mxyd, 0x00ff00ff mova m4, [base+bilin_h_shuf8] mova m0, [base+bilin_h_shuf4] - add mxyd, 16 << 8 + add mxyd, 0x00100010 movd m5, mxyd mov mxyd, r7m ; my - pshuflw m5, m5, q0000 - punpcklqdq m5, m5 + pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] mova m3, [base+pw_2048] add wq, t0 - RESTORE_DSQ_32 t0 + movifnidn dsq, dsmp jmp wq .h_w2: pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} @@ -445,23 +587,22 @@ RET .v: movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] - imul mxyd, 0xff01 + imul mxyd, 0x00ff00ff mova m5, [base+pw_2048] - add mxyd, 16 << 8 + add mxyd, 0x00100010 add wq, t0 movd m4, mxyd - pshuflw m4, m4, q0000 - punpcklqdq m4, m4 - RESTORE_DSQ_32 t0 + pshufd m4, m4, q0000 + movifnidn dsq, dsmp jmp wq .v_w2: movd m0, [srcq+ssq*0] .v_w2_loop: pinsrw m0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] - pshuflw m2, m0, q2301 + pshuflw m1, m0, q2301 pinsrw m0, [srcq+ssq*0], 0 ; 2 1 - punpcklbw m1, m0, m2 + punpcklbw m1, m0 pmaddubsw m1, m4 pmulhrsw m1, m5 packuswb m1, m1 @@ -476,11 +617,12 @@ .v_w4: movd m0, [srcq+ssq*0] .v_w4_loop: - movd m1, [srcq+ssq*1] + movd m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpckldq m2, m0, m1 ; 0 1 + mova m1, m0 movd m0, [srcq+ssq*0] - punpckldq m1, m0 ; 1 2 + punpckldq m1, m2 ; 0 1 + punpckldq m2, m0 ; 1 2 punpcklbw m1, m2 pmaddubsw m1, m4 pmulhrsw m1, m5 @@ -496,11 +638,12 @@ .v_w8: movq m0, [srcq+ssq*0] .v_w8_loop: - movq m3, [srcq+ssq*1] + movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw m1, m3, m0 + mova m1, m0 movq m0, [srcq+ssq*0] - punpcklbw m2, m0, m3 + punpcklbw m1, m2 + punpcklbw m2, m0 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 @@ -512,98 +655,102 @@ sub hd, 2 jg .v_w8_loop RET - ; %macro PUT_BILIN_V_W16 0 movu m0, [srcq+ssq*0] %%loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - punpcklbw m1, m3, m0 - punpckhbw m2, m3, m0 + mova m1, m0 + mova m2, m0 movu m0, [srcq+ssq*0] + punpcklbw m1, m3 + punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 - mova [dstq+dsq*0], m1 - punpcklbw m1, m0, m3 - punpckhbw m2, m0, m3 - pmaddubsw m1, m4 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 pmaddubsw m2, m4 - pmulhrsw m1, m5 + pmaddubsw m3, m4 pmulhrsw m2, m5 - packuswb m1, m2 - mova [dstq+dsq*1], m1 + pmulhrsw m3, m5 + packuswb m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg %%loop %endmacro - ; .v_w16: PUT_BILIN_V_W16 RET +.v_w128: + lea r6d, [hq+(7<<16)] + jmp .v_w16gt +.v_w64: + lea r6d, [hq+(3<<16)] + jmp .v_w16gt +.v_w32: + lea r6d, [hq+(1<<16)] .v_w16gt: - mov r4, dstq - mov r6, srcq + mov r4, srcq +%if ARCH_X86_64 + mov r7, dstq +%endif .v_w16gt_loop: -%if ARCH_X86_32 - mov bakm, t0q - RESTORE_DSQ_32 t0 PUT_BILIN_V_W16 - mov t0q, bakm +%if ARCH_X86_64 + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 %else - PUT_BILIN_V_W16 + mov dstq, dstmp + add r4, 16 + movzx hd, r6w + add dstq, 16 + mov srcq, r4 + mov dstmp, dstq %endif - mov hw, t0w - add r4, mmsize - add r6, mmsize - mov dstq, r4 - mov srcq, r6 - sub t0d, 1<<16 + sub r6d, 1<<16 jg .v_w16gt RET -.v_w32: - lea t0d, [hq+(1<<16)] - jmp .v_w16gt -.v_w64: - lea t0d, [hq+(3<<16)] - jmp .v_w16gt -.v_w128: - lea t0d, [hq+(7<<16)] - jmp .v_w16gt .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow - mova m7, [base+pw_2048] + mova m7, [base+pw_15] movd m6, mxyd add wq, t0 pshuflw m6, m6, q0000 + paddb m5, m5 punpcklqdq m6, m6 jmp wq .hv_w2: RESTORE_DSQ_32 t0 movd m0, [srcq+ssq*0] - pshufd m0, m0, q0000 ; src[x - src_stride] + punpckldq m0, m0 pshufb m0, m4 pmaddubsw m0, m5 .hv_w2_loop: - movd m1, [srcq+ssq*1] ; src[x] + movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - movhps m1, [srcq+ssq*0] ; src[x + src_stride] - pshufd m1, m1, q3120 + movd m2, [srcq+ssq*0] + punpckldq m1, m2 pshufb m1, m4 pmaddubsw m1, m5 ; 1 _ 2 _ shufps m2, m0, m1, q1032 ; 0 _ 1 _ mova m0, m1 - psubw m1, m2 ; src[x + src_stride] - src[x] - paddw m1, m1 - pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) - paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x]) - pmulhrsw m1, m7 + psubw m1, m2 ; 2 * (src[x + src_stride] - src[x]) + pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4 + pavgw m2, m7 ; src[x] + 8 + paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8 + psrlw m1, 4 packuswb m1, m1 %if ARCH_X86_64 movq r6, m1 @@ -620,8 +767,8 @@ RET .hv_w4: mova m4, [base+bilin_h_shuf4] - RESTORE_DSQ_32 t0 movddup xm0, [srcq+ssq*0] + movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w4_loop: @@ -629,14 +776,14 @@ lea srcq, [srcq+ssq*2] movhps m1, [srcq+ssq*0] pshufb m1, m4 - pmaddubsw m1, m5 ; 1 2 + pmaddubsw m1, m5 ; 1 2 shufps m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 - paddw m1, m1 pmulhw m1, m6 + pavgw m2, m7 paddw m1, m2 - pmulhrsw m1, m7 + psrlw m1, 4 packuswb m1, m1 movd [dstq+dsq*0], m1 psrlq m1, 32 @@ -646,28 +793,28 @@ jg .hv_w4_loop RET .hv_w8: - RESTORE_DSQ_32 t0 - movu m0, [srcq+ssq*0+8*0] + movu m0, [srcq+ssq*0] + movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: - movu m2, [srcq+ssq*1+8*0] + movu m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m2, m4 pmaddubsw m2, m5 psubw m1, m2, m0 - paddw m1, m1 pmulhw m1, m6 + pavgw m0, m7 paddw m1, m0 - movu m0, [srcq+ssq*0+8*0] + movu m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 psubw m3, m0, m2 - paddw m3, m3 pmulhw m3, m6 + pavgw m2, m7 paddw m3, m2 - pmulhrsw m1, m7 - pmulhrsw m3, m7 + psrlw m1, 4 + psrlw m3, 4 packuswb m1, m3 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 @@ -675,27 +822,34 @@ sub hd, 2 jg .hv_w8_loop RET +.hv_w128: + lea r6d, [hq+(7<<16)] + jmp .hv_w16_start +.hv_w64: + lea r6d, [hq+(3<<16)] + jmp .hv_w16_start +.hv_w32: + lea r6d, [hq+(1<<16)] +.hv_w16_start: + mov r4, srcq +%if ARCH_X86_32 + %define m8 [dstq] +%else + mov r7, dstq +%endif .hv_w16: - xor t0d, t0d -.hv_w16gt: - mov r4, dstq - mov r6, srcq - %if WIN64 - movaps r4m, xmm8 - %endif + movifnidn dsq, dsmp +%if WIN64 + movaps r4m, m8 +%endif .hv_w16_loop0: - movu m0, [srcq+8*0] - movu m1, [srcq+8*1] + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w16_loop: -%if ARCH_X86_32 - %define m0tmp [dstq] -%else - %define m0tmp m8 -%endif add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] @@ -703,62 +857,51 @@ pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 - mova m0tmp, m2 + mova m8, m2 psubw m2, m0 - paddw m2, m2 pmulhw m2, m6 + pavgw m0, m7 paddw m2, m0 mova m0, m3 psubw m3, m1 - paddw m3, m3 pmulhw m3, m6 + pavgw m1, m7 paddw m3, m1 mova m1, m0 - mova m0, m0tmp - pmulhrsw m2, m7 - pmulhrsw m3, m7 + mova m0, m8 + psrlw m2, 4 + psrlw m3, 4 packuswb m2, m3 mova [dstq], m2 add dstq, dsmp dec hd jg .hv_w16_loop - movzx hd, t0w - add r4, mmsize - add r6, mmsize - mov dstq, r4 - mov srcq, r6 - sub t0d, 1<<16 +%if ARCH_X86_32 + mov dstq, dstm + add r4, 16 + movzx hd, r6w + add dstq, 16 + mov srcq, r4 + mov dstm, dstq +%else + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 +%endif + sub r6d, 1<<16 jg .hv_w16_loop0 - %if WIN64 - movaps xmm8, r4m - %endif +%if WIN64 + movaps m8, r4m +%endif RET -.hv_w32: - lea t0d, [hq+(1<<16)] - jmp .hv_w16gt -.hv_w64: - lea t0d, [hq+(3<<16)] - jmp .hv_w16gt -.hv_w128: - lea t0d, [hq+(7<<16)] - jmp .hv_w16gt - -%macro PSHUFB_0X1X 1-2 ; dst[, src] - %if cpuflag(ssse3) - pshufb %1, %2 - %else - punpcklbw %1, %1 - psraw %1, 8 - pshufd %1, %1, q0000 - %endif -%endmacro %macro PSHUFB_BILIN_H8 2 ; dst, src %if cpuflag(ssse3) pshufb %1, %2 %else - mova %2, %1 - psrldq %1, 1 + psrldq %2, %1, 1 punpcklbw %1, %2 %endif %endmacro @@ -767,8 +910,7 @@ %if cpuflag(ssse3) pshufb %1, %2 %else - mova %2, %1 - psrldq %1, 1 + psrldq %2, %1, 1 punpckhbw %3, %1, %2 punpcklbw %1, %2 punpcklqdq %1, %3 @@ -805,17 +947,15 @@ %endmacro %macro PREP_BILIN 0 - -DECLARE_REG_TMP 3, 5, 6 %if ARCH_X86_32 - %define base t2-prep%+SUFFIX + %define base r6-prep%+SUFFIX %else - %define base 0 + %define base 0 %endif cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx - LEA t2, prep%+SUFFIX + LEA r6, prep%+SUFFIX tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd @@ -825,11 +965,12 @@ jnz .v .prep: %if notcpuflag(ssse3) - add t2, prep_ssse3 - prep_sse2 + add r6, prep_ssse3 - prep_sse2 jmp prep_ssse3 %else - movzx wd, word [t2+wq*2+table_offset(prep,)] - add wq, t2 + movzx wd, word [r6+wq*2+table_offset(prep,)] + pxor m4, m4 + add wq, r6 lea stride3q, [strideq*3] jmp wq .prep_w4: @@ -837,17 +978,16 @@ movd m1, [srcq+strideq*1] movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] punpckldq m0, m1 punpckldq m2, m3 - lea srcq, [srcq+strideq*4] - pxor m1, m1 - punpcklbw m0, m1 - punpcklbw m2, m1 + punpcklbw m0, m4 + punpcklbw m2, m4 psllw m0, 4 psllw m2, 4 - mova [tmpq+mmsize*0], m0 - mova [tmpq+mmsize*1], m2 - add tmpq, 32 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m2 + add tmpq, 16*2 sub hd, 4 jg .prep_w4 RET @@ -857,7 +997,6 @@ movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - pxor m4, m4 punpcklbw m0, m4 punpcklbw m1, m4 punpcklbw m2, m4 @@ -875,16 +1014,13 @@ jg .prep_w8 RET .prep_w16: - movq m0, [srcq+strideq*0+8*0] - movq m1, [srcq+strideq*0+8*1] - movq m2, [srcq+strideq*1+8*0] - movq m3, [srcq+strideq*1+8*1] + movu m1, [srcq+strideq*0] + movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - pxor m4, m4 - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpcklbw m3, m4 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 @@ -897,27 +1033,25 @@ sub hd, 2 jg .prep_w16 RET -.prep_w32: - mov t2d, 1 - jmp .prep_w32_vloop -.prep_w64: - mov t2d, 2 - jmp .prep_w32_vloop .prep_w128: - mov t2d, 4 + mov r3, -128 + jmp .prep_w32_start +.prep_w64: + mov r3, -64 + jmp .prep_w32_start +.prep_w32: + mov r3, -32 +.prep_w32_start: + sub srcq, r3 .prep_w32_vloop: - mov t1q, srcq - mov r3d, t2d + mov r6, r3 .prep_w32_hloop: - movq m0, [t1q+8*0] - movq m1, [t1q+8*1] - movq m2, [t1q+8*2] - movq m3, [t1q+8*3] - pxor m4, m4 - punpcklbw m0, m4 - punpcklbw m1, m4 - punpcklbw m2, m4 - punpcklbw m3, m4 + movu m1, [srcq+r6+16*0] + movu m3, [srcq+r6+16*1] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 @@ -927,10 +1061,9 @@ mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 - add t1q, 32 - dec r3d - jg .prep_w32_hloop - lea srcq, [srcq+strideq] + add r6, 32 + jl .prep_w32_hloop + add srcq, strideq dec hd jg .prep_w32_vloop RET @@ -938,40 +1071,31 @@ .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] - imul mxyd, 0xff01 %if cpuflag(ssse3) + imul mxyd, 0x00ff00ff mova m4, [base+bilin_h_shuf8] + add mxyd, 0x00100010 +%else + imul mxyd, 0xffff + add mxyd, 16 %endif - add mxyd, 16 << 8 movd m5, mxyd mov mxyd, r6m ; my -%if cpuflag(ssse3) - pshuflw m5, m5, q0000 - punpcklqdq m5, m5 -%else - PSHUFB_0X1X m5 -%endif + pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv -%if ARCH_X86_32 - mov t1, t2 ; save base reg for w4 -%endif - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] %if notcpuflag(ssse3) WIN64_SPILL_XMM 8 pxor m6, m6 %endif - add wq, t2 - lea stride3q, [strideq*3] + add wq, r6 jmp wq .h_w4: %if cpuflag(ssse3) - %if ARCH_X86_32 - mova m4, [t1-prep_ssse3+bilin_h_shuf4] - %else - mova m4, [bilin_h_shuf4] - %endif + mova m4, [base+bilin_h_shuf4] %endif + lea stride3q, [strideq*3] .h_w4_loop: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] @@ -989,6 +1113,8 @@ jg .h_w4_loop RET .h_w8: + lea stride3q, [strideq*3] +.h_w8_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] @@ -1008,7 +1134,7 @@ mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 - jg .h_w8 + jg .h_w8_loop RET .h_w16: movu m0, [srcq+strideq*0+8*0] @@ -1032,22 +1158,23 @@ sub hd, 2 jg .h_w16 RET -.h_w32: - mov t2d, 1 << 0 - jmp .h_w32_vloop -.h_w64: - mov t2d, 1 << 1 - jmp .h_w32_vloop .h_w128: - mov t2d, 1 << 3 + mov r3, -128 + jmp .h_w32_start +.h_w64: + mov r3, -64 + jmp .h_w32_start +.h_w32: + mov r3, -32 +.h_w32_start: + sub srcq, r3 .h_w32_vloop: - mov t1q, srcq - mov r3d, t2d + mov r6, r3 .h_w32_hloop: - movu m0, [t1q+8*0] - movu m1, [t1q+8*1] - movu m2, [t1q+8*2] - movu m3, [t1q+8*3] + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + movu m2, [srcq+r6+8*2] + movu m3, [srcq+r6+8*3] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 @@ -1061,11 +1188,10 @@ mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 - add t1q, 32 - shr r3d, 1 - jnz .h_w32_hloop - lea srcq, [srcq+strideq] - sub hd, 1 + add r6, 32 + jl .h_w32_hloop + add srcq, strideq + dec hd jg .h_w32_vloop RET .v: @@ -1073,19 +1199,19 @@ %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 %endif - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] - imul mxyd, 0xff01 - add mxyd, 16 << 8 - add wq, t2 - lea stride3q, [strideq*3] - movd m5, mxyd + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] %if cpuflag(ssse3) - pshuflw m5, m5, q0000 - punpcklqdq m5, m5 + imul mxyd, 0x00ff00ff + add mxyd, 0x00100010 %else - PSHUFB_0X1X m5 + imul mxyd, 0xffff pxor m6, m6 + add mxyd, 16 %endif + add wq, r6 + lea stride3q, [strideq*3] + movd m5, mxyd + pshufd m5, m5, q0000 jmp wq .v_w4: movd m0, [srcq+strideq*0] @@ -1094,46 +1220,41 @@ movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - punpcklwd m0, m1 ; 0 1 _ _ - punpcklwd m1, m2 ; 1 2 _ _ - punpcklbw m1, m0 - PMADDUBSW m1, m5, m6, m7, 0 - pshufd m1, m1, q3120 - mova [tmpq+16*0], m1 + punpckldq m0, m1 + punpckldq m1, m2 + punpcklbw m0, m1 ; 01 12 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m0 movd m0, [srcq+strideq*0] - punpcklwd m2, m3 ; 2 3 _ _ - punpcklwd m3, m0 ; 3 4 _ _ - punpcklbw m3, m2 - PMADDUBSW m3, m5, m6, m7, 0 - pshufd m3, m3, q3120 - mova [tmpq+16*1], m3 - add tmpq, 32 + punpckldq m2, m3 + punpckldq m3, m0 + punpcklbw m2, m3 ; 23 34 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*1], m2 + add tmpq, 16*2 sub hd, 4 jg .v_w4_loop RET .v_w8: movq m0, [srcq+strideq*0] .v_w8_loop: - movq m1, [srcq+strideq*2] - movq m2, [srcq+strideq*1] + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - shufpd m4, m0, m1, 0x0c ; 0 2 + punpcklbw m0, m1 ; 01 + punpcklbw m1, m2 ; 12 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*0], m0 movq m0, [srcq+strideq*0] - shufpd m2, m3, 0x0c ; 1 3 - shufpd m1, m0, 0x0c ; 2 4 - punpcklbw m3, m2, m4 - PMADDUBSW m3, m5, m6, m7, 0 - mova [tmpq+16*0], m3 - punpckhbw m3, m2, m4 - PMADDUBSW m3, m5, m6, m7, 0 - mova [tmpq+16*2], m3 - punpcklbw m3, m1, m2 - punpckhbw m1, m2 + punpcklbw m2, m3 ; 23 + punpcklbw m3, m0 ; 34 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*1], m1 PMADDUBSW m3, m5, m6, m7, 0 - PMADDUBSW m1, m5, m6, m7, 0 - mova [tmpq+16*1], m3 - mova [tmpq+16*3], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .v_w8_loop @@ -1143,48 +1264,48 @@ .v_w16_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] - punpcklbw m3, m1, m0 - punpckhbw m4, m1, m0 - PMADDUBSW m3, m5, m6, m7, 0 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*0], m3 - mova [tmpq+16*1], m4 - punpcklbw m3, m2, m1 - punpckhbw m4, m2, m1 - PMADDUBSW m3, m5, m6, m7, 0 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*2], m3 - mova [tmpq+16*3], m4 movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] + punpcklbw m4, m0, m1 + punpckhbw m0, m1 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m4 + punpcklbw m4, m1, m2 + punpckhbw m1, m2 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*1], m0 movu m0, [srcq+strideq*0] - add tmpq, 16*8 - punpcklbw m1, m3, m2 - punpckhbw m4, m3, m2 PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*2], m4 + punpcklbw m4, m2, m3 + punpckhbw m2, m3 PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq-16*4], m1 - mova [tmpq-16*3], m4 - punpcklbw m1, m0, m3 - punpckhbw m2, m0, m3 - PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*3], m1 PMADDUBSW m2, m5, m6, m7, 0 - mova [tmpq-16*2], m1 - mova [tmpq-16*1], m2 + mova [tmpq+16*4], m4 + punpcklbw m4, m3, m0 + punpckhbw m3, m0 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*5], m2 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*6], m4 + mova [tmpq+16*7], m3 + add tmpq, 16*8 sub hd, 4 jg .v_w16_loop RET -.v_w32: - lea t2d, [hq+(0<<16)] - mov t0d, 64 +.v_w128: + lea r3d, [hq+(3<<8)] + mov r6d, 256 jmp .v_w32_start .v_w64: - lea t2d, [hq+(1<<16)] - mov t0d, 128 + lea r3d, [hq+(1<<8)] + mov r6d, 128 jmp .v_w32_start -.v_w128: - lea t2d, [hq+(3<<16)] - mov t0d, 256 +.v_w32: + xor r3d, r3d + mov r6d, 64 .v_w32_start: %if ARCH_X86_64 %if WIN64 @@ -1192,7 +1313,7 @@ %endif mov r7, tmpq %endif - mov t1, srcq + mov r5, srcq .v_w32_hloop: movu m0, [srcq+strideq*0+16*0] movu m1, [srcq+strideq*0+16*1] @@ -1200,48 +1321,48 @@ movu m2, [srcq+strideq*1+16*0] movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] - punpcklbw m4, m2, m0 + punpcklbw m4, m0, m2 + punpckhbw m0, m2 PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m0, m5, m6, m7, 0 mova [tmpq+16*0], m4 - punpckhbw m4, m2, m0 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*1], m4 - punpcklbw m4, m3, m1 + mova [tmpq+16*1], m0 + movu m0, [srcq+strideq*0+16*0] + punpcklbw m4, m1, m3 + punpckhbw m1, m3 PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*2], m4 - punpckhbw m4, m3, m1 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*3], m4 - add tmpq, t0q - movu m0, [srcq+strideq*0+16*0] + mova [tmpq+16*3], m1 movu m1, [srcq+strideq*0+16*1] - punpcklbw m4, m0, m2 + add tmpq, r6 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq+16*0], m4 - punpckhbw m4, m0, m2 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*1], m4 - punpcklbw m4, m1, m3 + mova [tmpq+16*1], m2 + punpcklbw m4, m3, m1 + punpckhbw m3, m1 PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*2], m4 - punpckhbw m4, m1, m3 - PMADDUBSW m4, m5, m6, m7, 0 - mova [tmpq+16*3], m4 - add tmpq, t0q + mova [tmpq+16*3], m3 + add tmpq, r6 sub hd, 2 jg .v_w32_vloop - movzx hd, t2w - add t1, 32 - mov srcq, t1 + add r5, 32 + movzx hd, r3b + mov srcq, r5 %if ARCH_X86_64 - add r7, 2*16*2 + add r7, 16*4 mov tmpq, r7 %else mov tmpq, tmpmp - add tmpq, 2*16*2 + add tmpq, 16*4 mov tmpmp, tmpq %endif - sub t2d, 1<<16 + sub r3d, 1<<8 jg .v_w32_hloop %if WIN64 POP r7 @@ -1250,73 +1371,56 @@ .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] %assign stack_offset stack_offset - stack_size_padded %if cpuflag(ssse3) + imul mxyd, 0x08000800 WIN64_SPILL_XMM 8 %else - WIN64_SPILL_XMM 10 -%endif - movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] -%if cpuflag(ssse3) - shl mxyd, 11 -%else + or mxyd, 1<<16 + WIN64_SPILL_XMM 9 %if ARCH_X86_64 - mova m8, [pw_8] + mova m8, [base+pw_8] %else - %define m8 [pw_8] + %define m8 [base+pw_8] %endif pxor m7, m7 %endif movd m6, mxyd - add wq, t2 - pshuflw m6, m6, q0000 -%if cpuflag(ssse3) - punpcklqdq m6, m6 -%else - %if ARCH_X86_64 - psrlw m0, m8, 3 - punpcklwd m6, m0 - %else - punpcklwd m6, [base+pw_1] - %endif -%endif -%if ARCH_X86_32 - mov t1, t2 ; save base reg for w4 -%endif - lea stride3q, [strideq*3] + add wq, r6 + pshufd m6, m6, q0000 jmp wq .hv_w4: %if cpuflag(ssse3) - %if ARCH_X86_32 - mova m4, [t1-prep_ssse3+bilin_h_shuf4] - %else - mova m4, [bilin_h_shuf4] - %endif -%endif + mova m4, [base+bilin_h_shuf4] + movddup m0, [srcq+strideq*0] +%else movhps m0, [srcq+strideq*0] +%endif + lea r3, [strideq*3] PSHUFB_BILIN_H4 m0, m4, m3 PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 .hv_w4_loop: movq m1, [srcq+strideq*1] movhps m1, [srcq+strideq*2] - movq m2, [srcq+stride3q ] + movq m2, [srcq+r3 ] lea srcq, [srcq+strideq*4] movhps m2, [srcq+strideq*0] PSHUFB_BILIN_H4 m1, m4, m3 PSHUFB_BILIN_H4 m2, m4, m3 PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 - shufpd m3, m0, m1, 0x01 ; 0 1 - mova m0, m2 - PMADDUBSW m0, m5, m7, m4, 0 ; 3 4 - shufpd m2, m1, m0, 0x01 ; 2 3 - psubw m1, m3 + PMADDUBSW m2, m5, m7, m4, 0 ; 3 4 + shufpd m0, m1, 0x01 ; 0 1 + shufpd m3, m1, m2, 0x01 ; 2 3 + psubw m1, m0 PMULHRSW m1, m6, m4, m8, 4 - paddw m1, m3 - psubw m3, m0, m2 - PMULHRSW m3, m6, m4, m8, 4 - paddw m3, m2 + paddw m1, m0 + mova m0, m2 + psubw m2, m3 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m3 mova [tmpq+16*0], m1 - mova [tmpq+16*1], m3 + mova [tmpq+16*1], m2 add tmpq, 32 sub hd, 4 jg .hv_w4_loop @@ -1327,7 +1431,8 @@ PMADDUBSW m0, m5, m7, m4, 0 ; 0 .hv_w8_loop: movu m1, [srcq+strideq*1] - movu m2, [srcq+strideq*2] + lea srcq, [srcq+strideq*2] + movu m2, [srcq+strideq*0] PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PMADDUBSW m1, m5, m7, m4, 0 ; 1 @@ -1335,69 +1440,41 @@ psubw m3, m1, m0 PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 -%if notcpuflag(ssse3) && ARCH_X86_64 - SWAP m9, m7 -%endif - psubw m7, m2, m1 - PMULHRSW m7, m6, m4, m8, 4 - paddw m7, m1 + mova m0, m2 + psubw m2, m1 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m1 mova [tmpq+16*0], m3 - mova [tmpq+16*1], m7 -%if notcpuflag(ssse3) && ARCH_X86_64 - SWAP m7, m9 -%endif - movu m1, [srcq+stride3q ] - lea srcq, [srcq+strideq*4] - movu m0, [srcq+strideq*0] - PSHUFB_BILIN_H8 m1, m4 - PSHUFB_BILIN_H8 m0, m4 - PMADDUBSW m1, m5, m7, m4, ARCH_X86_32 ; 3 - PMADDUBSW m0, m5, m7, m4, 0 ; 4 - psubw m3, m1, m2 - PMULHRSW m3, m6, m4, m8, 4 - paddw m3, m2 -%if notcpuflag(ssse3) && ARCH_X86_64 - SWAP m9, m7 -%endif - psubw m7, m0, m1 - PMULHRSW m7, m6, m4, m8, 4 - paddw m7, m1 - mova [tmpq+16*2], m3 - mova [tmpq+16*3], m7 -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m7, m9 - %else - pxor m7, m7 - %endif -%endif - add tmpq, 16*4 - sub hd, 4 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 2 jg .hv_w8_loop RET -.hv_w16: - mov t2d, hd - mov t0d, 32 - jmp .hv_w16_start -.hv_w32: - lea t2d, [hq+(1<<16)] - mov t0d, 64 +.hv_w128: + lea r3d, [hq+(7<<8)] + mov r5d, 256 jmp .hv_w16_start .hv_w64: - lea t2d, [hq+(3<<16)] - mov t0d, 128 + lea r3d, [hq+(3<<8)] + mov r5d, 128 jmp .hv_w16_start -.hv_w128: - lea t2d, [hq+(7<<16)] - mov t0d, 256 +.hv_w32: + lea r3d, [hq+(1<<8)] + mov r5d, 64 + jmp .hv_w16_start +.hv_w16: + xor r3d, r3d + mov r5d, 32 .hv_w16_start: +%if ARCH_X86_64 || cpuflag(ssse3) + mov r6, srcq +%endif %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r7, tmpq %endif - mov t1, srcq .hv_w16_hloop: movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] @@ -1421,7 +1498,7 @@ PMULHRSW m0, m6, m4, m8, 4 paddw m0, m1 mova [tmpq+16*1], m0 - add tmpq, t0q + add tmpq, r5 movu m0, [srcq+strideq*0+8*0] PSHUFB_BILIN_H8 m0, m4 PMADDUBSW m0, m5, m7, m4, 0 ; 2a @@ -1436,21 +1513,30 @@ PMULHRSW m2, m6, m4, m8, 4 paddw m2, m3 mova [tmpq+16*1], m2 - add tmpq, t0q + add tmpq, r5 sub hd, 2 jg .hv_w16_vloop - movzx hd, t2w - add t1, 16 - mov srcq, t1 + movzx hd, r3b %if ARCH_X86_64 + add r6, 16 add r7, 2*16 + mov srcq, r6 mov tmpq, r7 +%elif cpuflag(ssse3) + mov tmpq, tmpm + add r6, 16 + add tmpq, 2*16 + mov srcq, r6 + mov tmpm, tmpq %else - mov tmpq, tmpmp + mov srcq, srcm + mov tmpq, tmpm + add srcq, 16 add tmpq, 2*16 - mov tmpmp, tmpq + mov srcm, srcq + mov tmpm, tmpq %endif - sub t2d, 1<<16 + sub r3d, 1<<8 jg .hv_w16_hloop %if WIN64 POP r7 @@ -1463,6 +1549,19 @@ %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 +%macro FN 4 ; prefix, type, type_h, type_v +cglobal %1_%2 + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1 %+ SUFFIX) +%endif +%endmacro + %if ARCH_X86_32 DECLARE_REG_TMP 1, 2 %elif WIN64 @@ -1471,35 +1570,22 @@ DECLARE_REG_TMP 7, 8 %endif -%macro PUT_8TAP_FN 3 ; type, type_h, type_v -cglobal put_8tap_%1 - mov t0d, FILTER_%2 - mov t1d, FILTER_%3 -%ifnidn %1, sharp_smooth ; skip the jump in the last filter - jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX) -%endif -%endmacro - -PUT_8TAP_FN regular, REGULAR, REGULAR -PUT_8TAP_FN regular_sharp, REGULAR, SHARP -PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH -PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR -PUT_8TAP_FN smooth, SMOOTH, SMOOTH -PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP -PUT_8TAP_FN sharp_regular, SHARP, REGULAR -PUT_8TAP_FN sharp, SHARP, SHARP -PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +FN put_8tap, sharp, SHARP, SHARP +FN put_8tap, sharp_smooth, SHARP, SMOOTH +FN put_8tap, smooth_sharp, SMOOTH, SHARP +FN put_8tap, smooth, SMOOTH, SMOOTH +FN put_8tap, sharp_regular, SHARP, REGULAR +FN put_8tap, regular_sharp, REGULAR, SHARP +FN put_8tap, smooth_regular, SMOOTH, REGULAR +FN put_8tap, regular_smooth, REGULAR, SMOOTH +FN put_8tap, regular, REGULAR, REGULAR %if ARCH_X86_32 %define base_reg r1 %define base base_reg-put_ssse3 - %define W32_RESTORE_DSQ mov dsq, dsm - %define W32_RESTORE_SSQ mov ssq, ssm %else %define base_reg r8 %define base 0 - %define W32_RESTORE_DSQ - %define W32_RESTORE_SSQ %endif cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 @@ -1530,10 +1616,9 @@ add wq, base_reg ; put_bilin mangling jump %assign stack_offset org_stack_offset -%if ARCH_X86_32 - mov dsq, dsm - mov ssq, ssm -%elif WIN64 + movifnidn dsq, dsmp + movifnidn ssq, ssmp +%if WIN64 pop r8 %endif lea r6, [ssq*3] @@ -1545,7 +1630,7 @@ test myd, 0xf00 %endif jnz .hv - W32_RESTORE_SSQ + movifnidn ssq, ssmp WIN64_SPILL_XMM 12 cmp wd, 4 jl .h_w2 @@ -1559,11 +1644,10 @@ shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] - movd m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0] - pshufd m5, m5, q0000 - movd m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4] - pshufd m6, m6, q0000 + movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3] mova m7, [base+pw_34] ; 2 + (8 << 2) + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 add wq, base_reg jmp wq .h_w2: @@ -1575,9 +1659,9 @@ dec srcq mova m4, [base+subpel_h_shuf4] movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] - pshufd m3, m3, q0000 mova m5, [base+pw_34] ; 2 + (8 << 2) - W32_RESTORE_DSQ + pshufd m3, m3, q0000 + movifnidn dsq, dsmp .h_w2_loop: movq m0, [srcq+ssq*0] movhps m0, [srcq+ssq*1] @@ -1588,10 +1672,10 @@ paddw m0, m5 ; pw34 psraw m0, 6 packuswb m0, m0 - movd r4d, m0 - mov [dstq+dsq*0], r4w - shr r4d, 16 - mov [dstq+dsq*1], r4w + movd r6d, m0 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop @@ -1604,10 +1688,10 @@ %endif dec srcq movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] - pshufd m3, m3, q0000 - mova m5, [base+pw_34] ; 2 + (8 << 2) mova m6, [base+subpel_h_shufA] - W32_RESTORE_DSQ + mova m5, [base+pw_34] ; 2 + (8 << 2) + pshufd m3, m3, q0000 + movifnidn dsq, dsmp .h_w4_loop: movq m0, [srcq+ssq*0] ; 1 movq m1, [srcq+ssq*1] ; 2 @@ -1627,7 +1711,6 @@ sub hd, 2 jg .h_w4_loop RET - ; %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] %if ARCH_X86_32 pshufb %2, %1, [base+subpel_h_shufB] @@ -1648,18 +1731,17 @@ paddw %1, m7 ; pw34 psraw %1, 6 %endmacro - ; .h_w8: - movu m0, [srcq+ssq*0] - movu m1, [srcq+ssq*1] - PUT_8TAP_H m0, m2, m3, m4 + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] + PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 %if ARCH_X86_32 - movq [dstq ], m0 + movq [dstq], m0 add dstq, dsm - movhps [dstq ], m0 + movhps [dstq], m0 add dstq, dsm %else movq [dstq+dsq*0], m0 @@ -1669,39 +1751,35 @@ sub hd, 2 jg .h_w8 RET -.h_w16: - xor r6d, r6d - jmp .h_start -.h_w32: - mov r6, -16*1 - jmp .h_start -.h_w64: - mov r6, -16*3 - jmp .h_start .h_w128: - mov r6, -16*7 -.h_start: - sub srcq, r6 - sub dstq, r6 - mov r4, r6 -.h_loop: + mov r4, -16*7 + jmp .h_w16_start +.h_w64: + mov r4, -16*3 + jmp .h_w16_start +.h_w32: + mov r4, -16*1 + jmp .h_w16_start +.h_w16: + xor r4d, r4d +.h_w16_start: + sub srcq, r4 + sub dstq, r4 +.h_w16_loop_v: + mov r6, r4 +.h_w16_loop_h: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 mova [dstq+r6], m0 - add r6, mmsize - jle .h_loop + add r6, 16 + jle .h_w16_loop_h add srcq, ssq -%if ARCH_X86_32 - add dstq, dsm -%else - add dstq, dsq -%endif - mov r6, r4 + add dstq, dsmp dec hd - jg .h_loop + jg .h_w16_loop_v RET .v: %if ARCH_X86_32 @@ -1709,7 +1787,7 @@ shr ssd, 16 cmp hd, 6 cmovs ssd, mxd - lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3] + movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] %else %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 @@ -1717,12 +1795,12 @@ shr myd, 16 cmp hd, 6 cmovs myd, mxd - lea myq, [base_reg+myq*8+subpel_filters-put_ssse3] + movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] %endif tzcnt r6d, wd movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] + punpcklwd m0, m0 mova m7, [base+pw_512] - psrlw m2, m7, 1 ; 0x0100 add r6, base_reg %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] @@ -1730,20 +1808,16 @@ %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed - ALLOC_STACK -mmsize*4 + ALLOC_STACK -16*4 %assign regs_used 7 - movd m0, [ssq+0] - pshufb m0, m2 - mova subpel0, m0 - movd m0, [ssq+2] - pshufb m0, m2 - mova subpel1, m0 - movd m0, [ssq+4] - pshufb m0, m2 - mova subpel2, m0 - movd m0, [ssq+6] - pshufb m0, m2 - mova subpel3, m0 + pshufd m1, m0, q0000 + mova subpel0, m1 + pshufd m1, m0, q1111 + mova subpel1, m1 + pshufd m1, m0, q2222 + mova subpel2, m1 + pshufd m1, m0, q3333 + mova subpel3, m1 mov ssq, [rstk+stack_offset+gprsize*4] lea ssq, [ssq*3] sub srcq, ssq @@ -1754,47 +1828,46 @@ %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 - movd subpel0, [myq+0] - pshufb subpel0, m2 - movd subpel1, [myq+2] - pshufb subpel1, m2 - movd subpel2, [myq+4] - pshufb subpel2, m2 - movd subpel3, [myq+6] - pshufb subpel3, m2 lea ss3q, [ssq*3] + pshufd m8, m0, q0000 sub srcq, ss3q + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 %endif jmp r6 .v_w2: - movd m2, [srcq+ssq*0] ; 0 - pinsrw m2, [srcq+ssq*1], 2 ; 0 1 - pinsrw m2, [srcq+ssq*2], 4 ; 0 1 2 + movd m1, [srcq+ssq*0] + movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] - add srcq, ssq - pinsrw m2, [srcq+ssq*0], 6 ; 0 1 2 3 - add srcq, ssq -%else - pinsrw m2, [srcq+ss3q ], 6 ; 0 1 2 3 - lea srcq, [srcq+ssq*4] -%endif - movd m3, [srcq+ssq*0] ; 4 - movd m1, [srcq+ssq*1] ; 5 - movd m0, [srcq+ssq*2] ; 6 -%if ARCH_X86_32 + movd m2, [srcq+ssq*0] + movd m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m3, [srcq+ssq*0] + movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - add srcq, ssq %else + movd m2, [srcq+ssq*2] + add srcq, ss3q + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m4, [srcq+ssq*2] add srcq, ss3q %endif - punpckldq m3, m1 ; 4 5 _ _ - punpckldq m1, m0 ; 5 6 _ _ - palignr m4, m3, m2, 4 ; 1 2 3 4 - punpcklbw m3, m1 ; 45 56 - punpcklbw m1, m2, m4 ; 01 12 - punpckhbw m2, m4 ; 23 34 + punpcklwd m1, m0 ; 0 1 + punpcklwd m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+ssq*0] + punpcklwd m2, m5 ; 2 3 + punpcklwd m5, m3 ; 3 4 + punpcklwd m3, m4 ; 4 5 + punpcklwd m4, m0 ; 5 6 + punpcklbw m2, m5 ; 23 34 + punpcklbw m3, m4 ; 45 56 .v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 @@ -1802,17 +1875,14 @@ mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 - movd m4, [srcq+ssq*0] ; 7 - punpckldq m3, m0, m4 ; 6 7 _ _ - movd m0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] - punpckldq m4, m0 ; 7 8 _ _ + punpcklwd m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpcklwd m4, m0 ; 7 8 punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 packuswb m5, m5 - pshuflw m5, m5, q2020 movd r6d, m5 mov [dstq+dsq*0], r6w shr r6d, 16 @@ -1828,51 +1898,46 @@ .v_w32: .v_w64: .v_w128: -%endif ; ARCH_X86_32 - lea r6d, [wq - 4] ; horizontal loop - mov r4, dstq -%if ARCH_X86_32 -%if STACK_ALIGNMENT < mmsize - %define srcm [rsp+mmsize*4+gprsize] -%endif - mov srcm, srcq -%else - mov r7, srcq + shl wd, 14 +%if STACK_ALIGNMENT < 16 + %define dstm [rsp+mmsize*4+gprsize] + mov dstm, dstq %endif - shl r6d, (16 - 2) ; (wq / 4) << 16 - mov r6w, hw + lea r6d, [hq+wq-(1<<16)] + mov r4, srcq .v_w4_loop0: - movd m2, [srcq+ssq*0] ; 0 - movhps m2, [srcq+ssq*2] ; 0 _ 2 - movd m3, [srcq+ssq*1] ; 1 -%if ARCH_X86_32 - lea srcq, [srcq+ssq*2] - add srcq, ssq - movhps m3, [srcq+ssq*0] ; 1 _ 3 - lea srcq, [srcq+ssq*1] -%else - movhps m3, [srcq+ss3q ] ; 1 _ 3 - lea srcq, [srcq+ssq*4] %endif - pshufd m2, m2, q2020 ; 0 2 0 2 - pshufd m3, m3, q2020 ; 1 3 1 3 - punpckldq m2, m3 ; 0 1 2 3 - movd m3, [srcq+ssq*0] ; 4 - movd m1, [srcq+ssq*1] ; 5 - movd m0, [srcq+ssq*2] ; 6 + movd m1, [srcq+ssq*0] + movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] - add srcq, ssq + movd m2, [srcq+ssq*0] + movd m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m3, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] %else + movd m2, [srcq+ssq*2] + add srcq, ss3q + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m4, [srcq+ssq*2] add srcq, ss3q %endif - punpckldq m3, m1 ; 4 5 _ _ - punpckldq m1, m0 ; 5 6 _ _ - palignr m4, m3, m2, 4 ; 1 2 3 4 - punpcklbw m3, m1 ; 45 56 - punpcklbw m1, m2, m4 ; 01 12 - punpckhbw m2, m4 ; 23 34 + punpckldq m1, m0 ; 0 1 + punpckldq m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+ssq*0] + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m4 ; 4 5 + punpckldq m4, m0 ; 5 6 + punpcklbw m2, m5 ; 23 34 + punpcklbw m3, m4 ; 45 56 .v_w4_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 @@ -1880,10 +1945,8 @@ mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 - movd m4, [srcq+ssq*0] punpckldq m3, m0, m4 ; 6 7 _ _ - movd m0, [srcq+ssq*1] - lea srcq, [srcq+ssq*2] + movd m0, [srcq+ssq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 @@ -1891,24 +1954,21 @@ pmulhrsw m5, m7 packuswb m5, m5 movd [dstq+dsq*0], m5 - pshufd m5, m5, q0101 + psrlq m5, 32 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop - mov hw, r6w ; reset vertical loop - add r4, 4 - mov dstq, r4 %if ARCH_X86_32 - mov srcq, srcm - add srcq, 4 - mov srcm, srcq -%else - add r7, 4 - mov srcq, r7 -%endif - sub r6d, 1<<16 ; horizontal-- + mov dstq, dstm + add r4, 4 + movzx hd, r6w + add dstq, 4 + mov srcq, r4 + mov dstm, dstq + sub r6d, 1<<16 jg .v_w4_loop0 +%endif RET %if ARCH_X86_64 .v_w8: @@ -1916,56 +1976,51 @@ .v_w32: .v_w64: .v_w128: - lea r6d, [wq - 8] ; horizontal loop - mov r4, dstq - mov r7, srcq - shl r6d, 8 - 3; (wq / 8) << 8 - mov r6b, hb + lea r6d, [wq*8-64] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*4] .v_w8_loop0: - movq m4, [srcq+ssq*0] ; 0 - movq m5, [srcq+ssq*1] ; 1 - lea srcq, [srcq+ssq*2] - movq m6, [srcq+ssq*0] ; 2 - movq m0, [srcq+ssq*1] ; 3 - lea srcq, [srcq+ssq*2] - movq m1, [srcq+ssq*0] ; 4 - movq m2, [srcq+ssq*1] ; 5 - lea srcq, [srcq+ssq*2] ; - movq m3, [srcq+ssq*0] ; 6 - shufpd m4, m0, 0x0c - shufpd m5, m1, 0x0c - punpcklbw m1, m4, m5 ; 01 - punpckhbw m4, m5 ; 34 - shufpd m6, m2, 0x0c - punpcklbw m2, m5, m6 ; 12 - punpckhbw m5, m6 ; 45 - shufpd m0, m3, 0x0c - punpcklbw m3, m6, m0 ; 23 - punpckhbw m6, m0 ; 56 + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, ss3q + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, ss3q + movq m0, [srcq+ssq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m5 ; 34 + punpcklbw m5, m6 ; 45 + punpcklbw m6, m0 ; 56 .v_w8_loop: - movq m12, [srcq+ssq*1] ; 8 + movq m13, [srcq+ssq*1] lea srcq, [srcq+ssq*2] - movq m13, [srcq+ssq*0] ; 9 pmaddubsw m14, m1, subpel0 ; a0 - pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 + pmaddubsw m15, m2, subpel0 ; b0 mova m2, m4 pmaddubsw m3, subpel1 ; a1 + mova m12, m0 pmaddubsw m4, subpel1 ; b1 + movq m0, [srcq+ssq*0] paddw m14, m3 paddw m15, m4 mova m3, m5 - mova m4, m6 pmaddubsw m5, subpel2 ; a2 + mova m4, m6 pmaddubsw m6, subpel2 ; b2 + punpcklbw m12, m13 ; 67 + punpcklbw m13, m0 ; 78 paddw m14, m5 + mova m5, m12 + pmaddubsw m12, subpel3 ; a3 paddw m15, m6 - shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c - punpcklbw m5, m6, m0 ; 67 - punpckhbw m6, m0 ; 78 - pmaddubsw m12, m5, subpel3 ; a3 - pmaddubsw m13, m6, subpel3 ; b3 + mova m6, m13 + pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 @@ -1976,12 +2031,12 @@ lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop - movzx hd, r6b ; reset vertical loop add r4, 8 add r7, 8 - mov dstq, r4 - mov srcq, r7 - sub r6d, 1<<8 ; horizontal-- + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 @@ -2006,7 +2061,7 @@ cmp hd, 6 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] - W32_RESTORE_SSQ + mov ssq, ssmp lea r6, [ssq*3] sub srcq, r6 %define base_reg r6 @@ -2019,7 +2074,6 @@ %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] - punpcklqdq m0, m0 punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m6, m0, q0000 @@ -2043,7 +2097,6 @@ %define subpelv1 m11 %define subpelv2 m12 %define subpelv3 m13 - punpcklqdq m0, m0 punpcklbw m0, m0 psraw m0, 8 ; sign-extend mova m8, [base+pw_8192] @@ -2058,22 +2111,21 @@ je .hv_w4 .hv_w2: mova m6, [base+subpel_h_shuf4] - ; movq m2, [srcq+ssq*0] ; 0 movhps m2, [srcq+ssq*1] ; 0 _ 1 - movq m0, [srcq+ssq*2] ; 2 %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] lea srcq, [srcq+ssq*2] - add srcq, ssq - movhps m0, [srcq+ssq*0] ; 2 _ 3 - lea srcq, [srcq+ssq*1] + movq m0, [srcq+ssq*0] ; 2 + movhps m0, [srcq+ssq*1] ; 2 _ 3 + lea srcq, [srcq+ssq*2] %else %define w8192reg m8 %define d512reg m9 - movhps m0, [srcq+ss3q ] ; 2 _ 3 - lea srcq, [srcq+ssq*4] + movq m0, [srcq+ssq*2] ; 2 + add srcq, ss3q + movhps m0, [srcq+ssq*0] ; 2 _ 3 %endif pshufb m2, m6 ; 0 ~ 1 ~ pshufb m0, m6 ; 2 ~ 3 ~ @@ -2081,43 +2133,42 @@ pmaddubsw m0, m7 ; subpel_filters phaddw m2, m0 ; 0 1 2 3 pmulhrsw m2, w8192reg - ; +%if ARCH_X86_32 movq m3, [srcq+ssq*0] ; 4 movhps m3, [srcq+ssq*1] ; 4 _ 5 - movq m0, [srcq+ssq*2] ; 6 -%if ARCH_X86_32 lea srcq, [srcq+ssq*2] - add srcq, ssq %else + movq m3, [srcq+ssq*1] ; 4 + movhps m3, [srcq+ssq*2] ; 4 _ 5 add srcq, ss3q %endif + movq m0, [srcq+ssq*0] ; 6 pshufb m3, m6 ; 4 ~ 5 ~ pshufb m0, m6 ; 6 ~ pmaddubsw m3, m7 ; subpel_filters pmaddubsw m0, m7 ; subpel_filters phaddw m3, m0 ; 4 5 6 _ pmulhrsw m3, w8192reg - ; palignr m4, m3, m2, 4; V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 punpckhwd m2, m4 ; V 23 34 2 3 3 4 pshufd m0, m3, q2121; V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 4 5 5 6 .hv_w2_loop: + movq m4, [srcq+ssq*1] ; V 7 + lea srcq, [srcq+ssq*2] ; V + movhps m4, [srcq+ssq*0] ; V 7 8 + pshufb m4, m6 + pmaddubsw m4, m7 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 ; V pmaddwd m2, subpelv1 ; V a1 b1 paddd m5, m2 ; V mova m2, m3 ; V pmaddwd m3, subpelv2 ; a2 b2 - paddd m5, m3 ; V - movq m4, [srcq+ssq*0] ; V 7 - movhps m4, [srcq+ssq*1] ; V 7 8 - lea srcq, [srcq+ssq*2] ; V - pshufb m4, m6 - pmaddubsw m4, m7 phaddw m4, m4 pmulhrsw m4, w8192reg + paddd m5, m3 ; V palignr m3, m4, m0, 12 mova m0, m4 punpcklwd m3, m0 ; V 67 78 @@ -2137,7 +2188,6 @@ RET %undef w8192reg %undef d512reg - ; .hv_w4: %define hv4_line_0_0 4 %define hv4_line_0_1 5 @@ -2149,14 +2199,12 @@ %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 - ; %macro SAVELINE_W4 3 mova [rsp+mmsize*hv4_line_%3_%2], %1 %endmacro %macro RESTORELINE_W4 3 mova %1, [rsp+mmsize*hv4_line_%3_%2] %endmacro - ; %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] @@ -2168,13 +2216,13 @@ mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 0 _ _ _ movhps m5, [srcq+ssq*1] ; 0 _ 1 _ - movq m4, [srcq+ssq*2] ; 2 _ _ _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] - add srcq, ssq - movhps m4, [srcq+ssq*0] ; 2 _ 3 _ - add srcq, ssq + movq m4, [srcq+ssq*0] ; 2 _ _ _ + movhps m4, [srcq+ssq*1] ; 2 _ 3 _ + lea srcq, [srcq+ssq*2] %else + movq m4, [srcq+ssq*2] ; 2 _ _ _ movhps m4, [srcq+ss3q ] ; 2 _ 3 _ lea srcq, [srcq+ssq*4] %endif @@ -2198,7 +2246,14 @@ mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 4 _ _ _ movhps m5, [srcq+ssq*1] ; 4 _ 5 _ +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movq m4, [srcq+ssq*0] ; 6 _ _ _ + add srcq, ssq +%else movq m4, [srcq+ssq*2] ; 6 _ _ _ + add srcq, ss3q +%endif pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ pmaddubsw m3, m7 ;H subpel_filters @@ -2214,13 +2269,6 @@ pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg ;H pw_8192 - ; -%if ARCH_X86_32 - lea srcq, [srcq+ssq*2] - add srcq, ssq -%else - add srcq, ss3q -%endif ;process high palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 @@ -2248,7 +2296,6 @@ mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 - ; mova m6, [base+subpel_h_shuf4] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ @@ -2280,10 +2327,10 @@ mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 - ; mova m6, [base+subpel_h_shuf4+16] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ + lea srcq, [srcq+ssq*2] pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ pmaddubsw m4, m7 ;H subpel_filters phaddw m4, m4 ;H 7 8 7 8 @@ -2295,12 +2342,10 @@ paddd m5, d512reg ; pd_512 paddd m5, m4 psrad m4, m5, 10 - ; RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 ; d -> w packuswb m5, m5 ; w -> b pshuflw m5, m5, q3120 - lea srcq, [srcq+ssq*2] movd [dstq+dsq*0], m5 psrlq m5, 32 movd [dstq+dsq*1], m5 @@ -2320,7 +2365,6 @@ %undef subpelv1 %undef subpelv2 %undef subpelv3 - ; .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 @@ -2355,7 +2399,7 @@ mov ssq, ssmp ALLOC_STACK -mmsize*13 %if STACK_ALIGNMENT < 16 - %define srcm [rsp+mmsize*13+gprsize*1] + %define dstm [rsp+mmsize*13+gprsize*1] %define dsm [rsp+mmsize*13+gprsize*2] mov r6, [rstk+stack_offset+gprsize*2] mov dsm, r6 @@ -2375,10 +2419,10 @@ mova subpelv2, m4 mova subpelv3, m5 lea r6, [ssq*3] + mov dstm, dstq sub srcq, r6 - mov srcm, srcq %else - ALLOC_STACK mmsize*5, 16 + ALLOC_STACK 16*5, 16 %define subpelh0 m10 %define subpelh1 m11 %define subpelv0 m12 @@ -2395,7 +2439,6 @@ movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 - punpcklqdq m1, m1 punpcklbw m1, m1 psraw m1, 8 ; sign-extend pshufd subpelv0, m1, q0000 @@ -2403,18 +2446,18 @@ pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 lea ss3q, [ssq*3] + mov r7, dstq sub srcq, ss3q - mov r7, srcq %endif - lea r6d, [wq-4] - mov r4, dstq - shl r6d, (16 - 2) - mov r6w, hw + shl wd, 14 + lea r6d, [hq+wq-(1<<16)] + mov r4, srcq .hv_w8_loop0: movu m4, [srcq+ssq*0] ; 0 = _ _ movu m5, [srcq+ssq*1] ; 1 = _ _ +%if ARCH_X86_32 lea srcq, [srcq+ssq*2] - ; +%endif %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] %if ARCH_X86_32 pshufb %3, %1, [base+subpel_h_shufB] @@ -2433,7 +2476,6 @@ paddw %1, %3 ; A0+C4 phaddw %1, %2 %endmacro - ; %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] @@ -2441,12 +2483,17 @@ %endif HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ +%if ARCH_X86_32 movu m6, [srcq+ssq*0] ; 2 = _ _ movu m0, [srcq+ssq*1] ; 3 = _ _ lea srcq, [srcq+ssq*2] +%else + movu m6, [srcq+ssq*2] ; 2 = _ _ + add srcq, ss3q + movu m0, [srcq+ssq*0] ; 3 = _ _ +%endif HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ - ; mova m7, [base+pw_8192] pmulhrsw m4, m7 ; H pw_8192 pmulhrsw m5, m7 ; H pw_8192 @@ -2458,11 +2505,16 @@ SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 - ; mova m7, [base+subpel_h_shufA] +%if ARCH_X86_32 movu m4, [srcq+ssq*0] ; 4 = _ _ movu m5, [srcq+ssq*1] ; 5 = _ _ lea srcq, [srcq+ssq*2] +%else + movu m4, [srcq+ssq*1] ; 4 = _ _ + movu m5, [srcq+ssq*2] ; 5 = _ _ + add srcq, ss3q +%endif movu m6, [srcq+ssq*0] ; 6 = _ _ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ @@ -2474,7 +2526,6 @@ punpcklwd m4, m0, m1 ; 3 4 ~ punpcklwd m5, m1, m2 ; 4 5 ~ punpcklwd m6, m2, m3 ; 5 6 ~ - ; SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 @@ -2558,16 +2609,19 @@ RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: - movzx hd, r6w - add r4, 4 - mov dstq, r4 %if ARCH_X86_32 - mov srcq, srcm - add srcq, 4 - mov srcm, srcq + mov dstq, dstm + add r4, 4 + movzx hd, r6w + add dstq, 4 + mov srcq, r4 + mov dstm, dstq %else + add r4, 4 add r7, 4 - mov srcq, r7 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 %endif sub r6d, 1<<16 jg .hv_w8_loop0 @@ -2624,22 +2678,20 @@ %macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 %if cpuflag(ssse3) phaddw %1, %2 - %else - %ifnidn %1, %2 + %elifnidn %1, %2 %if %4 == 1 - mova %3, [pw_1] + mova %3, [base+pw_1] %endif pmaddwd %1, %3 pmaddwd %2, %3 packssdw %1, %2 - %else + %else %if %4 == 1 - pmaddwd %1, [pw_1] + pmaddwd %1, [base+pw_1] %else pmaddwd %1, %3 %endif packssdw %1, %1 - %endif %endif %endmacro @@ -2740,7 +2792,7 @@ %endif %endmacro -%macro PREP_8TAP_HV_LOAD 4 ; dst0, src_memloc, tmp[1-2] +%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] %if cpuflag(ssse3) movu %1, [%2] pshufb m2, %1, shufB @@ -2751,10 +2803,6 @@ PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4 PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4 %endif -%endmacro - -%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] - PREP_8TAP_HV_LOAD %{1:4} mova m1, m2 PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0 PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4 @@ -2765,15 +2813,6 @@ PHADDW %1, m1, %3, 1 %endmacro -%macro PREP_8TAP_FN 3 ; type, type_h, type_v -cglobal prep_8tap_%1 - mov t0d, FILTER_%2 - mov t1d, FILTER_%3 -%ifnidn %1, sharp_smooth ; skip the jump in the last filter - jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX) -%endif -%endmacro - %macro PREP_8TAP 0 %if ARCH_X86_32 DECLARE_REG_TMP 1, 2 @@ -2782,24 +2821,23 @@ %else DECLARE_REG_TMP 6, 7 %endif -PREP_8TAP_FN regular, REGULAR, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH + +FN prep_8tap, sharp, SHARP, SHARP +FN prep_8tap, sharp_smooth, SHARP, SMOOTH +FN prep_8tap, smooth_sharp, SMOOTH, SHARP +FN prep_8tap, smooth, SMOOTH, SMOOTH +FN prep_8tap, sharp_regular, SHARP, REGULAR +FN prep_8tap, regular_sharp, REGULAR, SHARP +FN prep_8tap, smooth_regular, SMOOTH, REGULAR +FN prep_8tap, regular_smooth, REGULAR, SMOOTH +FN prep_8tap, regular, REGULAR, REGULAR %if ARCH_X86_32 %define base_reg r2 %define base base_reg-prep%+SUFFIX - %define W32_RESTORE_SSQ mov strideq, stridem %else %define base_reg r7 %define base 0 - %define W32_RESTORE_SSQ %endif cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %assign org_stack_offset stack_offset @@ -2807,7 +2845,7 @@ add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v - movsxd wq, wm + mov wd, wm movifnidn srcd, srcm movifnidn hd, hm test mxd, 0xf00 @@ -2817,6 +2855,7 @@ LEA base_reg, prep_ssse3 tzcnt wd, wd movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] + pxor m4, m4 add wq, base_reg movifnidn strided, stridem lea r6, [strideq*3] @@ -2835,6 +2874,10 @@ %else WIN64_SPILL_XMM 16 %endif +%if ARCH_X86_32 + %define strideq r6 + mov strideq, stridem +%endif cmp wd, 4 je .h_w4 tzcnt wd, wd @@ -2852,16 +2895,13 @@ shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] - movd m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0] - pshufd m5, m5, q0000 - movd m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4] - pshufd m6, m6, q0000 + movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) mova m7, [base+pw_8192] + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 %else - punpcklbw m5, m5 punpcklbw m6, m6 - psraw m5, 8 psraw m6, 8 %if ARCH_X86_64 mova m7, [pw_2] @@ -2869,6 +2909,8 @@ %else %define m15 m4 %endif + pshufd m5, m6, q1010 + punpckhqdq m6, m6 %endif add wq, base_reg jmp wq @@ -2880,10 +2922,10 @@ %endif dec srcq movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] - pshufd m4, m4, q0000 %if cpuflag(ssse3) mova m6, [base+pw_8192] mova m5, [base+subpel_h_shufA] + pshufd m4, m4, q0000 %else mova m6, [base+pw_2] %if ARCH_X86_64 @@ -2893,8 +2935,8 @@ %endif punpcklbw m4, m4 psraw m4, 8 + punpcklqdq m4, m4 %endif - W32_RESTORE_SSQ %if ARCH_X86_64 lea stride3q, [strideq*3] %endif @@ -2916,8 +2958,7 @@ pshufb m1, m5 pshufb m2, m5 pshufb m3, m5 -%else - %if ARCH_X86_64 +%elif ARCH_X86_64 movd m0, [srcq+strideq*0+0] movd m12, [srcq+strideq*0+1] movd m1, [srcq+strideq*1+0] @@ -2947,7 +2988,7 @@ punpcklqdq m1, m5 ; 1 punpcklqdq m2, m13 ; 2 punpcklqdq m3, m7 ; 3 - %else +%else movd m0, [srcq+strideq*0+0] movd m1, [srcq+strideq*0+1] movd m2, [srcq+strideq*0+2] @@ -2978,7 +3019,6 @@ lea srcq, [srcq+strideq*2] punpckldq m7, m5 punpcklqdq m3, m7 ; 3 - %endif %endif PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 PMADDUBSW m1, m4, m5, m7, 0 @@ -2994,14 +3034,7 @@ sub hd, 4 jg .h_w4_loop RET - ; .h_w8: -%if ARCH_X86_32 - mov r3, r2 - %define base_reg r3 - W32_RESTORE_SSQ -%endif -.h_w8_loop: %if cpuflag(ssse3) PREP_8TAP_H 0, srcq+strideq*0 PREP_8TAP_H 1, srcq+strideq*1 @@ -3017,51 +3050,42 @@ add tmpq, 16 dec hd %endif - jg .h_w8_loop + jg .h_w8 RET .h_w16: - mov r6, -16*1 + mov r3, -16*1 jmp .h_start .h_w32: - mov r6, -16*2 + mov r3, -16*2 jmp .h_start .h_w64: - mov r6, -16*4 + mov r3, -16*4 jmp .h_start .h_w128: - mov r6, -16*8 + mov r3, -16*8 .h_start: -%if ARCH_X86_32 - mov r3, r2 - %define base_reg r3 -%endif - sub srcq, r6 - mov r5, r6 - W32_RESTORE_SSQ + sub srcq, r3 + mov r5, r3 .h_loop: %if cpuflag(ssse3) - PREP_8TAP_H 0, srcq+r6+8*0 - PREP_8TAP_H 1, srcq+r6+8*1 + PREP_8TAP_H 0, srcq+r3+8*0 + PREP_8TAP_H 1, srcq+r3+8*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 32 - add r6, 16 + add r3, 16 %else - PREP_8TAP_H 0, srcq+r6 + PREP_8TAP_H 0, srcq+r3 mova [tmpq], m0 add tmpq, 16 - add r6, 8 + add r3, 8 %endif jl .h_loop add srcq, strideq - mov r6, r5 + mov r3, r5 dec hd jg .h_loop RET -%if ARCH_X86_32 - %define base_reg r2 -%endif - ; .v: LEA base_reg, prep%+SUFFIX %if ARCH_X86_32 @@ -3075,59 +3099,48 @@ shr myd, 16 cmp hd, 6 cmovs myd, mxd - lea myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) mova m2, [base+pw_512] - psrlw m2, m2, 1 ; 0x0100 mova m7, [base+pw_8192] + punpcklwd m0, m0 +%else + punpcklbw m0, m0 + psraw m0, 8 %endif %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] -%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed +%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed %if cpuflag(ssse3) ALLOC_STACK -mmsize*4 %else ALLOC_STACK -mmsize*5 %endif %assign regs_used 7 - movd m0, [myq+0] - PSHUFB_0X1X m0, m2 - mova subpel0, m0 - movd m0, [myq+2] - PSHUFB_0X1X m0, m2 - mova subpel1, m0 - movd m0, [myq+4] - PSHUFB_0X1X m0, m2 - mova subpel2, m0 - movd m0, [myq+6] - PSHUFB_0X1X m0, m2 - mova subpel3, m0 - %if notcpuflag(ssse3) - mov r6, base_reg - %define base_reg r6 - %endif mov strideq, [rstk+stack_offset+gprsize*3] - lea strideq, [strideq*3] - sub [rstk+stack_offset+gprsize*2], strideq - mov strideq, [rstk+stack_offset+gprsize*3] - mov srcq, [rstk+stack_offset+gprsize*2] + pshufd m1, m0, q0000 + mova subpel0, m1 + pshufd m1, m0, q1111 + mova subpel1, m1 + lea r5, [strideq*3] + pshufd m1, m0, q2222 + mova subpel2, m1 + pshufd m1, m0, q3333 + mova subpel3, m1 + sub srcq, r5 %else %define subpel0 m8 %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 - movd subpel0, [myq+0] - PSHUFB_0X1X subpel0, m2 - movd subpel1, [myq+2] - PSHUFB_0X1X subpel1, m2 - movd subpel2, [myq+4] - PSHUFB_0X1X subpel2, m2 - movd subpel3, [myq+6] - PSHUFB_0X1X subpel3, m2 + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 lea stride3q, [strideq*3] + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 sub srcq, stride3q cmp wd, 8 jns .v_w8 @@ -3151,35 +3164,34 @@ mov r5w, hw .v_w4_loop0: %endif - movd m2, [srcq+strideq*0] ; 0 - movhps m2, [srcq+strideq*2] ; 0 _ 2 - movd m3, [srcq+strideq*1] ; 1 + movd m1, [srcq+strideq*0] + movd m0, [srcq+strideq*1] %if ARCH_X86_32 lea srcq, [srcq+strideq*2] - movhps m3, [srcq+strideq*1] ; 1 _ 3 + movd m2, [srcq+strideq*0] + movd m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] -%else - movhps m3, [srcq+stride3q ] ; 1 _ 3 - lea srcq, [srcq+strideq*4] -%endif - pshufd m2, m2, q2020 ; 0 2 0 2 - pshufd m3, m3, q2020 ; 1 3 1 3 - punpckldq m2, m3 ; 0 1 2 3 - movd m3, [srcq+strideq*0] ; 4 - movd m1, [srcq+strideq*1] ; 5 - movd m0, [srcq+strideq*2] ; 6 -%if ARCH_X86_32 + movd m3, [srcq+strideq*0] + movd m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - add srcq, strideq %else + movd m2, [srcq+strideq*2] + add srcq, stride3q + movd m4, [srcq+strideq*0] + movd m3, [srcq+strideq*1] + movd m5, [srcq+strideq*2] add srcq, stride3q %endif - punpckldq m3, m1 ; 4 5 _ _ - punpckldq m1, m0 ; 5 6 _ _ - PALIGNR m4, m3, m2, 4 ; 1 2 3 4 - punpcklbw m3, m1 ; 45 56 - punpcklbw m1, m2, m4 ; 01 12 - punpckhbw m2, m4 ; 23 34 + punpckldq m1, m0 ; 0 1 + punpckldq m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+strideq*0] + punpckldq m2, m4 ; 2 3 + punpckldq m4, m3 ; 3 4 + punpckldq m3, m5 ; 4 5 + punpckldq m5, m0 ; 5 6 + punpcklbw m2, m4 ; 23 34 + punpcklbw m3, m5 ; 45 56 .v_w4_loop: %if ARCH_X86_32 && notcpuflag(ssse3) mova m7, subpel0 @@ -3200,11 +3212,11 @@ %endif mova m2, m3 PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 + movd m4, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] paddw m5, m3 - movd m4, [srcq+strideq*0] punpckldq m3, m0, m4 ; 6 7 _ _ - movd m0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] + movd m0, [srcq+strideq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 %if notcpuflag(ssse3) @@ -3234,54 +3246,43 @@ sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 - mov hw, r5w ; reset vertical loop - mov tmpq, tmpm mov srcq, srcm - add tmpq, 8 + mov tmpq, tmpm + movzx hd, r5w add srcq, 4 - mov tmpm, tmpq + add tmpq, 8 mov srcm, srcq + mov tmpm, tmpq sub r5d, 1<<16 ; horizontal-- jg .v_w4_loop0 %endif RET -%if ARCH_X86_32 && notcpuflag(ssse3) - %define base_reg r2 -%endif - ; %if ARCH_X86_64 .v_w8: - lea r5d, [wq - 8] ; horizontal loop + lea r6d, [wq*8-64] + mov r5, srcq mov r8, tmpq - mov r6, srcq - shl r5d, 8 - 3; (wq / 8) << 8 - mov r5b, hb + lea r6d, [hq+r6*4] .v_w8_loop0: - movq m4, [srcq+strideq*0] - movq m5, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movq m6, [srcq+strideq*0] - movq m0, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] movq m1, [srcq+strideq*0] movq m2, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movq m3, [srcq+strideq*0] - shufpd m4, m0, 0x0c - shufpd m5, m1, 0x0c - punpcklbw m1, m4, m5 ; 01 - punpckhbw m4, m5 ; 34 - shufpd m6, m2, 0x0c - punpcklbw m2, m5, m6 ; 12 - punpckhbw m5, m6 ; 45 - shufpd m0, m3, 0x0c - punpcklbw m3, m6, m0 ; 23 - punpckhbw m6, m0 ; 56 + movq m3, [srcq+strideq*2] + add srcq, stride3q + movq m4, [srcq+strideq*0] + movq m5, [srcq+strideq*1] + movq m6, [srcq+strideq*2] + add srcq, stride3q + movq m0, [srcq+strideq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m5 ; 34 + punpcklbw m5, m6 ; 45 + punpcklbw m6, m0 ; 56 .v_w8_loop: -%if cpuflag(ssse3) - movq m12, [srcq+strideq*1] + movq m13, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - movq m13, [srcq+strideq*0] +%if cpuflag(ssse3) pmaddubsw m14, m1, subpel0 ; a0 pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 @@ -3294,64 +3295,59 @@ mova m4, m6 pmaddubsw m5, subpel2 ; a2 pmaddubsw m6, subpel2 ; b2 + punpcklbw m12, m0, m13 ; 67 + movq m0, [srcq+strideq*0] + punpcklbw m13, m0 ; 78 paddw m14, m5 + mova m5, m12 + pmaddubsw m12, subpel3 ; a3 paddw m15, m6 - shufpd m6, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c - punpcklbw m5, m6, m0 ; 67 - punpckhbw m6, m0 ; 78 - pmaddubsw m12, m5, subpel3 ; a3 - pmaddubsw m13, m6, subpel3 ; b3 + mova m6, m13 + pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 - movu [tmpq+wq*0], m14 - movu [tmpq+wq*2], m15 %else mova m14, m1 PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 + mova m15, m2 + PMADDUBSW m15, subpel0, m7, m12, 0 ; b0 mova m1, m3 PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 + mova m2, m4 + PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 paddw m14, m3 mova m3, m5 PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 - paddw m14, m5 - movq m12, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - movq m13, [srcq+strideq*0] - shufpd m15, m0, m12, 0x0d - shufpd m0, m12, m13, 0x0c - punpcklbw m5, m15, m0 ; 67 - punpckhbw m15, m0 ; 78 - mova m13, m5 - PMADDUBSW m13, subpel3, m7, m12, 0 ; a3 - paddw m14, m13 - PMULHRSW_8192 m14, m14, [base+pw_2] - movu [tmpq+wq*0], m14 - mova m14, m2 - PMADDUBSW m14, subpel0, m7, m12, 0 ; b0 - mova m2, m4 - PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 - paddw m14, m4 + paddw m15, m4 mova m4, m6 PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 - paddw m14, m6 - mova m6, m15 - PMADDUBSW m15, subpel3, m7, m12, 0 ; b3 - paddw m14, m15 + paddw m15, m6 + punpcklbw m12, m0, m13 ; 67 + movq m0, [srcq+strideq*0] + punpcklbw m13, m0 ; 78 + paddw m14, m5 + mova m5, m12 + PMADDUBSW m12, subpel3, m7, m6, 0 ; a3 + paddw m14, m12 + mova m6, m13 + PMADDUBSW m13, subpel3, m7, m12, 0 ; b3 + paddw m15, m13 PMULHRSW_8192 m14, m14, [base+pw_2] - movu [tmpq+wq*2], m14 + PMULHRSW_8192 m15, m15, [base+pw_2] %endif + movu [tmpq+wq*0], m14 + movu [tmpq+wq*2], m15 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w8_loop - movzx hd, r5b ; reset vertical loop + add r5, 8 add r8, 16 - add r6, 8 + movzx hd, r6b + mov srcq, r5 mov tmpq, r8 - mov srcq, r6 - sub r5d, 1<<8 ; horizontal-- + sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 @@ -3359,7 +3355,6 @@ %undef subpel1 %undef subpel2 %undef subpel3 - ; .hv: %assign stack_offset org_stack_offset cmp wd, 4 @@ -3373,16 +3368,12 @@ cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] - mov r5, r2; use as new base - %define base_reg r5 - %assign regs_used 2 + mov strideq, stridem + %assign regs_used 6 ALLOC_STACK -mmsize*14 %assign regs_used 7 - mov strideq, [rstk+stack_offset+gprsize*3] - lea strideq, [strideq*3 + 1] - sub [rstk+stack_offset+gprsize*2], strideq - mov strideq, [rstk+stack_offset+gprsize*3] - mov srcq, [rstk+stack_offset+gprsize*2] + lea r5, [strideq*3+1] + sub srcq, r5 %define subpelv0 [rsp+mmsize*0] %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] @@ -3445,9 +3436,9 @@ %define hv4_line_1_3 13 %if ARCH_X86_32 %if cpuflag(ssse3) - %define w8192reg [base+pw_8192] + %define w8192reg [base+pw_8192] %else - %define w8192reg [base+pw_2] + %define w8192reg [base+pw_2] %endif %define d32reg [base+pd_32] %else @@ -3466,13 +3457,13 @@ %endif movq m5, [srcq+strideq*0] ; 0 _ _ _ movhps m5, [srcq+strideq*1] ; 0 _ 1 _ - movq m4, [srcq+strideq*2] ; 2 _ _ _ %if ARCH_X86_32 lea srcq, [srcq+strideq*2] - add srcq, strideq - movhps m4, [srcq+strideq*0] ; 2 _ 3 _ - add srcq, strideq + movq m4, [srcq+strideq*0] ; 2 _ _ _ + movhps m4, [srcq+strideq*1] ; 2 _ 3 _ + lea srcq, [srcq+strideq*2] %else + movq m4, [srcq+strideq*2] ; 2 _ _ _ movhps m4, [srcq+stride3q ] ; 2 _ 3 _ lea srcq, [srcq+strideq*4] %endif @@ -3506,7 +3497,14 @@ %endif movq m5, [srcq+strideq*0] ; 4 _ _ _ movhps m5, [srcq+strideq*1] ; 4 _ 5 _ +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movq m4, [srcq+strideq*0] ; 6 _ _ _ + add srcq, strideq +%else movq m4, [srcq+strideq*2] ; 6 _ _ _ + add srcq, stride3q +%endif PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters @@ -3531,12 +3529,6 @@ mova m2, [esp+mmsize*4] %endif %endif -%if ARCH_X86_32 - lea srcq, [srcq+strideq*2] - add srcq, strideq -%else - add srcq, stride3q -%endif ;process high PALIGNR m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 @@ -3572,7 +3564,6 @@ %define m15 m3 %endif %endif - ; %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] %endif @@ -3620,7 +3611,6 @@ mova [esp+0xA0], m5 %endif %endif - ; %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] %endif @@ -3644,7 +3634,6 @@ paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m4, m5, 6 - ; RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 pshufd m5, m5, q3120 @@ -3666,7 +3655,6 @@ %undef subpelv1 %undef subpelv2 %undef subpelv3 - ; .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 @@ -3676,7 +3664,6 @@ %define hv8_line_6 4 shr mxd, 16 %if ARCH_X86_32 - %define base_reg r2 %define subpelh0 [rsp+mmsize*5] %define subpelh1 [rsp+mmsize*6] %define subpelv0 [rsp+mmsize*7] @@ -3692,28 +3679,28 @@ cmp hd, 6 cmovs myd, mxd movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] - ALLOC_STACK -mmsize*13 + mov strideq, stridem + %assign regs_used 6 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 %if STACK_ALIGNMENT < mmsize - mov rstk, r2m - %define tmpm [rsp+mmsize*13+gprsize*1] - %define srcm [rsp+mmsize*13+gprsize*2] - %define stridem [rsp+mmsize*13+gprsize*3] - mov stridem, rstk + %define tmpm [rsp+mmsize*13+gprsize*1] + %define srcm [rsp+mmsize*13+gprsize*2] + %define stridem [rsp+mmsize*13+gprsize*3] + mov tmpm, tmpq + mov stridem, strideq %endif - mov r6, r2 - %define base_reg r6 + %if cpuflag(ssse3) pshufd m0, m1, q0000 pshufd m1, m1, q1111 - punpcklbw m5, m5 - %if notcpuflag(ssse3) - punpcklbw m0, m0 + %else punpcklbw m1, m1 - %endif - psraw m5, 8 - %if notcpuflag(ssse3) - psraw m0, 8 psraw m1, 8 + pshufd m0, m1, q1010 + punpckhqdq m1, m1 %endif + punpcklbw m5, m5 + psraw m5, 8 pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 @@ -3724,12 +3711,9 @@ mova subpelv1, m3 mova subpelv2, m4 mova subpelv3, m5 - W32_RESTORE_SSQ - lea strided, [strided*3] - sub srcd, strided - sub srcd, 3 - mov srcm, srcd - W32_RESTORE_SSQ + lea r5, [strideq*3+3] + sub srcq, r5 + mov srcm, srcq %else ALLOC_STACK mmsize*5, 16 %define subpelh0 m10 @@ -3746,38 +3730,31 @@ cmp hd, 6 cmovs myd, mxd movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + %if cpuflag(ssse3) pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 - punpcklbw m1, m1 - %if notcpuflag(ssse3) - punpcklbw subpelh0, subpelh0 - punpcklbw subpelh1, subpelh1 + %else + punpcklbw m0, m0 + psraw m0, 8 + pshufd subpelh0, m0, q1010 + pshufd subpelh1, m0, q3232 + mova m7, [base+pw_2] %endif + punpcklbw m1, m1 psraw m1, 8 - %if notcpuflag(ssse3) - psraw subpelh0, 8 - psraw subpelh1, 8 - %endif pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 - %if notcpuflag(ssse3) - mova m7, [base+pw_2] - %endif - lea stride3q, [strideq*3] + lea stride3q, [strideq*3] sub srcq, 3 sub srcq, stride3q mov r6, srcq -%endif - lea r5d, [wq-4] -%if ARCH_X86_64 mov r8, tmpq -%else - mov tmpm, tmpq %endif - shl r5d, (16 - 2) - mov r5w, hw + lea r5d, [wq-4] + shl r5d, 14 + add r5d, hd .hv_w8_loop0: %if cpuflag(ssse3) %if ARCH_X86_64 @@ -3795,24 +3772,24 @@ %endif PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 +%if ARCH_X86_64 + PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 + add srcq, stride3q + PREP_8TAP_HV m0, srcq+strideq*0, m7, m9 +%else lea srcq, [srcq+strideq*2] -%if notcpuflag(ssse3) - %if ARCH_X86_64 - SWAP m9, m4 - %else + %if notcpuflag(ssse3) mova [esp], m4 %endif -%endif PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 lea srcq, [srcq+strideq*2] +%endif %if cpuflag(ssse3) mova m7, [base+pw_8192] %else mova m7, [base+pw_2] - %if ARCH_X86_64 - SWAP m4, m9 - %else + %if ARCH_X86_32 mova m4, [esp] %endif %endif @@ -3828,28 +3805,26 @@ SAVELINE_W8 3, m3 %if cpuflag(ssse3) mova m7, [base+subpel_h_shufA] +%endif +%if ARCH_X86_64 + PREP_8TAP_HV m4, srcq+strideq*1, m8, m9 + PREP_8TAP_HV m5, srcq+strideq*2, m8, m9 + add srcq, stride3q + PREP_8TAP_HV m6, srcq+strideq*0, m8, m9 %else - %if ARCH_X86_64 - SWAP m8, m7 - SWAP m9, m0 - %else + %if notcpuflag(ssse3) mova [esp+0x30], m0 %endif -%endif PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 - PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 lea srcq, [srcq+strideq*2] + PREP_8TAP_HV m6, srcq+strideq*0, m7, m0 +%endif %if cpuflag(ssse3) mova m7, [base+pw_8192] -%else - %if ARCH_X86_64 - SWAP m0, m9 - SWAP m7, m8 - %else +%elif ARCH_X86_32 mova m0, [esp+0x30] mova m7, [base+pw_2] - %endif %endif PMULHRSW_8192 m1, m4, m7 PMULHRSW_8192 m2, m5, m7 @@ -3906,8 +3881,8 @@ %endif %endif PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 - PREP_8TAP_HV m4, srcq+strideq*2, m5, m6 lea srcq, [srcq+strideq*2] + PREP_8TAP_HV m4, srcq+strideq*0, m5, m6 %if cpuflag(ssse3) mova m5, [base+pw_8192] %else @@ -3937,24 +3912,1758 @@ RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: - movzx hd, r5w %if ARCH_X86_32 - add dword tmpm, 8 - mov tmpq, tmpm mov srcq, srcm + mov tmpq, tmpm + movzx hd, r5w add srcq, 4 + add tmpq, 8 mov srcm, srcq + mov tmpm, tmpq %else - add r8, 8 - mov tmpq, r8 add r6, 4 + add r8, 8 + movzx hd, r5b mov srcq, r6 + mov tmpq, r8 %endif sub r5d, 1<<16 jg .hv_w8_loop0 RET %endmacro +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro REMAP_REG 2 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %xdefine r14_save r14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + %xdefine r14 r14_save + %undef r14_save + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] + SWAP m%2, m%5 + movq m%1, [srcq+ r4] + movq m%2, [srcq+ r6] + movhps m%1, [srcq+ r7] + movhps m%2, [srcq+ r9] + movq m%3, [srcq+r10] + movq m%4, [srcq+r11] + movhps m%3, [srcq+r13] + movhps m%4, [srcq+ rX] + add srcq, ssq + movq m%5, [srcq+ r4] + movq m%6, [srcq+ r6] + movhps m%5, [srcq+ r7] + movhps m%6, [srcq+ r9] + movq m%7, [srcq+r10] + movq m%8, [srcq+r11] + movhps m%7, [srcq+r13] + movhps m%8, [srcq+ rX] + add srcq, ssq + pmaddubsw m%1, m%9 + pmaddubsw m%5, m%9 + pmaddubsw m%2, m%10 + pmaddubsw m%6, m%10 + pmaddubsw m%3, m%11 + pmaddubsw m%7, m%11 + pmaddubsw m%4, m%12 + pmaddubsw m%8, m%12 + phaddw m%1, m%2 + phaddw m%5, m%6 + phaddw m%3, m%4 + phaddw m%7, m%8 + phaddw m%1, m%3 + phaddw m%5, m%7 + pmulhrsw m%1, m12 + pmulhrsw m%5, m12 + SWAP m%2, m%5 +%endmacro + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isprep 0 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled, 4, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled, 4, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %xdefine base_reg r12 + %define rndshift 10 +%else + %assign isprep 1 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled, 4, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled, 4, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+0x138] + %endif + %xdefine base_reg r11 + %define rndshift 6 +%endif + LEA base_reg, %1_8tap_scaled_ssse3 +%define base base_reg-%1_8tap_scaled_ssse3 + tzcnt wd, wm + movd m8, dxm + movd m14, mxm + pshufd m8, m8, q0000 + pshufd m14, m14, q0000 +%if isprep && UNIX64 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 +%endif + mov dyd, dym +%ifidn %1, put + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+0x138] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %else + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+0x94] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define rX r14 + %define rXd r14d +%endif + mova m10, [base+pd_0x3ff] + mova m12, [base+pw_8192] +%ifidn %1, put + mova m13, [base+pd_512] +%else + mova m13, [base+pd_32] +%endif + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d + punpckldq m9, m8 + SWAP m8, m9 + paddd m14, m8 ; mx+dx*[0-1] + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + psrldq m15, 4 + movd r6d, m15 + mova m5, [base+bdct_lb_dw] + mova m6, [base+subpel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r6*8+2] + pxor m9, m9 + pcmpeqd m8, m9 + psrld m14, 10 + movq m0, [srcq+ssq*0] + movq m2, [srcq+ssq*2] + movhps m0, [srcq+ssq*1] + movhps m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + movq m1, [srcq+ssq*0] + movq m3, [srcq+ssq*2] + movhps m1, [srcq+ssq*1] + movhps m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m7 + punpcklqdq m15, m15 + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + pshufb m0, m14 + pshufb m2, m14 + pshufb m1, m14 + pshufb m3, m14 + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 1 2 3 + pmulhrsw m1, m12 ; 4 5 6 7 + palignr m2, m1, m0, 4 ; 1 2 3 4 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + pshufd m5, m1, q0321 ; 5 6 7 _ + punpcklwd m2, m1, m5 ; 45 56 + punpckhwd m4, m1, m5 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m11, r6q + punpcklbw m11, m11 + psraw m11, 8 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m5, m3, m8 + pmaddwd m6, m0, m9 + pmaddwd m7, m2, m10 + pmaddwd m8, m4, m11 + paddd m5, m6 + paddd m7, m8 + paddd m5, m13 + paddd m5, m7 + psrad m5, 10 + packssdw m5, m5 + packuswb m5, m5 + pextrw r6d, m5, 0 + mov [dstq], r6w + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movq m5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps m3, m0, q1032 ; 01 12 + shufps m0, m2, q1032 ; 23 34 + shufps m2, m4, q1032 ; 45 56 + pshufb m5, m14 + pmaddubsw m5, m15 + phaddw m5, m5 + pmulhrsw m5, m12 + palignr m4, m5, m1, 12 + punpcklqdq m1, m4, m4 ; 6 7 6 7 + punpcklwd m4, m1, m5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movhps m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m3, m0 ; 01 12 + mova m0, m2 ; 23 34 + pshufb m5, m14 + pmaddubsw m5, m15 + phaddw m5, m5 + pmulhrsw m5, m12 ; 6 7 6 7 + palignr m4, m5, m1, 8 ; 4 5 6 7 + pshufd m5, m4, q0321 ; 5 6 7 _ + mova m1, m4 + punpcklwd m2, m4, m5 ; 45 56 + punpckhwd m4, m5 ; 67 __ + jmp .w2_loop + SWAP m15, m8, m9 +%endif +.w4: + mov myd, mym + mova m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd m15, t0d + pmaddwd m8, m7 + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + psrldq m7, m15, 8 + movd r4d, m15 + movd r11d, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r6d, m15 + movd r13d, m7 + movd m15, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+r11*8+2] + movd m3, [base+subpel_filters+ r6*8+2] + movd m4, [base+subpel_filters+r13*8+2] + mova m5, [base+bdct_lb_dw] + movq m6, [base+subpel_s_shuf2] + pcmpeqd m0, m9 + psrld m14, 10 + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m3 + punpckldq m2, m4 + punpcklqdq m6, m6 + punpcklqdq m15, m2 + pshufb m14, m5 + paddb m14, m6 + movu m2, [srcq+ssq*0] + movu m4, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m5, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pand m11, m0 + pandn m0, m15 + SWAP m15, m0 + por m15, m11 + pshufb m7, m14 + pshufb m9, m14 + pshufb m8, m14 + pshufb m10, m14 + pshufb m2, m14 + pshufb m4, m14 + pshufb m3, m14 + pshufb m5, m14 + pmaddubsw m7, m15 + pmaddubsw m9, m15 + pmaddubsw m8, m15 + pmaddubsw m10, m15 + pmaddubsw m2, m15 + pmaddubsw m4, m15 + pmaddubsw m3, m15 + pmaddubsw m5, m15 + phaddw m7, m9 + phaddw m8, m10 + phaddw m9, m2, m4 + phaddw m3, m5 + pmulhrsw m7, m12 ; 0 1 + pmulhrsw m8, m12 ; 2 3 + pmulhrsw m9, m12 ; 4 5 + pmulhrsw m3, m12 ; 6 7 + shufps m4, m7, m8, q1032 ; 1 2 + shufps m5, m8, m9, q1032 ; 3 4 + shufps m6, m9, m3, q1032 ; 5 6 + psrldq m11, m3, 8 ; 7 _ + punpcklwd m0, m7, m4 ; 01 + punpckhwd m7, m4 ; 12 + punpcklwd m1, m8, m5 ; 23 + punpckhwd m8, m5 ; 34 + punpcklwd m2, m9, m6 ; 45 + punpckhwd m9, m6 ; 56 + punpcklwd m3, m11 ; 67 + mova [rsp+0x00], m7 + mova [rsp+0x10], m8 + mova [rsp+0x20], m9 +.w4_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m10, r6q + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + pmaddwd m7, m3, m10 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + packssdw m4, m4 +%ifidn %1, put + packuswb m4, m4 + movd [dstq], m4 + add dstq, dsq +%else + movq [tmpq], m4 + add tmpq, 8 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + movu m4, [srcq] + test myd, 0x400 + jz .w4_skip_line + mova m0, [rsp+0x00] + mova [rsp+0x00], m1 + mova m1, [rsp+0x10] + mova [rsp+0x10], m2 + mova m2, [rsp+0x20] + mova [rsp+0x20], m3 + pshufb m4, m14 + pmaddubsw m4, m15 + phaddw m4, m4 + pmulhrsw m4, m12 + punpcklwd m3, m11, m4 + mova m11, m4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m6, [rsp+0x10] + mova m7, [rsp+0x20] + pshufb m4, m14 + pshufb m5, m14 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m4, m5 + pmulhrsw m4, m12 + punpcklwd m9, m11, m4 + mova [rsp+0x00], m6 + mova [rsp+0x10], m7 + mova [rsp+0x20], m9 + psrldq m11, m4, 8 + mova m0, m1 + mova m1, m2 + mova m2, m3 + punpcklwd m3, m4, m11 + jmp .w4_loop + SWAP m0, m15 +.w8: + mov dword [rsp+0x90], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [rsp+0x90], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+0x90], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+0x90], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+0x90], 16 + movifprep tmp_stridem, 256 +.w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + movd m15, t0d + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + mova [rsp+0x100], m7 + mova [rsp+0x120], m15 + mov [rsp+0x098], srcq + mov [rsp+0x130], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + jmp .hloop +.hloop_prep: + dec dword [rsp+0x090] + jz .ret + add qword [rsp+0x130], 8*(isprep+1) + mov hd, hm + mova m7, [rsp+0x100] + mova m14, [rsp+0x110] + mova m10, [base+pd_0x3ff] + mova m15, [rsp+0x120] + pxor m9, m9 + mov srcq, [rsp+0x098] + mov r0q, [rsp+0x130] ; dstq / tmpq + paddd m14, m7 +.hloop: + mova m11, [base+pq_0x40000000] + psrld m4, m14, 10 + mova [rsp], m4 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m9 + psrldq m4, m5, 8 + movd r4d, m5 + movd r6d, m4 + psrldq m5, 4 + psrldq m4, 4 + movd r7d, m5 + movd r9d, m4 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + mova [rsp+0x110], m14 + psrldq m4, m15, 8 + movd r10d, m15 + movd r11d, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + psrldq m4, m14, 8 + movd r10d, m14 + movd r11d, m4 + psrldq m14, 4 + psrldq m4, 4 + movd r13d, m14 + movd rXd, m4 + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m11, m4 + pand m8, m11, m6 + pand m15, m11, m14 + pand m11, m11, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m11, m5 + mova [rsp+0x10], m7 + mova [rsp+0x20], m8 + mova [rsp+0x30], m15 + mova [rsp+0x40], m11 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1 + mova [rsp+0x50], m1 + mova [rsp+0x60], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3 + mova [rsp+0x70], m3 + mova [rsp+0x80], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5 + MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7 + SWAP m7, m0 + SWAP m8, m14 + mova m1, [rsp+0x50] + mova m2, [rsp+0x60] + mova m3, [rsp+0x70] + mova m9, [rsp+0x80] + mov myd, mym + mov dyd, dym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova [rsp+0x50], m4 + mova [rsp+0x60], m5 + mova [rsp+0x70], m6 + mova [rsp+0x80], m7 + SWAP m14, m8 +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m11, r6q + punpcklbw m11, m11 + psraw m11, 8 + pshufd m5, m11, q0000 + pshufd m7, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m4, m5, m0 + pmaddwd m5, m5, m1 + pmaddwd m6, m7, m2 + pmaddwd m7, m7, m3 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [rsp+0x50], m10 + pmaddwd m7, [rsp+0x60], m10 + pmaddwd m8, [rsp+0x70], m11 + pmaddwd m9, [rsp+0x80], m11 + paddd m4, m6 + paddd m5, m7 + paddd m4, m8 + paddd m5, m9 + psrad m4, rndshift + psrad m5, rndshift + packssdw m4, m5 +%ifidn %1, put + packuswb m4, m4 + movq [dstq], m4 + add dstq, dsm +%else + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+0x140], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .skip_line + mova m14, [base+unpckw] + movq m6, [srcq+r10] + movq m7, [srcq+r11] + movhps m6, [srcq+r13] + movhps m7, [srcq+ rX] + movq m4, [srcq+ r4] + movq m5, [srcq+ r6] + movhps m4, [srcq+ r7] + movhps m5, [srcq+ r9] + add srcq, ssq + mov myd, [rsp+0x140] + mov dyd, dym + pshufd m9, m14, q1032 + pshufb m0, m14 ; 0a 1a + pshufb m1, m14 ; 0b 1b + pshufb m2, m9 ; 3a 2a + pshufb m3, m9 ; 3b 2b + pmaddubsw m6, [rsp+0x30] + pmaddubsw m7, [rsp+0x40] + pmaddubsw m4, [rsp+0x10] + pmaddubsw m5, [rsp+0x20] + phaddw m6, m7 + phaddw m4, m5 + phaddw m4, m6 + pmulhrsw m4, m12 + pshufb m5, [rsp+0x50], m14 ; 4a 5a + pshufb m6, [rsp+0x60], m14 ; 4b 5b + pshufb m7, [rsp+0x70], m9 ; 7a 6a + pshufb m8, [rsp+0x80], m9 ; 7b 6b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + punpckhwd m5, m7 ; 56a + punpckhwd m6, m8 ; 56b + punpcklwd m7, m4 ; 78a + punpckhqdq m4, m4 + punpcklwd m8, m4 ; 78b + mova [rsp+0x50], m5 + mova [rsp+0x60], m6 + mova [rsp+0x70], m7 + mova [rsp+0x80], m8 + jmp .vloop +.skip_line: + mova m0, [rsp+0x10] + mova m1, [rsp+0x20] + mova m14, [rsp+0x30] + mova m15, [rsp+0x40] + MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15 + mov myd, [rsp+0x140] + mov dyd, dym + mova m0, m2 ; 01a + mova m1, m3 ; 01b + mova m2, [rsp+0x50] ; 23a + mova m3, [rsp+0x60] ; 23b + mova m5, [rsp+0x70] ; 45a + mova m6, [rsp+0x80] ; 45b + punpcklwd m7, m4, m8 ; 67a + punpckhwd m4, m8 ; 67b + mova [rsp+0x50], m5 + mova [rsp+0x60], m6 + mova [rsp+0x70], m7 + mova [rsp+0x80], m4 + jmp .vloop +.dy1: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy1_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d + punpckldq m9, m8 + SWAP m8, m9 + paddd m14, m8 ; mx+dx*[0-1] + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + psrldq m15, 4 + movd r6d, m15 + mova m5, [base+bdct_lb_dw] + mova m6, [base+subpel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r6*8+2] + pxor m9, m9 + pcmpeqd m8, m9 + psrld m14, 10 + movq m0, [srcq+ssq*0] + movq m2, [srcq+ssq*2] + movhps m0, [srcq+ssq*1] + movhps m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movq m1, [srcq+ssq*0] + movq m3, [srcq+ssq*2] + movhps m1, [srcq+ssq*1] + add srcq, ss3q + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + punpckldq m15, m7 + punpcklqdq m15, m15 + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + pshufd m8, m10, q0000 + pshufd m9, m10, q1111 + pshufd m11, m10, q3333 + pshufd m10, m10, q2222 + pshufb m0, m14 + pshufb m2, m14 + pshufb m1, m14 + pshufb m3, m14 + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + palignr m2, m1, m0, 4 + pshufd m4, m1, q2121 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + punpcklwd m2, m1, m4 ; 45 56 +.dy1_w2_loop: + movq m1, [srcq+ssq*0] + movhps m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m3, m8 + pmaddwd m6, m0, m9 + pmaddwd m7, m2, m10 + mova m3, m0 + mova m0, m2 + paddd m5, m13 + paddd m6, m7 + pshufb m1, m14 + pmaddubsw m1, m15 + phaddw m1, m1 + pmulhrsw m1, m12 + palignr m7, m1, m4, 12 + punpcklwd m2, m7, m1 ; 67 78 + pmaddwd m7, m2, m11 + mova m4, m1 + paddd m5, m6 + paddd m5, m7 + psrad m5, rndshift + packssdw m5, m5 + packuswb m5, m5 + pextrw r4d, m5, 0 + pextrw r6d, m5, 1 + mov [dstq+dsq*0], r4w + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET + SWAP m15, m8, m9 +%endif +.dy1_w4: + mov myd, mym + mova m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd m15, t0d + pmaddwd m8, m7 + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + psrldq m7, m15, 8 + movd r4d, m15 + movd r11d, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r6d, m15 + movd r13d, m7 + movd m15, [base+subpel_filters+ r4*8+2] + movd m4, [base+subpel_filters+r11*8+2] + movd m5, [base+subpel_filters+ r6*8+2] + movd m7, [base+subpel_filters+r13*8+2] + movq m6, [base+subpel_s_shuf2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pcmpeqd m8, m9 + psrld m14, 10 + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m5 + punpckldq m4, m7 + punpcklqdq m6, m6 + punpcklqdq m15, m4 + pshufb m14, [base+bdct_lb_dw] + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m7, [srcq+ssq*2] + add srcq, ss3q + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + paddb m14, m6 + movq m10, r4q + punpcklbw m10, m10 + psraw m10, 8 + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb m5, m14 + pshufb m7, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + pmaddubsw m7, m15 + phaddw m0, m1 + phaddw m2, m3 + phaddw m4, m5 + phaddw m6, m7, m7 + pmulhrsw m0, m12 ; 0 1 + pmulhrsw m2, m12 ; 2 3 + pmulhrsw m4, m12 ; 4 5 + pmulhrsw m6, m12 ; 6 _ + shufps m1, m0, m2, q1032 ; 1 2 + shufps m3, m2, m4, q1032 ; 3 4 + shufps m5, m4, m6, q1032 ; 5 6 + punpcklwd m7, m0, m1 ; 01 + punpckhwd m0, m1 ; 12 + punpcklwd m8, m2, m3 ; 23 + punpckhwd m2, m3 ; 34 + punpcklwd m9, m4, m5 ; 45 + punpckhwd m4, m5 ; 56 + pshufd m1, m10, q0000 + pshufd m3, m10, q1111 + pshufd m5, m10, q2222 + pshufd m10, m10, q3333 + mova [rsp+0x00], m8 + mova [rsp+0x10], m2 + mova [rsp+0x20], m9 + mova [rsp+0x30], m4 +.dy1_w4_loop: + movu m11, [srcq+ssq*0] + pmaddwd m7, m1 + pmaddwd m8, m3 + pmaddwd m0, m1 + pmaddwd m2, m3 + pmaddwd m9, m5 + pmaddwd m4, m5 + paddd m7, m8 + paddd m0, m2 + movu m8, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m11, m14 + pmaddubsw m11, m15 + paddd m7, m13 + paddd m0, m13 + paddd m7, m9 + paddd m0, m4 + pshufb m8, m14 + pmaddubsw m8, m15 + phaddw m11, m8 + mova m8, [rsp+0x20] + pmulhrsw m11, m12 + punpcklwd m9, m6, m11 ; 67 + psrldq m6, m11, 8 + punpcklwd m4, m11, m6 ; 78 + pmaddwd m2, m9, m10 + pmaddwd m11, m4, m10 + paddd m7, m2 + mova m2, [rsp+0x30] + paddd m0, m11 + psrad m7, rndshift + psrad m0, rndshift + packssdw m7, m0 + mova m0, [rsp+0x10] +%ifidn %1, put + packuswb m7, m7 + psrldq m11, m7, 4 + movd [dstq+dsq*0], m7 + movd [dstq+dsq*1], m11 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], m7 + add tmpq, 16 +%endif + sub hd, 2 + jz .ret + mova m7, [rsp+0x00] + mova [rsp+0x00], m8 + mova [rsp+0x10], m2 + mova [rsp+0x20], m9 + mova [rsp+0x30], m4 + jmp .dy1_w4_loop + SWAP m8, m15 +.dy1_w8: + mov dword [rsp+0x90], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [rsp+0x90], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+0x90], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+0x90], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+0x90], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + movq m3, r4q + punpcklbw m3, m3 + psraw m3, 8 + mova [rsp+0x100], m7 + mova [rsp+0x120], m15 + mov [rsp+0x098], srcq + mov [rsp+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [rsp+0x140], m0 + mova [rsp+0x150], m1 + mova [rsp+0x160], m2 + mova [rsp+0x170], m3 +%if UNIX64 + mov hm, hd +%endif + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+0x090] + jz .ret + add qword [rsp+0x130], 8*(isprep+1) + mov hd, hm + mova m7, [rsp+0x100] + mova m14, [rsp+0x110] + mova m10, [base+pd_0x3ff] + mova m15, [rsp+0x120] + pxor m9, m9 + mov srcq, [rsp+0x098] + mov r0q, [rsp+0x130] ; dstq / tmpq + paddd m14, m7 +.dy1_hloop: + mova m11, [base+pq_0x40000000] + psrld m4, m14, 10 + mova [rsp], m4 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m9 + psrldq m4, m5, 8 + movd r4d, m5 + movd r6d, m4 + psrldq m5, 4 + psrldq m4, 4 + movd r7d, m5 + movd r9d, m4 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + mova [rsp+0x110], m14 + psrldq m4, m15, 8 + movd r10d, m15 + movd r11d, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + psrldq m4, m14, 8 + movd r10d, m14 + movd r11d, m4 + psrldq m14, 4 + psrldq m4, 4 + movd r13d, m14 + movd rXd, m4 + punpcklbw m14, m14 + psraw m14, 8 + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m8, m11, m4 + pand m9, m11, m6 + pand m15, m11, m7 + pand m11, m11, m5 + pandn m4, m0 + pandn m6, m1 + pandn m7, m2 + pandn m5, m3 + por m8, m4 + por m9, m6 + por m15, m7 + por m11, m5 + mova [rsp+0x10], m8 + mova [rsp+0x20], m9 + mova [rsp+0x30], m15 + mova [rsp+0x40], m11 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 + mova [rsp+0x50], m1 + mova [rsp+0x60], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 + mova [rsp+0x70], m3 + mova [rsp+0x80], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 + MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 + SWAP m7, m0 + SWAP m8, m14 + mova m1, [rsp+0x50] + mova m2, [rsp+0x60] + mova m3, [rsp+0x70] + mova m15, [rsp+0x80] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + SWAP m14, m8 + mova m8, [rsp+0x140] + mova m9, [rsp+0x150] + mova m10, [rsp+0x160] + mova m11, [rsp+0x170] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m15; 23a + punpckhwd m3, m15 ; 23b + mova [rsp+0x50], m4 + mova [rsp+0x60], m5 + mova [rsp+0x70], m6 + mova [rsp+0x80], m7 + mova m14, [base+unpckw] +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + pmaddwd m7, m3, m9 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [rsp+0x50], m10 + pmaddwd m7, [rsp+0x60], m10 + pmaddwd m15, [rsp+0x70], m11 + paddd m4, m6 + pmaddwd m6, [rsp+0x80], m11 + paddd m5, m7 + paddd m4, m15 + paddd m5, m6 + psrad m4, rndshift + psrad m5, rndshift + packssdw m4, m5 +%ifidn %1, put + packuswb m4, m4 + movq [dstq], m4 + add dstq, dsm +%else + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep + movq m4, [srcq+ r4] + movq m5, [srcq+ r6] + movhps m4, [srcq+ r7] + movhps m5, [srcq+ r9] + movq m6, [srcq+r10] + movq m7, [srcq+r11] + movhps m6, [srcq+r13] + movhps m7, [srcq+ rX] + add srcq, ssq + pshufd m15, m14, q1032 + pshufb m0, m14 ; 0a 1a + pshufb m1, m14 ; 0b 1b + pshufb m2, m15 ; 3a 2a + pshufb m3, m15 ; 3b 2b + pmaddubsw m4, [rsp+0x10] + pmaddubsw m5, [rsp+0x20] + pmaddubsw m6, [rsp+0x30] + pmaddubsw m7, [rsp+0x40] + phaddw m4, m5 + phaddw m6, m7 + phaddw m4, m6 + pmulhrsw m4, m12 + pshufb m5, [rsp+0x70], m15 ; 7a 6a + pshufb m7, [rsp+0x80], m15 ; 7b 6b + pshufb m6, [rsp+0x50], m14 ; 4a 5a + pshufb m15, [rsp+0x60], m14 ; 4b 5b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m6 ; 34a + punpcklwd m3, m15 ; 34b + punpckhwd m6, m5 ; 56a + punpckhwd m15, m7 ; 56b + punpcklwd m5, m4 ; 78a + psrldq m4, 8 + punpcklwd m7, m4 ; 78b + mova [rsp+0x50], m6 + mova [rsp+0x60], m15 + mova [rsp+0x70], m5 + mova [rsp+0x80], m7 + jmp .dy1_vloop +.dy2: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy2_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d + punpckldq m9, m8 + SWAP m8, m9 + paddd m14, m8 ; mx+dx*[0-1] + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + psrldq m15, 4 + movd r6d, m15 + mova m5, [base+bdct_lb_dw] + mova m6, [base+subpel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r6*8+2] + pxor m9, m9 + pcmpeqd m8, m9 + psrld m14, 10 + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + movhps m0, [srcq+ssq*2] + movhps m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + punpckldq m15, m7 + punpcklqdq m15, m15 + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + movq m3, [srcq+ssq*0] + movhps m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m0, m14 + pshufb m1, m14 + pshufb m3, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + movq m11, r4q + punpcklbw m11, m11 + psraw m11, 8 + pslldq m2, m3, 8 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 2 _ 4 + pmulhrsw m1, m12 ; 1 3 _ 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pshufd m2, m0, q3110 ; 0 2 2 4 + pshufd m1, m1, q3110 ; 1 3 3 5 + punpcklwd m3, m2, m1 ; 01 23 + punpckhwd m2, m1 ; 23 45 +.dy2_w2_loop: + movq m6, [srcq+ssq*0] + movq m7, [srcq+ssq*1] + movhps m6, [srcq+ssq*2] + movhps m7, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd m4, m3, m8 + pmaddwd m5, m2, m9 + pshufb m6, m14 + pshufb m7, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + phaddw m6, m7 + pmulhrsw m6, m12 + psrldq m7, m6, 8 + palignr m6, m0, 8 + palignr m7, m1, 8 + mova m0, m6 + mova m1, m7 + pshufd m6, m6, q3221 + pshufd m7, m7, q3221 + punpcklwd m3, m6, m7 ; 45 67 + punpckhwd m2, m6, m7 ; 67 89 + pmaddwd m6, m3, m10 + pmaddwd m7, m2, m11 + paddd m4, m5 + paddd m4, m13 + paddd m6, m7 + paddd m4, m6 + psrad m4, rndshift + packssdw m4, m4 + packuswb m4, m4 + movd r4d, m4 + mov [dstq+dsq*0], r4w + shr r4d, 16 + mov [dstq+dsq*1], r4w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET + SWAP m15, m8, m9 +%endif +.dy2_w4: + mov myd, mym + mova m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd m15, t0d + pmaddwd m8, m7 + mova m11, [base+pd_0x4000] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + psrldq m7, m15, 8 + movd r4d, m15 + movd r11d, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r6d, m15 + movd r13d, m7 + movd m15, [base+subpel_filters+ r4*8+2] + movd m4, [base+subpel_filters+r11*8+2] + movd m5, [base+subpel_filters+ r6*8+2] + movd m7, [base+subpel_filters+r13*8+2] + movq m6, [base+subpel_s_shuf2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pcmpeqd m8, m9 + psrld m14, 10 + movu m0, [srcq+ssq*0] + movu m2, [srcq+ssq*2] + movu m1, [srcq+ssq*1] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m5 + punpckldq m4, m7 + punpcklqdq m6, m6 + punpcklqdq m15, m4 + pshufb m14, [base+bdct_lb_dw] + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + paddb m14, m6 + movq m11, r4q + punpcklbw m11, m11 + psraw m11, 8 + pshufb m0, m14 + pshufb m2, m14 + pshufb m1, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb m5, m14 + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m0, m2 + phaddw m1, m3 + phaddw m4, m5 + pmulhrsw m0, m12 ; 0 2 + pmulhrsw m1, m12 ; 1 3 + pmulhrsw m4, m12 ; 4 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + psrldq m5, m4, 8 ; 5 _ + punpckhwd m2, m0, m1 ; 23 + punpcklwd m0, m1 ; 01 + punpcklwd m4, m5 ; 45 +.dy2_w4_loop: + pmaddwd m0, m8 ; a0 + pmaddwd m5, m2, m8 ; b0 + pmaddwd m2, m9 ; a1 + pmaddwd m7, m4, m9 ; b1 + pmaddwd m3, m4, m10 ; a2 + paddd m0, m13 + paddd m5, m13 + paddd m0, m2 + paddd m5, m7 + paddd m0, m3 + movu m6, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m6, m14 + pshufb m7, m14 + pshufb m3, m14 + pshufb m1, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + pmaddubsw m3, m15 + pmaddubsw m1, m15 + phaddw m6, m7 + phaddw m3, m1 + pmulhrsw m6, m12 ; 6 7 + pmulhrsw m3, m12 ; 8 9 + psrldq m7, m6, 8 + psrldq m1, m3, 8 + punpcklwd m6, m7 ; 67 + punpcklwd m3, m1 ; 89 + mova m2, m6 + pmaddwd m1, m6, m10 ; b2 + pmaddwd m6, m11 ; a3 + pmaddwd m7, m3, m11 ; b3 + paddd m5, m1 + paddd m0, m6 + paddd m5, m7 + psrad m0, rndshift + psrad m5, rndshift + packssdw m0, m5 +%ifidn %1, put + packuswb m0, m0 + psrldq m1, m0, 4 + movd [dstq+dsq*0], m0 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], m0 + add tmpq, 16 +%endif + mova m0, m4 + mova m4, m3 + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET + SWAP m8, m15 +.dy2_w8: + mov dword [rsp+0x90], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [rsp+0x90], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+0x90], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+0x90], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+0x90], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + movq m3, r4q + punpcklbw m3, m3 + psraw m3, 8 + mova [rsp+0x100], m7 + mova [rsp+0x120], m15 + mov [rsp+0x098], srcq + mov [rsp+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [rsp+0x140], m0 + mova [rsp+0x150], m1 + mova [rsp+0x160], m2 + mova [rsp+0x170], m3 +%if UNIX64 + mov hm, hd +%endif + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+0x090] + jz .ret + add qword [rsp+0x130], 8*(isprep+1) + mov hd, hm + mova m7, [rsp+0x100] + mova m14, [rsp+0x110] + mova m10, [base+pd_0x3ff] + mova m15, [rsp+0x120] + pxor m9, m9 + mov srcq, [rsp+0x098] + mov r0q, [rsp+0x130] ; dstq / tmpq + paddd m14, m7 +.dy2_hloop: + mova m11, [base+pq_0x40000000] + psrld m4, m14, 10 + mova [rsp], m4 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m9 + psrldq m4, m5, 8 + movd r4d, m5 + movd r6d, m4 + psrldq m5, 4 + psrldq m4, 4 + movd r7d, m5 + movd r9d, m4 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + mova [rsp+0x110], m14 + psrldq m4, m15, 8 + movd r10d, m15 + movd r11d, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + psrldq m4, m14, 8 + movd r10d, m14 + movd r11d, m4 + psrldq m14, 4 + psrldq m4, 4 + movd r13d, m14 + movd rXd, m4 + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m8, m11, m4 + pand m9, m11, m6 + pand m15, m11, m7 + pand m11, m11, m5 + pandn m4, m0 + pandn m6, m1 + pandn m7, m2 + pandn m5, m3 + por m8, m4 + por m9, m6 + por m15, m7 + por m11, m5 + mova [rsp+0x10], m8 + mova [rsp+0x20], m9 + mova [rsp+0x30], m15 + mova [rsp+0x40], m11 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 + mova [rsp+0x50], m1 + mova [rsp+0x60], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 + mova [rsp+0x70], m3 + mova [rsp+0x80], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 + MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 + SWAP m7, m0 + SWAP m8, m14 + mova m1, [rsp+0x50] + mova m2, [rsp+0x60] + mova m3, [rsp+0x70] + mova m15, [rsp+0x80] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + SWAP m14, m8 + mova m8, [rsp+0x140] + mova m9, [rsp+0x150] + mova m10, [rsp+0x160] + mova m11, [rsp+0x170] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m15; 23a + punpckhwd m3, m15 ; 23b + mova [rsp+0x50], m4 + mova [rsp+0x60], m5 + mova [rsp+0x70], m6 + mova [rsp+0x80], m7 +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + pmaddwd m7, m3, m9 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [rsp+0x50], m10 + pmaddwd m7, [rsp+0x60], m10 + pmaddwd m15, [rsp+0x70], m11 + paddd m4, m6 + pmaddwd m6, [rsp+0x80], m11 + paddd m5, m7 + paddd m4, m15 + paddd m5, m6 + psrad m4, rndshift + psrad m5, rndshift + packssdw m4, m5 +%ifidn %1, put + packuswb m4, m4 + movq [dstq], m4 + add dstq, dsm +%else + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep + mova m8, [rsp+0x10] + mova m9, [rsp+0x20] + mova m10, [rsp+0x30] + mova m11, [rsp+0x40] + mova m0, m2 ; 01a + mova m1, m3 ; 01b + MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11 + mova m3, [rsp+0x50] ; 23a + mova m4, [rsp+0x60] ; 23b + mova m5, [rsp+0x70] ; 45a + mova m7, [rsp+0x80] ; 45b + mova m8, [rsp+0x140] + mova m9, [rsp+0x150] + mova m10, [rsp+0x160] + mova m11, [rsp+0x170] + punpcklwd m14, m2, m6 ; 67a + punpckhwd m2, m6 ; 67b + mova [rsp+0x50], m5 + mova [rsp+0x60], m7 + mova [rsp+0x70], m14 + mova [rsp+0x80], m2 + mova m2, m3 + mova m3, m4 + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled + mov t0d, (5*15 << 16) | 5*15 + mov t1d, (5*15 << 16) | 5*15 + jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) +%endmacro + +%if ARCH_X86_64 +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif +BILIN_SCALED_FN put +FN put_8tap_scaled, sharp, SHARP, SHARP +FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN put_8tap_scaled, smooth, SMOOTH, SMOOTH +FN put_8tap_scaled, sharp_regular, SHARP, REGULAR +FN put_8tap_scaled, regular_sharp, REGULAR, SHARP +FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN put_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif +BILIN_SCALED_FN prep +FN prep_8tap_scaled, sharp, SHARP, SHARP +FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH +FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR +FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP +FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN prep_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED prep +%endif + %if ARCH_X86_32 %macro SAVE_ALPHA_BETA 0 mov alpham, alphad @@ -4329,7 +6038,7 @@ mov PIC_reg, PIC_mem %endif sub betad, tmp2d ; beta -= alpha*3 - lea filterq, [PIC_sym(mc_warp_filter)] + lea filterq, [PIC_sym(mc_warp_filter2)] %if ARCH_X86_64 mov myd, r6m %if cpuflag(ssse3) @@ -5758,7 +7467,7 @@ %define m11 [base+pd_63] %define m10 [base+pb_8x0_8x8] %endif - pmaddwd m4, m7, [base+resize_mul] ; dx*[0,1,2,3] + pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] pslld m7, 2 ; dx*4 pslld m5, 14 paddd m6, m4 ; mx+[0..3]*dx diff -Nru dav1d-0.7.1/src/x86/msac.asm dav1d-0.9.1/src/x86/msac.asm --- dav1d-0.7.1/src/x86/msac.asm 2020-06-21 11:48:55.036126400 +0000 +++ dav1d-0.9.1/src/x86/msac.asm 2021-07-28 21:38:28.917852400 +0000 @@ -23,6 +23,7 @@ ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +%include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 64 ; avoids cacheline splits @@ -152,6 +153,7 @@ .renorm4: bsr ecx, t2d xor ecx, 15 ; d +.renorm5: shl t2d, cl shl t4, cl mov [t7+msac.rng], t2d @@ -412,13 +414,20 @@ sub t2d, t1d ; r - v sub t4, rax ; dif - vw cmovb t2d, t1d + mov t1d, [t0+msac.cnt] cmovb t4, t3 + movifnidn t7, t0 + mov ecx, 0xbfff setb al ; the upper 32 bits contains garbage but that's OK + sub ecx, t2d not t4 + ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14) + ; i.e. (0 <= d <= 2) and v < (3 << 14) + shr ecx, 14 ; d %if ARCH_X86_64 == 0 movzx eax, al %endif - jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5 cglobal msac_decode_bool, 0, 6, 0 movifnidn t0, r0mp diff -Nru dav1d-0.7.1/tests/checkasm/checkasm.c dav1d-0.9.1/tests/checkasm/checkasm.c --- dav1d-0.7.1/tests/checkasm/checkasm.c 2020-06-21 11:48:55.040126600 +0000 +++ dav1d-0.9.1/tests/checkasm/checkasm.c 2021-07-28 21:38:28.917852400 +0000 @@ -518,9 +518,7 @@ } int main(int argc, char *argv[]) { - (void)func_new, (void)func_ref; state.seed = get_seed(); - int ret = 0; while (argc > 1) { if (!strncmp(argv[1], "--help", 6)) { @@ -568,6 +566,24 @@ dav1d_init_cpu(); +#ifdef readtime + if (state.bench_pattern) { + static int testing = 0; + checkasm_save_context(); + if (!testing) { + checkasm_set_signal_handler_state(1); + testing = 1; + readtime(); + checkasm_set_signal_handler_state(0); + } else { + fprintf(stderr, "checkasm: unable to access cycle counter\n"); + return 1; + } + } +#endif + + int ret = 0; + if (!state.function_listing) { fprintf(stderr, "checkasm: using random seed %u\n", state.seed); #if ARCH_X86_64 @@ -672,7 +688,9 @@ /* Indicate that the current test has failed, return whether verbose printing * is requested. */ int checkasm_fail_func(const char *const msg, ...) { - if (state.current_func_ver->cpu && state.current_func_ver->ok) { + if (state.current_func_ver && state.current_func_ver->cpu && + state.current_func_ver->ok) + { va_list arg; print_cpu_name(); @@ -737,10 +755,12 @@ void checkasm_set_signal_handler_state(const int enabled) { #ifdef _WIN32 +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) if (enabled) AddVectoredExceptionHandler(0, signal_handler); else RemoveVectoredExceptionHandler(signal_handler); +#endif #else void (*const handler)(int) = enabled ? signal_handler : SIG_DFL; signal(SIGBUS, handler); @@ -750,39 +770,91 @@ #endif } +static int check_err(const char *const file, const int line, + const char *const name, const int w, const int h, + int *const err) +{ + if (*err) + return 0; + if (!checkasm_fail_func("%s:%d", file, line)) + return 1; + *err = 1; + fprintf(stderr, "%s (%dx%d):\n", name, w, h); + return 0; +} + #define DEF_CHECKASM_CHECK_FUNC(type, fmt) \ int checkasm_check_##type(const char *const file, const int line, \ const type *buf1, ptrdiff_t stride1, \ const type *buf2, ptrdiff_t stride2, \ - const int w, int h, const char *const name) \ + const int w, int h, const char *const name, \ + const int align_w, const int align_h, \ + const int padding) \ { \ + int aligned_w = (w + align_w - 1) & ~(align_w - 1); \ + int aligned_h = (h + align_h - 1) & ~(align_h - 1); \ + int err = 0; \ stride1 /= sizeof(*buf1); \ stride2 /= sizeof(*buf2); \ int y = 0; \ for (y = 0; y < h; y++) \ if (memcmp(&buf1[y*stride1], &buf2[y*stride2], w*sizeof(*buf1))) \ break; \ - if (y == h) \ - return 0; \ - if (!checkasm_fail_func("%s:%d", file, line)) \ - return 1; \ - fprintf(stderr, "%s:\n", name); \ - while (h--) { \ - for (int x = 0; x < w; x++) \ - fprintf(stderr, " " fmt, buf1[x]); \ - fprintf(stderr, " "); \ - for (int x = 0; x < w; x++) \ - fprintf(stderr, " " fmt, buf2[x]); \ - fprintf(stderr, " "); \ - for (int x = 0; x < w; x++) \ - fprintf(stderr, "%c", buf1[x] != buf2[x] ? 'x' : '.'); \ - buf1 += stride1; \ - buf2 += stride2; \ - fprintf(stderr, "\n"); \ + if (y != h) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + for (y = 0; y < h; y++) { \ + for (int x = 0; x < w; x++) \ + fprintf(stderr, " " fmt, buf1[x]); \ + fprintf(stderr, " "); \ + for (int x = 0; x < w; x++) \ + fprintf(stderr, " " fmt, buf2[x]); \ + fprintf(stderr, " "); \ + for (int x = 0; x < w; x++) \ + fprintf(stderr, "%c", buf1[x] != buf2[x] ? 'x' : '.'); \ + buf1 += stride1; \ + buf2 += stride2; \ + fprintf(stderr, "\n"); \ + } \ + buf1 -= h*stride1; \ + buf2 -= h*stride2; \ } \ - return 1; \ + for (y = -padding; y < 0; y++) \ + if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \ + (w + 2*padding)*sizeof(*buf1))) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + fprintf(stderr, " overwrite above\n"); \ + break; \ + } \ + for (y = aligned_h; y < aligned_h + padding; y++) \ + if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \ + (w + 2*padding)*sizeof(*buf1))) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + fprintf(stderr, " overwrite below\n"); \ + break; \ + } \ + for (y = 0; y < h; y++) \ + if (memcmp(&buf1[y*stride1 - padding], &buf2[y*stride2 - padding], \ + padding*sizeof(*buf1))) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + fprintf(stderr, " overwrite left\n"); \ + break; \ + } \ + for (y = 0; y < h; y++) \ + if (memcmp(&buf1[y*stride1 + aligned_w], &buf2[y*stride2 + aligned_w], \ + padding*sizeof(*buf1))) { \ + if (check_err(file, line, name, w, h, &err)) \ + return 1; \ + fprintf(stderr, " overwrite right\n"); \ + break; \ + } \ + return err; \ } +DEF_CHECKASM_CHECK_FUNC(int8_t, "%4d") DEF_CHECKASM_CHECK_FUNC(uint8_t, "%02x") DEF_CHECKASM_CHECK_FUNC(uint16_t, "%04x") DEF_CHECKASM_CHECK_FUNC(int16_t, "%6d") diff -Nru dav1d-0.7.1/tests/checkasm/checkasm.h dav1d-0.9.1/tests/checkasm/checkasm.h --- dav1d-0.7.1/tests/checkasm/checkasm.h 2020-06-21 11:48:55.040126600 +0000 +++ dav1d-0.9.1/tests/checkasm/checkasm.h 2021-07-28 21:38:28.917852400 +0000 @@ -86,8 +86,6 @@ int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps, unsigned max_ulp, int len); -static void *func_ref, *func_new; - #define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */ /* Decide whether or not the specified function needs to be tested */ @@ -99,6 +97,7 @@ * is optional. */ #define declare_func(ret, ...)\ declare_new(ret, __VA_ARGS__)\ + void *func_ref, *func_new;\ typedef ret func_type(__VA_ARGS__);\ checkasm_save_context() @@ -116,7 +115,7 @@ #if HAVE_ASM #if ARCH_X86 -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) #include #define readtime() (_mm_lfence(), __rdtsc()) #else @@ -127,6 +126,9 @@ } #define readtime readtime #endif +#elif (ARCH_AARCH64 || ARCH_ARM) && defined(__APPLE__) +#include +#define readtime() mach_absolute_time() #elif ARCH_AARCH64 #ifdef _MSC_VER #include @@ -280,9 +282,9 @@ #ifdef readtime #define bench_new(...)\ do {\ + func_type *tfunc = func_new;\ + checkasm_set_signal_handler_state(1);\ if (checkasm_bench_func()) {\ - checkasm_set_signal_handler_state(1);\ - func_type *tfunc = func_new;\ uint64_t tsum = 0;\ int tcount = 0;\ for (int ti = 0; ti < BENCH_RUNS; ti++) {\ @@ -297,33 +299,50 @@ tcount++;\ }\ }\ - checkasm_set_signal_handler_state(0);\ checkasm_update_bench(tcount, tsum);\ + } else {\ + tfunc(__VA_ARGS__);\ }\ + checkasm_set_signal_handler_state(0);\ } while (0) #else #define bench_new(...) do {} while (0) #endif + +#define PIXEL_RECT(name, w, h) \ + ALIGN_STK_64(pixel, name##_buf, ((h)+32)*((w)+64) + 64,); \ + ptrdiff_t name##_stride = sizeof(pixel)*((w)+64); \ + (void)name##_stride; \ + pixel *name = name##_buf + ((w)+64)*16 + 64 + +#define CLEAR_PIXEL_RECT(name) \ + memset(name##_buf, 0x99, sizeof(name##_buf)) \ + #define DECL_CHECKASM_CHECK_FUNC(type) \ int checkasm_check_##type(const char *const file, const int line, \ const type *const buf1, const ptrdiff_t stride1, \ const type *const buf2, const ptrdiff_t stride2, \ - const int w, const int h, const char *const name) + const int w, const int h, const char *const name, \ + const int align_w, const int align_h, \ + const int padding) +DECL_CHECKASM_CHECK_FUNC(int8_t); DECL_CHECKASM_CHECK_FUNC(uint8_t); DECL_CHECKASM_CHECK_FUNC(uint16_t); DECL_CHECKASM_CHECK_FUNC(int16_t); DECL_CHECKASM_CHECK_FUNC(int32_t); -#define PASTE(a,b) a ## b -#define CONCAT(a,b) PASTE(a,b) +#define CONCAT(a,b) a ## b -#define checkasm_check(prefix, ...) CONCAT(checkasm_check_, prefix)(__FILE__, __LINE__, __VA_ARGS__) +#define checkasm_check2(prefix, ...) CONCAT(checkasm_check_, prefix)(__FILE__, __LINE__, __VA_ARGS__) +#define checkasm_check(prefix, ...) checkasm_check2(prefix, __VA_ARGS__, 0, 0, 0) #ifdef BITDEPTH #define checkasm_check_pixel(...) checkasm_check(PIXEL_TYPE, __VA_ARGS__) +#define checkasm_check_pixel_padded(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 1, 1, 8) +#define checkasm_check_pixel_padded_align(...) checkasm_check2(PIXEL_TYPE, __VA_ARGS__, 8) #define checkasm_check_coef(...) checkasm_check(COEF_TYPE, __VA_ARGS__) #endif diff -Nru dav1d-0.7.1/tests/checkasm/filmgrain.c dav1d-0.9.1/tests/checkasm/filmgrain.c --- dav1d-0.7.1/tests/checkasm/filmgrain.c 2020-06-21 11:48:55.040126600 +0000 +++ dav1d-0.9.1/tests/checkasm/filmgrain.c 2021-07-28 21:38:28.917852400 +0000 @@ -34,6 +34,12 @@ #define UNIT_TEST 1 #include "src/fg_apply_tmpl.c" +#if BITDEPTH == 8 +#define checkasm_check_entry(...) checkasm_check(int8_t, __VA_ARGS__) +#else +#define checkasm_check_entry(...) checkasm_check(int16_t, __VA_ARGS__) +#endif + static const char ss_name[][4] = { [DAV1D_PIXEL_LAYOUT_I420 - 1] = "420", [DAV1D_PIXEL_LAYOUT_I422 - 1] = "422", @@ -65,11 +71,9 @@ call_ref(grain_lut_c, fg_data HIGHBD_TAIL_SUFFIX); call_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX); - if (memcmp(grain_lut_c, grain_lut_a, - GRAIN_WIDTH * GRAIN_HEIGHT * sizeof(entry))) - { - fail(); - } + checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH, + grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH, + GRAIN_WIDTH, GRAIN_HEIGHT, "grain_lut"); bench_new(grain_lut_a, fg_data HIGHBD_TAIL_SUFFIX); } @@ -123,10 +127,11 @@ memset(grain_lut_a, 0xff, sizeof(grain_lut_a)); call_ref(grain_lut_c, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX); call_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX); - int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH; - for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++) - diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry)); - if (diff) fail(); + int w = ss_x ? 44 : GRAIN_WIDTH; + int h = ss_y ? 38 : GRAIN_HEIGHT; + checkasm_check_entry(grain_lut_c[0], sizeof(entry) * GRAIN_WIDTH, + grain_lut_a[0], sizeof(entry) * GRAIN_WIDTH, + w, h, "grain_lut"); bench_new(grain_lut_a, grain_lut_y, fg_data, uv HIGHBD_TAIL_SUFFIX); } @@ -137,10 +142,10 @@ } static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { - ALIGN_STK_64(pixel, c_dst, 128 * 32,); - ALIGN_STK_64(pixel, a_dst, 128 * 32,); - ALIGN_STK_64(pixel, src, 128 * 32,); - const ptrdiff_t stride = 128 * sizeof(pixel); + PIXEL_RECT(c_dst, 128, 32); + PIXEL_RECT(a_dst, 128, 32); + PIXEL_RECT(src, 128, 32); + const ptrdiff_t stride = c_dst_stride; declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride, const Dav1dFilmGrainData *data, size_t pw, @@ -178,40 +183,70 @@ generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points, fg_data[0].num_y_points, scaling); - const int w = 1 + (rnd() & 127); - const int h = 1 + (rnd() & 31); - - for (int y = 0; y < 32; y++) - for (int x = 0; x < 128; x++) - src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max; - const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0; - fg_data[0].clip_to_restricted_range = rnd() & 1; fg_data[0].scaling_shift = (rnd() & 3) + 8; for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1; fg_data[0].overlap_flag++) { - call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h, - row_num HIGHBD_TAIL_SUFFIX); - call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h, - row_num HIGHBD_TAIL_SUFFIX); + for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) { + int w, h, row_num; + if (fg_data[0].overlap_flag) { + w = 35 + (rnd() % 93); + if (i == 0) { + row_num = 0; + h = 1 + (rnd() % 31); + } else { + row_num = 1 + (rnd() & 0x7ff); + if (i == 1) { + h = 3 + (rnd() % 30); + } else { + h = 1 + (rnd() & 1); + } + } + } else { + w = 1 + (rnd() & 127); + h = 1 + (rnd() & 31); + row_num = rnd() & 0x7ff; + } + + for (int y = 0; y < 32; y++) { + // Src pixels past the right edge can be uninitialized + for (int x = 0; x < 128; x++) + src[y * PXSTRIDE(stride) + x] = rnd(); + for (int x = 0; x < w; x++) + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + } + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h, + row_num HIGHBD_TAIL_SUFFIX); + call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut, h, + row_num HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst"); + checkasm_check_pixel_padded_align(c_dst, stride, a_dst, stride, + w, h, "dst", 32, 2); + } } fg_data[0].overlap_flag = 1; + for (int y = 0; y < 32; y++) { + // Make sure all pixels are in range + for (int x = 0; x < 128; x++) + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + } bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32, - row_num HIGHBD_TAIL_SUFFIX); + 1 HIGHBD_TAIL_SUFFIX); } report("fgy_32x32xn"); } static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { - ALIGN_STK_64(pixel, c_dst, 128 * 32,); - ALIGN_STK_64(pixel, a_dst, 128 * 32,); - ALIGN_STK_64(pixel, src, 128 * 32,); - ALIGN_STK_64(pixel, luma_src, 128 * 32,); - const ptrdiff_t lstride = 128 * sizeof(pixel); + PIXEL_RECT(c_dst, 128, 32); + PIXEL_RECT(a_dst, 128, 32); + PIXEL_RECT(src, 128, 32); + PIXEL_RECT(luma_src, 128, 32); + const ptrdiff_t lstride = luma_src_stride; declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride, const Dav1dFilmGrainData *data, size_t pw, @@ -224,7 +259,7 @@ const enum Dav1dPixelLayout layout = layout_idx + 1; const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444; const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420; - const ptrdiff_t stride = (ss_x ? 96 : 128) * sizeof(pixel); + const ptrdiff_t stride = c_dst_stride; for (int csfl = 0; csfl <= 1; csfl++) { if (check_func(dsp->fguv_32x32xn[layout_idx], @@ -258,17 +293,6 @@ dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0], fg_data, uv_pl HIGHBD_TAIL_SUFFIX); - const int w = 1 + (rnd() & (127 >> ss_x)); - const int h = 1 + (rnd() & (31 >> ss_y)); - - for (int y = 0; y < 32; y++) - for (int x = 0; x < 128; x++) - src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max; - for (int y = 0; y < 32; y++) - for (int x = 0; x < 128; x++) - luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max; - const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0; - if (csfl) { fg_data[0].num_y_points = 2 + (rnd() % 13); const int pad = 0xff / fg_data[0].num_y_points; @@ -301,17 +325,63 @@ for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1; fg_data[0].overlap_flag++) { - call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h, - row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); - call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h, - row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); - - checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst"); + for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) { + int w, h, row_num; + if (fg_data[0].overlap_flag) { + w = (36 >> ss_x) + (rnd() % (92 >> ss_x)); + if (i == 0) { + row_num = 0; + h = 1 + (rnd() & (31 >> ss_y)); + } else { + row_num = 1 + (rnd() & 0x7ff); + if (i == 1) { + h = (ss_y ? 2 : 3) + (rnd() % (ss_y ? 15 : 30)); + } else { + h = ss_y ? 1 : 1 + (rnd() & 1); + } + } + } else { + w = 1 + (rnd() & (127 >> ss_x)); + h = 1 + (rnd() & (31 >> ss_y)); + row_num = rnd() & 0x7ff; + } + + for (int y = 0; y < 32; y++) { + // Src pixels past the right edge can be uninitialized + for (int x = 0; x < 128; x++) { + src[y * PXSTRIDE(stride) + x] = rnd(); + luma_src[y * PXSTRIDE(lstride) + x] = rnd(); + } + for (int x = 0; x < w; x++) + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + for (int x = 0; x < (w << ss_x); x++) + luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max; + } + + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); + call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h, + row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); + call_new(a_dst, src, stride, fg_data, w, scaling, grain_lut[1], h, + row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); + + checkasm_check_pixel_padded_align(c_dst, stride, + a_dst, stride, + w, h, "dst", + 32 >> ss_x, 2); + } } fg_data[0].overlap_flag = 1; + for (int y = 0; y < 32; y++) { + // Make sure all pixels are in range + for (int x = 0; x < 128; x++) { + src[y * PXSTRIDE(stride) + x] &= bitdepth_max; + luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max; + } + } bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16, - row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); + 1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX); } } } diff -Nru dav1d-0.7.1/tests/checkasm/ipred.c dav1d-0.9.1/tests/checkasm/ipred.c --- dav1d-0.7.1/tests/checkasm/ipred.c 2020-06-21 11:48:55.040126600 +0000 +++ dav1d-0.9.1/tests/checkasm/ipred.c 2021-07-28 21:38:28.917852400 +0000 @@ -66,8 +66,8 @@ }; static void check_intra_pred(Dav1dIntraPredDSPContext *const c) { - ALIGN_STK_64(pixel, c_dst, 64 * 64,); - ALIGN_STK_64(pixel, a_dst, 64 * 64,); + PIXEL_RECT(c_dst, 64, 64); + PIXEL_RECT(a_dst, 64, 64); ALIGN_STK_64(pixel, topleft_buf, 257,); pixel *const topleft = topleft_buf + 128; @@ -89,7 +89,7 @@ for (int h = imax(w / 4, 4); h <= imin(w * 4, (mode == FILTER_PRED ? 32 : 64)); h <<= 1) { - const ptrdiff_t stride = w * sizeof(pixel); + const ptrdiff_t stride = c_dst_stride; int a = 0, maxw = 0, maxh = 0; if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */ @@ -112,12 +112,15 @@ for (int i = -h * 2; i <= w * 2; i++) topleft[i] = rnd() & bitdepth_max; + CLEAR_PIXEL_RECT(c_dst); + CLEAR_PIXEL_RECT(a_dst); call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh HIGHBD_TAIL_SUFFIX); call_new(a_dst, stride, topleft, w, h, a, maxw, maxh HIGHBD_TAIL_SUFFIX); - if (checkasm_check_pixel(c_dst, stride, a_dst, stride, - w, h, "dst")) + if (checkasm_check_pixel_padded(c_dst, stride, + a_dst, stride, + w, h, "dst")) { if (mode == Z1_PRED || mode == Z3_PRED) fprintf(stderr, "angle = %d (0x%03x)\n", diff -Nru dav1d-0.7.1/tests/checkasm/itx.c dav1d-0.9.1/tests/checkasm/itx.c --- dav1d-0.7.1/tests/checkasm/itx.c 2020-06-21 11:48:55.040126600 +0000 +++ dav1d-0.9.1/tests/checkasm/itx.c 2021-07-28 21:38:28.917852400 +0000 @@ -138,14 +138,21 @@ * dimensions are non-zero. This leads to braching to specific optimized * simd versions (e.g. dc-only) so that we get full asm coverage in this * test */ - const uint16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]]; + + const enum TxClass tx_class = dav1d_tx_type_class[txtp]; + const uint16_t *const scan = dav1d_scans[tx]; const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0; const int sub_low = subsh > 1 ? sub_high - 8 : 0; int n, eob; for (n = 0, eob = 0; n < sw * sh; n++) { - const int rc = scan[n]; - const int rcx = rc % sh, rcy = rc / sh; + int rc, rcx, rcy; + if (tx_class == TX_CLASS_2D) + rc = scan[n], rcx = rc % sh, rcy = rc / sh; + else if (tx_class == TX_CLASS_H) + rcx = n % sh, rcy = n / sh, rc = n; + else /* tx_class == TX_CLASS_V */ + rcx = n / sw, rcy = n % sw, rc = rcy * sh + rcx; /* Pick a random eob within this sub-itx */ if (rcx > sub_high || rcy > sub_high) { @@ -156,8 +163,18 @@ if (eob) eob += rnd() % (n - eob - 1); - for (n = eob + 1; n < sw * sh; n++) - coeff[scan[n]] = 0; + if (tx_class == TX_CLASS_2D) + for (n = eob + 1; n < sw * sh; n++) + coeff[scan[n]] = 0; + else if (tx_class == TX_CLASS_H) + for (n = eob + 1; n < sw * sh; n++) + coeff[n] = 0; + else /* tx_class == TX_CLASS_V */ { + for (int rcx = eob / sw, rcy = eob % sw; rcx < sh; rcx++, rcy = -1) + while (++rcy < sw) + coeff[rcy * sh + rcx] = 0; + n = sw * sh; + } for (; n < 32 * 32; n++) coeff[n] = rnd(); return eob; diff -Nru dav1d-0.7.1/tests/checkasm/loopfilter.c dav1d-0.9.1/tests/checkasm/loopfilter.c --- dav1d-0.7.1/tests/checkasm/loopfilter.c 2020-06-21 11:48:55.040126600 +0000 +++ dav1d-0.9.1/tests/checkasm/loopfilter.c 2021-07-28 21:38:28.917852400 +0000 @@ -33,13 +33,12 @@ #include "src/loopfilter.h" static void init_lpf_border(pixel *const dst, const ptrdiff_t stride, - int E, int I, int H, const int bitdepth_max) + int E, int I, const int bitdepth_max) { const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; const int F = 1 << bitdepth_min_8; E <<= bitdepth_min_8; I <<= bitdepth_min_8; - H <<= bitdepth_min_8; const int filter_type = rnd() % 4; const int edge_diff = rnd() % ((E + 2) * 4) - 2 * (E + 2); @@ -171,7 +170,7 @@ L = l[2 * x + 1][lf_idx] ? l[2 * x + 1][lf_idx] : l[2 * x][lf_idx]; } init_lpf_border(c_dst + i * (dir ? 1 : 16), dir ? 128 : 1, - lut.e[L], lut.i[L], L >> 4, bitdepth_max); + lut.e[L], lut.i[L], bitdepth_max); } memcpy(a_dst_mem, c_dst_mem, 128 * sizeof(pixel) * 16); diff -Nru dav1d-0.7.1/tests/checkasm/looprestoration.c dav1d-0.9.1/tests/checkasm/looprestoration.c --- dav1d-0.7.1/tests/checkasm/looprestoration.c 2020-06-21 11:48:55.040126600 +0000 +++ dav1d-0.9.1/tests/checkasm/looprestoration.c 2021-07-28 21:38:28.917852400 +0000 @@ -27,139 +27,156 @@ #include "tests/checkasm/checkasm.h" +#include #include #include "src/levels.h" #include "src/looprestoration.h" #include "src/tables.h" +static int to_binary(int x) { /* 0-15 -> 0000-1111 */ + return (x & 1) + 5 * (x & 2) + 25 * (x & 4) + 125 * (x & 8); +} + static void init_tmp(pixel *buf, const ptrdiff_t stride, const int w, const int h, const int bitdepth_max) { + const int noise_mask = bitdepth_max >> 4; + const int x_off = rnd() & 7, y_off = rnd() & 7; + for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) - buf[x] = rnd() & bitdepth_max; + for (int x = 0; x < w; x++) { + buf[x] = (((x + x_off) ^ (y + y_off)) & 8 ? bitdepth_max : 0) ^ + (rnd() & noise_mask); + } buf += PXSTRIDE(stride); } } static void check_wiener(Dav1dLoopRestorationDSPContext *const c, const int bpc) { - ALIGN_STK_64(pixel, c_dst, 448 * 64,); - ALIGN_STK_64(pixel, a_dst, 448 * 64,); - ALIGN_STK_64(pixel, h_edge, 448 * 8,); + ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32; + ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32; + ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32; pixel left[64][4]; + LooprestorationParams params; + int16_t (*const filter)[8] = params.filter; declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*const left)[4], const pixel *lpf, ptrdiff_t lpf_stride, - int w, int h, const int16_t filterh[7], - const int16_t filterv[7], enum LrEdgeFlags edges - HIGHBD_DECL_SUFFIX); - - for (int pl = 0; pl < 2; pl++) { - if (check_func(c->wiener, "wiener_%s_%dbpc", - pl ? "chroma" : "luma", bpc)) - { - int16_t filter[2][3], filter_v[7], filter_h[7]; - - filter[0][0] = pl ? 0 : (rnd() & 15) - 5; - filter[0][1] = (rnd() & 31) - 23; - filter[0][2] = (rnd() & 63) - 17; - filter[1][0] = pl ? 0 : (rnd() & 15) - 5; - filter[1][1] = (rnd() & 31) - 23; - filter[1][2] = (rnd() & 63) - 17; - - filter_h[0] = filter_h[6] = filter[0][0]; - filter_h[1] = filter_h[5] = filter[0][1]; - filter_h[2] = filter_h[4] = filter[0][2]; - filter_h[3] = -((filter_h[0] + filter_h[1] + filter_h[2]) * 2); - - filter_v[0] = filter_v[6] = filter[1][0]; - filter_v[1] = filter_v[5] = filter[1][1]; - filter_v[2] = filter_v[4] = filter[1][2]; - filter_v[3] = -((filter_v[0] + filter_v[1] + filter_v[2]) * 2); + int w, int h, const LooprestorationParams *params, + enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + + for (int t = 0; t < 2; t++) { + if (check_func(c->wiener[t], "wiener_%dtap_%dbpc", t ? 5 : 7, bpc)) { + filter[0][0] = filter[0][6] = t ? 0 : (rnd() & 15) - 5; + filter[0][1] = filter[0][5] = (rnd() & 31) - 23; + filter[0][2] = filter[0][4] = (rnd() & 63) - 17; + filter[0][3] = -(filter[0][0] + filter[0][1] + filter[0][2]) * 2; +#if BITDEPTH != 8 + filter[0][3] += 128; +#endif + + filter[1][0] = filter[1][6] = t ? 0 : (rnd() & 15) - 5; + filter[1][1] = filter[1][5] = (rnd() & 31) - 23; + filter[1][2] = filter[1][4] = (rnd() & 63) - 17; + filter[1][3] = 128 - (filter[1][0] + filter[1][1] + filter[1][2]) * 2; const int base_w = 1 + (rnd() % 384); const int base_h = 1 + (rnd() & 63); const int bitdepth_max = (1 << bpc) - 1; - init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max); - init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max); + init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max); + init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max); init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max); for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) { const int w = edges & LR_HAVE_RIGHT ? 256 : base_w; const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h; - memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel)); + memcpy(a_src, c_src, 448 * 64 * sizeof(pixel)); - call_ref(c_dst + 32, 448 * sizeof(pixel), left, - h_edge + 32, 448 * sizeof(pixel), - w, h, filter_h, filter_v, edges HIGHBD_TAIL_SUFFIX); - call_new(a_dst + 32, 448 * sizeof(pixel), left, - h_edge + 32, 448 * sizeof(pixel), - w, h, filter_h, filter_v, edges HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel), - a_dst + 32, 448 * sizeof(pixel), - w, h, "dst"); + call_ref(c_dst, 448 * sizeof(pixel), left, + h_edge, 448 * sizeof(pixel), + w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + call_new(a_dst, 448 * sizeof(pixel), left, + h_edge, 448 * sizeof(pixel), + w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel), + a_dst, 448 * sizeof(pixel), + w, h, "dst")) + { + fprintf(stderr, "size = %dx%d, edges = %04d\n", + w, h, to_binary(edges)); + break; + } } - bench_new(a_dst + 32, 448 * sizeof(pixel), left, - h_edge + 32, 448 * sizeof(pixel), - 256, 64, filter_h, filter_v, 0xf HIGHBD_TAIL_SUFFIX); + bench_new(a_dst, 448 * sizeof(pixel), left, + h_edge, 448 * sizeof(pixel), + 256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX); } } } static void check_sgr(Dav1dLoopRestorationDSPContext *const c, const int bpc) { - ALIGN_STK_64(pixel, c_dst, 448 * 64,); - ALIGN_STK_64(pixel, a_dst, 448 * 64,); - ALIGN_STK_64(pixel, h_edge, 448 * 8,); + ALIGN_STK_64(pixel, c_src, 448 * 64,), *const c_dst = c_src + 32; + ALIGN_STK_64(pixel, a_src, 448 * 64,), *const a_dst = a_src + 32; + ALIGN_STK_64(pixel, edge_buf, 448 * 8,), *const h_edge = edge_buf + 32; pixel left[64][4]; + LooprestorationParams params; declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*const left)[4], const pixel *lpf, ptrdiff_t lpf_stride, - int w, int h, int sgr_idx, - const int16_t sgr_wt[7], enum LrEdgeFlags edges - HIGHBD_DECL_SUFFIX); - - for (int sgr_idx = 14; sgr_idx >= 6; sgr_idx -= 4) { - if (check_func(c->selfguided, "selfguided_%s_%dbpc", - sgr_idx == 6 ? "mix" : sgr_idx == 10 ? "3x3" : "5x5", bpc)) - { - int16_t sgr_wt[2]; - - sgr_wt[0] = dav1d_sgr_params[sgr_idx][0] ? (rnd() & 127) - 96 : 0; - sgr_wt[1] = dav1d_sgr_params[sgr_idx][1] ? (rnd() & 127) - 32 : - iclip(128 - sgr_wt[0], -32, 95); + int w, int h, const LooprestorationParams *params, + enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); + + static const struct { char name[4]; uint8_t idx; } sgr_data[3] = { + { "5x5", 14 }, + { "3x3", 10 }, + { "mix", 0 }, + }; + + for (int i = 0; i < 3; i++) { + if (check_func(c->sgr[i], "sgr_%s_%dbpc", sgr_data[i].name, bpc)) { + const uint16_t *const sgr_params = dav1d_sgr_params[sgr_data[i].idx]; + params.sgr.s0 = sgr_params[0]; + params.sgr.s1 = sgr_params[1]; + params.sgr.w0 = sgr_params[0] ? (rnd() & 127) - 96 : 0; + params.sgr.w1 = (sgr_params[1] ? 160 - (rnd() & 127) : 33) - params.sgr.w0; const int base_w = 1 + (rnd() % 384); const int base_h = 1 + (rnd() & 63); const int bitdepth_max = (1 << bpc) - 1; - init_tmp(c_dst, 448 * sizeof(pixel), 448, 64, bitdepth_max); - init_tmp(h_edge, 448 * sizeof(pixel), 448, 8, bitdepth_max); + init_tmp(c_src, 448 * sizeof(pixel), 448, 64, bitdepth_max); + init_tmp(edge_buf, 448 * sizeof(pixel), 448, 8, bitdepth_max); init_tmp((pixel *) left, 4 * sizeof(pixel), 4, 64, bitdepth_max); for (enum LrEdgeFlags edges = 0; edges <= 0xf; edges++) { const int w = edges & LR_HAVE_RIGHT ? 256 : base_w; const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h; - memcpy(a_dst, c_dst, 448 * 64 * sizeof(pixel)); + memcpy(a_src, c_src, 448 * 64 * sizeof(pixel)); - call_ref(c_dst + 32, 448 * sizeof(pixel), left, - h_edge + 32, 448 * sizeof(pixel), - w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX); - call_new(a_dst + 32, 448 * sizeof(pixel), left, - h_edge + 32, 448 * sizeof(pixel), - w, h, sgr_idx, sgr_wt, edges HIGHBD_TAIL_SUFFIX); - checkasm_check_pixel(c_dst + 32, 448 * sizeof(pixel), - a_dst + 32, 448 * sizeof(pixel), - w, h, "dst"); + call_ref(c_dst, 448 * sizeof(pixel), left, + h_edge, 448 * sizeof(pixel), + w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + call_new(a_dst, 448 * sizeof(pixel), left, + h_edge, 448 * sizeof(pixel), + w, h, ¶ms, edges HIGHBD_TAIL_SUFFIX); + if (checkasm_check_pixel(c_dst, 448 * sizeof(pixel), + a_dst, 448 * sizeof(pixel), + w, h, "dst")) + { + fprintf(stderr, "size = %dx%d, edges = %04d\n", + w, h, to_binary(edges)); + break; + } } - bench_new(a_dst + 32, 448 * sizeof(pixel), left, - h_edge + 32, 448 * sizeof(pixel), - 256, 64, sgr_idx, sgr_wt, 0xf HIGHBD_TAIL_SUFFIX); + bench_new(a_dst, 448 * sizeof(pixel), left, + h_edge, 448 * sizeof(pixel), + 256, 64, ¶ms, 0xf HIGHBD_TAIL_SUFFIX); } } } diff -Nru dav1d-0.7.1/tests/checkasm/msac.c dav1d-0.9.1/tests/checkasm/msac.c --- dav1d-0.7.1/tests/checkasm/msac.c 2020-06-21 11:48:55.044126500 +0000 +++ dav1d-0.9.1/tests/checkasm/msac.c 2021-07-28 21:38:28.917852400 +0000 @@ -140,11 +140,11 @@ report("decode_symbol"); } -static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) { +static void check_decode_bool_adapt(MsacDSPContext *const c, uint8_t *const buf) { MsacContext s_c, s_a; + declare_func(unsigned, MsacContext *s, uint16_t *cdf); if (check_func(c->bool_adapt, "msac_decode_bool_adapt")) { - declare_func(unsigned, MsacContext *s, uint16_t *cdf); uint16_t cdf[2][2]; for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); @@ -165,9 +165,13 @@ bench_new(&s_a, cdf[1]); } } +} +static void check_decode_bool_equi(MsacDSPContext *const c, uint8_t *const buf) { + MsacContext s_c, s_a; + + declare_func(unsigned, MsacContext *s); if (check_func(c->bool_equi, "msac_decode_bool_equi")) { - declare_func(unsigned, MsacContext *s); dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); s_a = s_c; for (int i = 0; i < 64; i++) { @@ -180,9 +184,13 @@ } bench_new(&s_a); } +} +static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) { + MsacContext s_c, s_a; + + declare_func(unsigned, MsacContext *s, unsigned f); if (check_func(c->bool, "msac_decode_bool")) { - declare_func(unsigned, MsacContext *s, unsigned f); dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); s_a = s_c; for (int i = 0; i < 64; i++) { @@ -197,6 +205,12 @@ bench_new(&s_a, 16384); } +} + +static void check_decode_bool_funcs(MsacDSPContext *const c, uint8_t *const buf) { + check_decode_bool_adapt(c, buf); + check_decode_bool_equi(c, buf); + check_decode_bool(c, buf); report("decode_bool"); } @@ -204,8 +218,8 @@ ALIGN_STK_16(uint16_t, cdf, 2, [16]); MsacContext s_c, s_a; + declare_func(unsigned, MsacContext *s, uint16_t *cdf); if (check_func(c->hi_tok, "msac_decode_hi_tok")) { - declare_func(unsigned, MsacContext *s, uint16_t *cdf); for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); s_a = s_c; @@ -272,6 +286,6 @@ buf[i] = rnd(); check_decode_symbol(&c, buf); - check_decode_bool(&c, buf); + check_decode_bool_funcs(&c, buf); check_decode_hi_tok(&c, buf); } diff -Nru dav1d-0.7.1/tests/checkasm/x86/checkasm.asm dav1d-0.9.1/tests/checkasm/x86/checkasm.asm --- dav1d-0.7.1/tests/checkasm/x86/checkasm.asm 2020-06-21 11:48:55.044126500 +0000 +++ dav1d-0.9.1/tests/checkasm/x86/checkasm.asm 2021-07-28 21:38:28.917852400 +0000 @@ -23,8 +23,9 @@ ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -%define private_prefix checkasm %include "config.asm" +%undef private_prefix +%define private_prefix checkasm %include "ext/x86/x86inc.asm" SECTION_RODATA 16 diff -Nru dav1d-0.7.1/tests/header_test.c dav1d-0.9.1/tests/header_test.c --- dav1d-0.7.1/tests/header_test.c 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/tests/header_test.c 2021-07-28 21:38:28.917852400 +0000 @@ -0,0 +1,33 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include DAV1D_TEST_HEADER + +int main(void) +{ + return 0; +} diff -Nru dav1d-0.7.1/tests/libfuzzer/dav1d_fuzzer.c dav1d-0.9.1/tests/libfuzzer/dav1d_fuzzer.c --- dav1d-0.7.1/tests/libfuzzer/dav1d_fuzzer.c 2020-06-21 11:48:55.044126500 +0000 +++ dav1d-0.9.1/tests/libfuzzer/dav1d_fuzzer.c 2021-07-28 21:38:28.917852400 +0000 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "src/cpu.h" @@ -38,8 +39,6 @@ #ifdef DAV1D_ALLOC_FAIL -#include - #include "alloc_fail.h" static unsigned djb_xor(const uint8_t * c, size_t len) { @@ -56,6 +55,39 @@ #define DAV1D_FUZZ_MAX_SIZE 4096 * 4096 +// search for "--cpumask xxx" in argv and remove both parameters +int LLVMFuzzerInitialize(int *argc, char ***argv) { + int i = 1; + for (; i < *argc; i++) { + if (!strcmp((*argv)[i], "--cpumask")) { + const char * cpumask = (*argv)[i+1]; + if (cpumask) { + char *end; + unsigned res; + if (!strncmp(cpumask, "0x", 2)) { + cpumask += 2; + res = (unsigned) strtoul(cpumask, &end, 16); + } else { + res = (unsigned) strtoul(cpumask, &end, 0); + } + if (end != cpumask && !end[0]) { + dav1d_set_cpu_flags_mask(res); + } + } + break; + } + } + + for (; i < *argc - 2; i++) { + (*argv)[i] = (*argv)[i + 2]; + } + + *argc = i; + + return 0; +} + + // expects ivf input int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) @@ -146,15 +178,21 @@ dav1d_data_unref(&buf); } - do { - memset(&pic, 0, sizeof(pic)); - err = dav1d_get_picture(ctx, &pic); - if (err == 0) - dav1d_picture_unref(&pic); - } while (err != DAV1D_ERR(EAGAIN)); + memset(&pic, 0, sizeof(pic)); + if ((err = dav1d_get_picture(ctx, &pic)) == 0) { + /* Test calling dav1d_picture_unref() after dav1d_close() */ + do { + Dav1dPicture pic2 = { 0 }; + if ((err = dav1d_get_picture(ctx, &pic2)) == 0) + dav1d_picture_unref(&pic2); + } while (err != DAV1D_ERR(EAGAIN)); + + dav1d_close(&ctx); + dav1d_picture_unref(&pic); + return 0; + } cleanup: - dav1d_flush(ctx); dav1d_close(&ctx); end: return 0; diff -Nru dav1d-0.7.1/tests/libfuzzer/dav1d_fuzzer.h dav1d-0.9.1/tests/libfuzzer/dav1d_fuzzer.h --- dav1d-0.7.1/tests/libfuzzer/dav1d_fuzzer.h 2020-06-21 11:48:55.044126500 +0000 +++ dav1d-0.9.1/tests/libfuzzer/dav1d_fuzzer.h 2021-07-28 21:38:28.917852400 +0000 @@ -31,6 +31,7 @@ #include #include +int LLVMFuzzerInitialize(int *argc, char ***argv); int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size); #endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */ diff -Nru dav1d-0.7.1/tests/libfuzzer/main.c dav1d-0.9.1/tests/libfuzzer/main.c --- dav1d-0.7.1/tests/libfuzzer/main.c 2020-06-21 11:48:55.044126500 +0000 +++ dav1d-0.9.1/tests/libfuzzer/main.c 2021-07-28 21:38:28.917852400 +0000 @@ -40,7 +40,7 @@ // expects ivf input -int main(const int argc, char *const *const argv) { +int main(int argc, char *argv[]) { int ret = -1; FILE *f = NULL; int64_t fsize; @@ -48,6 +48,10 @@ uint8_t *data = NULL; size_t size = 0; + if (LLVMFuzzerInitialize(&argc, &argv)) { + return 1; + } + if (argc != 2) { fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]); return -1; diff -Nru dav1d-0.7.1/tests/meson.build dav1d-0.9.1/tests/meson.build --- dav1d-0.7.1/tests/meson.build 2020-06-21 11:48:55.044126500 +0000 +++ dav1d-0.9.1/tests/meson.build 2021-07-28 21:38:28.921852400 +0000 @@ -31,8 +31,6 @@ subdir_done() endif -libdav1d_nasm_objs_if_needed = [] - if is_asm_enabled checkasm_sources = files( 'checkasm/checkasm.c', @@ -62,25 +60,25 @@ checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects() endforeach - checkasm_nasm_objs = [] - if host_machine.cpu_family() == 'aarch64' - checkasm_sources += files('checkasm/arm/checkasm_64.S') + checkasm_asm_objs = [] + checkasm_asm_sources = [] + if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64' + checkasm_asm_sources += files('checkasm/arm/checkasm_64.S') elif host_machine.cpu_family().startswith('arm') - checkasm_sources += files('checkasm/arm/checkasm_32.S') + checkasm_asm_sources += files('checkasm/arm/checkasm_32.S') elif host_machine.cpu_family().startswith('x86') - checkasm_nasm_objs = nasm_gen.process(files('checkasm/x86/checkasm.asm')) + checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm')) endif - m_lib = cc.find_library('m', required: false) - - if meson.version().version_compare('< 0.48.999') - libdav1d_nasm_objs_if_needed = libdav1d_nasm_objs + if use_gaspp + checkasm_asm_objs += gaspp_gen.process(checkasm_asm_sources) + else + checkasm_sources += checkasm_asm_sources endif checkasm = executable('checkasm', checkasm_sources, - checkasm_nasm_objs, - libdav1d_nasm_objs_if_needed, + checkasm_asm_objs, objects: [ checkasm_bitdepth_objs, @@ -94,16 +92,57 @@ thread_dependency, rt_dependency, libdl_dependency, - m_lib, + libm_dependency, ], ) - test('checkasm', checkasm, is_parallel: false) + test('checkasm', checkasm, suite: 'checkasm', timeout: 180, is_parallel: false) + benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench') endif +c99_extension_flag = cc.first_supported_argument( + '-Werror=c11-extensions', + '-Werror=c99-c11-compat', + '-Wc11-extensions', + '-Wc99-c11-compat', +) + +# dav1d_api_headers +foreach header : dav1d_api_headers + target = header + '_test' + + header_test_exe = executable(target, + 'header_test.c', + include_directories: dav1d_inc_dirs, + c_args: ['-DDAV1D_TEST_HEADER="@0@"'.format(header), c99_extension_flag], + build_by_default: true + ) + + test(target, header_test_exe, suite: 'headers') +endforeach + + # fuzzing binaries -if meson.version().version_compare('>=0.49') - subdir('libfuzzer') +subdir('libfuzzer') + +# seek stress test binary, depends on dav1d cli tool +if get_option('enable_tools') + seek_stress_sources = files('seek_stress.c') + seek_stress = executable('seek_stress', + seek_stress_sources, rev_target, + objects: [ + dav1d.extract_objects('dav1d_cli_parse.c'), + dav1d_input_objs.extract_objects('input/input.c', 'input/ivf.c'), + ], + include_directories: [dav1d_inc_dirs, include_directories('../tools')], + link_with: libdav1d, + dependencies: [ + thread_dependency, + rt_dependency, + getopt_dependency, + libm_dependency, + ], + ) endif # Include dav1d test data repository with additional tests diff -Nru dav1d-0.7.1/tests/seek_stress.c dav1d-0.9.1/tests/seek_stress.c --- dav1d-0.7.1/tests/seek_stress.c 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/tests/seek_stress.c 2021-07-28 21:38:28.921852400 +0000 @@ -0,0 +1,243 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * Copyright © 2020, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "vcs_version.h" +#include "cli_config.h" + +#include +#include +#include +#include + +#include "dav1d/dav1d.h" +#include "input/input.h" +#include "input/demuxer.h" +#include "dav1d_cli_parse.h" + +#define NUM_RAND_SEEK 3 +#define NUM_REL_SEEK 4 +#define NUM_END_SEEK 2 + +const Demuxer annexb_demuxer = { .name = "" }; +const Demuxer section5_demuxer = { .name = "" }; + +#ifdef _WIN32 +#include +static unsigned get_seed(void) { + return GetTickCount(); +} +#else +#ifdef __APPLE__ +#include +#else +#include +#endif +static unsigned get_seed(void) { +#ifdef __APPLE__ + return (unsigned) mach_absolute_time(); +#elif defined(HAVE_CLOCK_GETTIME) + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec); +#endif +} +#endif + +static uint32_t xs_state[4]; + +static void xor128_srand(unsigned seed) { + xs_state[0] = seed; + xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff); + xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff); + xs_state[3] = ~seed; +} + +// xor128 from Marsaglia, George (July 2003). "Xorshift RNGs". +// Journal of Statistical Software. 8 (14). +// doi:10.18637/jss.v008.i14. +static int xor128_rand(void) { + const uint32_t x = xs_state[0]; + const uint32_t t = x ^ (x << 11); + + xs_state[0] = xs_state[1]; + xs_state[1] = xs_state[2]; + xs_state[2] = xs_state[3]; + uint32_t w = xs_state[3]; + + w = (w ^ (w >> 19)) ^ (t ^ (t >> 8)); + xs_state[3] = w; + + return w >> 1; +} + +static inline int decode_frame(Dav1dPicture *const p, + Dav1dContext *const c, Dav1dData *const data) +{ + int res; + memset(p, 0, sizeof(*p)); + if ((res = dav1d_send_data(c, data)) < 0) { + if (res != DAV1D_ERR(EAGAIN)) { + fprintf(stderr, "Error decoding frame: %s\n", + strerror(DAV1D_ERR(res))); + return res; + } + } + if ((res = dav1d_get_picture(c, p)) < 0) { + if (res != DAV1D_ERR(EAGAIN)) { + fprintf(stderr, "Error decoding frame: %s\n", + strerror(DAV1D_ERR(res))); + return res; + } + } else dav1d_picture_unref(p); + return 0; +} + +static int decode_rand(DemuxerContext *const in, Dav1dContext *const c, + Dav1dData *const data, const double fps) +{ + int res = 0; + Dav1dPicture p; + const int num_frames = xor128_rand() % (int)(fps * 5); + for (int i = 0; i < num_frames; i++) { + if ((res = decode_frame(&p, c, data))) break; + if (input_read(in, data) || data->sz == 0) break; + } + return res; +} + +static int decode_all(DemuxerContext *const in, + Dav1dContext *const c, Dav1dData *const data) +{ + int res = 0; + Dav1dPicture p; + do { if ((res = decode_frame(&p, c, data))) break; + } while (!input_read(in, data) && data->sz > 0); + return res; +} + +static int seek(DemuxerContext *const in, Dav1dContext *const c, + const uint64_t pts, Dav1dData *const data) +{ + int res; + if ((res = input_seek(in, pts))) return res; + Dav1dSequenceHeader seq; + do { if ((res = input_read(in, data))) break; + } while (dav1d_parse_sequence_header(&seq, data->data, data->sz)); + dav1d_flush(c); + return res; +} + +int main(const int argc, char *const *const argv) { + const char *version = dav1d_version(); + if (strcmp(version, DAV1D_VERSION)) { + fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n", + version, DAV1D_VERSION); + return EXIT_FAILURE; + } + + CLISettings cli_settings; + Dav1dSettings lib_settings; + DemuxerContext *in; + Dav1dContext *c; + Dav1dData data; + unsigned total, i_fps[2], i_timebase[2]; + double timebase, spf, fps; + uint64_t pts; + + xor128_srand(get_seed()); + parse(argc, argv, &cli_settings, &lib_settings); + + if (input_open(&in, "ivf", cli_settings.inputfile, + i_fps, &total, i_timebase) < 0 || + !i_timebase[0] || !i_timebase[1] || !i_fps[0] || !i_fps[1]) + { + return EXIT_SUCCESS; + } + if (dav1d_open(&c, &lib_settings)) + return EXIT_FAILURE; + + timebase = (double)i_timebase[1] / i_timebase[0]; + spf = (double)i_fps[1] / i_fps[0]; + fps = (double)i_fps[0] / i_fps[1]; + if (fps < 1) goto end; + +#define FRAME_OFFSET_TO_PTS(foff) \ + (uint64_t)llround(((foff) * spf) * 1000000000.0) +#define TS_TO_PTS(ts) \ + (uint64_t)llround(((ts) * timebase) * 1000000000.0) + + // seek at random pts + for (int i = 0; i < NUM_RAND_SEEK; i++) { + pts = FRAME_OFFSET_TO_PTS(xor128_rand() % total); + if (seek(in, c, pts, &data)) continue; + if (decode_rand(in, c, &data, fps)) goto end; + } + pts = TS_TO_PTS(data.m.timestamp); + + // seek left / right randomly with random intervals within 1s + for (int i = 0, tries = 0; + i - tries < NUM_REL_SEEK && tries < NUM_REL_SEEK / 2; + i++) + { + const int sign = xor128_rand() & 1 ? -1 : +1; + const float diff = (xor128_rand() % 100) / 100.f; + int64_t new_pts = pts + sign * FRAME_OFFSET_TO_PTS(diff * fps); + const int64_t new_ts = llround(new_pts / (timebase * 1000000000.0)); + new_pts = TS_TO_PTS(new_ts); + if (new_pts < 0 || (uint64_t)new_pts >= FRAME_OFFSET_TO_PTS(total)) { + if (seek(in, c, FRAME_OFFSET_TO_PTS(total / 2), &data)) break; + pts = TS_TO_PTS(data.m.timestamp); + tries++; + continue; + } + if (seek(in, c, new_pts, &data)) + if (seek(in, c, 0, &data)) goto end; + if (decode_rand(in, c, &data, fps)) goto end; + pts = TS_TO_PTS(data.m.timestamp); + } + + unsigned shift = 0; + do { + shift += 5; + if (shift > total) + shift = total; + } while (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data)); + + // simulate seeking after the end of the file + for (int i = 0; i < NUM_END_SEEK; i++) { + if (seek(in, c, FRAME_OFFSET_TO_PTS(total - shift), &data)) goto end; + if (decode_all(in, c, &data)) goto end; + int num_flush = 1 + 64 + xor128_rand() % 64; + while (num_flush--) dav1d_flush(c); + } + +end: + input_close(in); + dav1d_close(&c); + return EXIT_SUCCESS; +} diff -Nru dav1d-0.7.1/THANKS.md dav1d-0.9.1/THANKS.md --- dav1d-0.7.1/THANKS.md 2020-06-21 11:48:54.948126300 +0000 +++ dav1d-0.9.1/THANKS.md 2021-07-28 21:38:28.849851600 +0000 @@ -16,13 +16,16 @@ And all the dav1d Authors (git shortlog -sn), including: -Janne Grunau, Ronald S. Bultje, Martin Storsjö, Henrik Gramner, James Almer, -Marvin Scholz, Luc Trudeau, Jean-Baptiste Kempf, Victorien Le Couviour--Tuffet, -David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Nathan E. Egge, -Francois Cartegnie, Konstantin Pavlov, Liwei Wang, Xuefeng Jiang, -Derek Buitenhuis, Raphaël Zumer, Niklas Haas, Michael Bradshaw, Kyle Siefring, -Raphael Zumer, Boyuan Xiao, Thierry Foucu, Matthias Dressel, Thomas Daede, -Rupert Swarbrick, Jan Beich, Dale Curtis, SmilingWolf, Tristan Laurent, -Vittorio Giovara, Rostislav Pehlivanov, Shiz, skal, Steinar Midtskogen, -Luca Barbato, Justin Bull, Jean-Yves Avenard, Timo Gurr, Fred Barbier, -Anisse Astier, Pablo Stebler, Nicolas Frattaroli, Mark Shuttleworth. +Martin Storsjö, Janne Grunau, Henrik Gramner, Ronald S. Bultje, James Almer, +Marvin Scholz, Luc Trudeau, Victorien Le Couviour--Tuffet, Jean-Baptiste Kempf, +Hugo Beauzée-Luyssen, Matthias Dressel, Konstantin Pavlov, David Michael Barr, +Steve Lhomme, Niklas Haas, B Krishnan Iyer, Francois Cartegnie, Liwei Wang, +Nathan E. Egge, Derek Buitenhuis, Michael Bradshaw, Raphaël Zumer, +Xuefeng Jiang, Luca Barbato, Jan Beich, Wan-Teh Chang, Justin Bull, Boyuan Xiao, +Dale Curtis, Kyle Siefring, Raphael Zumer, Rupert Swarbrick, Thierry Foucu, +Thomas Daede, Colin Lee, Emmanuel Gil Peyrot, Lynne, Michail Alvanos, +Nico Weber, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, +Dmitriy Sychov, Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, +Mark Shuttleworth, Matthieu Bouron, Nicolas Frattaroli, Pablo Stebler, +Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Sylvestre Ledru, Timo Gurr, +Tristan Matthews, Xavier Claessens, Xu Guangxin, kossh1 and skal. diff -Nru dav1d-0.7.1/tools/dav1d.c dav1d-0.9.1/tools/dav1d.c --- dav1d-0.7.1/tools/dav1d.c 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/dav1d.c 2021-07-28 21:38:28.921852400 +0000 @@ -124,11 +124,15 @@ else b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)", n, num, 100.0 * n / num); - if (i_fps && b < end) { + if (b < end) { const double d_fps = 1e9 * n / elapsed; - const double speed = d_fps / i_fps; - b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)", - d_fps, i_fps, speed); + if (i_fps) { + const double speed = d_fps / i_fps; + b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)", + d_fps, i_fps, speed); + } else { + b += snprintf(b, end - b, " - %.2lf fps", d_fps); + } } if (!istty) strcpy(b > end - 2 ? end - 2 : b, "\n"); @@ -193,7 +197,6 @@ seq_skip); } - //getc(stdin); if (cli_settings.limit != 0 && cli_settings.limit < total) total = cli_settings.limit; diff -Nru dav1d-0.7.1/tools/dav1d_cli_parse.c dav1d-0.9.1/tools/dav1d_cli_parse.c --- dav1d-0.7.1/tools/dav1d_cli_parse.c 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/dav1d_cli_parse.c 2021-07-28 21:38:28.921852400 +0000 @@ -26,6 +26,7 @@ */ #include "config.h" +#include "cli_config.h" #include #include @@ -51,6 +52,7 @@ ARG_REALTIME_CACHE, ARG_FRAME_THREADS, ARG_TILE_THREADS, + ARG_POSTFILTER_THREADS, ARG_VERIFY, ARG_FILM_GRAIN, ARG_OPPOINT, @@ -73,6 +75,7 @@ { "realtimecache", 1, NULL, ARG_REALTIME_CACHE }, { "framethreads", 1, NULL, ARG_FRAME_THREADS }, { "tilethreads", 1, NULL, ARG_TILE_THREADS }, + { "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS }, { "verify", 1, NULL, ARG_VERIFY }, { "filmgrain", 1, NULL, ARG_FILM_GRAIN }, { "oppoint", 1, NULL, ARG_OPPOINT }, @@ -82,6 +85,12 @@ { NULL, 0, NULL, 0 }, }; +#if HAVE_XXHASH_H +#define AVAILABLE_MUXERS "'md5', 'xxh3', 'yuv', 'yuv4mpeg2' or 'null'" +#else +#define AVAILABLE_MUXERS "'md5', 'yuv', 'yuv4mpeg2' or 'null'" +#endif + #if ARCH_AARCH64 || ARCH_ARM #define ALLOWED_CPU_MASKS " or 'neon'" #elif ARCH_PPC64LE @@ -106,8 +115,8 @@ fprintf(stderr, "Supported options:\n" " --input/-i $file: input file\n" " --output/-o $file: output file\n" - " --demuxer $name: force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from extension)\n" - " --muxer $name: force muxer type ('md5', 'yuv', 'yuv4mpeg2' or 'null'; default: detect from extension)\n" + " --demuxer $name: force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from content)\n" + " --muxer $name: force muxer type (" AVAILABLE_MUXERS "; default: detect from extension)\n" " --quiet/-q: disable status messages\n" " --frametimes $file: dump frame times to file\n" " --limit/-l $num: stop decoding after $num frames\n" @@ -117,7 +126,8 @@ " --version/-v: print version and exit\n" " --framethreads $num: number of frame threads (default: 1)\n" " --tilethreads $num: number of tile threads (default: 1)\n" - " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n" + " --pfthreads $num: number of postfilter threads (default: 1)\n" + " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5 or xxh3)\n" " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 31)\n" " --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n" " --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n" @@ -198,24 +208,26 @@ { "avx2", X86_CPU_MASK_AVX2 }, { "avx512icl", X86_CPU_MASK_AVX512ICL }, #endif - { 0 }, + { "none", 0 }, }; +#define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n))) + static unsigned parse_enum(char *optarg, const EnumParseTable *const tbl, - const int option, const char *app) + const int tbl_sz, const int option, const char *app) { char str[1024]; strcpy(str, "any of "); - for (int n = 0; tbl[n].str; n++) { + for (int n = 0; n < tbl_sz; n++) { if (!strcmp(tbl[n].str, optarg)) return tbl[n].val; if (n) { - if (!tbl[n + 1].str) - strcat(str, " or "); - else + if (n < tbl_sz - 1) strcat(str, ", "); + else + strcat(str, " or "); } strcat(str, tbl[n].str); } @@ -295,6 +307,10 @@ lib_settings->n_tile_threads = parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]); break; + case ARG_POSTFILTER_THREADS: + lib_settings->n_postfilter_threads = + parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]); + break; case ARG_VERIFY: cli_settings->verify = optarg; break; @@ -325,7 +341,7 @@ fprintf(stderr, "%s\n", dav1d_version()); exit(0); case ARG_CPU_MASK: - dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl, + dav1d_set_cpu_flags_mask(parse_enum(optarg, cpu_mask_tbl, ARRAY_SIZE(cpu_mask_tbl), ARG_CPU_MASK, argv[0])); break; default: @@ -338,8 +354,11 @@ if (cli_settings->verify) { if (cli_settings->outputfile) usage(argv[0], "Verification (--verify) requires output file (-o/--output) to not set"); - if (cli_settings->muxer && !strcmp(cli_settings->muxer, "md5")) - usage(argv[0], "Verification (--verify) requires the md5 muxer (--muxer md5)"); + if (cli_settings->muxer && strcmp(cli_settings->muxer, "md5") && + strcmp(cli_settings->muxer, "xxh3")) + { + usage(argv[0], "Verification (--verify) requires a checksum muxer (md5 or xxh3)"); + } cli_settings->outputfile = "-"; if (!cli_settings->muxer) @@ -347,7 +366,8 @@ } if (!grain_specified && cli_settings->muxer && - !strcmp(cli_settings->muxer, "md5")) + (!strcmp(cli_settings->muxer, "md5") || + !strcmp(cli_settings->muxer, "xxh3"))) { lib_settings->apply_grain = 0; } diff -Nru dav1d-0.7.1/tools/dav1d.manifest dav1d-0.9.1/tools/dav1d.manifest --- dav1d-0.7.1/tools/dav1d.manifest 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/tools/dav1d.manifest 2021-07-28 21:38:28.921852400 +0000 @@ -0,0 +1,10 @@ + + + + + + true + UTF-8 + + + diff -Nru dav1d-0.7.1/tools/dav1d.rc.in dav1d-0.9.1/tools/dav1d.rc.in --- dav1d-0.7.1/tools/dav1d.rc.in 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/tools/dav1d.rc.in 2021-07-28 21:38:28.921852400 +0000 @@ -0,0 +1,33 @@ +#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0 +#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@" +#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0 +#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@" + +#include + +1 RT_MANIFEST "dav1d.manifest" +1 VERSIONINFO +FILETYPE VFT_APP +FILEOS VOS_NT_WINDOWS32 +PRODUCTVERSION PROJECT_VERSION_NUMBER +FILEVERSION API_VERSION_NUMBER +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904E4" + BEGIN + VALUE "CompanyName", "VideoLAN" + VALUE "ProductName", "dav1d" + VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR + VALUE "FileVersion", API_VERSION_NUMBER_STR + VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder" + VALUE "InternalName", "dav1d" + VALUE "OriginalFilename", "dav1d.exe" + VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors" + END + END + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x409, 1252 + END +END diff -Nru dav1d-0.7.1/tools/input/annexb.c dav1d-0.9.1/tools/input/annexb.c --- dav1d-0.7.1/tools/input/annexb.c 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/input/annexb.c 2021-07-28 21:38:28.921852400 +0000 @@ -191,5 +191,6 @@ .probe_sz = PROBE_SIZE, .open = annexb_open, .read = annexb_read, + .seek = NULL, .close = annexb_close, }; diff -Nru dav1d-0.7.1/tools/input/demuxer.h dav1d-0.9.1/tools/input/demuxer.h --- dav1d-0.7.1/tools/input/demuxer.h 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/input/demuxer.h 2021-07-28 21:38:28.921852400 +0000 @@ -39,6 +39,7 @@ int (*open)(DemuxerPriv *ctx, const char *filename, unsigned fps[2], unsigned *num_frames, unsigned timebase[2]); int (*read)(DemuxerPriv *ctx, Dav1dData *data); + int (*seek)(DemuxerPriv *ctx, uint64_t pts); void (*close)(DemuxerPriv *ctx); } Demuxer; diff -Nru dav1d-0.7.1/tools/input/input.c dav1d-0.9.1/tools/input/input.c --- dav1d-0.7.1/tools/input/input.c 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/input/input.c 2021-07-28 21:38:28.921852400 +0000 @@ -128,6 +128,10 @@ return ctx->impl->read(ctx->data, data); } +int input_seek(DemuxerContext *const ctx, const uint64_t pts) { + return ctx->impl->seek ? ctx->impl->seek(ctx->data, pts) : -1; +} + void input_close(DemuxerContext *const ctx) { ctx->impl->close(ctx->data); free(ctx); diff -Nru dav1d-0.7.1/tools/input/input.h dav1d-0.9.1/tools/input/input.h --- dav1d-0.7.1/tools/input/input.h 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/input/input.h 2021-07-28 21:38:28.921852400 +0000 @@ -36,6 +36,7 @@ const char *const name, const char *const filename, unsigned fps[2], unsigned *num_frames, unsigned timebase[2]); int input_read(DemuxerContext *ctx, Dav1dData *data); +int input_seek(DemuxerContext *ctx, uint64_t pts); void input_close(DemuxerContext *ctx); #endif /* DAV1D_INPUT_INPUT_H */ diff -Nru dav1d-0.7.1/tools/input/ivf.c dav1d-0.9.1/tools/input/ivf.c --- dav1d-0.7.1/tools/input/ivf.c 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/input/ivf.c 2021-07-28 21:38:28.921852400 +0000 @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -38,6 +39,10 @@ typedef struct DemuxerPriv { FILE *f; + int broken; + double timebase; + uint64_t last_ts; + uint64_t step; } IvfInputContext; static const uint8_t probe_data[] = { @@ -61,13 +66,12 @@ static int ivf_open(IvfInputContext *const c, const char *const file, unsigned fps[2], unsigned *const num_frames, unsigned timebase[2]) { - size_t res; uint8_t hdr[32]; if (!(c->f = fopen(file, "rb"))) { fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno)); return -1; - } else if ((res = fread(hdr, 32, 1, c->f)) != 1) { + } else if (fread(hdr, 32, 1, c->f) != 1) { fprintf(stderr, "Failed to read stream header: %s\n", strerror(errno)); fclose(c->f); return -1; @@ -87,11 +91,17 @@ timebase[1] = rl32(&hdr[20]); const unsigned duration = rl32(&hdr[24]); - uint8_t data[4]; + uint8_t data[8]; + c->broken = 0; for (*num_frames = 0;; (*num_frames)++) { - if ((res = fread(data, 4, 1, c->f)) != 1) - break; // EOF - fseeko(c->f, rl32(data) + 8, SEEK_CUR); + if (fread(data, 4, 1, c->f) != 1) break; // EOF + size_t sz = rl32(data); + if (fread(data, 8, 1, c->f) != 1) break; // EOF + const uint64_t ts = rl64(data); + if (*num_frames && ts <= c->last_ts) + c->broken = 1; + c->last_ts = ts; + fseeko(c->f, sz, SEEK_CUR); } uint64_t fps_num = (uint64_t) timebase[0] * *num_frames; @@ -113,34 +123,68 @@ } else { fps[0] = fps[1] = 0; } + c->timebase = (double)timebase[0] / timebase[1]; + c->step = duration / *num_frames; fseeko(c->f, 32, SEEK_SET); + c->last_ts = 0; return 0; } -static int ivf_read(IvfInputContext *const c, Dav1dData *const buf) { +static inline int ivf_read_header(IvfInputContext *const c, ptrdiff_t *const sz, + int64_t *const off_, uint64_t *const ts) +{ uint8_t data[8]; - uint8_t *ptr; - size_t res; + int64_t const off = ftello(c->f); + if (off_) *off_ = off; + if (fread(data, 4, 1, c->f) != 1) return -1; // EOF + *sz = rl32(data); + if (!c->broken) { + if (fread(data, 8, 1, c->f) != 1) return -1; + *ts = rl64(data); + } else { + if (fseeko(c->f, 8, SEEK_CUR)) return -1; + *ts = off > 32 ? c->last_ts + c->step : 0; + } + return 0; +} - const int64_t off = ftello(c->f); - if ((res = fread(data, 4, 1, c->f)) != 1) - return -1; // EOF - const ptrdiff_t sz = rl32(data); - if ((res = fread(data, 8, 1, c->f)) != 1) - return -1; // EOF - ptr = dav1d_data_create(buf, sz); - if (!ptr) return -1; - buf->m.offset = off; - buf->m.timestamp = rl64(data); - if ((res = fread(ptr, sz, 1, c->f)) != 1) { +static int ivf_read(IvfInputContext *const c, Dav1dData *const buf) { + uint8_t *ptr; + ptrdiff_t sz; + int64_t off; + uint64_t ts; + if (ivf_read_header(c, &sz, &off, &ts)) return -1; + if (!(ptr = dav1d_data_create(buf, sz))) return -1; + if (fread(ptr, sz, 1, c->f) != 1) { fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno)); dav1d_data_unref(buf); return -1; } + buf->m.offset = off; + buf->m.timestamp = ts; + c->last_ts = ts; + return 0; +} +static int ivf_seek(IvfInputContext *const c, const uint64_t pts) { + uint64_t cur; + const uint64_t ts = llround((pts * c->timebase) / 1000000000.0); + if (ts <= c->last_ts) + if (fseeko(c->f, 32, SEEK_SET)) goto error; + while (1) { + ptrdiff_t sz; + if (ivf_read_header(c, &sz, NULL, &cur)) goto error; + if (cur >= ts) break; + if (fseeko(c->f, sz, SEEK_CUR)) goto error; + c->last_ts = cur; + } + if (fseeko(c->f, -12, SEEK_CUR)) goto error; return 0; +error: + fprintf(stderr, "Failed to seek: %s\n", strerror(errno)); + return -1; } static void ivf_close(IvfInputContext *const c) { @@ -154,5 +198,6 @@ .probe_sz = sizeof(probe_data), .open = ivf_open, .read = ivf_read, + .seek = ivf_seek, .close = ivf_close, }; diff -Nru dav1d-0.7.1/tools/input/parse.h dav1d-0.9.1/tools/input/parse.h --- dav1d-0.7.1/tools/input/parse.h 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/input/parse.h 2021-07-28 21:38:28.921852400 +0000 @@ -29,22 +29,24 @@ #ifndef DAV1D_INPUT_PARSE_H #define DAV1D_INPUT_PARSE_H +#include + #include "dav1d/headers.h" static int leb128(FILE *const f, size_t *const len) { + uint64_t val = 0; unsigned i = 0, more; - *len = 0; do { - uint8_t byte; - if (fread(&byte, 1, 1, f) < 1) + uint8_t v; + if (fread(&v, 1, 1, f) < 1) return -1; - more = byte & 0x80; - const unsigned bits = byte & 0x7f; - if (i <= 3 || (i == 4 && bits < (1 << 4))) - *len |= bits << (i * 7); - else if (bits) return -1; - if (++i == 8 && more) return -1; - } while (more); + more = v & 0x80; + val |= ((uint64_t) (v & 0x7F)) << (i * 7); + i++; + } while (more && i < 8); + if (val > UINT_MAX || more) + return -1; + *len = (size_t) val; return i; } @@ -52,18 +54,18 @@ // with author's permission static int leb(const uint8_t *ptr, int sz, size_t *const len) { + uint64_t val = 0; unsigned i = 0, more; - *len = 0; do { if (!sz--) return -1; - const int byte = *ptr++; - more = byte & 0x80; - const unsigned bits = byte & 0x7f; - if (i <= 3 || (i == 4 && bits < (1 << 4))) - *len |= bits << (i * 7); - else if (bits) return -1; - if (++i == 8 && more) return -1; - } while (more); + const int v = *ptr++; + more = v & 0x80; + val |= ((uint64_t) (v & 0x7F)) << (i * 7); + i++; + } while (more && i < 8); + if (val > UINT_MAX || more) + return -1; + *len = (size_t) val; return i; } diff -Nru dav1d-0.7.1/tools/input/section5.c dav1d-0.9.1/tools/input/section5.c --- dav1d-0.7.1/tools/input/section5.c 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/input/section5.c 2021-07-28 21:38:28.921852400 +0000 @@ -181,5 +181,6 @@ .probe_sz = PROBE_SIZE, .open = section5_open, .read = section5_read, + .seek = NULL, .close = section5_close, }; diff -Nru dav1d-0.7.1/tools/meson.build dav1d-0.9.1/tools/meson.build --- dav1d-0.7.1/tools/meson.build 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/meson.build 2021-07-28 21:38:28.921852400 +0000 @@ -39,6 +39,18 @@ 'output/yuv.c', ) +# hacky check for xxhash.h to allow copying it to tools/output +if not get_option('xxhash_muxer').disabled() + xxhash_include = '-I' + meson.current_source_dir() / 'output' + if cc.has_header_symbol('xxhash.h', 'XXH3_createState', args : xxhash_include) + dav1d_output_sources += 'output/xxhash.c' + xxh3_found = true + elif get_option('xxhash_muxer').enabled() + # manual error since 'required' kw arg in has_header_symbol() was only added in meson 0.50 + error( 'C symbol XXH3_createState not found in header xxhash.h') + endif +endif + dav1d_input_objs = static_library('dav1d_input', dav1d_input_sources, @@ -69,6 +81,8 @@ # Configuratin data for cli_config.h cli_cdata = configuration_data() +cli_cdata.set10('HAVE_XXHASH_H', get_variable('xxh3_found', false)) + cli_config_h_target = configure_file(output: 'cli_config.h', configuration: cli_cdata) # dav1d cli tool sources @@ -77,12 +91,33 @@ 'dav1d_cli_parse.c', ) +if host_machine.system() == 'windows' + rc_file = configure_file( + input : 'dav1d.rc.in', + output : 'dav1d.rc', + configuration : rc_data + ) + + dav1d_rc_obj = winmod.compile_resources(rc_file, + depend_files : files('dav1d.manifest'), + include_directories : include_directories('.') + ) +else + dav1d_rc_obj = [] +endif + dav1d = executable('dav1d', dav1d_sources, + dav1d_rc_obj, rev_target, cli_config_h_target, link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs], include_directories : [dav1d_inc_dirs], - dependencies : [getopt_dependency, thread_dependency, rt_dependency], + dependencies : [ + getopt_dependency, + thread_dependency, + rt_dependency, + libm_dependency, + ], install : true, ) diff -Nru dav1d-0.7.1/tools/output/md5.c dav1d-0.9.1/tools/output/md5.c --- dav1d-0.7.1/tools/output/md5.c 2020-06-21 11:48:55.048126500 +0000 +++ dav1d-0.9.1/tools/output/md5.c 2021-07-28 21:38:28.921852400 +0000 @@ -37,14 +37,7 @@ #include "output/muxer.h" -static const uint8_t s[][4] = { - { 7, 12, 17, 22, }, - { 5, 9, 14, 20, }, - { 4, 11, 16, 23, }, - { 6, 10, 15, 21, }, -}; - -static const unsigned k[] = { +static const uint32_t k[64] = { 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, @@ -63,7 +56,6 @@ 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391, }; - #if ENDIANNESS_BIG #define NE2LE_32(x) (((x & 0x00ff) << 24) |\ ((x & 0xff00) << 8) |\ @@ -85,8 +77,11 @@ #endif typedef struct MuxerPriv { - unsigned abcd[4]; - uint8_t data[64]; + uint32_t abcd[4]; + union { + uint8_t data[64]; + uint32_t data32[16]; + }; uint64_t len; FILE *f; #if ENDIANNESS_BIG @@ -120,42 +115,48 @@ return 0; } -static inline unsigned leftrotate(const unsigned x, const unsigned c) { +static inline uint32_t leftrotate(const uint32_t x, const int c) { return (x << c) | (x >> (32 - c)); } -static void md5_body(MD5Context *md5, const uint8_t *const _data) { - const uint32_t *data = (uint32_t *) _data; - - unsigned a = md5->abcd[0]; - unsigned b = md5->abcd[1]; - unsigned c = md5->abcd[2]; - unsigned d = md5->abcd[3]; - unsigned i; - - for (i = 0; i < 64; i++) { - unsigned f, g, tmp; - - if (i < 16) { - f = (b & c) | (~b & d); - g = i; - } else if (i < 32) { - f = (d & b) | (~d & c); - g = (5 * i + 1) & 15; - } else if (i < 48) { - f = b ^ c ^ d; - g = (3 * i + 5) & 15; - } else { - f = c ^ (b | ~d); - g = (7 * i) & 15; - } - - tmp = d; - d = c; - c = b; - b += leftrotate(a + f + k[i] + NE2LE_32(data[g]), s[i >> 4][i & 3]); - a = tmp; - } +#define F(i) do { \ + a = b + leftrotate(a + ((b & c) | (~b & d)) + k[i + 0] + NE2LE_32(data[i + 0]), 7); \ + d = a + leftrotate(d + ((a & b) | (~a & c)) + k[i + 1] + NE2LE_32(data[i + 1]), 12); \ + c = d + leftrotate(c + ((d & a) | (~d & b)) + k[i + 2] + NE2LE_32(data[i + 2]), 17); \ + b = c + leftrotate(b + ((c & d) | (~c & a)) + k[i + 3] + NE2LE_32(data[i + 3]), 22); \ +} while (0) + +#define G(i) do { \ + a = b + leftrotate(a + ((d & b) | (~d & c)) + k[i + 0] + NE2LE_32(data[(i + 1) & 15]), 5); \ + d = a + leftrotate(d + ((c & a) | (~c & b)) + k[i + 1] + NE2LE_32(data[(i + 6) & 15]), 9); \ + c = d + leftrotate(c + ((b & d) | (~b & a)) + k[i + 2] + NE2LE_32(data[(i + 11) & 15]), 14); \ + b = c + leftrotate(b + ((a & c) | (~a & d)) + k[i + 3] + NE2LE_32(data[(i + 0) & 15]), 20); \ +} while (0) + +#define H(i) do { \ + a = b + leftrotate(a + (b ^ c ^ d) + k[i + 0] + NE2LE_32(data[( 5 - i) & 15]), 4); \ + d = a + leftrotate(d + (a ^ b ^ c) + k[i + 1] + NE2LE_32(data[( 8 - i) & 15]), 11); \ + c = d + leftrotate(c + (d ^ a ^ b) + k[i + 2] + NE2LE_32(data[(11 - i) & 15]), 16); \ + b = c + leftrotate(b + (c ^ d ^ a) + k[i + 3] + NE2LE_32(data[(14 - i) & 15]), 23); \ +} while (0) + +#define I(i) do { \ + a = b + leftrotate(a + (c ^ (b | ~d)) + k[i + 0] + NE2LE_32(data[( 0 - i) & 15]), 6); \ + d = a + leftrotate(d + (b ^ (a | ~c)) + k[i + 1] + NE2LE_32(data[( 7 - i) & 15]), 10); \ + c = d + leftrotate(c + (a ^ (d | ~b)) + k[i + 2] + NE2LE_32(data[(14 - i) & 15]), 15); \ + b = c + leftrotate(b + (d ^ (c | ~a)) + k[i + 3] + NE2LE_32(data[( 5 - i) & 15]), 21); \ +} while (0) + +static void md5_body(MD5Context *const md5, const uint32_t *const data) { + uint32_t a = md5->abcd[0]; + uint32_t b = md5->abcd[1]; + uint32_t c = md5->abcd[2]; + uint32_t d = md5->abcd[3]; + + F( 0); F( 4); F( 8); F(12); + G(16); G(20); G(24); G(28); + H(32); H(36); H(40); H(44); + I(48); I(52); I(56); I(60); md5->abcd[0] += a; md5->abcd[1] += b; @@ -167,19 +168,19 @@ if (!len) return; if (md5->len & 63) { - const unsigned tmp = imin(len, 64 - (md5->len & 63)); + const unsigned tmp = umin(len, 64 - (md5->len & 63)); memcpy(&md5->data[md5->len & 63], data, tmp); len -= tmp; data += tmp; md5->len += tmp; if (!(md5->len & 63)) - md5_body(md5, md5->data); + md5_body(md5, md5->data32); } while (len >= 64) { memcpy(md5->data, data, 64); - md5_body(md5, md5->data); + md5_body(md5, md5->data32); md5->len += 64; data += 64; len -= 64; @@ -251,12 +252,12 @@ static void md5_finish(MD5Context *const md5) { static const uint8_t bit[2] = { 0x80, 0x00 }; - uint64_t len = NE2LE_64(md5->len << 3); + const uint64_t len = NE2LE_64(md5->len << 3); md5_update(md5, &bit[0], 1); while ((md5->len & 63) != 56) md5_update(md5, &bit[1], 1); - md5_update(md5, (uint8_t *) &len, 8); + md5_update(md5, (const uint8_t *) &len, 8); } static void md5_close(MD5Context *const md5) { @@ -278,23 +279,20 @@ fclose(md5->f); } -static int md5_verify(MD5Context *const md5, const char *const md5_str) { +static int md5_verify(MD5Context *const md5, const char *md5_str) { md5_finish(md5); if (strlen(md5_str) < 32) - return 0; + return -1; - const char *p = md5_str; - unsigned abcd[4] = { 0 }; + uint32_t abcd[4] = { 0 }; char t[3] = { 0 }; for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - unsigned val; + for (int j = 0; j < 32; j += 8) { char *ignore; - memcpy(t, p, 2); - p += 2; - val = (unsigned) strtoul(t, &ignore, 16); - abcd[i] |= val << (8 * j); + memcpy(t, md5_str, 2); + md5_str += 2; + abcd[i] |= (uint32_t) strtoul(t, &ignore, 16) << j; } } diff -Nru dav1d-0.7.1/tools/output/output.c dav1d-0.9.1/tools/output/output.c --- dav1d-0.7.1/tools/output/output.c 2020-06-21 11:48:55.052126400 +0000 +++ dav1d-0.9.1/tools/output/output.c 2021-07-28 21:38:28.921852400 +0000 @@ -26,6 +26,7 @@ */ #include "config.h" +#include "cli_config.h" #include #include @@ -44,11 +45,15 @@ extern const Muxer null_muxer; extern const Muxer md5_muxer; +extern const Muxer xxh3_muxer; extern const Muxer yuv_muxer; extern const Muxer y4m2_muxer; static const Muxer *muxers[] = { &null_muxer, &md5_muxer, +#if HAVE_XXHASH_H + &xxh3_muxer, +#endif &yuv_muxer, &y4m2_muxer, NULL diff -Nru dav1d-0.7.1/tools/output/xxhash.c dav1d-0.9.1/tools/output/xxhash.c --- dav1d-0.7.1/tools/output/xxhash.c 1970-01-01 00:00:00.000000000 +0000 +++ dav1d-0.9.1/tools/output/xxhash.c 2021-07-28 21:38:28.921852400 +0000 @@ -0,0 +1,142 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018-2021, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include + +#define XXH_INLINE_ALL +#include "xxhash.h" + +#include "output/muxer.h" + +typedef struct MuxerPriv { + XXH3_state_t* state; + FILE *f; +} xxh3Context; + +static int xxh3_open(xxh3Context *const xxh3, const char *const file, + const Dav1dPictureParameters *const p, + const unsigned fps[2]) +{ + xxh3->state = XXH3_createState(); + if (!xxh3->state) return DAV1D_ERR(ENOMEM); + XXH_errorcode err = XXH3_128bits_reset(xxh3->state); + if (err != XXH_OK) { + XXH3_freeState(xxh3->state); + xxh3->state = NULL; + return DAV1D_ERR(ENOMEM); + } + + if (!strcmp(file, "-")) { + xxh3->f = stdout; + } else if (!(xxh3->f = fopen(file, "wb"))) { + XXH3_freeState(xxh3->state); + xxh3->state = NULL; + fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno)); + return -1; + } + + return 0; +} + +static int xxh3_write(xxh3Context *const xxh3, Dav1dPicture *const p) { + const int hbd = p->p.bpc > 8; + const int w = p->p.w, h = p->p.h; + uint8_t *yptr = p->data[0]; + + for (int y = 0; y < h; y++) { + XXH3_128bits_update(xxh3->state, yptr, w << hbd); + yptr += p->stride[0]; + } + + if (p->p.layout != DAV1D_PIXEL_LAYOUT_I400) { + const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int cw = (w + ss_hor) >> ss_hor; + const int ch = (h + ss_ver) >> ss_ver; + for (int pl = 1; pl <= 2; pl++) { + uint8_t *uvptr = p->data[pl]; + + for (int y = 0; y < ch; y++) { + XXH3_128bits_update(xxh3->state, uvptr, cw << hbd); + uvptr += p->stride[1]; + } + } + } + + dav1d_picture_unref(p); + + return 0; +} + +static void xxh3_close(xxh3Context *const xxh3) { + XXH128_hash_t hash = XXH3_128bits_digest(xxh3->state); + XXH3_freeState(xxh3->state); + XXH128_canonical_t c; + XXH128_canonicalFromHash(&c, hash); + + for (int i = 0; i < 16; i++) + fprintf(xxh3->f, "%2.2x", c.digest[i]); + fprintf(xxh3->f, "\n"); + + if (xxh3->f != stdout) + fclose(xxh3->f); +} + +static int xxh3_verify(xxh3Context *const xxh3, const char * xxh3_str) { + XXH128_hash_t hash = XXH3_128bits_digest(xxh3->state); + XXH3_freeState(xxh3->state); + + if (strlen(xxh3_str) < 32) + return -1; + + XXH128_canonical_t c; + char t[3] = { 0 }; + for (int i = 0; i < 16; i++) { + char *ignore; + memcpy(t, xxh3_str, 2); + xxh3_str += 2; + c.digest[i] = (unsigned char) strtoul(t, &ignore, 16); + } + XXH128_hash_t verify = XXH128_hashFromCanonical(&c); + + return !XXH128_isEqual(hash, verify); +} + +const Muxer xxh3_muxer = { + .priv_data_size = sizeof(xxh3Context), + .name = "xxh3", + .extension = "xxh3", + .write_header = xxh3_open, + .write_picture = xxh3_write, + .write_trailer = xxh3_close, + .verify = xxh3_verify, +}; diff -Nru dav1d-0.7.1/tools/output/y4m2.c dav1d-0.9.1/tools/output/y4m2.c --- dav1d-0.7.1/tools/output/y4m2.c 2020-06-21 11:48:55.052126400 +0000 +++ dav1d-0.9.1/tools/output/y4m2.c 2021-07-28 21:38:28.921852400 +0000 @@ -28,6 +28,7 @@ #include "config.h" #include +#include #include #include #include @@ -77,8 +78,17 @@ chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] : ss_names[p->p.layout][p->seq_hdr->hbd]; - fprintf(c->f, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n", - p->p.w, p->p.h, c->fps[0], c->fps[1], ss_name); + const unsigned fw = p->p.w; + const unsigned fh = p->p.h; + uint64_t aw = (uint64_t)fh * p->frame_hdr->render_width; + uint64_t ah = (uint64_t)fw * p->frame_hdr->render_height; + uint64_t gcd = ah; + for (uint64_t a = aw, b; (b = a % gcd); a = gcd, gcd = b); + aw /= gcd; + ah /= gcd; + + fprintf(c->f, "YUV4MPEG2 W%u H%u F%u:%u Ip A%"PRIu64":%"PRIu64" C%s\n", + fw, fh, c->fps[0], c->fps[1], aw, ah, ss_name); return 0; }